npm - @sanity/ailf - Versions diffs - 2.3.0 → 2.3.2 - Mend

@sanity/ailf 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/config/models.ts +7 -3
package/dist/adapters/task-sources/content-lake-task-source.js +15 -7
package/dist/agent-observer/provider.d.ts +15 -2
package/dist/agent-observer/provider.js +91 -6
package/dist/commands/pipeline-action.js +11 -0
package/dist/config/models.ts +7 -3
package/dist/orchestration/steps/fetch-docs-step.js +23 -9
package/dist/orchestration/steps/generate-configs-step.d.ts +15 -0
package/dist/orchestration/steps/generate-configs-step.js +44 -0
package/dist/orchestration/steps/run-eval-step.js +14 -0
package/dist/webhook/eval-request-handler.js +24 -15
package/package.json +1 -1

package/config/models.ts CHANGED Viewed

@@ -37,7 +37,10 @@ export default defineModels({
     {
       id: "openai:chat:gpt-5.2",
       label: "GPT 5.2",
-      config: { temperature: 0.2, max_tokens: 4096 },
+      config: {
+        max_completion_tokens: 8192,
+        verbosity: "medium",
+      },
       modes: ["literacy", "knowledge-probe"],
       // All literacy variants included by default
     },
@@ -45,8 +48,9 @@ export default defineModels({
       id: "openai:responses:gpt-5.4",
       label: "GPT 5.4",
       config: {
-        reasoning_effort: "medium",
-        max_output_tokens: 4096,
+        reasoning: { effort: "medium", summary: "auto" },
+        verbosity: "medium",
+        max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
         maxRetries: 1,
       },
       timeoutMs: 600_000, // 10 min — reasoning model needs more headroom

package/dist/adapters/task-sources/content-lake-task-source.js CHANGED Viewed

@@ -28,7 +28,13 @@
  */
 const TASKS_QUERY = /* groq */ `
 *[_type == "ailf.task"
-  && (!defined($areas) || area->areaId.current in $areas)
+  && (
+    !defined($areas)
+    // Current field name
+    || area->areaId.current in $areas
+    // Legacy field name (pre-schema-rename documents)
+    || featureArea->areaId.current in $areas
+  )
   && (!defined($taskIds) || id.current in $taskIds)
   && (
     // Status-based filtering (unified — replaces execution.enabled)
@@ -39,13 +45,15 @@ const TASKS_QUERY = /* groq */ `
     || (defined($taskIds) && status != "archived")
   )
   && (!defined($tags) || count((tags)[@ in $tags]) > 0)
-] | order(area->areaId.current asc, id.current asc) {
+] | order(coalesce(area->areaId.current, featureArea->areaId.current) asc, id.current asc) {
   "taskId": id.current,
-  title,
-  "areaId": area->areaId.current,
-  promptText,
+  // Coalesce current and legacy field names so documents created before
+  // the schema rename are still readable.
+  "title": coalesce(title, description),
+  "areaId": coalesce(area->areaId.current, featureArea->areaId.current),
+  "promptText": coalesce(promptText, taskPrompt),
   docCoverage,
-  "contextDocs": contextDocs[] {
+  "contextDocs": coalesce(contextDocs, canonicalDocs)[] {
     refType,
     "slug": doc->slug.current,
     "docRefId": doc->_id,
@@ -55,7 +63,7 @@ const TASKS_QUERY = /* groq */ `
     perspective,
     reason
   },
-  assertions,
+  "assertions": coalesce(assertions, assert),
   rawAssert,
   baseline,
   tags,

package/dist/agent-observer/provider.d.ts CHANGED Viewed

@@ -69,9 +69,22 @@ export default class InstrumentedProvider {
     getRecorder(): RequestRecorder;
     id(): string;
     /**
-     * Calls OpenAI Chat Completions API directly. Uses the recorder's
-     * fetch wrapper so the LLM call itself is captured in the observation log.
+     * Detect whether the model should use the Responses API based on config.
+     */
+    private isResponsesModel;
+    /**
+     * Calls OpenAI directly. Routes to the Responses API for reasoning
+     * models and Chat Completions API for everything else. Uses the
+     * recorder's fetch wrapper so the API call is captured.
      */
     private callOpenAI;
+    /**
+     * Calls OpenAI Chat Completions API (non-reasoning models).
+     */
+    private callOpenAIChatCompletions;
+    /**
+     * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
+     */
+    private callOpenAIResponses;
 }
 export {};

package/dist/agent-observer/provider.js CHANGED Viewed

@@ -94,13 +94,26 @@ export default class InstrumentedProvider {
         return `instrumented:${this.providerId}`;
     }
     /**
-     * Calls OpenAI Chat Completions API directly. Uses the recorder's
-     * fetch wrapper so the LLM call itself is captured in the observation log.
+     * Detect whether the model should use the Responses API based on config.
+     */
+    isResponsesModel() {
+        const model = this.config.modelName || this.config.model || "";
+        return (this.config.reasoning != null ||
+            this.config.reasoning_effort != null ||
+            model.startsWith("gpt-5") ||
+            model.startsWith("o1") ||
+            model.startsWith("o3") ||
+            model.startsWith("o4"));
+    }
+    /**
+     * Calls OpenAI directly. Routes to the Responses API for reasoning
+     * models and Chat Completions API for everything else. Uses the
+     * recorder's fetch wrapper so the API call is captured.
      */
     async callOpenAI(prompt) {
-        const model = this.config.model || "gpt-4o";
-        const temperature = this.config.temperature ?? 0;
-        const maxTokens = this.config.max_tokens || 4096;
+        const model = this.config.modelName ||
+            this.config.model ||
+            "gpt-4o";
         const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
         if (!apiKey) {
             return {
@@ -108,7 +121,17 @@ export default class InstrumentedProvider {
                 output: undefined,
             };
         }
-        // Use the recorder's fetch wrapper so the API call is recorded
+        if (this.isResponsesModel()) {
+            return this.callOpenAIResponses(prompt, model, apiKey);
+        }
+        return this.callOpenAIChatCompletions(prompt, model, apiKey);
+    }
+    /**
+     * Calls OpenAI Chat Completions API (non-reasoning models).
+     */
+    async callOpenAIChatCompletions(prompt, model, apiKey) {
+        const temperature = this.config.temperature ?? 0;
+        const maxTokens = this.config.max_tokens || 4096;
         const fetchFn = this.recorder.isRunning()
             ? this.recorder.fetch.bind(this.recorder)
             : globalThis.fetch;
@@ -148,4 +171,66 @@ export default class InstrumentedProvider {
             },
         };
     }
+    /**
+     * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
+     */
+    async callOpenAIResponses(prompt, model, apiKey) {
+        const maxOutputTokens = this.config.max_output_tokens || 32_000;
+        const reasoning = this.config.reasoning;
+        const reasoningEffort = reasoning?.effort || this.config.reasoning_effort || "medium";
+        const reasoningSummary = reasoning?.summary;
+        const verbosity = this.config.verbosity;
+        const fetchFn = this.recorder.isRunning()
+            ? this.recorder.fetch.bind(this.recorder)
+            : globalThis.fetch;
+        const startTime = Date.now();
+        const response = await fetchFn("https://api.openai.com/v1/responses", {
+            body: JSON.stringify({
+                input: prompt,
+                max_output_tokens: maxOutputTokens,
+                model,
+                reasoning: {
+                    effort: reasoningEffort,
+                    ...(reasoningSummary ? { summary: reasoningSummary } : {}),
+                },
+                ...(verbosity ? { text: { format: { type: "text" }, verbosity } } : {}),
+            }),
+            headers: {
+                Authorization: `Bearer ${apiKey}`,
+                "Content-Type": "application/json",
+            },
+            method: "POST",
+        });
+        const data = (await response.json());
+        if (data.error) {
+            return {
+                error: data.error.message ?? "Unknown OpenAI error",
+                output: undefined,
+            };
+        }
+        // Extract text from Responses API output format
+        let output = "";
+        for (const item of data.output ?? []) {
+            if (item.type === "message" && item.content) {
+                for (const block of item.content) {
+                    if (block.type === "output_text" && block.text) {
+                        output += block.text;
+                    }
+                }
+            }
+        }
+        return {
+            cost: calculateCost(model, data.usage?.input_tokens ?? 0, data.usage?.output_tokens ?? 0),
+            metadata: {
+                latencyMs: Date.now() - startTime,
+                model,
+            },
+            output,
+            tokenUsage: {
+                completion: data.usage?.output_tokens,
+                prompt: data.usage?.input_tokens,
+                total: data.usage?.total_tokens,
+            },
+        };
+    }
 }

package/dist/commands/pipeline-action.js CHANGED Viewed

@@ -315,6 +315,17 @@ export async function executePipeline(cliOpts) {
         }
         // Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
         config.outputDir = resolveOutputDir(cliOpts.outputDir);
+        // Capture options — CLI flags and env vars aren't in the config file,
+        // so merge them here (same logic as resolveOptions).
+        config.captureEnabled = cliOpts.capture || process.env.AILF_CAPTURE === "1";
+        if (cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR) {
+            config.captureDir = cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR;
+        }
+        config.captureCompress =
+            cliOpts.captureCompress !== false &&
+                process.env.AILF_CAPTURE_COMPRESS !== "0";
+        config.captureExtras =
+            cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
         // Create AppContext directly from the merged config so adapters
         // (especially taskSource) are wired from the file config's
         // taskSourceType — not from CLI defaults.

package/dist/config/models.ts CHANGED Viewed

@@ -37,7 +37,10 @@ export default defineModels({
     {
       id: "openai:chat:gpt-5.2",
       label: "GPT 5.2",
-      config: { temperature: 0.2, max_tokens: 4096 },
+      config: {
+        max_completion_tokens: 8192,
+        verbosity: "medium",
+      },
       modes: ["literacy", "knowledge-probe"],
       // All literacy variants included by default
     },
@@ -45,8 +48,9 @@ export default defineModels({
       id: "openai:responses:gpt-5.4",
       label: "GPT 5.4",
       config: {
-        reasoning_effort: "medium",
-        max_output_tokens: 4096,
+        reasoning: { effort: "medium", summary: "auto" },
+        verbosity: "medium",
+        max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
         maxRetries: 1,
       },
       timeoutMs: 600_000, // 10 min — reasoning model needs more headroom

package/dist/orchestration/steps/fetch-docs-step.js CHANGED Viewed

@@ -29,15 +29,29 @@ export class FetchDocsStep {
             return { status: "skipped", reason: "--skip-fetch" };
         }
         const start = Date.now();
-        // Load tasks from the filesystem — the same source GenerateConfigsStep
-        // uses. This replaces ctx.taskSource (ContentLakeTaskSource) which may
-        // have no ailf.task documents, causing a mismatch where generated
-        // configs reference context files that were never fetched.
-        const allTasks = await loadPipelineTasks({
-            rootDir: ctx.config.rootDir,
-            mode: ctx.config.mode,
-            repoTasksPath: ctx.config.repoTasksPath,
-        });
+        // Load tasks — use the same source as GenerateConfigsStep to avoid
+        // a mismatch where configs reference context files that were never
+        // fetched.
+        //
+        // Content Lake path: use ctx.taskSource (ContentLakeTaskSource) which
+        // loads Studio-owned ailf.task documents via GROQ.
+        // Filesystem path: load from .task.ts files (repo/inline tasks).
+        let allTasks;
+        if (ctx.config.taskSourceType === "content-lake") {
+            const filter = {
+                ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
+                ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
+                ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
+            };
+            allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
+        }
+        else {
+            allTasks = await loadPipelineTasks({
+                rootDir: ctx.config.rootDir,
+                mode: ctx.config.mode,
+                repoTasksPath: ctx.config.repoTasksPath,
+            });
+        }
         // Bridge: narrow to literacy tasks for canonical doc access
         const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
         const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);

package/dist/orchestration/steps/generate-configs-step.d.ts CHANGED Viewed

@@ -18,6 +18,21 @@ export declare class GenerateConfigsStep implements PipelineStep {
     private compileLiteracyVariants;
     private compileSingleMode;
     private loadTasks;
+    /**
+     * Load tasks from the Content Lake via ctx.taskSource.
+     *
+     * The ContentLakeTaskSource adapter handles area/task/tag filtering
+     * in the GROQ query itself, so we build a FilterOptions and pass it
+     * through rather than filtering in-memory after loading.
+     */
+    private loadTasksFromContentLake;
+    /**
+     * Load tasks from filesystem .task.ts files.
+     *
+     * This is the original path used for repo-based and inline tasks.
+     * It scans tasks/{mode}/ and optionally --repo-tasks-path.
+     */
+    private loadTasksFromFilesystem;
     private applyFilters;
     /**
      * Build a descriptive error message when no tasks match the current filters.

package/dist/orchestration/steps/generate-configs-step.js CHANGED Viewed

@@ -209,6 +209,50 @@ export class GenerateConfigsStep {
     // Task loading — unified for all modes
     // ---------------------------------------------------------------------------
     async loadTasks(ctx, mode, state) {
+        // Content Lake path — use ctx.taskSource (ContentLakeTaskSource) which
+        // loads ailf.task documents via GROQ. This is the only path that sees
+        // Studio-owned tasks (ownership: "studio").
+        if (ctx.config.taskSourceType === "content-lake") {
+            return this.loadTasksFromContentLake(ctx, state);
+        }
+        // Filesystem path — load from .task.ts files (repo tasks, inline tasks).
+        return this.loadTasksFromFilesystem(ctx, mode, state);
+    }
+    /**
+     * Load tasks from the Content Lake via ctx.taskSource.
+     *
+     * The ContentLakeTaskSource adapter handles area/task/tag filtering
+     * in the GROQ query itself, so we build a FilterOptions and pass it
+     * through rather than filtering in-memory after loading.
+     */
+    async loadTasksFromContentLake(ctx, state) {
+        const filter = {
+            ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
+            ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
+            ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
+        };
+        const tasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
+        // Capture loaded IDs for error messages (same as filesystem path)
+        this.lastLoadedTaskIds = tasks
+            .map((t) => t.id)
+            .filter((id) => !!id);
+        // Release auto-scope
+        if (state.releaseAutoScope && !ctx.config.noAutoScope) {
+            const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
+            const beforeCount = tasks.length;
+            const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
+            ctx.logger.info(`  🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
+            return scoped;
+        }
+        return tasks;
+    }
+    /**
+     * Load tasks from filesystem .task.ts files.
+     *
+     * This is the original path used for repo-based and inline tasks.
+     * It scans tasks/{mode}/ and optionally --repo-tasks-path.
+     */
+    async loadTasksFromFilesystem(ctx, mode, state) {
         const { resolve } = await import("path");
         const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
         const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");

package/dist/orchestration/steps/run-eval-step.js CHANGED Viewed

@@ -113,6 +113,11 @@ export class RunEvalStep {
                 // required eval modes were satisfied from the remote cache.
                 state.remoteCacheHits ??= new Set();
                 state.remoteCacheHits.add(this.mode);
+                // Carry forward Promptfoo share URLs from the cached report
+                if (remoteCacheResult.promptfooUrls?.length) {
+                    state.promptfooUrls ??= [];
+                    state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
+                }
                 // Capture the restored score-summary from remote cache
                 const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
                 if (existsSync(cachedSummaryPath)) {
@@ -189,6 +194,14 @@ export class RunEvalStep {
                 mode: this.mode,
             });
         }
+        // Extract Promptfoo share URL from eval results (Step 3b)
+        if (ctx.evalRunner.extractShareUrl) {
+            const shareUrl = ctx.evalRunner.extractShareUrl(resolve(rootDir, resultsFileForMode(this.mode)));
+            if (shareUrl) {
+                state.promptfooUrls ??= [];
+                state.promptfooUrls.push({ mode: this.mode, url: shareUrl });
+            }
+        }
         const durationMs = Date.now() - start;
         return {
             durationMs,
@@ -224,6 +237,7 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
         console.log(`  ℹ️  Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
         return {
             completedAt: cachedReport.completedAt,
+            promptfooUrls: cachedReport.provenance?.promptfooUrls,
             reportId: cachedReport.id,
         };
     }

package/dist/webhook/eval-request-handler.js CHANGED Viewed

@@ -165,24 +165,33 @@ async function dispatchGitHubEval(repo, payload, config) {
     const hasPerspective = !!payload.perspective;
     const hasTasks = Array.isArray(payload.tasks) && payload.tasks.length > 0;
     const hasAreas = Array.isArray(payload.areas) && payload.areas.length > 0;
+    // Nest the PipelineRequest under `request` to stay within GitHub's
+    // 10-property limit on client_payload. Workflow-level metadata
+    // (caller_repo) stays at the top level for the workflow to read.
     const body = {
         client_payload: {
             caller_repo: "sanity-io/www-sanity-io",
-            dataset: payload.dataset,
-            mode: payload.mode,
-            projectId: payload.projectId,
-            publish: true,
-            source: "production",
-            // Release-scoped fields
-            ...(hasPerspective ? { perspective: payload.perspective } : {}),
-            // Task-scoped fields
-            ...(hasTasks ? { tasks: payload.tasks } : {}),
-            ...(hasAreas ? { areas: payload.areas } : {}),
-            ...(payload.debug ? { debug: true } : {}),
-            ...(payload.tag ? { publishTag: payload.tag } : {}),
-            ...(payload.sourceReportId
-                ? { sourceReportId: payload.sourceReportId }
-                : {}),
+            request: {
+                dataset: payload.dataset,
+                mode: payload.mode,
+                projectId: payload.projectId,
+                publish: true,
+                source: "production",
+                // Studio-initiated evals always use Content Lake as the task source.
+                // Without this, the pipeline only loads filesystem .task.ts files and
+                // Studio-owned tasks are invisible.
+                taskMode: "content-lake",
+                // Release-scoped fields
+                ...(hasPerspective ? { perspective: payload.perspective } : {}),
+                // Task-scoped fields
+                ...(hasTasks ? { tasks: payload.tasks } : {}),
+                ...(hasAreas ? { areas: payload.areas } : {}),
+                ...(payload.debug ? { debug: true } : {}),
+                ...(payload.tag ? { publishTag: payload.tag } : {}),
+                ...(payload.sourceReportId
+                    ? { sourceReportId: payload.sourceReportId }
+                    : {}),
+            },
         },
         event_type: "external-eval",
     };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "2.3.0",
+  "version": "2.3.2",
   "private": false,
   "publishConfig": {
     "access": "public"