npm - @sanity/ailf - Versions diffs - 2.3.1 → 2.3.3 - Mend

@sanity/ailf 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/config/models.ts +7 -3
package/dist/_vendor/ailf-core/types/index.d.ts +7 -0
package/dist/agent-observer/provider.d.ts +15 -2
package/dist/agent-observer/provider.js +91 -6
package/dist/commands/pipeline-action.js +11 -0
package/dist/config/models.ts +7 -3
package/dist/pipeline/calculate-scores.js +7 -0
package/dist/webhook/eval-request-handler.js +24 -19
package/package.json +1 -1

package/config/models.ts CHANGED Viewed

@@ -37,7 +37,10 @@ export default defineModels({
     {
       id: "openai:chat:gpt-5.2",
       label: "GPT 5.2",
-      config: { temperature: 0.2, max_tokens: 4096 },
+      config: {
+        max_completion_tokens: 8192,
+        verbosity: "medium",
+      },
       modes: ["literacy", "knowledge-probe"],
       // All literacy variants included by default
     },
@@ -45,8 +48,9 @@ export default defineModels({
       id: "openai:responses:gpt-5.4",
       label: "GPT 5.4",
       config: {
-        reasoning_effort: "medium",
-        max_output_tokens: 4096,
+        reasoning: { effort: "medium", summary: "auto" },
+        verbosity: "medium",
+        max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
         maxRetries: 1,
       },
       timeoutMs: 600_000, // 10 min — reasoning model needs more headroom

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -256,6 +256,13 @@ export interface GraderJudgment {
     dimension: string;
     /** The model that produced the response being graded */
     modelId: string;
+    /**
+     * True when the model failed to produce meaningful output (empty response,
+     * API error, or refusal). Distinguishes infrastructure failures from
+     * genuinely incorrect responses — a score of 0 from no output is
+     * fundamentally different from a score of 0 from wrong output.
+     */
+    outputFailure?: boolean;
     /** The grader's natural language reasoning */
     reason: string;
     /** The numeric score (0–100) */

package/dist/agent-observer/provider.d.ts CHANGED Viewed

@@ -69,9 +69,22 @@ export default class InstrumentedProvider {
     getRecorder(): RequestRecorder;
     id(): string;
     /**
-     * Calls OpenAI Chat Completions API directly. Uses the recorder's
-     * fetch wrapper so the LLM call itself is captured in the observation log.
+     * Detect whether the model should use the Responses API based on config.
+     */
+    private isResponsesModel;
+    /**
+     * Calls OpenAI directly. Routes to the Responses API for reasoning
+     * models and Chat Completions API for everything else. Uses the
+     * recorder's fetch wrapper so the API call is captured.
      */
     private callOpenAI;
+    /**
+     * Calls OpenAI Chat Completions API (non-reasoning models).
+     */
+    private callOpenAIChatCompletions;
+    /**
+     * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
+     */
+    private callOpenAIResponses;
 }
 export {};

package/dist/agent-observer/provider.js CHANGED Viewed

@@ -94,13 +94,26 @@ export default class InstrumentedProvider {
         return `instrumented:${this.providerId}`;
     }
     /**
-     * Calls OpenAI Chat Completions API directly. Uses the recorder's
-     * fetch wrapper so the LLM call itself is captured in the observation log.
+     * Detect whether the model should use the Responses API based on config.
+     */
+    isResponsesModel() {
+        const model = this.config.modelName || this.config.model || "";
+        return (this.config.reasoning != null ||
+            this.config.reasoning_effort != null ||
+            model.startsWith("gpt-5") ||
+            model.startsWith("o1") ||
+            model.startsWith("o3") ||
+            model.startsWith("o4"));
+    }
+    /**
+     * Calls OpenAI directly. Routes to the Responses API for reasoning
+     * models and Chat Completions API for everything else. Uses the
+     * recorder's fetch wrapper so the API call is captured.
      */
     async callOpenAI(prompt) {
-        const model = this.config.model || "gpt-4o";
-        const temperature = this.config.temperature ?? 0;
-        const maxTokens = this.config.max_tokens || 4096;
+        const model = this.config.modelName ||
+            this.config.model ||
+            "gpt-4o";
         const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
         if (!apiKey) {
             return {
@@ -108,7 +121,17 @@ export default class InstrumentedProvider {
                 output: undefined,
             };
         }
-        // Use the recorder's fetch wrapper so the API call is recorded
+        if (this.isResponsesModel()) {
+            return this.callOpenAIResponses(prompt, model, apiKey);
+        }
+        return this.callOpenAIChatCompletions(prompt, model, apiKey);
+    }
+    /**
+     * Calls OpenAI Chat Completions API (non-reasoning models).
+     */
+    async callOpenAIChatCompletions(prompt, model, apiKey) {
+        const temperature = this.config.temperature ?? 0;
+        const maxTokens = this.config.max_tokens || 4096;
         const fetchFn = this.recorder.isRunning()
             ? this.recorder.fetch.bind(this.recorder)
             : globalThis.fetch;
@@ -148,4 +171,66 @@ export default class InstrumentedProvider {
             },
         };
     }
+    /**
+     * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
+     */
+    async callOpenAIResponses(prompt, model, apiKey) {
+        const maxOutputTokens = this.config.max_output_tokens || 32_000;
+        const reasoning = this.config.reasoning;
+        const reasoningEffort = reasoning?.effort || this.config.reasoning_effort || "medium";
+        const reasoningSummary = reasoning?.summary;
+        const verbosity = this.config.verbosity;
+        const fetchFn = this.recorder.isRunning()
+            ? this.recorder.fetch.bind(this.recorder)
+            : globalThis.fetch;
+        const startTime = Date.now();
+        const response = await fetchFn("https://api.openai.com/v1/responses", {
+            body: JSON.stringify({
+                input: prompt,
+                max_output_tokens: maxOutputTokens,
+                model,
+                reasoning: {
+                    effort: reasoningEffort,
+                    ...(reasoningSummary ? { summary: reasoningSummary } : {}),
+                },
+                ...(verbosity ? { text: { format: { type: "text" }, verbosity } } : {}),
+            }),
+            headers: {
+                Authorization: `Bearer ${apiKey}`,
+                "Content-Type": "application/json",
+            },
+            method: "POST",
+        });
+        const data = (await response.json());
+        if (data.error) {
+            return {
+                error: data.error.message ?? "Unknown OpenAI error",
+                output: undefined,
+            };
+        }
+        // Extract text from Responses API output format
+        let output = "";
+        for (const item of data.output ?? []) {
+            if (item.type === "message" && item.content) {
+                for (const block of item.content) {
+                    if (block.type === "output_text" && block.text) {
+                        output += block.text;
+                    }
+                }
+            }
+        }
+        return {
+            cost: calculateCost(model, data.usage?.input_tokens ?? 0, data.usage?.output_tokens ?? 0),
+            metadata: {
+                latencyMs: Date.now() - startTime,
+                model,
+            },
+            output,
+            tokenUsage: {
+                completion: data.usage?.output_tokens,
+                prompt: data.usage?.input_tokens,
+                total: data.usage?.total_tokens,
+            },
+        };
+    }
 }

package/dist/commands/pipeline-action.js CHANGED Viewed

@@ -315,6 +315,17 @@ export async function executePipeline(cliOpts) {
         }
         // Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
         config.outputDir = resolveOutputDir(cliOpts.outputDir);
+        // Capture options — CLI flags and env vars aren't in the config file,
+        // so merge them here (same logic as resolveOptions).
+        config.captureEnabled = cliOpts.capture || process.env.AILF_CAPTURE === "1";
+        if (cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR) {
+            config.captureDir = cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR;
+        }
+        config.captureCompress =
+            cliOpts.captureCompress !== false &&
+                process.env.AILF_CAPTURE_COMPRESS !== "0";
+        config.captureExtras =
+            cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
         // Create AppContext directly from the merged config so adapters
         // (especially taskSource) are wired from the file config's
         // taskSourceType — not from CLI defaults.

package/dist/config/models.ts CHANGED Viewed

@@ -37,7 +37,10 @@ export default defineModels({
     {
       id: "openai:chat:gpt-5.2",
       label: "GPT 5.2",
-      config: { temperature: 0.2, max_tokens: 4096 },
+      config: {
+        max_completion_tokens: 8192,
+        verbosity: "medium",
+      },
       modes: ["literacy", "knowledge-probe"],
       // All literacy variants included by default
     },
@@ -45,8 +48,9 @@ export default defineModels({
       id: "openai:responses:gpt-5.4",
       label: "GPT 5.4",
       config: {
-        reasoning_effort: "medium",
-        max_output_tokens: 4096,
+        reasoning: { effort: "medium", summary: "auto" },
+        verbosity: "medium",
+        max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
         maxRetries: 1,
       },
       timeoutMs: 600_000, // 10 min — reasoning model needs more headroom

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -114,6 +114,10 @@ export function extractGraderJudgments(resultsPath) {
     for (const result of results) {
         const taskId = result.description;
         const modelId = result.providerId ?? result.providerLabel ?? "unknown";
+        // Detect output failures: empty/whitespace response means the model
+        // failed to produce output (API error, token exhaustion, refusal).
+        const output = result.response?.output ?? "";
+        const isOutputFailure = !output.trim();
         for (const comp of result.gradingResult.componentResults) {
             if (comp.assertion?.type !== "llm-rubric") {
                 continue;
@@ -139,9 +143,12 @@ export function extractGraderJudgments(resultsPath) {
                     // Not JSON — use raw reason string
                 }
             }
+            // Also flag synthesized api-error judgments as output failures
+            const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
             judgments.push({
                 dimension: kind,
                 modelId,
+                ...(outputFailure && { outputFailure: true }),
                 reason,
                 score,
                 taskId,

package/dist/webhook/eval-request-handler.js CHANGED Viewed

@@ -165,28 +165,33 @@ async function dispatchGitHubEval(repo, payload, config) {
     const hasPerspective = !!payload.perspective;
     const hasTasks = Array.isArray(payload.tasks) && payload.tasks.length > 0;
     const hasAreas = Array.isArray(payload.areas) && payload.areas.length > 0;
+    // Nest the PipelineRequest under `request` to stay within GitHub's
+    // 10-property limit on client_payload. Workflow-level metadata
+    // (caller_repo) stays at the top level for the workflow to read.
     const body = {
         client_payload: {
             caller_repo: "sanity-io/www-sanity-io",
-            dataset: payload.dataset,
-            mode: payload.mode,
-            projectId: payload.projectId,
-            publish: true,
-            source: "production",
-            // Studio-initiated evals always use Content Lake as the task source.
-            // Without this, the pipeline only loads filesystem .task.ts files and
-            // Studio-owned tasks are invisible.
-            taskMode: "content-lake",
-            // Release-scoped fields
-            ...(hasPerspective ? { perspective: payload.perspective } : {}),
-            // Task-scoped fields
-            ...(hasTasks ? { tasks: payload.tasks } : {}),
-            ...(hasAreas ? { areas: payload.areas } : {}),
-            ...(payload.debug ? { debug: true } : {}),
-            ...(payload.tag ? { publishTag: payload.tag } : {}),
-            ...(payload.sourceReportId
-                ? { sourceReportId: payload.sourceReportId }
-                : {}),
+            request: {
+                dataset: payload.dataset,
+                mode: payload.mode,
+                projectId: payload.projectId,
+                publish: true,
+                source: "production",
+                // Studio-initiated evals always use Content Lake as the task source.
+                // Without this, the pipeline only loads filesystem .task.ts files and
+                // Studio-owned tasks are invisible.
+                taskMode: "content-lake",
+                // Release-scoped fields
+                ...(hasPerspective ? { perspective: payload.perspective } : {}),
+                // Task-scoped fields
+                ...(hasTasks ? { tasks: payload.tasks } : {}),
+                ...(hasAreas ? { areas: payload.areas } : {}),
+                ...(payload.debug ? { debug: true } : {}),
+                ...(payload.tag ? { publishTag: payload.tag } : {}),
+                ...(payload.sourceReportId
+                    ? { sourceReportId: payload.sourceReportId }
+                    : {}),
+            },
         },
         event_type: "external-eval",
     };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "2.3.1",
+  "version": "2.3.3",
   "private": false,
   "publishConfig": {
     "access": "public"