npm - @sanity/ailf - Versions diffs - 2.3.2 → 2.3.3 - Mend

@sanity/ailf 2.3.2 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/_vendor/ailf-core/types/index.d.ts +7 -0
package/dist/pipeline/calculate-scores.js +7 -0
package/package.json +1 -1

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -256,6 +256,13 @@ export interface GraderJudgment {
     dimension: string;
     /** The model that produced the response being graded */
     modelId: string;
+    /**
+     * True when the model failed to produce meaningful output (empty response,
+     * API error, or refusal). Distinguishes infrastructure failures from
+     * genuinely incorrect responses — a score of 0 from no output is
+     * fundamentally different from a score of 0 from wrong output.
+     */
+    outputFailure?: boolean;
     /** The grader's natural language reasoning */
     reason: string;
     /** The numeric score (0–100) */

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -114,6 +114,10 @@ export function extractGraderJudgments(resultsPath) {
     for (const result of results) {
         const taskId = result.description;
         const modelId = result.providerId ?? result.providerLabel ?? "unknown";
+        // Detect output failures: empty/whitespace response means the model
+        // failed to produce output (API error, token exhaustion, refusal).
+        const output = result.response?.output ?? "";
+        const isOutputFailure = !output.trim();
         for (const comp of result.gradingResult.componentResults) {
             if (comp.assertion?.type !== "llm-rubric") {
                 continue;
@@ -139,9 +143,12 @@ export function extractGraderJudgments(resultsPath) {
                     // Not JSON — use raw reason string
                 }
             }
+            // Also flag synthesized api-error judgments as output failures
+            const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
             judgments.push({
                 dimension: kind,
                 modelId,
+                ...(outputFailure && { outputFailure: true }),
                 reason,
                 score,
                 taskId,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "2.3.2",
+  "version": "2.3.3",
   "private": false,
   "publishConfig": {
     "access": "public"