@sanity/ailf 2.3.2 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -256,6 +256,13 @@ export interface GraderJudgment {
|
|
|
256
256
|
dimension: string;
|
|
257
257
|
/** The model that produced the response being graded */
|
|
258
258
|
modelId: string;
|
|
259
|
+
/**
|
|
260
|
+
* True when the model failed to produce meaningful output (empty response,
|
|
261
|
+
* API error, or refusal). Distinguishes infrastructure failures from
|
|
262
|
+
* genuinely incorrect responses — a score of 0 from no output is
|
|
263
|
+
* fundamentally different from a score of 0 from wrong output.
|
|
264
|
+
*/
|
|
265
|
+
outputFailure?: boolean;
|
|
259
266
|
/** The grader's natural language reasoning */
|
|
260
267
|
reason: string;
|
|
261
268
|
/** The numeric score (0–100) */
|
|
@@ -114,6 +114,10 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
114
114
|
for (const result of results) {
|
|
115
115
|
const taskId = result.description;
|
|
116
116
|
const modelId = result.providerId ?? result.providerLabel ?? "unknown";
|
|
117
|
+
// Detect output failures: empty/whitespace response means the model
|
|
118
|
+
// failed to produce output (API error, token exhaustion, refusal).
|
|
119
|
+
const output = result.response?.output ?? "";
|
|
120
|
+
const isOutputFailure = !output.trim();
|
|
117
121
|
for (const comp of result.gradingResult.componentResults) {
|
|
118
122
|
if (comp.assertion?.type !== "llm-rubric") {
|
|
119
123
|
continue;
|
|
@@ -139,9 +143,12 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
139
143
|
// Not JSON — use raw reason string
|
|
140
144
|
}
|
|
141
145
|
}
|
|
146
|
+
// Also flag synthesized api-error judgments as output failures
|
|
147
|
+
const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
|
|
142
148
|
judgments.push({
|
|
143
149
|
dimension: kind,
|
|
144
150
|
modelId,
|
|
151
|
+
...(outputFailure && { outputFailure: true }),
|
|
145
152
|
reason,
|
|
146
153
|
score,
|
|
147
154
|
taskId,
|