@sanity/ailf 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.ts +7 -3
- package/dist/_vendor/ailf-core/types/index.d.ts +7 -0
- package/dist/agent-observer/provider.d.ts +15 -2
- package/dist/agent-observer/provider.js +91 -6
- package/dist/commands/pipeline-action.js +11 -0
- package/dist/config/models.ts +7 -3
- package/dist/pipeline/calculate-scores.js +7 -0
- package/dist/webhook/eval-request-handler.js +24 -19
- package/package.json +1 -1
package/config/models.ts
CHANGED
|
@@ -37,7 +37,10 @@ export default defineModels({
|
|
|
37
37
|
{
|
|
38
38
|
id: "openai:chat:gpt-5.2",
|
|
39
39
|
label: "GPT 5.2",
|
|
40
|
-
config: {
|
|
40
|
+
config: {
|
|
41
|
+
max_completion_tokens: 8192,
|
|
42
|
+
verbosity: "medium",
|
|
43
|
+
},
|
|
41
44
|
modes: ["literacy", "knowledge-probe"],
|
|
42
45
|
// All literacy variants included by default
|
|
43
46
|
},
|
|
@@ -45,8 +48,9 @@ export default defineModels({
|
|
|
45
48
|
id: "openai:responses:gpt-5.4",
|
|
46
49
|
label: "GPT 5.4",
|
|
47
50
|
config: {
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
reasoning: { effort: "medium", summary: "auto" },
|
|
52
|
+
verbosity: "medium",
|
|
53
|
+
max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
|
|
50
54
|
maxRetries: 1,
|
|
51
55
|
},
|
|
52
56
|
timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
|
|
@@ -256,6 +256,13 @@ export interface GraderJudgment {
|
|
|
256
256
|
dimension: string;
|
|
257
257
|
/** The model that produced the response being graded */
|
|
258
258
|
modelId: string;
|
|
259
|
+
/**
|
|
260
|
+
* True when the model failed to produce meaningful output (empty response,
|
|
261
|
+
* API error, or refusal). Distinguishes infrastructure failures from
|
|
262
|
+
* genuinely incorrect responses — a score of 0 from no output is
|
|
263
|
+
* fundamentally different from a score of 0 from wrong output.
|
|
264
|
+
*/
|
|
265
|
+
outputFailure?: boolean;
|
|
259
266
|
/** The grader's natural language reasoning */
|
|
260
267
|
reason: string;
|
|
261
268
|
/** The numeric score (0–100) */
|
|
@@ -69,9 +69,22 @@ export default class InstrumentedProvider {
|
|
|
69
69
|
getRecorder(): RequestRecorder;
|
|
70
70
|
id(): string;
|
|
71
71
|
/**
|
|
72
|
-
*
|
|
73
|
-
|
|
72
|
+
* Detect whether the model should use the Responses API based on config.
|
|
73
|
+
*/
|
|
74
|
+
private isResponsesModel;
|
|
75
|
+
/**
|
|
76
|
+
* Calls OpenAI directly. Routes to the Responses API for reasoning
|
|
77
|
+
* models and Chat Completions API for everything else. Uses the
|
|
78
|
+
* recorder's fetch wrapper so the API call is captured.
|
|
74
79
|
*/
|
|
75
80
|
private callOpenAI;
|
|
81
|
+
/**
|
|
82
|
+
* Calls OpenAI Chat Completions API (non-reasoning models).
|
|
83
|
+
*/
|
|
84
|
+
private callOpenAIChatCompletions;
|
|
85
|
+
/**
|
|
86
|
+
* Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
|
|
87
|
+
*/
|
|
88
|
+
private callOpenAIResponses;
|
|
76
89
|
}
|
|
77
90
|
export {};
|
|
@@ -94,13 +94,26 @@ export default class InstrumentedProvider {
|
|
|
94
94
|
return `instrumented:${this.providerId}`;
|
|
95
95
|
}
|
|
96
96
|
/**
|
|
97
|
-
*
|
|
98
|
-
|
|
97
|
+
* Detect whether the model should use the Responses API based on config.
|
|
98
|
+
*/
|
|
99
|
+
isResponsesModel() {
|
|
100
|
+
const model = this.config.modelName || this.config.model || "";
|
|
101
|
+
return (this.config.reasoning != null ||
|
|
102
|
+
this.config.reasoning_effort != null ||
|
|
103
|
+
model.startsWith("gpt-5") ||
|
|
104
|
+
model.startsWith("o1") ||
|
|
105
|
+
model.startsWith("o3") ||
|
|
106
|
+
model.startsWith("o4"));
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Calls OpenAI directly. Routes to the Responses API for reasoning
|
|
110
|
+
* models and Chat Completions API for everything else. Uses the
|
|
111
|
+
* recorder's fetch wrapper so the API call is captured.
|
|
99
112
|
*/
|
|
100
113
|
async callOpenAI(prompt) {
|
|
101
|
-
const model = this.config.
|
|
102
|
-
|
|
103
|
-
|
|
114
|
+
const model = this.config.modelName ||
|
|
115
|
+
this.config.model ||
|
|
116
|
+
"gpt-4o";
|
|
104
117
|
const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
|
|
105
118
|
if (!apiKey) {
|
|
106
119
|
return {
|
|
@@ -108,7 +121,17 @@ export default class InstrumentedProvider {
|
|
|
108
121
|
output: undefined,
|
|
109
122
|
};
|
|
110
123
|
}
|
|
111
|
-
|
|
124
|
+
if (this.isResponsesModel()) {
|
|
125
|
+
return this.callOpenAIResponses(prompt, model, apiKey);
|
|
126
|
+
}
|
|
127
|
+
return this.callOpenAIChatCompletions(prompt, model, apiKey);
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Calls OpenAI Chat Completions API (non-reasoning models).
|
|
131
|
+
*/
|
|
132
|
+
async callOpenAIChatCompletions(prompt, model, apiKey) {
|
|
133
|
+
const temperature = this.config.temperature ?? 0;
|
|
134
|
+
const maxTokens = this.config.max_tokens || 4096;
|
|
112
135
|
const fetchFn = this.recorder.isRunning()
|
|
113
136
|
? this.recorder.fetch.bind(this.recorder)
|
|
114
137
|
: globalThis.fetch;
|
|
@@ -148,4 +171,66 @@ export default class InstrumentedProvider {
|
|
|
148
171
|
},
|
|
149
172
|
};
|
|
150
173
|
}
|
|
174
|
+
/**
|
|
175
|
+
* Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
|
|
176
|
+
*/
|
|
177
|
+
async callOpenAIResponses(prompt, model, apiKey) {
|
|
178
|
+
const maxOutputTokens = this.config.max_output_tokens || 32_000;
|
|
179
|
+
const reasoning = this.config.reasoning;
|
|
180
|
+
const reasoningEffort = reasoning?.effort || this.config.reasoning_effort || "medium";
|
|
181
|
+
const reasoningSummary = reasoning?.summary;
|
|
182
|
+
const verbosity = this.config.verbosity;
|
|
183
|
+
const fetchFn = this.recorder.isRunning()
|
|
184
|
+
? this.recorder.fetch.bind(this.recorder)
|
|
185
|
+
: globalThis.fetch;
|
|
186
|
+
const startTime = Date.now();
|
|
187
|
+
const response = await fetchFn("https://api.openai.com/v1/responses", {
|
|
188
|
+
body: JSON.stringify({
|
|
189
|
+
input: prompt,
|
|
190
|
+
max_output_tokens: maxOutputTokens,
|
|
191
|
+
model,
|
|
192
|
+
reasoning: {
|
|
193
|
+
effort: reasoningEffort,
|
|
194
|
+
...(reasoningSummary ? { summary: reasoningSummary } : {}),
|
|
195
|
+
},
|
|
196
|
+
...(verbosity ? { text: { format: { type: "text" }, verbosity } } : {}),
|
|
197
|
+
}),
|
|
198
|
+
headers: {
|
|
199
|
+
Authorization: `Bearer ${apiKey}`,
|
|
200
|
+
"Content-Type": "application/json",
|
|
201
|
+
},
|
|
202
|
+
method: "POST",
|
|
203
|
+
});
|
|
204
|
+
const data = (await response.json());
|
|
205
|
+
if (data.error) {
|
|
206
|
+
return {
|
|
207
|
+
error: data.error.message ?? "Unknown OpenAI error",
|
|
208
|
+
output: undefined,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
// Extract text from Responses API output format
|
|
212
|
+
let output = "";
|
|
213
|
+
for (const item of data.output ?? []) {
|
|
214
|
+
if (item.type === "message" && item.content) {
|
|
215
|
+
for (const block of item.content) {
|
|
216
|
+
if (block.type === "output_text" && block.text) {
|
|
217
|
+
output += block.text;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
return {
|
|
223
|
+
cost: calculateCost(model, data.usage?.input_tokens ?? 0, data.usage?.output_tokens ?? 0),
|
|
224
|
+
metadata: {
|
|
225
|
+
latencyMs: Date.now() - startTime,
|
|
226
|
+
model,
|
|
227
|
+
},
|
|
228
|
+
output,
|
|
229
|
+
tokenUsage: {
|
|
230
|
+
completion: data.usage?.output_tokens,
|
|
231
|
+
prompt: data.usage?.input_tokens,
|
|
232
|
+
total: data.usage?.total_tokens,
|
|
233
|
+
},
|
|
234
|
+
};
|
|
235
|
+
}
|
|
151
236
|
}
|
|
@@ -315,6 +315,17 @@ export async function executePipeline(cliOpts) {
|
|
|
315
315
|
}
|
|
316
316
|
// Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
|
|
317
317
|
config.outputDir = resolveOutputDir(cliOpts.outputDir);
|
|
318
|
+
// Capture options — CLI flags and env vars aren't in the config file,
|
|
319
|
+
// so merge them here (same logic as resolveOptions).
|
|
320
|
+
config.captureEnabled = cliOpts.capture || process.env.AILF_CAPTURE === "1";
|
|
321
|
+
if (cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR) {
|
|
322
|
+
config.captureDir = cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR;
|
|
323
|
+
}
|
|
324
|
+
config.captureCompress =
|
|
325
|
+
cliOpts.captureCompress !== false &&
|
|
326
|
+
process.env.AILF_CAPTURE_COMPRESS !== "0";
|
|
327
|
+
config.captureExtras =
|
|
328
|
+
cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
|
|
318
329
|
// Create AppContext directly from the merged config so adapters
|
|
319
330
|
// (especially taskSource) are wired from the file config's
|
|
320
331
|
// taskSourceType — not from CLI defaults.
|
package/dist/config/models.ts
CHANGED
|
@@ -37,7 +37,10 @@ export default defineModels({
|
|
|
37
37
|
{
|
|
38
38
|
id: "openai:chat:gpt-5.2",
|
|
39
39
|
label: "GPT 5.2",
|
|
40
|
-
config: {
|
|
40
|
+
config: {
|
|
41
|
+
max_completion_tokens: 8192,
|
|
42
|
+
verbosity: "medium",
|
|
43
|
+
},
|
|
41
44
|
modes: ["literacy", "knowledge-probe"],
|
|
42
45
|
// All literacy variants included by default
|
|
43
46
|
},
|
|
@@ -45,8 +48,9 @@ export default defineModels({
|
|
|
45
48
|
id: "openai:responses:gpt-5.4",
|
|
46
49
|
label: "GPT 5.4",
|
|
47
50
|
config: {
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
reasoning: { effort: "medium", summary: "auto" },
|
|
52
|
+
verbosity: "medium",
|
|
53
|
+
max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
|
|
50
54
|
maxRetries: 1,
|
|
51
55
|
},
|
|
52
56
|
timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
|
|
@@ -114,6 +114,10 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
114
114
|
for (const result of results) {
|
|
115
115
|
const taskId = result.description;
|
|
116
116
|
const modelId = result.providerId ?? result.providerLabel ?? "unknown";
|
|
117
|
+
// Detect output failures: empty/whitespace response means the model
|
|
118
|
+
// failed to produce output (API error, token exhaustion, refusal).
|
|
119
|
+
const output = result.response?.output ?? "";
|
|
120
|
+
const isOutputFailure = !output.trim();
|
|
117
121
|
for (const comp of result.gradingResult.componentResults) {
|
|
118
122
|
if (comp.assertion?.type !== "llm-rubric") {
|
|
119
123
|
continue;
|
|
@@ -139,9 +143,12 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
139
143
|
// Not JSON — use raw reason string
|
|
140
144
|
}
|
|
141
145
|
}
|
|
146
|
+
// Also flag synthesized api-error judgments as output failures
|
|
147
|
+
const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
|
|
142
148
|
judgments.push({
|
|
143
149
|
dimension: kind,
|
|
144
150
|
modelId,
|
|
151
|
+
...(outputFailure && { outputFailure: true }),
|
|
145
152
|
reason,
|
|
146
153
|
score,
|
|
147
154
|
taskId,
|
|
@@ -165,28 +165,33 @@ async function dispatchGitHubEval(repo, payload, config) {
|
|
|
165
165
|
const hasPerspective = !!payload.perspective;
|
|
166
166
|
const hasTasks = Array.isArray(payload.tasks) && payload.tasks.length > 0;
|
|
167
167
|
const hasAreas = Array.isArray(payload.areas) && payload.areas.length > 0;
|
|
168
|
+
// Nest the PipelineRequest under `request` to stay within GitHub's
|
|
169
|
+
// 10-property limit on client_payload. Workflow-level metadata
|
|
170
|
+
// (caller_repo) stays at the top level for the workflow to read.
|
|
168
171
|
const body = {
|
|
169
172
|
client_payload: {
|
|
170
173
|
caller_repo: "sanity-io/www-sanity-io",
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
174
|
+
request: {
|
|
175
|
+
dataset: payload.dataset,
|
|
176
|
+
mode: payload.mode,
|
|
177
|
+
projectId: payload.projectId,
|
|
178
|
+
publish: true,
|
|
179
|
+
source: "production",
|
|
180
|
+
// Studio-initiated evals always use Content Lake as the task source.
|
|
181
|
+
// Without this, the pipeline only loads filesystem .task.ts files and
|
|
182
|
+
// Studio-owned tasks are invisible.
|
|
183
|
+
taskMode: "content-lake",
|
|
184
|
+
// Release-scoped fields
|
|
185
|
+
...(hasPerspective ? { perspective: payload.perspective } : {}),
|
|
186
|
+
// Task-scoped fields
|
|
187
|
+
...(hasTasks ? { tasks: payload.tasks } : {}),
|
|
188
|
+
...(hasAreas ? { areas: payload.areas } : {}),
|
|
189
|
+
...(payload.debug ? { debug: true } : {}),
|
|
190
|
+
...(payload.tag ? { publishTag: payload.tag } : {}),
|
|
191
|
+
...(payload.sourceReportId
|
|
192
|
+
? { sourceReportId: payload.sourceReportId }
|
|
193
|
+
: {}),
|
|
194
|
+
},
|
|
190
195
|
},
|
|
191
196
|
event_type: "external-eval",
|
|
192
197
|
};
|