@sanity/ailf 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/config/models.ts CHANGED
@@ -37,7 +37,10 @@ export default defineModels({
37
37
  {
38
38
  id: "openai:chat:gpt-5.2",
39
39
  label: "GPT 5.2",
40
- config: { temperature: 0.2, max_tokens: 4096 },
40
+ config: {
41
+ max_completion_tokens: 8192,
42
+ verbosity: "medium",
43
+ },
41
44
  modes: ["literacy", "knowledge-probe"],
42
45
  // All literacy variants included by default
43
46
  },
@@ -45,8 +48,9 @@ export default defineModels({
45
48
  id: "openai:responses:gpt-5.4",
46
49
  label: "GPT 5.4",
47
50
  config: {
48
- reasoning_effort: "medium",
49
- max_output_tokens: 4096,
51
+ reasoning: { effort: "medium", summary: "auto" },
52
+ verbosity: "medium",
53
+ max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
50
54
  maxRetries: 1,
51
55
  },
52
56
  timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
@@ -69,9 +69,22 @@ export default class InstrumentedProvider {
69
69
  getRecorder(): RequestRecorder;
70
70
  id(): string;
71
71
  /**
72
- * Calls OpenAI Chat Completions API directly. Uses the recorder's
73
- * fetch wrapper so the LLM call itself is captured in the observation log.
72
+ * Detect whether the model should use the Responses API based on config.
73
+ */
74
+ private isResponsesModel;
75
+ /**
76
+ * Calls OpenAI directly. Routes to the Responses API for reasoning
77
+ * models and Chat Completions API for everything else. Uses the
78
+ * recorder's fetch wrapper so the API call is captured.
74
79
  */
75
80
  private callOpenAI;
81
+ /**
82
+ * Calls OpenAI Chat Completions API (non-reasoning models).
83
+ */
84
+ private callOpenAIChatCompletions;
85
+ /**
86
+ * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
87
+ */
88
+ private callOpenAIResponses;
76
89
  }
77
90
  export {};
@@ -94,13 +94,26 @@ export default class InstrumentedProvider {
94
94
  return `instrumented:${this.providerId}`;
95
95
  }
96
96
  /**
97
- * Calls OpenAI Chat Completions API directly. Uses the recorder's
98
- * fetch wrapper so the LLM call itself is captured in the observation log.
97
+ * Detect whether the model should use the Responses API based on config.
98
+ */
99
+ isResponsesModel() {
100
+ const model = this.config.modelName || this.config.model || "";
101
+ return (this.config.reasoning != null ||
102
+ this.config.reasoning_effort != null ||
103
+ model.startsWith("gpt-5") ||
104
+ model.startsWith("o1") ||
105
+ model.startsWith("o3") ||
106
+ model.startsWith("o4"));
107
+ }
108
+ /**
109
+ * Calls OpenAI directly. Routes to the Responses API for reasoning
110
+ * models and Chat Completions API for everything else. Uses the
111
+ * recorder's fetch wrapper so the API call is captured.
99
112
  */
100
113
  async callOpenAI(prompt) {
101
- const model = this.config.model || "gpt-4o";
102
- const temperature = this.config.temperature ?? 0;
103
- const maxTokens = this.config.max_tokens || 4096;
114
+ const model = this.config.modelName ||
115
+ this.config.model ||
116
+ "gpt-4o";
104
117
  const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
105
118
  if (!apiKey) {
106
119
  return {
@@ -108,7 +121,17 @@ export default class InstrumentedProvider {
108
121
  output: undefined,
109
122
  };
110
123
  }
111
- // Use the recorder's fetch wrapper so the API call is recorded
124
+ if (this.isResponsesModel()) {
125
+ return this.callOpenAIResponses(prompt, model, apiKey);
126
+ }
127
+ return this.callOpenAIChatCompletions(prompt, model, apiKey);
128
+ }
129
+ /**
130
+ * Calls OpenAI Chat Completions API (non-reasoning models).
131
+ */
132
+ async callOpenAIChatCompletions(prompt, model, apiKey) {
133
+ const temperature = this.config.temperature ?? 0;
134
+ const maxTokens = this.config.max_tokens || 4096;
112
135
  const fetchFn = this.recorder.isRunning()
113
136
  ? this.recorder.fetch.bind(this.recorder)
114
137
  : globalThis.fetch;
@@ -148,4 +171,66 @@ export default class InstrumentedProvider {
148
171
  },
149
172
  };
150
173
  }
174
+ /**
175
+ * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
176
+ */
177
+ async callOpenAIResponses(prompt, model, apiKey) {
178
+ const maxOutputTokens = this.config.max_output_tokens || 32_000;
179
+ const reasoning = this.config.reasoning;
180
+ const reasoningEffort = reasoning?.effort || this.config.reasoning_effort || "medium";
181
+ const reasoningSummary = reasoning?.summary;
182
+ const verbosity = this.config.verbosity;
183
+ const fetchFn = this.recorder.isRunning()
184
+ ? this.recorder.fetch.bind(this.recorder)
185
+ : globalThis.fetch;
186
+ const startTime = Date.now();
187
+ const response = await fetchFn("https://api.openai.com/v1/responses", {
188
+ body: JSON.stringify({
189
+ input: prompt,
190
+ max_output_tokens: maxOutputTokens,
191
+ model,
192
+ reasoning: {
193
+ effort: reasoningEffort,
194
+ ...(reasoningSummary ? { summary: reasoningSummary } : {}),
195
+ },
196
+ ...(verbosity ? { text: { format: { type: "text" }, verbosity } } : {}),
197
+ }),
198
+ headers: {
199
+ Authorization: `Bearer ${apiKey}`,
200
+ "Content-Type": "application/json",
201
+ },
202
+ method: "POST",
203
+ });
204
+ const data = (await response.json());
205
+ if (data.error) {
206
+ return {
207
+ error: data.error.message ?? "Unknown OpenAI error",
208
+ output: undefined,
209
+ };
210
+ }
211
+ // Extract text from Responses API output format
212
+ let output = "";
213
+ for (const item of data.output ?? []) {
214
+ if (item.type === "message" && item.content) {
215
+ for (const block of item.content) {
216
+ if (block.type === "output_text" && block.text) {
217
+ output += block.text;
218
+ }
219
+ }
220
+ }
221
+ }
222
+ return {
223
+ cost: calculateCost(model, data.usage?.input_tokens ?? 0, data.usage?.output_tokens ?? 0),
224
+ metadata: {
225
+ latencyMs: Date.now() - startTime,
226
+ model,
227
+ },
228
+ output,
229
+ tokenUsage: {
230
+ completion: data.usage?.output_tokens,
231
+ prompt: data.usage?.input_tokens,
232
+ total: data.usage?.total_tokens,
233
+ },
234
+ };
235
+ }
151
236
  }
@@ -315,6 +315,17 @@ export async function executePipeline(cliOpts) {
315
315
  }
316
316
  // Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
317
317
  config.outputDir = resolveOutputDir(cliOpts.outputDir);
318
+ // Capture options — CLI flags and env vars aren't in the config file,
319
+ // so merge them here (same logic as resolveOptions).
320
+ config.captureEnabled = cliOpts.capture || process.env.AILF_CAPTURE === "1";
321
+ if (cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR) {
322
+ config.captureDir = cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR;
323
+ }
324
+ config.captureCompress =
325
+ cliOpts.captureCompress !== false &&
326
+ process.env.AILF_CAPTURE_COMPRESS !== "0";
327
+ config.captureExtras =
328
+ cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
318
329
  // Create AppContext directly from the merged config so adapters
319
330
  // (especially taskSource) are wired from the file config's
320
331
  // taskSourceType — not from CLI defaults.
@@ -37,7 +37,10 @@ export default defineModels({
37
37
  {
38
38
  id: "openai:chat:gpt-5.2",
39
39
  label: "GPT 5.2",
40
- config: { temperature: 0.2, max_tokens: 4096 },
40
+ config: {
41
+ max_completion_tokens: 8192,
42
+ verbosity: "medium",
43
+ },
41
44
  modes: ["literacy", "knowledge-probe"],
42
45
  // All literacy variants included by default
43
46
  },
@@ -45,8 +48,9 @@ export default defineModels({
45
48
  id: "openai:responses:gpt-5.4",
46
49
  label: "GPT 5.4",
47
50
  config: {
48
- reasoning_effort: "medium",
49
- max_output_tokens: 4096,
51
+ reasoning: { effort: "medium", summary: "auto" },
52
+ verbosity: "medium",
53
+ max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
50
54
  maxRetries: 1,
51
55
  },
52
56
  timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
@@ -165,28 +165,33 @@ async function dispatchGitHubEval(repo, payload, config) {
165
165
  const hasPerspective = !!payload.perspective;
166
166
  const hasTasks = Array.isArray(payload.tasks) && payload.tasks.length > 0;
167
167
  const hasAreas = Array.isArray(payload.areas) && payload.areas.length > 0;
168
+ // Nest the PipelineRequest under `request` to stay within GitHub's
169
+ // 10-property limit on client_payload. Workflow-level metadata
170
+ // (caller_repo) stays at the top level for the workflow to read.
168
171
  const body = {
169
172
  client_payload: {
170
173
  caller_repo: "sanity-io/www-sanity-io",
171
- dataset: payload.dataset,
172
- mode: payload.mode,
173
- projectId: payload.projectId,
174
- publish: true,
175
- source: "production",
176
- // Studio-initiated evals always use Content Lake as the task source.
177
- // Without this, the pipeline only loads filesystem .task.ts files and
178
- // Studio-owned tasks are invisible.
179
- taskMode: "content-lake",
180
- // Release-scoped fields
181
- ...(hasPerspective ? { perspective: payload.perspective } : {}),
182
- // Task-scoped fields
183
- ...(hasTasks ? { tasks: payload.tasks } : {}),
184
- ...(hasAreas ? { areas: payload.areas } : {}),
185
- ...(payload.debug ? { debug: true } : {}),
186
- ...(payload.tag ? { publishTag: payload.tag } : {}),
187
- ...(payload.sourceReportId
188
- ? { sourceReportId: payload.sourceReportId }
189
- : {}),
174
+ request: {
175
+ dataset: payload.dataset,
176
+ mode: payload.mode,
177
+ projectId: payload.projectId,
178
+ publish: true,
179
+ source: "production",
180
+ // Studio-initiated evals always use Content Lake as the task source.
181
+ // Without this, the pipeline only loads filesystem .task.ts files and
182
+ // Studio-owned tasks are invisible.
183
+ taskMode: "content-lake",
184
+ // Release-scoped fields
185
+ ...(hasPerspective ? { perspective: payload.perspective } : {}),
186
+ // Task-scoped fields
187
+ ...(hasTasks ? { tasks: payload.tasks } : {}),
188
+ ...(hasAreas ? { areas: payload.areas } : {}),
189
+ ...(payload.debug ? { debug: true } : {}),
190
+ ...(payload.tag ? { publishTag: payload.tag } : {}),
191
+ ...(payload.sourceReportId
192
+ ? { sourceReportId: payload.sourceReportId }
193
+ : {}),
194
+ },
190
195
  },
191
196
  event_type: "external-eval",
192
197
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "2.3.1",
3
+ "version": "2.3.2",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"