@sanity/ailf 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/config/models.ts CHANGED
@@ -37,7 +37,10 @@ export default defineModels({
37
37
  {
38
38
  id: "openai:chat:gpt-5.2",
39
39
  label: "GPT 5.2",
40
- config: { temperature: 0.2, max_tokens: 4096 },
40
+ config: {
41
+ max_completion_tokens: 8192,
42
+ verbosity: "medium",
43
+ },
41
44
  modes: ["literacy", "knowledge-probe"],
42
45
  // All literacy variants included by default
43
46
  },
@@ -45,8 +48,9 @@ export default defineModels({
45
48
  id: "openai:responses:gpt-5.4",
46
49
  label: "GPT 5.4",
47
50
  config: {
48
- reasoning_effort: "medium",
49
- max_output_tokens: 4096,
51
+ reasoning: { effort: "medium", summary: "auto" },
52
+ verbosity: "medium",
53
+ max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
50
54
  maxRetries: 1,
51
55
  },
52
56
  timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
@@ -28,7 +28,13 @@
28
28
  */
29
29
  const TASKS_QUERY = /* groq */ `
30
30
  *[_type == "ailf.task"
31
- && (!defined($areas) || area->areaId.current in $areas)
31
+ && (
32
+ !defined($areas)
33
+ // Current field name
34
+ || area->areaId.current in $areas
35
+ // Legacy field name (pre-schema-rename documents)
36
+ || featureArea->areaId.current in $areas
37
+ )
32
38
  && (!defined($taskIds) || id.current in $taskIds)
33
39
  && (
34
40
  // Status-based filtering (unified — replaces execution.enabled)
@@ -39,13 +45,15 @@ const TASKS_QUERY = /* groq */ `
39
45
  || (defined($taskIds) && status != "archived")
40
46
  )
41
47
  && (!defined($tags) || count((tags)[@ in $tags]) > 0)
42
- ] | order(area->areaId.current asc, id.current asc) {
48
+ ] | order(coalesce(area->areaId.current, featureArea->areaId.current) asc, id.current asc) {
43
49
  "taskId": id.current,
44
- title,
45
- "areaId": area->areaId.current,
46
- promptText,
50
+ // Coalesce current and legacy field names so documents created before
51
+ // the schema rename are still readable.
52
+ "title": coalesce(title, description),
53
+ "areaId": coalesce(area->areaId.current, featureArea->areaId.current),
54
+ "promptText": coalesce(promptText, taskPrompt),
47
55
  docCoverage,
48
- "contextDocs": contextDocs[] {
56
+ "contextDocs": coalesce(contextDocs, canonicalDocs)[] {
49
57
  refType,
50
58
  "slug": doc->slug.current,
51
59
  "docRefId": doc->_id,
@@ -55,7 +63,7 @@ const TASKS_QUERY = /* groq */ `
55
63
  perspective,
56
64
  reason
57
65
  },
58
- assertions,
66
+ "assertions": coalesce(assertions, assert),
59
67
  rawAssert,
60
68
  baseline,
61
69
  tags,
@@ -69,9 +69,22 @@ export default class InstrumentedProvider {
69
69
  getRecorder(): RequestRecorder;
70
70
  id(): string;
71
71
  /**
72
- * Calls OpenAI Chat Completions API directly. Uses the recorder's
73
- * fetch wrapper so the LLM call itself is captured in the observation log.
72
+ * Detect whether the model should use the Responses API based on config.
73
+ */
74
+ private isResponsesModel;
75
+ /**
76
+ * Calls OpenAI directly. Routes to the Responses API for reasoning
77
+ * models and Chat Completions API for everything else. Uses the
78
+ * recorder's fetch wrapper so the API call is captured.
74
79
  */
75
80
  private callOpenAI;
81
+ /**
82
+ * Calls OpenAI Chat Completions API (non-reasoning models).
83
+ */
84
+ private callOpenAIChatCompletions;
85
+ /**
86
+ * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
87
+ */
88
+ private callOpenAIResponses;
76
89
  }
77
90
  export {};
@@ -94,13 +94,26 @@ export default class InstrumentedProvider {
94
94
  return `instrumented:${this.providerId}`;
95
95
  }
96
96
  /**
97
- * Calls OpenAI Chat Completions API directly. Uses the recorder's
98
- * fetch wrapper so the LLM call itself is captured in the observation log.
97
+ * Detect whether the model should use the Responses API based on config.
98
+ */
99
+ isResponsesModel() {
100
+ const model = this.config.modelName || this.config.model || "";
101
+ return (this.config.reasoning != null ||
102
+ this.config.reasoning_effort != null ||
103
+ model.startsWith("gpt-5") ||
104
+ model.startsWith("o1") ||
105
+ model.startsWith("o3") ||
106
+ model.startsWith("o4"));
107
+ }
108
+ /**
109
+ * Calls OpenAI directly. Routes to the Responses API for reasoning
110
+ * models and Chat Completions API for everything else. Uses the
111
+ * recorder's fetch wrapper so the API call is captured.
99
112
  */
100
113
  async callOpenAI(prompt) {
101
- const model = this.config.model || "gpt-4o";
102
- const temperature = this.config.temperature ?? 0;
103
- const maxTokens = this.config.max_tokens || 4096;
114
+ const model = this.config.modelName ||
115
+ this.config.model ||
116
+ "gpt-4o";
104
117
  const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
105
118
  if (!apiKey) {
106
119
  return {
@@ -108,7 +121,17 @@ export default class InstrumentedProvider {
108
121
  output: undefined,
109
122
  };
110
123
  }
111
- // Use the recorder's fetch wrapper so the API call is recorded
124
+ if (this.isResponsesModel()) {
125
+ return this.callOpenAIResponses(prompt, model, apiKey);
126
+ }
127
+ return this.callOpenAIChatCompletions(prompt, model, apiKey);
128
+ }
129
+ /**
130
+ * Calls OpenAI Chat Completions API (non-reasoning models).
131
+ */
132
+ async callOpenAIChatCompletions(prompt, model, apiKey) {
133
+ const temperature = this.config.temperature ?? 0;
134
+ const maxTokens = this.config.max_tokens || 4096;
112
135
  const fetchFn = this.recorder.isRunning()
113
136
  ? this.recorder.fetch.bind(this.recorder)
114
137
  : globalThis.fetch;
@@ -148,4 +171,66 @@ export default class InstrumentedProvider {
148
171
  },
149
172
  };
150
173
  }
174
+ /**
175
+ * Calls OpenAI Responses API (reasoning models like GPT-5.x, o-series).
176
+ */
177
+ async callOpenAIResponses(prompt, model, apiKey) {
178
+ const maxOutputTokens = this.config.max_output_tokens || 32_000;
179
+ const reasoning = this.config.reasoning;
180
+ const reasoningEffort = reasoning?.effort || this.config.reasoning_effort || "medium";
181
+ const reasoningSummary = reasoning?.summary;
182
+ const verbosity = this.config.verbosity;
183
+ const fetchFn = this.recorder.isRunning()
184
+ ? this.recorder.fetch.bind(this.recorder)
185
+ : globalThis.fetch;
186
+ const startTime = Date.now();
187
+ const response = await fetchFn("https://api.openai.com/v1/responses", {
188
+ body: JSON.stringify({
189
+ input: prompt,
190
+ max_output_tokens: maxOutputTokens,
191
+ model,
192
+ reasoning: {
193
+ effort: reasoningEffort,
194
+ ...(reasoningSummary ? { summary: reasoningSummary } : {}),
195
+ },
196
+ ...(verbosity ? { text: { format: { type: "text" }, verbosity } } : {}),
197
+ }),
198
+ headers: {
199
+ Authorization: `Bearer ${apiKey}`,
200
+ "Content-Type": "application/json",
201
+ },
202
+ method: "POST",
203
+ });
204
+ const data = (await response.json());
205
+ if (data.error) {
206
+ return {
207
+ error: data.error.message ?? "Unknown OpenAI error",
208
+ output: undefined,
209
+ };
210
+ }
211
+ // Extract text from Responses API output format
212
+ let output = "";
213
+ for (const item of data.output ?? []) {
214
+ if (item.type === "message" && item.content) {
215
+ for (const block of item.content) {
216
+ if (block.type === "output_text" && block.text) {
217
+ output += block.text;
218
+ }
219
+ }
220
+ }
221
+ }
222
+ return {
223
+ cost: calculateCost(model, data.usage?.input_tokens ?? 0, data.usage?.output_tokens ?? 0),
224
+ metadata: {
225
+ latencyMs: Date.now() - startTime,
226
+ model,
227
+ },
228
+ output,
229
+ tokenUsage: {
230
+ completion: data.usage?.output_tokens,
231
+ prompt: data.usage?.input_tokens,
232
+ total: data.usage?.total_tokens,
233
+ },
234
+ };
235
+ }
151
236
  }
@@ -315,6 +315,17 @@ export async function executePipeline(cliOpts) {
315
315
  }
316
316
  // Output dir: explicit CLI flag → $CWD/.ailf/results/latest/
317
317
  config.outputDir = resolveOutputDir(cliOpts.outputDir);
318
+ // Capture options — CLI flags and env vars aren't in the config file,
319
+ // so merge them here (same logic as resolveOptions).
320
+ config.captureEnabled = cliOpts.capture || process.env.AILF_CAPTURE === "1";
321
+ if (cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR) {
322
+ config.captureDir = cliOpts.captureDir ?? process.env.AILF_CAPTURE_DIR;
323
+ }
324
+ config.captureCompress =
325
+ cliOpts.captureCompress !== false &&
326
+ process.env.AILF_CAPTURE_COMPRESS !== "0";
327
+ config.captureExtras =
328
+ cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
318
329
  // Create AppContext directly from the merged config so adapters
319
330
  // (especially taskSource) are wired from the file config's
320
331
  // taskSourceType — not from CLI defaults.
@@ -37,7 +37,10 @@ export default defineModels({
37
37
  {
38
38
  id: "openai:chat:gpt-5.2",
39
39
  label: "GPT 5.2",
40
- config: { temperature: 0.2, max_tokens: 4096 },
40
+ config: {
41
+ max_completion_tokens: 8192,
42
+ verbosity: "medium",
43
+ },
41
44
  modes: ["literacy", "knowledge-probe"],
42
45
  // All literacy variants included by default
43
46
  },
@@ -45,8 +48,9 @@ export default defineModels({
45
48
  id: "openai:responses:gpt-5.4",
46
49
  label: "GPT 5.4",
47
50
  config: {
48
- reasoning_effort: "medium",
49
- max_output_tokens: 4096,
51
+ reasoning: { effort: "medium", summary: "auto" },
52
+ verbosity: "medium",
53
+ max_output_tokens: 32_000, // reasoning tokens share this budget — 4096 was too low
50
54
  maxRetries: 1,
51
55
  },
52
56
  timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
@@ -29,15 +29,29 @@ export class FetchDocsStep {
29
29
  return { status: "skipped", reason: "--skip-fetch" };
30
30
  }
31
31
  const start = Date.now();
32
- // Load tasks from the filesystem — the same source GenerateConfigsStep
33
- // uses. This replaces ctx.taskSource (ContentLakeTaskSource) which may
34
- // have no ailf.task documents, causing a mismatch where generated
35
- // configs reference context files that were never fetched.
36
- const allTasks = await loadPipelineTasks({
37
- rootDir: ctx.config.rootDir,
38
- mode: ctx.config.mode,
39
- repoTasksPath: ctx.config.repoTasksPath,
40
- });
32
+ // Load tasks use the same source as GenerateConfigsStep to avoid
33
+ // a mismatch where configs reference context files that were never
34
+ // fetched.
35
+ //
36
+ // Content Lake path: use ctx.taskSource (ContentLakeTaskSource) which
37
+ // loads Studio-owned ailf.task documents via GROQ.
38
+ // Filesystem path: load from .task.ts files (repo/inline tasks).
39
+ let allTasks;
40
+ if (ctx.config.taskSourceType === "content-lake") {
41
+ const filter = {
42
+ ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
43
+ ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
44
+ ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
45
+ };
46
+ allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
47
+ }
48
+ else {
49
+ allTasks = await loadPipelineTasks({
50
+ rootDir: ctx.config.rootDir,
51
+ mode: ctx.config.mode,
52
+ repoTasksPath: ctx.config.repoTasksPath,
53
+ });
54
+ }
41
55
  // Bridge: narrow to literacy tasks for canonical doc access
42
56
  const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
43
57
  const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);
@@ -18,6 +18,21 @@ export declare class GenerateConfigsStep implements PipelineStep {
18
18
  private compileLiteracyVariants;
19
19
  private compileSingleMode;
20
20
  private loadTasks;
21
+ /**
22
+ * Load tasks from the Content Lake via ctx.taskSource.
23
+ *
24
+ * The ContentLakeTaskSource adapter handles area/task/tag filtering
25
+ * in the GROQ query itself, so we build a FilterOptions and pass it
26
+ * through rather than filtering in-memory after loading.
27
+ */
28
+ private loadTasksFromContentLake;
29
+ /**
30
+ * Load tasks from filesystem .task.ts files.
31
+ *
32
+ * This is the original path used for repo-based and inline tasks.
33
+ * It scans tasks/{mode}/ and optionally --repo-tasks-path.
34
+ */
35
+ private loadTasksFromFilesystem;
21
36
  private applyFilters;
22
37
  /**
23
38
  * Build a descriptive error message when no tasks match the current filters.
@@ -209,6 +209,50 @@ export class GenerateConfigsStep {
209
209
  // Task loading — unified for all modes
210
210
  // ---------------------------------------------------------------------------
211
211
  async loadTasks(ctx, mode, state) {
212
+ // Content Lake path — use ctx.taskSource (ContentLakeTaskSource) which
213
+ // loads ailf.task documents via GROQ. This is the only path that sees
214
+ // Studio-owned tasks (ownership: "studio").
215
+ if (ctx.config.taskSourceType === "content-lake") {
216
+ return this.loadTasksFromContentLake(ctx, state);
217
+ }
218
+ // Filesystem path — load from .task.ts files (repo tasks, inline tasks).
219
+ return this.loadTasksFromFilesystem(ctx, mode, state);
220
+ }
221
+ /**
222
+ * Load tasks from the Content Lake via ctx.taskSource.
223
+ *
224
+ * The ContentLakeTaskSource adapter handles area/task/tag filtering
225
+ * in the GROQ query itself, so we build a FilterOptions and pass it
226
+ * through rather than filtering in-memory after loading.
227
+ */
228
+ async loadTasksFromContentLake(ctx, state) {
229
+ const filter = {
230
+ ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
231
+ ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
232
+ ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
233
+ };
234
+ const tasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
235
+ // Capture loaded IDs for error messages (same as filesystem path)
236
+ this.lastLoadedTaskIds = tasks
237
+ .map((t) => t.id)
238
+ .filter((id) => !!id);
239
+ // Release auto-scope
240
+ if (state.releaseAutoScope && !ctx.config.noAutoScope) {
241
+ const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
242
+ const beforeCount = tasks.length;
243
+ const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
244
+ ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
245
+ return scoped;
246
+ }
247
+ return tasks;
248
+ }
249
+ /**
250
+ * Load tasks from filesystem .task.ts files.
251
+ *
252
+ * This is the original path used for repo-based and inline tasks.
253
+ * It scans tasks/{mode}/ and optionally --repo-tasks-path.
254
+ */
255
+ async loadTasksFromFilesystem(ctx, mode, state) {
212
256
  const { resolve } = await import("path");
213
257
  const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
214
258
  const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
@@ -113,6 +113,11 @@ export class RunEvalStep {
113
113
  // required eval modes were satisfied from the remote cache.
114
114
  state.remoteCacheHits ??= new Set();
115
115
  state.remoteCacheHits.add(this.mode);
116
+ // Carry forward Promptfoo share URLs from the cached report
117
+ if (remoteCacheResult.promptfooUrls?.length) {
118
+ state.promptfooUrls ??= [];
119
+ state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
120
+ }
116
121
  // Capture the restored score-summary from remote cache
117
122
  const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
118
123
  if (existsSync(cachedSummaryPath)) {
@@ -189,6 +194,14 @@ export class RunEvalStep {
189
194
  mode: this.mode,
190
195
  });
191
196
  }
197
+ // Extract Promptfoo share URL from eval results (Step 3b)
198
+ if (ctx.evalRunner.extractShareUrl) {
199
+ const shareUrl = ctx.evalRunner.extractShareUrl(resolve(rootDir, resultsFileForMode(this.mode)));
200
+ if (shareUrl) {
201
+ state.promptfooUrls ??= [];
202
+ state.promptfooUrls.push({ mode: this.mode, url: shareUrl });
203
+ }
204
+ }
192
205
  const durationMs = Date.now() - start;
193
206
  return {
194
207
  durationMs,
@@ -224,6 +237,7 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
224
237
  console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
225
238
  return {
226
239
  completedAt: cachedReport.completedAt,
240
+ promptfooUrls: cachedReport.provenance?.promptfooUrls,
227
241
  reportId: cachedReport.id,
228
242
  };
229
243
  }
@@ -165,24 +165,33 @@ async function dispatchGitHubEval(repo, payload, config) {
165
165
  const hasPerspective = !!payload.perspective;
166
166
  const hasTasks = Array.isArray(payload.tasks) && payload.tasks.length > 0;
167
167
  const hasAreas = Array.isArray(payload.areas) && payload.areas.length > 0;
168
+ // Nest the PipelineRequest under `request` to stay within GitHub's
169
+ // 10-property limit on client_payload. Workflow-level metadata
170
+ // (caller_repo) stays at the top level for the workflow to read.
168
171
  const body = {
169
172
  client_payload: {
170
173
  caller_repo: "sanity-io/www-sanity-io",
171
- dataset: payload.dataset,
172
- mode: payload.mode,
173
- projectId: payload.projectId,
174
- publish: true,
175
- source: "production",
176
- // Release-scoped fields
177
- ...(hasPerspective ? { perspective: payload.perspective } : {}),
178
- // Task-scoped fields
179
- ...(hasTasks ? { tasks: payload.tasks } : {}),
180
- ...(hasAreas ? { areas: payload.areas } : {}),
181
- ...(payload.debug ? { debug: true } : {}),
182
- ...(payload.tag ? { publishTag: payload.tag } : {}),
183
- ...(payload.sourceReportId
184
- ? { sourceReportId: payload.sourceReportId }
185
- : {}),
174
+ request: {
175
+ dataset: payload.dataset,
176
+ mode: payload.mode,
177
+ projectId: payload.projectId,
178
+ publish: true,
179
+ source: "production",
180
+ // Studio-initiated evals always use Content Lake as the task source.
181
+ // Without this, the pipeline only loads filesystem .task.ts files and
182
+ // Studio-owned tasks are invisible.
183
+ taskMode: "content-lake",
184
+ // Release-scoped fields
185
+ ...(hasPerspective ? { perspective: payload.perspective } : {}),
186
+ // Task-scoped fields
187
+ ...(hasTasks ? { tasks: payload.tasks } : {}),
188
+ ...(hasAreas ? { areas: payload.areas } : {}),
189
+ ...(payload.debug ? { debug: true } : {}),
190
+ ...(payload.tag ? { publishTag: payload.tag } : {}),
191
+ ...(payload.sourceReportId
192
+ ? { sourceReportId: payload.sourceReportId }
193
+ : {}),
194
+ },
186
195
  },
187
196
  event_type: "external-eval",
188
197
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "2.3.0",
3
+ "version": "2.3.2",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"