@ai-sdk/google 3.0.74 → 3.0.75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,8 +11,8 @@ import { z } from 'zod/v4';
11
11
  * Helpers are defined as factories (invoked only inside the exported
12
12
  * `lazySchema(() => ...)` callbacks) so no `z.object(...)` / `z.union(...)`
13
13
  * runs at module import. Schemas are intentionally narrow on the fields the
14
- * SDK consumes (text + thought) and lenient (`loose()` / `unknown`) on the
15
- * rest, so subsequent additions can widen without breaking the basic path.
14
+ * SDK consumes and lenient (`loose()` / `unknown`) on the rest, so subsequent
15
+ * additions can widen without breaking the basic path.
16
16
  */
17
17
 
18
18
  const tokenByModalitySchema = () =>
@@ -77,7 +77,7 @@ const annotationSchema = () => {
77
77
  type: z.literal('file_citation'),
78
78
  file_name: z.string().nullish(),
79
79
  document_uri: z.string().nullish(),
80
- source: z.string().nullish(),
80
+ url: z.string().nullish(),
81
81
  page_number: z.number().nullish(),
82
82
  media_id: z.string().nullish(),
83
83
  start_index: z.number().nullish(),
@@ -116,9 +116,10 @@ const thoughtSummaryItemSchema = () =>
116
116
  .loose();
117
117
 
118
118
  /*
119
- * Catch-all content block schema. Specific variants (`text`, `thought`,
120
- * `function_call`, built-in tool call/result) are narrowly typed; unknown
121
- * block types fall through `loose()`.
119
+ * Content block schemas these populate the `content` array of a
120
+ * `model_output` step. Function calls, thoughts, and built-in tool
121
+ * call/result blocks are top-level step types (see `stepSchema` below), not
122
+ * content blocks.
122
123
  */
123
124
  const contentBlockSchema = () => {
124
125
  const textContent = z
@@ -129,15 +130,71 @@ const contentBlockSchema = () => {
129
130
  })
130
131
  .loose();
131
132
 
132
- const thoughtContent = z
133
+ const imageContent = z
133
134
  .object({
134
- type: z.literal('thought'),
135
- signature: z.string().nullish(),
136
- summary: z.array(thoughtSummaryItemSchema()).nullish(),
135
+ type: z.literal('image'),
136
+ data: z.string().nullish(),
137
+ mime_type: z.string().nullish(),
138
+ resolution: z.enum(['low', 'medium', 'high', 'ultra_high']).nullish(),
139
+ uri: z.string().nullish(),
140
+ })
141
+ .loose();
142
+
143
+ return z.union([
144
+ textContent,
145
+ imageContent,
146
+ z.object({ type: z.string() }).loose(),
147
+ ]);
148
+ };
149
+
150
+ export type GoogleInteractionsContentBlock = z.infer<
151
+ ReturnType<typeof contentBlockSchema>
152
+ >;
153
+
154
+ const BUILTIN_TOOL_CALL_STEP_TYPES = [
155
+ 'google_search_call',
156
+ 'code_execution_call',
157
+ 'url_context_call',
158
+ 'file_search_call',
159
+ 'google_maps_call',
160
+ 'mcp_server_tool_call',
161
+ ] as const;
162
+
163
+ const BUILTIN_TOOL_RESULT_STEP_TYPES = [
164
+ 'google_search_result',
165
+ 'code_execution_result',
166
+ 'url_context_result',
167
+ 'file_search_result',
168
+ 'google_maps_result',
169
+ 'mcp_server_tool_result',
170
+ ] as const;
171
+
172
+ /*
173
+ * Step schema union — elements of `response.steps[]` and the `step` field on
174
+ * `step.start` SSE events.
175
+ *
176
+ * - `user_input` echoes a turn the client sent; only appears on
177
+ * `GET /interactions/{id}` (the full timeline). The SDK skips it.
178
+ * - `model_output` wraps the model's text/image content in `step.content[]`.
179
+ * - `function_call`, `thought`, and the built-in `*_call`/`*_result` steps
180
+ * carry their payload directly on the step (no `content` indirection).
181
+ */
182
+ const stepSchema = () => {
183
+ const userInputStep = z
184
+ .object({
185
+ type: z.literal('user_input'),
186
+ content: z.array(contentBlockSchema()).nullish(),
187
+ })
188
+ .loose();
189
+
190
+ const modelOutputStep = z
191
+ .object({
192
+ type: z.literal('model_output'),
193
+ content: z.array(contentBlockSchema()).nullish(),
137
194
  })
138
195
  .loose();
139
196
 
140
- const functionCallContent = z
197
+ const functionCallStep = z
141
198
  .object({
142
199
  type: z.literal('function_call'),
143
200
  id: z.string(),
@@ -147,26 +204,17 @@ const contentBlockSchema = () => {
147
204
  })
148
205
  .loose();
149
206
 
150
- const imageContent = z
207
+ const thoughtStep = z
151
208
  .object({
152
- type: z.literal('image'),
153
- data: z.string().nullish(),
154
- mime_type: z.string().nullish(),
155
- resolution: z.enum(['low', 'medium', 'high', 'ultra_high']).nullish(),
156
- uri: z.string().nullish(),
209
+ type: z.literal('thought'),
210
+ signature: z.string().nullish(),
211
+ summary: z.array(thoughtSummaryItemSchema()).nullish(),
157
212
  })
158
213
  .loose();
159
214
 
160
- const builtinToolCall = z
215
+ const builtinToolCallStep = z
161
216
  .object({
162
- type: z.enum([
163
- 'google_search_call',
164
- 'code_execution_call',
165
- 'url_context_call',
166
- 'file_search_call',
167
- 'google_maps_call',
168
- 'mcp_server_tool_call',
169
- ]),
217
+ type: z.enum(BUILTIN_TOOL_CALL_STEP_TYPES),
170
218
  id: z.string(),
171
219
  arguments: z.record(z.string(), z.unknown()).nullish(),
172
220
  name: z.string().nullish(),
@@ -176,16 +224,9 @@ const contentBlockSchema = () => {
176
224
  })
177
225
  .loose();
178
226
 
179
- const builtinToolResult = z
227
+ const builtinToolResultStep = z
180
228
  .object({
181
- type: z.enum([
182
- 'google_search_result',
183
- 'code_execution_result',
184
- 'url_context_result',
185
- 'file_search_result',
186
- 'google_maps_result',
187
- 'mcp_server_tool_result',
188
- ]),
229
+ type: z.enum(BUILTIN_TOOL_RESULT_STEP_TYPES),
189
230
  call_id: z.string(),
190
231
  result: z.unknown().nullish(),
191
232
  is_error: z.boolean().nullish(),
@@ -196,19 +237,17 @@ const contentBlockSchema = () => {
196
237
  .loose();
197
238
 
198
239
  return z.union([
199
- textContent,
200
- imageContent,
201
- thoughtContent,
202
- functionCallContent,
203
- builtinToolCall,
204
- builtinToolResult,
240
+ userInputStep,
241
+ modelOutputStep,
242
+ functionCallStep,
243
+ thoughtStep,
244
+ builtinToolCallStep,
245
+ builtinToolResultStep,
205
246
  z.object({ type: z.string() }).loose(),
206
247
  ]);
207
248
  };
208
249
 
209
- export type GoogleInteractionsContentBlock = z.infer<
210
- ReturnType<typeof contentBlockSchema>
211
- >;
250
+ export type GoogleInteractionsStep = z.infer<ReturnType<typeof stepSchema>>;
212
251
 
213
252
  export const googleInteractionsResponseSchema = lazySchema(() =>
214
253
  zodSchema(
@@ -225,7 +264,7 @@ export const googleInteractionsResponseSchema = lazySchema(() =>
225
264
  status: interactionStatusSchema(),
226
265
  model: z.string().nullish(),
227
266
  agent: z.string().nullish(),
228
- outputs: z.array(contentBlockSchema()).nullish(),
267
+ steps: z.array(stepSchema()).nullish(),
229
268
  usage: usageSchema().nullish(),
230
269
  service_tier: z.string().nullish(),
231
270
  previous_interaction_id: z.string().nullish(),
@@ -246,9 +285,9 @@ export const googleInteractionsEventSchema = lazySchema(() =>
246
285
  const annotation = annotationSchema();
247
286
  const thoughtSummaryItem = thoughtSummaryItemSchema();
248
287
 
249
- const interactionStartEvent = z
288
+ const interactionCreatedEvent = z
250
289
  .object({
251
- event_type: z.literal('interaction.start'),
290
+ event_type: z.literal('interaction.created'),
252
291
  event_id: z.string().nullish(),
253
292
  interaction: z
254
293
  .object({
@@ -266,30 +305,35 @@ export const googleInteractionsEventSchema = lazySchema(() =>
266
305
  })
267
306
  .loose();
268
307
 
269
- const contentStartEvent = z
308
+ /*
309
+ * `step.start` carries the discriminated step shape under `step`. For
310
+ * `function_call` steps the `name` is included here; for `thought`
311
+ * steps the initial `signature` and `summary` arrive here when set.
312
+ */
313
+ const stepStartEvent = z
270
314
  .object({
271
- event_type: z.literal('content.start'),
315
+ event_type: z.literal('step.start'),
272
316
  event_id: z.string().nullish(),
273
317
  index: z.number(),
274
- content: contentBlockSchema(),
318
+ step: stepSchema(),
275
319
  })
276
320
  .loose();
277
321
 
278
- const contentDeltaText = z
322
+ const stepDeltaText = z
279
323
  .object({
280
324
  type: z.literal('text'),
281
325
  text: z.string(),
282
326
  })
283
327
  .loose();
284
328
 
285
- const contentDeltaThoughtSummary = z
329
+ const stepDeltaThoughtSummary = z
286
330
  .object({
287
331
  type: z.literal('thought_summary'),
288
332
  content: thoughtSummaryItem.nullish(),
289
333
  })
290
334
  .loose();
291
335
 
292
- const contentDeltaThoughtSignature = z
336
+ const stepDeltaThoughtSignature = z
293
337
  .object({
294
338
  type: z.literal('thought_signature'),
295
339
  signature: z.string().nullish(),
@@ -297,33 +341,40 @@ export const googleInteractionsEventSchema = lazySchema(() =>
297
341
  .loose();
298
342
 
299
343
  /*
300
- * `function_call` content deltas carry the entire call (id + name +
301
- * arguments) — there is no per-token argument streaming. See js-genai
302
- * `src/interactions/resources/interactions.ts` `ContentDelta.FunctionCall`.
344
+ * `function_call` step deltas stream the JSON arguments as a partial
345
+ * string. Wire shape:
346
+ * { type: 'arguments_delta', arguments: '<partial-json-string>' }
347
+ * The partial JSON lives in `arguments` (a string), not in a separate
348
+ * `arguments_delta` field — the discriminator name is the only place
349
+ * `arguments_delta` appears. Consumers accumulate the substrings and
350
+ * parse on `step.stop`.
303
351
  */
304
- const contentDeltaFunctionCall = z
352
+ const stepDeltaArgumentsDelta = z
305
353
  .object({
306
- type: z.literal('function_call'),
307
- id: z.string(),
308
- name: z.string(),
309
- arguments: z.record(z.string(), z.unknown()).nullish(),
354
+ type: z.literal('arguments_delta'),
355
+ arguments: z.string().nullish(),
356
+ id: z.string().nullish(),
310
357
  signature: z.string().nullish(),
311
358
  })
312
359
  .loose();
313
360
 
314
- const contentDeltaTextAnnotation = z
361
+ /*
362
+ * URL/file/place-citation deltas. The discriminator is
363
+ * `text_annotation_delta` (matching the `_delta` suffix used by
364
+ * `arguments_delta`); `text_annotation` is also accepted as an alias.
365
+ */
366
+ const stepDeltaTextAnnotation = z
315
367
  .object({
316
- type: z.literal('text_annotation'),
368
+ type: z.enum(['text_annotation_delta', 'text_annotation']),
317
369
  annotations: z.array(annotation).nullish(),
318
370
  })
319
371
  .loose();
320
372
 
321
373
  /*
322
- * `image` content deltas carry the entire image payload (`data` base64 +
323
- * `mime_type`, or `uri`) — there is no per-byte streaming. See js-genai
324
- * `src/interactions/resources/interactions.ts` `ContentDelta.Image`.
374
+ * `image` deltas carry the entire payload per delta (`data` base64 +
375
+ * `mime_type`, or `uri`) — there is no per-byte streaming.
325
376
  */
326
- const contentDeltaImage = z
377
+ const stepDeltaImage = z
327
378
  .object({
328
379
  type: z.literal('image'),
329
380
  data: z.string().nullish(),
@@ -334,21 +385,15 @@ export const googleInteractionsEventSchema = lazySchema(() =>
334
385
  .loose();
335
386
 
336
387
  /*
337
- * Built-in tool call deltas mirror the same shape as their content-block
338
- * counterparts (full payload per delta -- there is no per-token streaming
339
- * of arguments). Result deltas carry the populated `result` payload.
388
+ * Built-in tool call/result step deltas mirror the shape of their step
389
+ * counterparts (full payload per delta there is no per-token
390
+ * streaming of arguments). Result deltas carry the populated `result`
391
+ * payload.
340
392
  */
341
- const contentDeltaBuiltinToolCall = z
393
+ const stepDeltaBuiltinToolCall = z
342
394
  .object({
343
- type: z.enum([
344
- 'google_search_call',
345
- 'code_execution_call',
346
- 'url_context_call',
347
- 'file_search_call',
348
- 'google_maps_call',
349
- 'mcp_server_tool_call',
350
- ]),
351
- id: z.string(),
395
+ type: z.enum(BUILTIN_TOOL_CALL_STEP_TYPES),
396
+ id: z.string().nullish(),
352
397
  arguments: z.record(z.string(), z.unknown()).nullish(),
353
398
  name: z.string().nullish(),
354
399
  server_name: z.string().nullish(),
@@ -357,17 +402,10 @@ export const googleInteractionsEventSchema = lazySchema(() =>
357
402
  })
358
403
  .loose();
359
404
 
360
- const contentDeltaBuiltinToolResult = z
405
+ const stepDeltaBuiltinToolResult = z
361
406
  .object({
362
- type: z.enum([
363
- 'google_search_result',
364
- 'code_execution_result',
365
- 'url_context_result',
366
- 'file_search_result',
367
- 'google_maps_result',
368
- 'mcp_server_tool_result',
369
- ]),
370
- call_id: z.string(),
407
+ type: z.enum(BUILTIN_TOOL_RESULT_STEP_TYPES),
408
+ call_id: z.string().nullish(),
371
409
  result: z.unknown().nullish(),
372
410
  is_error: z.boolean().nullish(),
373
411
  name: z.string().nullish(),
@@ -376,49 +414,73 @@ export const googleInteractionsEventSchema = lazySchema(() =>
376
414
  })
377
415
  .loose();
378
416
 
379
- const contentDeltaUnknown = z.object({ type: z.string() }).loose();
380
-
381
- const contentDeltaUnion = z.union([
382
- contentDeltaText,
383
- contentDeltaImage,
384
- contentDeltaThoughtSummary,
385
- contentDeltaThoughtSignature,
386
- contentDeltaFunctionCall,
387
- contentDeltaTextAnnotation,
388
- contentDeltaBuiltinToolCall,
389
- contentDeltaBuiltinToolResult,
390
- contentDeltaUnknown,
417
+ const stepDeltaUnknown = z.object({ type: z.string() }).loose();
418
+
419
+ const stepDeltaUnion = z.union([
420
+ stepDeltaText,
421
+ stepDeltaImage,
422
+ stepDeltaThoughtSummary,
423
+ stepDeltaThoughtSignature,
424
+ stepDeltaArgumentsDelta,
425
+ stepDeltaTextAnnotation,
426
+ stepDeltaBuiltinToolCall,
427
+ stepDeltaBuiltinToolResult,
428
+ stepDeltaUnknown,
391
429
  ]);
392
430
 
393
- const contentDeltaEvent = z
431
+ const stepDeltaEvent = z
394
432
  .object({
395
- event_type: z.literal('content.delta'),
433
+ event_type: z.literal('step.delta'),
396
434
  event_id: z.string().nullish(),
397
435
  index: z.number(),
398
- delta: contentDeltaUnion,
436
+ delta: stepDeltaUnion,
399
437
  })
400
438
  .loose();
401
439
 
402
- const contentStopEvent = z
440
+ const stepStopEvent = z
403
441
  .object({
404
- event_type: z.literal('content.stop'),
442
+ event_type: z.literal('step.stop'),
405
443
  event_id: z.string().nullish(),
406
444
  index: z.number(),
407
445
  })
408
446
  .loose();
409
447
 
448
+ /*
449
+ * Status-transition events. The API emits `interaction.status_update`
450
+ * for in-progress and requires-action transitions; the more specific
451
+ * `interaction.in_progress` and `interaction.requires_action` shapes
452
+ * are accepted so all three route through the same handler.
453
+ */
410
454
  const interactionStatusUpdateEvent = z
411
455
  .object({
412
456
  event_type: z.literal('interaction.status_update'),
413
457
  event_id: z.string().nullish(),
414
458
  interaction_id: z.string().nullish(),
415
- status,
459
+ status: status.nullish(),
460
+ })
461
+ .loose();
462
+
463
+ const interactionInProgressEvent = z
464
+ .object({
465
+ event_type: z.literal('interaction.in_progress'),
466
+ event_id: z.string().nullish(),
467
+ interaction_id: z.string().nullish(),
468
+ status: status.nullish(),
469
+ })
470
+ .loose();
471
+
472
+ const interactionRequiresActionEvent = z
473
+ .object({
474
+ event_type: z.literal('interaction.requires_action'),
475
+ event_id: z.string().nullish(),
476
+ interaction_id: z.string().nullish(),
477
+ status: status.nullish(),
416
478
  })
417
479
  .loose();
418
480
 
419
- const interactionCompleteEvent = z
481
+ const interactionCompletedEvent = z
420
482
  .object({
421
- event_type: z.literal('interaction.complete'),
483
+ event_type: z.literal('interaction.completed'),
422
484
  event_id: z.string().nullish(),
423
485
  interaction: z
424
486
  .object({
@@ -448,12 +510,14 @@ export const googleInteractionsEventSchema = lazySchema(() =>
448
510
  const unknownEvent = z.object({ event_type: z.string() }).loose();
449
511
 
450
512
  return z.union([
451
- interactionStartEvent,
452
- contentStartEvent,
453
- contentDeltaEvent,
454
- contentStopEvent,
513
+ interactionCreatedEvent,
514
+ stepStartEvent,
515
+ stepDeltaEvent,
516
+ stepStopEvent,
455
517
  interactionStatusUpdateEvent,
456
- interactionCompleteEvent,
518
+ interactionInProgressEvent,
519
+ interactionRequiresActionEvent,
520
+ interactionCompletedEvent,
457
521
  errorEvent,
458
522
  unknownEvent,
459
523
  ]);
@@ -68,6 +68,67 @@ export const googleInteractionsLanguageModelOptions = lazySchema(() =>
68
68
 
69
69
  thinkingLevel: z.enum(['minimal', 'low', 'medium', 'high']).nullish(),
70
70
  thinkingSummaries: z.enum(['auto', 'none']).nullish(),
71
+
72
+ /**
73
+ * Output-format entries that map directly to the API's `response_format`
74
+ * array. Use this to request image, audio, or non-JSON text outputs
75
+ * with full control over `mime_type`, `aspect_ratio`, and `image_size`.
76
+ *
77
+ * Entries are sent in order. The AI SDK call-level `responseFormat: {
78
+ * type: 'json', schema }` still drives JSON-mode and adds a matching
79
+ * text entry automatically; entries listed here are appended.
80
+ */
81
+ responseFormat: z
82
+ .array(
83
+ z.union([
84
+ z
85
+ .object({
86
+ type: z.literal('text'),
87
+ mimeType: z.string().nullish(),
88
+ schema: z.unknown().nullish(),
89
+ })
90
+ .loose(),
91
+ z
92
+ .object({
93
+ type: z.literal('image'),
94
+ mimeType: z.string().nullish(),
95
+ aspectRatio: z
96
+ .enum([
97
+ '1:1',
98
+ '2:3',
99
+ '3:2',
100
+ '3:4',
101
+ '4:3',
102
+ '4:5',
103
+ '5:4',
104
+ '9:16',
105
+ '16:9',
106
+ '21:9',
107
+ '1:8',
108
+ '8:1',
109
+ '1:4',
110
+ '4:1',
111
+ ])
112
+ .nullish(),
113
+ imageSize: z.enum(['1K', '2K', '4K', '512']).nullish(),
114
+ })
115
+ .loose(),
116
+ z
117
+ .object({
118
+ type: z.literal('audio'),
119
+ mimeType: z.string().nullish(),
120
+ })
121
+ .loose(),
122
+ ]),
123
+ )
124
+ .nullish(),
125
+
126
+ /**
127
+ * @deprecated Use `responseFormat` with a `{ type: 'image', ... }`
128
+ * entry instead. Retained for backwards compatibility; the SDK
129
+ * translates it into a matching `response_format` image entry and
130
+ * emits a warning when set.
131
+ */
71
132
  imageConfig: z
72
133
  .object({
73
134
  aspectRatio: z