@ai-sdk/google 3.0.73 → 3.0.75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/index.d.mts +17 -0
  3. package/dist/index.d.ts +17 -0
  4. package/dist/index.js +521 -340
  5. package/dist/index.js.map +1 -1
  6. package/dist/index.mjs +521 -340
  7. package/dist/index.mjs.map +1 -1
  8. package/dist/internal/index.d.mts +1 -0
  9. package/dist/internal/index.d.ts +1 -0
  10. package/dist/internal/index.js +43 -28
  11. package/dist/internal/index.js.map +1 -1
  12. package/dist/internal/index.mjs +43 -28
  13. package/dist/internal/index.mjs.map +1 -1
  14. package/docs/15-google-generative-ai.mdx +72 -16
  15. package/package.json +1 -1
  16. package/src/convert-to-google-generative-ai-messages.ts +20 -2
  17. package/src/google-generative-ai-language-model.ts +5 -4
  18. package/src/google-generative-ai-prompt.ts +5 -1
  19. package/src/interactions/build-google-interactions-stream-transform.ts +285 -154
  20. package/src/interactions/convert-to-google-interactions-input.ts +57 -133
  21. package/src/interactions/extract-google-interactions-sources.ts +3 -3
  22. package/src/interactions/google-interactions-api.ts +179 -115
  23. package/src/interactions/google-interactions-language-model-options.ts +61 -0
  24. package/src/interactions/google-interactions-language-model.ts +100 -38
  25. package/src/interactions/google-interactions-prompt.ts +189 -114
  26. package/src/interactions/map-google-interactions-finish-reason.ts +3 -5
  27. package/src/interactions/parse-google-interactions-outputs.ts +80 -74
  28. package/src/interactions/prepare-google-interactions-tools.ts +1 -1
  29. package/src/interactions/stream-google-interactions.ts +1 -1
  30. package/src/interactions/synthesize-google-interactions-agent-stream.ts +1 -1
@@ -7,11 +7,12 @@ import type {
7
7
  import { convertToBase64 } from '@ai-sdk/provider-utils';
8
8
  import type {
9
9
  GoogleInteractionsContent,
10
+ GoogleInteractionsContentBlock,
10
11
  GoogleInteractionsFunctionResultContent,
11
12
  GoogleInteractionsImageContent,
12
13
  GoogleInteractionsInput,
14
+ GoogleInteractionsStep,
13
15
  GoogleInteractionsTextContent,
14
- GoogleInteractionsTurn,
15
16
  } from './google-interactions-prompt';
16
17
 
17
18
  function getTopLevelMediaType(mediaType: string): string {
@@ -42,22 +43,20 @@ export type ConvertToGoogleInteractionsInputResult = {
42
43
 
43
44
  /**
44
45
  * Converts an AI SDK `LanguageModelV3Prompt` into the Gemini Interactions
45
- * request shape (`{ input, system_instruction }`).
46
+ * request shape (`{ input: Array<Step>, system_instruction }`).
46
47
  *
47
- * Handles text parts, file parts (image / audio / document / video, all four
48
- * `data.type` shapes), tool-call/tool-result round-tripping, per-block
49
- * `signature` round-tripping (`thought.signature`, `function_call.signature`),
50
- * and statefulness compaction (drop assistant/tool turns whose
51
- * `providerOptions.google.interactionId === previousInteractionId`).
48
+ * Prior assistant content round-trips as discrete steps:
49
+ * - text / image content → `model_output` step with a single `content` array
50
+ * - reasoning → `thought` step (`signature` + `summary`)
51
+ * - tool-call `function_call` step
52
+ * User turns (and tool-result turns from the previous round) are sent as
53
+ * `user_input` steps whose `content[]` holds the user's parts (text, files,
54
+ * and — for tool-result turns — `function_result` blocks).
52
55
  *
53
- * NOTE on PRD Open Q3 (empty-text-with-signature carrier hack from the
54
- * `:generateContent` provider): unnecessary on Interactions because
55
- * `thought.signature` and `function_call.signature` are explicit fields on
56
- * the wire (verified against `googleapis/js-genai`
57
- * `src/interactions/resources/interactions.ts` `ThoughtContent` /
58
- * `FunctionCallContent`). When an input reasoning part has empty text + a
59
- * signature, the converter emits a `thought` block with `signature` and an
60
- * omitted `summary` — no synthetic empty-text carrier needed.
56
+ * Handles text parts, file parts (image / audio / document / video, all four
57
+ * `data.type` shapes), tool-call/tool-result round-tripping, per-step
58
+ * `signature` round-tripping, and statefulness compaction (drop assistant/tool
59
+ * turns whose `providerOptions.google.interactionId === previousInteractionId`).
61
60
  */
62
61
  export function convertToGoogleInteractionsInput({
63
62
  prompt,
@@ -68,20 +67,12 @@ export function convertToGoogleInteractionsInput({
68
67
  prompt: LanguageModelV3Prompt;
69
68
  previousInteractionId?: string;
70
69
  store?: boolean;
71
- /**
72
- * Per-block media resolution applied to every image / video input block
73
- * (the Interactions wire format places `resolution` on the block, not at
74
- * the top level). See js-genai
75
- * `src/interactions/resources/interactions.ts` `ImageContent.resolution`
76
- * and `VideoContent.resolution`.
77
- */
78
70
  mediaResolution?: GoogleInteractionsMediaResolution;
79
71
  }): ConvertToGoogleInteractionsInputResult {
80
72
  const warnings: Array<SharedV3Warning> = [];
81
73
 
82
74
  /*
83
- * Behavior matrix per PRD § "Public-API contracts" → "Configurable behavior
84
- * matrix":
75
+ * Behavior matrix for compaction:
85
76
  *
86
77
  * - `previousInteractionId` set + `store !== false` → compact history (drop
87
78
  * assistant/tool turns whose `providerMetadata.google.interactionId`
@@ -90,10 +81,6 @@ export function convertToGoogleInteractionsInput({
90
81
  * (incoherent combo), still send full history (NO compaction).
91
82
  * - `store === false`, no `previousInteractionId` → no compaction.
92
83
  * - Default → no compaction.
93
- *
94
- * The actual `previous_interaction_id` / `store` body fields are emitted in
95
- * the language model's `getArgs`; this converter only handles the history
96
- * shape and the warning.
97
84
  */
98
85
  const incoherentCombo = previousInteractionId != null && store === false;
99
86
  const shouldCompact = previousInteractionId != null && store !== false;
@@ -113,7 +100,7 @@ export function convertToGoogleInteractionsInput({
113
100
  : prompt;
114
101
 
115
102
  const systemTexts: Array<string> = [];
116
- const turns: Array<GoogleInteractionsTurn> = [];
103
+ const steps: Array<GoogleInteractionsStep> = [];
117
104
 
118
105
  for (const message of compactedPrompt) {
119
106
  switch (message.role) {
@@ -122,14 +109,10 @@ export function convertToGoogleInteractionsInput({
122
109
  break;
123
110
  }
124
111
  case 'user': {
125
- const content: Array<GoogleInteractionsContent> = [];
112
+ const content: Array<GoogleInteractionsContentBlock> = [];
126
113
  for (const part of message.content) {
127
114
  if (part.type === 'text') {
128
- const block: GoogleInteractionsTextContent = {
129
- type: 'text',
130
- text: part.text,
131
- };
132
- content.push(block);
115
+ content.push({ type: 'text', text: part.text });
133
116
  } else if (part.type === 'file') {
134
117
  const fileBlock = convertFilePartToContent({
135
118
  part,
@@ -143,20 +126,34 @@ export function convertToGoogleInteractionsInput({
143
126
  }
144
127
  const merged = mergeAdjacentTextContent(content);
145
128
  if (merged.length > 0) {
146
- turns.push({ role: 'user', content: merged });
129
+ steps.push({ type: 'user_input', content: merged });
147
130
  }
148
131
  break;
149
132
  }
150
133
  case 'assistant': {
151
- const content: Array<GoogleInteractionsContent> = [];
134
+ /*
135
+ * Prior assistant content fans out into one step per logical block.
136
+ * Adjacent text/image content blocks are coalesced into a single
137
+ * `model_output` step (matching how the API emits them on output);
138
+ * reasoning and tool-calls each become their own step.
139
+ */
140
+ let pendingModelOutput: Array<GoogleInteractionsContentBlock> = [];
141
+ const flushModelOutput = () => {
142
+ if (pendingModelOutput.length > 0) {
143
+ steps.push({ type: 'model_output', content: pendingModelOutput });
144
+ pendingModelOutput = [];
145
+ }
146
+ };
147
+
152
148
  for (const part of message.content) {
153
149
  if (part.type === 'text') {
154
- content.push({ type: 'text', text: part.text });
150
+ pendingModelOutput.push({ type: 'text', text: part.text });
155
151
  } else if (part.type === 'reasoning') {
152
+ flushModelOutput();
156
153
  const signature = part.providerOptions?.google?.signature as
157
154
  | string
158
155
  | undefined;
159
- content.push({
156
+ steps.push({
160
157
  type: 'thought',
161
158
  ...(signature != null ? { signature } : {}),
162
159
  summary:
@@ -171,9 +168,10 @@ export function convertToGoogleInteractionsInput({
171
168
  mediaResolution,
172
169
  });
173
170
  if (fileBlock != null) {
174
- content.push(fileBlock);
171
+ pendingModelOutput.push(fileBlock);
175
172
  }
176
173
  } else if (part.type === 'tool-call') {
174
+ flushModelOutput();
177
175
  const signature = part.providerOptions?.google?.signature as
178
176
  | string
179
177
  | undefined;
@@ -181,7 +179,7 @@ export function convertToGoogleInteractionsInput({
181
179
  typeof part.input === 'string'
182
180
  ? safeParseToolArgs(part.input)
183
181
  : ((part.input ?? {}) as Record<string, unknown>);
184
- content.push({
182
+ steps.push({
185
183
  type: 'function_call',
186
184
  id: part.toolCallId,
187
185
  name: part.toolName,
@@ -195,51 +193,17 @@ export function convertToGoogleInteractionsInput({
195
193
  });
196
194
  }
197
195
  }
198
- if (content.length > 0) {
199
- turns.push({ role: 'model', content });
200
- }
196
+ flushModelOutput();
201
197
  break;
202
198
  }
203
199
  case 'tool': {
204
200
  /*
205
- * Tool-result messages are emitted as a `user` turn whose content
206
- * holds one `function_result` block per tool-result part. Wire shape
207
- * (verified against `googleapis/js-genai`
208
- * `samples/interactions_function_calling_client_state.ts` and
209
- * `src/interactions/resources/interactions.ts` `FunctionResultContent`
210
- * around line 979 — RESOLVES PRD Open Q2):
211
- *
212
- * {
213
- * role: 'user',
214
- * content: [
215
- * {
216
- * type: 'function_result',
217
- * call_id: <id from the matching function_call block>,
218
- * name: <tool name>,
219
- * result: <string | unknown | Array<TextContent|ImageContent>>,
220
- * is_error?: boolean,
221
- * signature?: string,
222
- * },
223
- * ],
224
- * }
225
- *
226
- * The `result` field is a discriminated union: a plain string for
227
- * text-only results, or an array of `text` / `image` content blocks
228
- * for mixed text/image results. Our converter takes the AI SDK
229
- * canonical `LanguageModelV3ToolResultOutput` and maps:
230
- * - `{ type: 'text', value }` → `result: <string>`
231
- * - `{ type: 'json', value }` → `result: <stringified JSON>`
232
- * - `{ type: 'error-text', value }` → `result: <string>` + `is_error: true`
233
- * - `{ type: 'error-json', value }` → `result: <stringified JSON>` + `is_error: true`
234
- * - `{ type: 'execution-denied', reason }` → `result: <reason>` + `is_error: true`
235
- * - `{ type: 'content', value: [...] }` → `result: Array<text|image>`
236
- * where each AI SDK `file` part with `mediaType: image/*` becomes
237
- * an Interactions `image` block (file-data path matches
238
- * `convertFilePartToContent` for top-level user images), and `text`
239
- * parts pass through. Non-image file parts fall back to a warning
240
- * because `FunctionResultContent.result` only accepts text/image.
201
+ * Tool-result messages are emitted as a `user_input` step whose
202
+ * content holds one `function_result` block per tool-result part.
203
+ * `function_result` remains a content-block type (it sits inside
204
+ * a step), not a top-level step type.
241
205
  */
242
- const content: Array<GoogleInteractionsContent> = [];
206
+ const content: Array<GoogleInteractionsContentBlock> = [];
243
207
  for (const part of message.content) {
244
208
  if (part.type !== 'tool-result') {
245
209
  warnings.push({
@@ -260,7 +224,7 @@ export function convertToGoogleInteractionsInput({
260
224
  content.push(block);
261
225
  }
262
226
  if (content.length > 0) {
263
- turns.push({ role: 'user', content });
227
+ steps.push({ type: 'user_input', content });
264
228
  }
265
229
  break;
266
230
  }
@@ -270,24 +234,7 @@ export function convertToGoogleInteractionsInput({
270
234
  const systemInstruction =
271
235
  systemTexts.length > 0 ? systemTexts.join('\n\n') : undefined;
272
236
 
273
- let input: GoogleInteractionsInput;
274
- if (turns.length === 0) {
275
- input = '';
276
- } else if (
277
- turns.length === 1 &&
278
- turns[0].role === 'user' &&
279
- Array.isArray(turns[0].content)
280
- ) {
281
- /*
282
- * Single-turn user prompt: send the bare `Array<Content>` shape per the
283
- * Interactions API's preferred single-turn format.
284
- */
285
- input = turns[0].content;
286
- } else {
287
- input = turns;
288
- }
289
-
290
- return { input, systemInstruction, warnings };
237
+ return { input: steps, systemInstruction, warnings };
291
238
  }
292
239
 
293
240
  /**
@@ -337,12 +284,6 @@ function convertFilePartToContent({
337
284
  return undefined;
338
285
  }
339
286
 
340
- /*
341
- * `resolution` is per-block on the wire (`ImageContent.resolution`,
342
- * `VideoContent.resolution`); only image and video carry it (see
343
- * `googleapis/js-genai` `src/interactions/resources/interactions.ts`).
344
- * Audio / document blocks ignore the option silently.
345
- */
346
287
  const resolutionField =
347
288
  mediaResolution != null && (kind === 'image' || kind === 'video')
348
289
  ? { resolution: mediaResolution }
@@ -374,23 +315,9 @@ function convertFilePartToContent({
374
315
  }
375
316
 
376
317
  /*
377
- * Drops assistant turns that were part of the linked interaction
378
- * (`previousInteractionId`) so the API doesn't see them re-sent on top of its
379
- * server-side state. Also drops any subsequent `tool` (tool-result) message
380
- * whose `tool-result.toolCallId` matches a `tool-call.toolCallId` from the
381
- * dropped assistant turn — server-state already has the matching tool result
382
- * baked in, and re-sending it without its paired call would be malformed.
383
- *
384
- * An assistant message is considered "part of the linked interaction" if any
385
- * of its content parts carry `providerOptions.google.interactionId ===
386
- * previousInteractionId`. This is stamped by `parseGoogleInteractionsOutputs`
387
- * (and the stream transformer) on every output content part.
388
- *
389
- * User messages are always kept regardless of where they fell in the prior
390
- * conversation — only assistant model output and its tool plumbing live on the
391
- * server. (Note that the AI SDK does not stamp `interactionId` onto user
392
- * messages, so even if it did, this function would not have a way to identify
393
- * which user message belongs to which interaction.)
318
+ * Drops assistant messages that were part of the linked interaction
319
+ * (`previousInteractionId`). Tool-result turns whose tool-call counterpart
320
+ * was dropped are also pruned to keep the message stream well-formed.
394
321
  */
395
322
  function compactPromptForPreviousInteraction({
396
323
  prompt,
@@ -602,21 +529,18 @@ function filePartToImageBlock({
602
529
  }
603
530
 
604
531
  /*
605
- * Collapses runs of adjacent text content blocks within a single user message
606
- * into one combined text block, separated by a blank line. The Interactions
607
- * API has no `text+data` shape, so a `data.type === 'text'` file part is
608
- * already lowered to a `text` block by `convertFilePartToContent`; merging
609
- * keeps the wire shape compact and preserves intent when an inline text file
610
- * sits next to a regular text part. Text blocks carrying `annotations` are
611
- * left untouched (annotations are tied to specific text spans).
532
+ * Collapses runs of adjacent text content blocks within a single user step
533
+ * into one combined text block, separated by a blank line. Text blocks
534
+ * carrying `annotations` are left untouched (annotations are tied to specific
535
+ * text spans).
612
536
  */
613
537
  function mergeAdjacentTextContent(
614
- content: Array<GoogleInteractionsContent>,
615
- ): Array<GoogleInteractionsContent> {
538
+ content: Array<GoogleInteractionsContentBlock>,
539
+ ): Array<GoogleInteractionsContentBlock> {
616
540
  if (content.length < 2) {
617
541
  return content;
618
542
  }
619
- const result: Array<GoogleInteractionsContent> = [];
543
+ const result: Array<GoogleInteractionsContentBlock> = [];
620
544
  for (const block of content) {
621
545
  const last = result[result.length - 1];
622
546
  if (
@@ -60,7 +60,7 @@ export function annotationToSource({
60
60
  }
61
61
  case 'file_citation': {
62
62
  const a = annotation as GoogleInteractionsFileCitation;
63
- const uri = a.document_uri ?? a.source ?? a.file_name;
63
+ const uri = a.url ?? a.document_uri ?? a.file_name;
64
64
  if (uri == null || uri.length === 0) return undefined;
65
65
  if (uri.startsWith('http://') || uri.startsWith('https://')) {
66
66
  return {
@@ -176,10 +176,10 @@ export function builtinToolResultToSources({
176
176
  const entry = raw as {
177
177
  file_name?: string;
178
178
  document_uri?: string;
179
- source?: string;
179
+ url?: string;
180
180
  title?: string;
181
181
  };
182
- const uri = entry.document_uri ?? entry.source ?? entry.file_name;
182
+ const uri = entry.url ?? entry.document_uri ?? entry.file_name;
183
183
  if (uri == null || uri.length === 0) continue;
184
184
  if (uri.startsWith('http://') || uri.startsWith('https://')) {
185
185
  sources.push({