@ai-sdk/google 3.0.25 → 3.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -929,7 +929,7 @@ The `vertexRagStore` tool accepts the following configuration options:
929
929
 
930
930
  ### Image Outputs
931
931
 
932
- Gemini models with image generation capabilities (`gemini-2.5-flash-image`) support image generation. Images are exposed as files in the response.
932
+ Gemini models with image generation capabilities (e.g. `gemini-2.5-flash-image`) support generating images as part of a multimodal response. Images are exposed as files in the response.
933
933
 
934
934
  ```ts
935
935
  import { google } from '@ai-sdk/google';
@@ -948,6 +948,12 @@ for (const file of result.files) {
948
948
  }
949
949
  ```
950
950
 
951
+ <Note>
952
+ If you primarily want to generate images without text output, you can also use
953
+ Gemini image models with the `generateImage()` function. See [Gemini Image
954
+ Models](#gemini-image-models) for details.
955
+ </Note>
956
+
951
957
  ### Safety Ratings
952
958
 
953
959
  The safety ratings provide insight into the safety of the model's response.
@@ -1146,9 +1152,18 @@ The following optional provider options are available for Google Generative AI e
1146
1152
 
1147
1153
  ## Image Models
1148
1154
 
1149
- You can create [Imagen](https://ai.google.dev/gemini-api/docs/imagen) models that call the Google Generative AI API using the `.image()` factory method.
1155
+ You can create image models that call the Google Generative AI API using the `.image()` factory method.
1150
1156
  For more on image generation with the AI SDK see [generateImage()](/docs/reference/ai-sdk-core/generate-image).
1151
1157
 
1158
+ The Google provider supports two types of image models:
1159
+
1160
+ - **Imagen models**: Dedicated image generation models using the `:predict` API
1161
+ - **Gemini image models**: Multimodal language models with image output capabilities using the `:generateContent` API
1162
+
1163
+ ### Imagen Models
1164
+
1165
+ [Imagen](https://ai.google.dev/gemini-api/docs/imagen) models are dedicated image generation models.
1166
+
1152
1167
  ```ts
1153
1168
  import { google } from '@ai-sdk/google';
1154
1169
  import { generateImage } from 'ai';
@@ -1178,7 +1193,7 @@ const { image } = await generateImage({
1178
1193
  });
1179
1194
  ```
1180
1195
 
1181
- The following provider options are available:
1196
+ The following provider options are available for Imagen models:
1182
1197
 
1183
1198
  - **personGeneration** `allow_adult` | `allow_all` | `dont_allow`
1184
1199
  Whether to allow person generation. Defaults to `allow_adult`.
@@ -1188,10 +1203,84 @@ The following provider options are available:
1188
1203
  parameter instead.
1189
1204
  </Note>
1190
1205
 
1191
- #### Model Capabilities
1206
+ #### Imagen Model Capabilities
1192
1207
 
1193
1208
  | Model | Aspect Ratios |
1194
1209
  | ------------------------------- | ------------------------- |
1195
1210
  | `imagen-4.0-generate-001` | 1:1, 3:4, 4:3, 9:16, 16:9 |
1196
1211
  | `imagen-4.0-ultra-generate-001` | 1:1, 3:4, 4:3, 9:16, 16:9 |
1197
1212
  | `imagen-4.0-fast-generate-001` | 1:1, 3:4, 4:3, 9:16, 16:9 |
1213
+
1214
+ ### Gemini Image Models
1215
+
1216
+ [Gemini image models](https://ai.google.dev/gemini-api/docs/image-generation) (e.g. `gemini-2.5-flash-image`) are technically multimodal output language models, but they can be used with the `generateImage()` function for a simpler image generation experience. Internally, the provider calls the language model API with `responseModalities: ['IMAGE']`.
1217
+
1218
+ ```ts
1219
+ import { google } from '@ai-sdk/google';
1220
+ import { generateImage } from 'ai';
1221
+
1222
+ const { image } = await generateImage({
1223
+ model: google.image('gemini-2.5-flash-image'),
1224
+ prompt: 'A photorealistic image of a cat wearing a wizard hat',
1225
+ aspectRatio: '1:1',
1226
+ });
1227
+ ```
1228
+
1229
+ Gemini image models also support image editing by providing input images:
1230
+
1231
+ ```ts
1232
+ import { google } from '@ai-sdk/google';
1233
+ import { generateImage } from 'ai';
1234
+ import fs from 'node:fs';
1235
+
1236
+ const sourceImage = fs.readFileSync('./cat.png');
1237
+
1238
+ const { image } = await generateImage({
1239
+ model: google.image('gemini-2.5-flash-image'),
1240
+ prompt: {
1241
+ text: 'Add a small wizard hat to this cat',
1242
+ images: [sourceImage],
1243
+ },
1244
+ });
1245
+ ```
1246
+
1247
+ You can also use URLs for input images:
1248
+
1249
+ ```ts
1250
+ import { google } from '@ai-sdk/google';
1251
+ import { generateImage } from 'ai';
1252
+
1253
+ const { image } = await generateImage({
1254
+ model: google.image('gemini-2.5-flash-image'),
1255
+ prompt: {
1256
+ text: 'Add a small wizard hat to this cat',
1257
+ images: ['https://example.com/cat.png'],
1258
+ },
1259
+ });
1260
+ ```
1261
+
1262
+ <Note>
1263
+ Gemini image models do not support the `size` or `n` parameters. Use
1264
+ `aspectRatio` instead of `size`. Mask-based inpainting is also not supported.
1265
+ </Note>
1266
+
1267
+ <Note>
1268
+ For more advanced use cases where you need both text and image outputs, or
1269
+ want more control over the generation process, you can use Gemini image models
1270
+ directly with `generateText()`. See [Image Outputs](#image-outputs) for
1271
+ details.
1272
+ </Note>
1273
+
1274
+ #### Gemini Image Model Capabilities
1275
+
1276
+ | Model | Image Generation | Image Editing | Aspect Ratios |
1277
+ | ---------------------------- | ------------------- | ------------------- | --------------------------------------------------- |
1278
+ | `gemini-2.5-flash-image` | <Check size={18} /> | <Check size={18} /> | 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9 |
1279
+ | `gemini-3-pro-image-preview` | <Check size={18} /> | <Check size={18} /> | 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9 |
1280
+
1281
+ <Note>
1282
+ `gemini-3-pro-image-preview` supports additional features including up to 14
1283
+ reference images for editing (6 objects, 5 humans), resolution options (1K,
1284
+ 2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
1285
+ grounding.
1286
+ </Note>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ai-sdk/google",
3
- "version": "3.0.25",
3
+ "version": "3.0.26",
4
4
  "license": "Apache-2.0",
5
5
  "sideEffects": false,
6
6
  "main": "./dist/index.js",
@@ -36,8 +36,8 @@
36
36
  }
37
37
  },
38
38
  "dependencies": {
39
- "@ai-sdk/provider": "3.0.8",
40
- "@ai-sdk/provider-utils": "4.0.14"
39
+ "@ai-sdk/provider-utils": "4.0.14",
40
+ "@ai-sdk/provider": "3.0.8"
41
41
  },
42
42
  "devDependencies": {
43
43
  "@types/node": "20.17.24",
@@ -1,11 +1,19 @@
1
- import { ImageModelV3, SharedV3Warning } from '@ai-sdk/provider';
1
+ import {
2
+ ImageModelV3,
3
+ LanguageModelV3Prompt,
4
+ SharedV3Warning,
5
+ } from '@ai-sdk/provider';
2
6
  import {
3
7
  combineHeaders,
8
+ convertToBase64,
4
9
  createJsonResponseHandler,
10
+ FetchFunction,
11
+ generateId as defaultGenerateId,
5
12
  type InferSchema,
6
13
  lazySchema,
7
14
  parseProviderOptions,
8
15
  postJsonToApi,
16
+ Resolvable,
9
17
  resolve,
10
18
  zodSchema,
11
19
  } from '@ai-sdk/provider-utils';
@@ -15,7 +23,7 @@ import {
15
23
  GoogleGenerativeAIImageModelId,
16
24
  GoogleGenerativeAIImageSettings,
17
25
  } from './google-generative-ai-image-settings';
18
- import { FetchFunction, Resolvable } from '@ai-sdk/provider-utils';
26
+ import { GoogleGenerativeAILanguageModel } from './google-generative-ai-language-model';
19
27
 
20
28
  interface GoogleGenerativeAIImageModelConfig {
21
29
  provider: string;
@@ -32,8 +40,15 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
32
40
  readonly specificationVersion = 'v3';
33
41
 
34
42
  get maxImagesPerCall(): number {
43
+ if (this.settings.maxImagesPerCall != null) {
44
+ return this.settings.maxImagesPerCall;
45
+ }
46
+ // https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-image
47
+ if (isGeminiModel(this.modelId)) {
48
+ return 10;
49
+ }
35
50
  // https://ai.google.dev/gemini-api/docs/imagen#imagen-model
36
- return this.settings.maxImagesPerCall ?? 4;
51
+ return 4;
37
52
  }
38
53
 
39
54
  get provider(): string {
@@ -48,6 +63,16 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
48
63
 
49
64
  async doGenerate(
50
65
  options: Parameters<ImageModelV3['doGenerate']>[0],
66
+ ): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
67
+ // Gemini image models use the language model API internally
68
+ if (isGeminiModel(this.modelId)) {
69
+ return this.doGenerateGemini(options);
70
+ }
71
+ return this.doGenerateImagen(options);
72
+ }
73
+
74
+ private async doGenerateImagen(
75
+ options: Parameters<ImageModelV3['doGenerate']>[0],
51
76
  ): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
52
77
  const {
53
78
  prompt,
@@ -63,10 +88,10 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
63
88
  } = options;
64
89
  const warnings: Array<SharedV3Warning> = [];
65
90
 
66
- // Google Generative AI does not support image editing
91
+ // Imagen API endpoints do not support image editing
67
92
  if (files != null && files.length > 0) {
68
93
  throw new Error(
69
- 'Google Generative AI does not support image editing. ' +
94
+ 'Google Generative AI does not support image editing with Imagen models. ' +
70
95
  'Use Google Vertex AI (@ai-sdk/google-vertex) for image editing capabilities.',
71
96
  );
72
97
  }
@@ -138,10 +163,10 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
138
163
  images: response.predictions.map(
139
164
  (p: { bytesBase64Encoded: string }) => p.bytesBase64Encoded,
140
165
  ),
141
- warnings: warnings ?? [],
166
+ warnings,
142
167
  providerMetadata: {
143
168
  google: {
144
- images: response.predictions.map(prediction => ({
169
+ images: response.predictions.map(() => ({
145
170
  // Add any prediction-specific metadata here
146
171
  })),
147
172
  },
@@ -153,6 +178,146 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
153
178
  },
154
179
  };
155
180
  }
181
+
182
+ private async doGenerateGemini(
183
+ options: Parameters<ImageModelV3['doGenerate']>[0],
184
+ ): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
185
+ const {
186
+ prompt,
187
+ n,
188
+ size,
189
+ aspectRatio,
190
+ seed,
191
+ providerOptions,
192
+ headers,
193
+ abortSignal,
194
+ files,
195
+ mask,
196
+ } = options;
197
+ const warnings: Array<SharedV3Warning> = [];
198
+
199
+ // Gemini does not support mask-based inpainting
200
+ if (mask != null) {
201
+ throw new Error(
202
+ 'Gemini image models do not support mask-based image editing.',
203
+ );
204
+ }
205
+
206
+ // Gemini does not support generating multiple images per call via n parameter
207
+ if (n != null && n > 1) {
208
+ throw new Error(
209
+ 'Gemini image models do not support generating a set number of images per call. Use n=1 or omit the n parameter.',
210
+ );
211
+ }
212
+
213
+ if (size != null) {
214
+ warnings.push({
215
+ type: 'unsupported',
216
+ feature: 'size',
217
+ details:
218
+ 'This model does not support the `size` option. Use `aspectRatio` instead.',
219
+ });
220
+ }
221
+
222
+ // Build user message content for language model
223
+ const userContent: Array<
224
+ | { type: 'text'; text: string }
225
+ | { type: 'file'; data: string | Uint8Array | URL; mediaType: string }
226
+ > = [];
227
+
228
+ // Add text prompt
229
+ if (prompt != null) {
230
+ userContent.push({ type: 'text', text: prompt });
231
+ }
232
+
233
+ // Add input images for editing
234
+ if (files != null && files.length > 0) {
235
+ for (const file of files) {
236
+ if (file.type === 'url') {
237
+ userContent.push({
238
+ type: 'file',
239
+ data: new URL(file.url),
240
+ mediaType: 'image/*',
241
+ });
242
+ } else {
243
+ userContent.push({
244
+ type: 'file',
245
+ data:
246
+ typeof file.data === 'string'
247
+ ? file.data
248
+ : new Uint8Array(file.data),
249
+ mediaType: file.mediaType,
250
+ });
251
+ }
252
+ }
253
+ }
254
+
255
+ const languageModelPrompt: LanguageModelV3Prompt = [
256
+ { role: 'user', content: userContent },
257
+ ];
258
+
259
+ // Instantiate language model
260
+ const languageModel = new GoogleGenerativeAILanguageModel(this.modelId, {
261
+ provider: this.config.provider,
262
+ baseURL: this.config.baseURL,
263
+ headers: this.config.headers ?? {},
264
+ fetch: this.config.fetch,
265
+ generateId: this.config.generateId ?? defaultGenerateId,
266
+ });
267
+
268
+ // Call language model with image-only response modality
269
+ const result = await languageModel.doGenerate({
270
+ prompt: languageModelPrompt,
271
+ seed,
272
+ providerOptions: {
273
+ google: {
274
+ responseModalities: ['IMAGE'],
275
+ imageConfig: aspectRatio ? { aspectRatio } : undefined,
276
+ ...((providerOptions?.google as Record<string, unknown>) ?? {}),
277
+ },
278
+ },
279
+ headers,
280
+ abortSignal,
281
+ });
282
+
283
+ const currentDate = this.config._internal?.currentDate?.() ?? new Date();
284
+
285
+ // Extract images from language model response
286
+ const images: string[] = [];
287
+ for (const part of result.content) {
288
+ if (part.type === 'file' && part.mediaType.startsWith('image/')) {
289
+ images.push(convertToBase64(part.data));
290
+ }
291
+ }
292
+
293
+ return {
294
+ images,
295
+ warnings,
296
+ providerMetadata: {
297
+ google: {
298
+ images: images.map(() => ({})),
299
+ },
300
+ },
301
+ response: {
302
+ timestamp: currentDate,
303
+ modelId: this.modelId,
304
+ headers: result.response?.headers,
305
+ },
306
+ usage: result.usage
307
+ ? {
308
+ inputTokens: result.usage.inputTokens.total,
309
+ outputTokens: result.usage.outputTokens.total,
310
+ totalTokens:
311
+ (result.usage.inputTokens.total ?? 0) +
312
+ (result.usage.outputTokens.total ?? 0),
313
+ }
314
+ : undefined,
315
+ };
316
+ }
317
+ }
318
+
319
+ function isGeminiModel(modelId: string): boolean {
320
+ return modelId.startsWith('gemini-');
156
321
  }
157
322
 
158
323
  // minimal version of the schema
@@ -1,7 +1,11 @@
1
1
  export type GoogleGenerativeAIImageModelId =
2
+ // Imagen models (use :predict API)
2
3
  | 'imagen-4.0-generate-001'
3
4
  | 'imagen-4.0-ultra-generate-001'
4
5
  | 'imagen-4.0-fast-generate-001'
6
+ // Gemini image models (technically multimodal output language models, use :generateContent API)
7
+ | 'gemini-2.5-flash-image'
8
+ | 'gemini-3-pro-image-preview'
5
9
  | (string & {});
6
10
 
7
11
  export interface GoogleGenerativeAIImageSettings {