@ai-sdk/google 3.0.25 → 3.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/index.d.mts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +123 -6
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +125 -6
- package/dist/index.mjs.map +1 -1
- package/docs/15-google-generative-ai.mdx +93 -4
- package/package.json +3 -3
- package/src/google-generative-ai-image-model.ts +172 -7
- package/src/google-generative-ai-image-settings.ts +4 -0
|
@@ -929,7 +929,7 @@ The `vertexRagStore` tool accepts the following configuration options:
|
|
|
929
929
|
|
|
930
930
|
### Image Outputs
|
|
931
931
|
|
|
932
|
-
Gemini models with image generation capabilities (`gemini-2.5-flash-image`) support
|
|
932
|
+
Gemini models with image generation capabilities (e.g. `gemini-2.5-flash-image`) support generating images as part of a multimodal response. Images are exposed as files in the response.
|
|
933
933
|
|
|
934
934
|
```ts
|
|
935
935
|
import { google } from '@ai-sdk/google';
|
|
@@ -948,6 +948,12 @@ for (const file of result.files) {
|
|
|
948
948
|
}
|
|
949
949
|
```
|
|
950
950
|
|
|
951
|
+
<Note>
|
|
952
|
+
If you primarily want to generate images without text output, you can also use
|
|
953
|
+
Gemini image models with the `generateImage()` function. See [Gemini Image
|
|
954
|
+
Models](#gemini-image-models) for details.
|
|
955
|
+
</Note>
|
|
956
|
+
|
|
951
957
|
### Safety Ratings
|
|
952
958
|
|
|
953
959
|
The safety ratings provide insight into the safety of the model's response.
|
|
@@ -1146,9 +1152,18 @@ The following optional provider options are available for Google Generative AI e
|
|
|
1146
1152
|
|
|
1147
1153
|
## Image Models
|
|
1148
1154
|
|
|
1149
|
-
You can create
|
|
1155
|
+
You can create image models that call the Google Generative AI API using the `.image()` factory method.
|
|
1150
1156
|
For more on image generation with the AI SDK see [generateImage()](/docs/reference/ai-sdk-core/generate-image).
|
|
1151
1157
|
|
|
1158
|
+
The Google provider supports two types of image models:
|
|
1159
|
+
|
|
1160
|
+
- **Imagen models**: Dedicated image generation models using the `:predict` API
|
|
1161
|
+
- **Gemini image models**: Multimodal language models with image output capabilities using the `:generateContent` API
|
|
1162
|
+
|
|
1163
|
+
### Imagen Models
|
|
1164
|
+
|
|
1165
|
+
[Imagen](https://ai.google.dev/gemini-api/docs/imagen) models are dedicated image generation models.
|
|
1166
|
+
|
|
1152
1167
|
```ts
|
|
1153
1168
|
import { google } from '@ai-sdk/google';
|
|
1154
1169
|
import { generateImage } from 'ai';
|
|
@@ -1178,7 +1193,7 @@ const { image } = await generateImage({
|
|
|
1178
1193
|
});
|
|
1179
1194
|
```
|
|
1180
1195
|
|
|
1181
|
-
The following provider options are available:
|
|
1196
|
+
The following provider options are available for Imagen models:
|
|
1182
1197
|
|
|
1183
1198
|
- **personGeneration** `allow_adult` | `allow_all` | `dont_allow`
|
|
1184
1199
|
Whether to allow person generation. Defaults to `allow_adult`.
|
|
@@ -1188,10 +1203,84 @@ The following provider options are available:
|
|
|
1188
1203
|
parameter instead.
|
|
1189
1204
|
</Note>
|
|
1190
1205
|
|
|
1191
|
-
#### Model Capabilities
|
|
1206
|
+
#### Imagen Model Capabilities
|
|
1192
1207
|
|
|
1193
1208
|
| Model | Aspect Ratios |
|
|
1194
1209
|
| ------------------------------- | ------------------------- |
|
|
1195
1210
|
| `imagen-4.0-generate-001` | 1:1, 3:4, 4:3, 9:16, 16:9 |
|
|
1196
1211
|
| `imagen-4.0-ultra-generate-001` | 1:1, 3:4, 4:3, 9:16, 16:9 |
|
|
1197
1212
|
| `imagen-4.0-fast-generate-001` | 1:1, 3:4, 4:3, 9:16, 16:9 |
|
|
1213
|
+
|
|
1214
|
+
### Gemini Image Models
|
|
1215
|
+
|
|
1216
|
+
[Gemini image models](https://ai.google.dev/gemini-api/docs/image-generation) (e.g. `gemini-2.5-flash-image`) are technically multimodal output language models, but they can be used with the `generateImage()` function for a simpler image generation experience. Internally, the provider calls the language model API with `responseModalities: ['IMAGE']`.
|
|
1217
|
+
|
|
1218
|
+
```ts
|
|
1219
|
+
import { google } from '@ai-sdk/google';
|
|
1220
|
+
import { generateImage } from 'ai';
|
|
1221
|
+
|
|
1222
|
+
const { image } = await generateImage({
|
|
1223
|
+
model: google.image('gemini-2.5-flash-image'),
|
|
1224
|
+
prompt: 'A photorealistic image of a cat wearing a wizard hat',
|
|
1225
|
+
aspectRatio: '1:1',
|
|
1226
|
+
});
|
|
1227
|
+
```
|
|
1228
|
+
|
|
1229
|
+
Gemini image models also support image editing by providing input images:
|
|
1230
|
+
|
|
1231
|
+
```ts
|
|
1232
|
+
import { google } from '@ai-sdk/google';
|
|
1233
|
+
import { generateImage } from 'ai';
|
|
1234
|
+
import fs from 'node:fs';
|
|
1235
|
+
|
|
1236
|
+
const sourceImage = fs.readFileSync('./cat.png');
|
|
1237
|
+
|
|
1238
|
+
const { image } = await generateImage({
|
|
1239
|
+
model: google.image('gemini-2.5-flash-image'),
|
|
1240
|
+
prompt: {
|
|
1241
|
+
text: 'Add a small wizard hat to this cat',
|
|
1242
|
+
images: [sourceImage],
|
|
1243
|
+
},
|
|
1244
|
+
});
|
|
1245
|
+
```
|
|
1246
|
+
|
|
1247
|
+
You can also use URLs for input images:
|
|
1248
|
+
|
|
1249
|
+
```ts
|
|
1250
|
+
import { google } from '@ai-sdk/google';
|
|
1251
|
+
import { generateImage } from 'ai';
|
|
1252
|
+
|
|
1253
|
+
const { image } = await generateImage({
|
|
1254
|
+
model: google.image('gemini-2.5-flash-image'),
|
|
1255
|
+
prompt: {
|
|
1256
|
+
text: 'Add a small wizard hat to this cat',
|
|
1257
|
+
images: ['https://example.com/cat.png'],
|
|
1258
|
+
},
|
|
1259
|
+
});
|
|
1260
|
+
```
|
|
1261
|
+
|
|
1262
|
+
<Note>
|
|
1263
|
+
Gemini image models do not support the `size` or `n` parameters. Use
|
|
1264
|
+
`aspectRatio` instead of `size`. Mask-based inpainting is also not supported.
|
|
1265
|
+
</Note>
|
|
1266
|
+
|
|
1267
|
+
<Note>
|
|
1268
|
+
For more advanced use cases where you need both text and image outputs, or
|
|
1269
|
+
want more control over the generation process, you can use Gemini image models
|
|
1270
|
+
directly with `generateText()`. See [Image Outputs](#image-outputs) for
|
|
1271
|
+
details.
|
|
1272
|
+
</Note>
|
|
1273
|
+
|
|
1274
|
+
#### Gemini Image Model Capabilities
|
|
1275
|
+
|
|
1276
|
+
| Model | Image Generation | Image Editing | Aspect Ratios |
|
|
1277
|
+
| ---------------------------- | ------------------- | ------------------- | --------------------------------------------------- |
|
|
1278
|
+
| `gemini-2.5-flash-image` | <Check size={18} /> | <Check size={18} /> | 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9 |
|
|
1279
|
+
| `gemini-3-pro-image-preview` | <Check size={18} /> | <Check size={18} /> | 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9 |
|
|
1280
|
+
|
|
1281
|
+
<Note>
|
|
1282
|
+
`gemini-3-pro-image-preview` supports additional features including up to 14
|
|
1283
|
+
reference images for editing (6 objects, 5 humans), resolution options (1K,
|
|
1284
|
+
2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
|
|
1285
|
+
grounding.
|
|
1286
|
+
</Note>
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ai-sdk/google",
|
|
3
|
-
"version": "3.0.
|
|
3
|
+
"version": "3.0.26",
|
|
4
4
|
"license": "Apache-2.0",
|
|
5
5
|
"sideEffects": false,
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -36,8 +36,8 @@
|
|
|
36
36
|
}
|
|
37
37
|
},
|
|
38
38
|
"dependencies": {
|
|
39
|
-
"@ai-sdk/provider": "
|
|
40
|
-
"@ai-sdk/provider
|
|
39
|
+
"@ai-sdk/provider-utils": "4.0.14",
|
|
40
|
+
"@ai-sdk/provider": "3.0.8"
|
|
41
41
|
},
|
|
42
42
|
"devDependencies": {
|
|
43
43
|
"@types/node": "20.17.24",
|
|
@@ -1,11 +1,19 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
ImageModelV3,
|
|
3
|
+
LanguageModelV3Prompt,
|
|
4
|
+
SharedV3Warning,
|
|
5
|
+
} from '@ai-sdk/provider';
|
|
2
6
|
import {
|
|
3
7
|
combineHeaders,
|
|
8
|
+
convertToBase64,
|
|
4
9
|
createJsonResponseHandler,
|
|
10
|
+
FetchFunction,
|
|
11
|
+
generateId as defaultGenerateId,
|
|
5
12
|
type InferSchema,
|
|
6
13
|
lazySchema,
|
|
7
14
|
parseProviderOptions,
|
|
8
15
|
postJsonToApi,
|
|
16
|
+
Resolvable,
|
|
9
17
|
resolve,
|
|
10
18
|
zodSchema,
|
|
11
19
|
} from '@ai-sdk/provider-utils';
|
|
@@ -15,7 +23,7 @@ import {
|
|
|
15
23
|
GoogleGenerativeAIImageModelId,
|
|
16
24
|
GoogleGenerativeAIImageSettings,
|
|
17
25
|
} from './google-generative-ai-image-settings';
|
|
18
|
-
import {
|
|
26
|
+
import { GoogleGenerativeAILanguageModel } from './google-generative-ai-language-model';
|
|
19
27
|
|
|
20
28
|
interface GoogleGenerativeAIImageModelConfig {
|
|
21
29
|
provider: string;
|
|
@@ -32,8 +40,15 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
|
|
|
32
40
|
readonly specificationVersion = 'v3';
|
|
33
41
|
|
|
34
42
|
get maxImagesPerCall(): number {
|
|
43
|
+
if (this.settings.maxImagesPerCall != null) {
|
|
44
|
+
return this.settings.maxImagesPerCall;
|
|
45
|
+
}
|
|
46
|
+
// https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-image
|
|
47
|
+
if (isGeminiModel(this.modelId)) {
|
|
48
|
+
return 10;
|
|
49
|
+
}
|
|
35
50
|
// https://ai.google.dev/gemini-api/docs/imagen#imagen-model
|
|
36
|
-
return
|
|
51
|
+
return 4;
|
|
37
52
|
}
|
|
38
53
|
|
|
39
54
|
get provider(): string {
|
|
@@ -48,6 +63,16 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
|
|
|
48
63
|
|
|
49
64
|
async doGenerate(
|
|
50
65
|
options: Parameters<ImageModelV3['doGenerate']>[0],
|
|
66
|
+
): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
|
|
67
|
+
// Gemini image models use the language model API internally
|
|
68
|
+
if (isGeminiModel(this.modelId)) {
|
|
69
|
+
return this.doGenerateGemini(options);
|
|
70
|
+
}
|
|
71
|
+
return this.doGenerateImagen(options);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
private async doGenerateImagen(
|
|
75
|
+
options: Parameters<ImageModelV3['doGenerate']>[0],
|
|
51
76
|
): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
|
|
52
77
|
const {
|
|
53
78
|
prompt,
|
|
@@ -63,10 +88,10 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
|
|
|
63
88
|
} = options;
|
|
64
89
|
const warnings: Array<SharedV3Warning> = [];
|
|
65
90
|
|
|
66
|
-
//
|
|
91
|
+
// Imagen API endpoints do not support image editing
|
|
67
92
|
if (files != null && files.length > 0) {
|
|
68
93
|
throw new Error(
|
|
69
|
-
'Google Generative AI does not support image editing. ' +
|
|
94
|
+
'Google Generative AI does not support image editing with Imagen models. ' +
|
|
70
95
|
'Use Google Vertex AI (@ai-sdk/google-vertex) for image editing capabilities.',
|
|
71
96
|
);
|
|
72
97
|
}
|
|
@@ -138,10 +163,10 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
|
|
|
138
163
|
images: response.predictions.map(
|
|
139
164
|
(p: { bytesBase64Encoded: string }) => p.bytesBase64Encoded,
|
|
140
165
|
),
|
|
141
|
-
warnings
|
|
166
|
+
warnings,
|
|
142
167
|
providerMetadata: {
|
|
143
168
|
google: {
|
|
144
|
-
images: response.predictions.map(
|
|
169
|
+
images: response.predictions.map(() => ({
|
|
145
170
|
// Add any prediction-specific metadata here
|
|
146
171
|
})),
|
|
147
172
|
},
|
|
@@ -153,6 +178,146 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
|
|
|
153
178
|
},
|
|
154
179
|
};
|
|
155
180
|
}
|
|
181
|
+
|
|
182
|
+
private async doGenerateGemini(
|
|
183
|
+
options: Parameters<ImageModelV3['doGenerate']>[0],
|
|
184
|
+
): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
|
|
185
|
+
const {
|
|
186
|
+
prompt,
|
|
187
|
+
n,
|
|
188
|
+
size,
|
|
189
|
+
aspectRatio,
|
|
190
|
+
seed,
|
|
191
|
+
providerOptions,
|
|
192
|
+
headers,
|
|
193
|
+
abortSignal,
|
|
194
|
+
files,
|
|
195
|
+
mask,
|
|
196
|
+
} = options;
|
|
197
|
+
const warnings: Array<SharedV3Warning> = [];
|
|
198
|
+
|
|
199
|
+
// Gemini does not support mask-based inpainting
|
|
200
|
+
if (mask != null) {
|
|
201
|
+
throw new Error(
|
|
202
|
+
'Gemini image models do not support mask-based image editing.',
|
|
203
|
+
);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Gemini does not support generating multiple images per call via n parameter
|
|
207
|
+
if (n != null && n > 1) {
|
|
208
|
+
throw new Error(
|
|
209
|
+
'Gemini image models do not support generating a set number of images per call. Use n=1 or omit the n parameter.',
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if (size != null) {
|
|
214
|
+
warnings.push({
|
|
215
|
+
type: 'unsupported',
|
|
216
|
+
feature: 'size',
|
|
217
|
+
details:
|
|
218
|
+
'This model does not support the `size` option. Use `aspectRatio` instead.',
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Build user message content for language model
|
|
223
|
+
const userContent: Array<
|
|
224
|
+
| { type: 'text'; text: string }
|
|
225
|
+
| { type: 'file'; data: string | Uint8Array | URL; mediaType: string }
|
|
226
|
+
> = [];
|
|
227
|
+
|
|
228
|
+
// Add text prompt
|
|
229
|
+
if (prompt != null) {
|
|
230
|
+
userContent.push({ type: 'text', text: prompt });
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Add input images for editing
|
|
234
|
+
if (files != null && files.length > 0) {
|
|
235
|
+
for (const file of files) {
|
|
236
|
+
if (file.type === 'url') {
|
|
237
|
+
userContent.push({
|
|
238
|
+
type: 'file',
|
|
239
|
+
data: new URL(file.url),
|
|
240
|
+
mediaType: 'image/*',
|
|
241
|
+
});
|
|
242
|
+
} else {
|
|
243
|
+
userContent.push({
|
|
244
|
+
type: 'file',
|
|
245
|
+
data:
|
|
246
|
+
typeof file.data === 'string'
|
|
247
|
+
? file.data
|
|
248
|
+
: new Uint8Array(file.data),
|
|
249
|
+
mediaType: file.mediaType,
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const languageModelPrompt: LanguageModelV3Prompt = [
|
|
256
|
+
{ role: 'user', content: userContent },
|
|
257
|
+
];
|
|
258
|
+
|
|
259
|
+
// Instantiate language model
|
|
260
|
+
const languageModel = new GoogleGenerativeAILanguageModel(this.modelId, {
|
|
261
|
+
provider: this.config.provider,
|
|
262
|
+
baseURL: this.config.baseURL,
|
|
263
|
+
headers: this.config.headers ?? {},
|
|
264
|
+
fetch: this.config.fetch,
|
|
265
|
+
generateId: this.config.generateId ?? defaultGenerateId,
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
// Call language model with image-only response modality
|
|
269
|
+
const result = await languageModel.doGenerate({
|
|
270
|
+
prompt: languageModelPrompt,
|
|
271
|
+
seed,
|
|
272
|
+
providerOptions: {
|
|
273
|
+
google: {
|
|
274
|
+
responseModalities: ['IMAGE'],
|
|
275
|
+
imageConfig: aspectRatio ? { aspectRatio } : undefined,
|
|
276
|
+
...((providerOptions?.google as Record<string, unknown>) ?? {}),
|
|
277
|
+
},
|
|
278
|
+
},
|
|
279
|
+
headers,
|
|
280
|
+
abortSignal,
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
const currentDate = this.config._internal?.currentDate?.() ?? new Date();
|
|
284
|
+
|
|
285
|
+
// Extract images from language model response
|
|
286
|
+
const images: string[] = [];
|
|
287
|
+
for (const part of result.content) {
|
|
288
|
+
if (part.type === 'file' && part.mediaType.startsWith('image/')) {
|
|
289
|
+
images.push(convertToBase64(part.data));
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return {
|
|
294
|
+
images,
|
|
295
|
+
warnings,
|
|
296
|
+
providerMetadata: {
|
|
297
|
+
google: {
|
|
298
|
+
images: images.map(() => ({})),
|
|
299
|
+
},
|
|
300
|
+
},
|
|
301
|
+
response: {
|
|
302
|
+
timestamp: currentDate,
|
|
303
|
+
modelId: this.modelId,
|
|
304
|
+
headers: result.response?.headers,
|
|
305
|
+
},
|
|
306
|
+
usage: result.usage
|
|
307
|
+
? {
|
|
308
|
+
inputTokens: result.usage.inputTokens.total,
|
|
309
|
+
outputTokens: result.usage.outputTokens.total,
|
|
310
|
+
totalTokens:
|
|
311
|
+
(result.usage.inputTokens.total ?? 0) +
|
|
312
|
+
(result.usage.outputTokens.total ?? 0),
|
|
313
|
+
}
|
|
314
|
+
: undefined,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
function isGeminiModel(modelId: string): boolean {
|
|
320
|
+
return modelId.startsWith('gemini-');
|
|
156
321
|
}
|
|
157
322
|
|
|
158
323
|
// minimal version of the schema
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
export type GoogleGenerativeAIImageModelId =
|
|
2
|
+
// Imagen models (use :predict API)
|
|
2
3
|
| 'imagen-4.0-generate-001'
|
|
3
4
|
| 'imagen-4.0-ultra-generate-001'
|
|
4
5
|
| 'imagen-4.0-fast-generate-001'
|
|
6
|
+
// Gemini image models (technically multimodal output language models, use :generateContent API)
|
|
7
|
+
| 'gemini-2.5-flash-image'
|
|
8
|
+
| 'gemini-3-pro-image-preview'
|
|
5
9
|
| (string & {});
|
|
6
10
|
|
|
7
11
|
export interface GoogleGenerativeAIImageSettings {
|