npm - @ai-sdk/google - Versions diffs - 3.0.25 → 3.0.26 - Mend

@ai-sdk/google 3.0.25 → 3.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/CHANGELOG.md +6 -0
package/dist/index.d.mts +1 -1
package/dist/index.d.ts +1 -1
package/dist/index.js +123 -6
package/dist/index.js.map +1 -1
package/dist/index.mjs +125 -6
package/dist/index.mjs.map +1 -1
package/docs/15-google-generative-ai.mdx +93 -4
package/package.json +3 -3
package/src/google-generative-ai-image-model.ts +172 -7
package/src/google-generative-ai-image-settings.ts +4 -0

package/docs/15-google-generative-ai.mdx CHANGED Viewed

@@ -929,7 +929,7 @@ The `vertexRagStore` tool accepts the following configuration options:
 ### Image Outputs
-Gemini models with image generation capabilities (`gemini-2.5-flash-image`) support image generation. Images are exposed as files in the response.
+Gemini models with image generation capabilities (e.g. `gemini-2.5-flash-image`) support generating images as part of a multimodal response. Images are exposed as files in the response.
 ```ts
 import { google } from '@ai-sdk/google';
@@ -948,6 +948,12 @@ for (const file of result.files) {
 }
 ```
+<Note>
+  If you primarily want to generate images without text output, you can also use
+  Gemini image models with the `generateImage()` function. See [Gemini Image
+  Models](#gemini-image-models) for details.
+</Note>
 ### Safety Ratings
 The safety ratings provide insight into the safety of the model's response.
@@ -1146,9 +1152,18 @@ The following optional provider options are available for Google Generative AI e
 ## Image Models
-You can create [Imagen](https://ai.google.dev/gemini-api/docs/imagen) models that call the Google Generative AI API using the `.image()` factory method.
+You can create image models that call the Google Generative AI API using the `.image()` factory method.
 For more on image generation with the AI SDK see [generateImage()](/docs/reference/ai-sdk-core/generate-image).
+The Google provider supports two types of image models:
+- **Imagen models**: Dedicated image generation models using the `:predict` API
+- **Gemini image models**: Multimodal language models with image output capabilities using the `:generateContent` API
+### Imagen Models
+[Imagen](https://ai.google.dev/gemini-api/docs/imagen) models are dedicated image generation models.
 ```ts
 import { google } from '@ai-sdk/google';
 import { generateImage } from 'ai';
@@ -1178,7 +1193,7 @@ const { image } = await generateImage({
 });
 ```
-The following provider options are available:
+The following provider options are available for Imagen models:
 - **personGeneration** `allow_adult` | `allow_all` | `dont_allow`
   Whether to allow person generation. Defaults to `allow_adult`.
@@ -1188,10 +1203,84 @@ The following provider options are available:
   parameter instead.
 </Note>
-#### Model Capabilities
+#### Imagen Model Capabilities
 | Model                           | Aspect Ratios             |
 | ------------------------------- | ------------------------- |
 | `imagen-4.0-generate-001`       | 1:1, 3:4, 4:3, 9:16, 16:9 |
 | `imagen-4.0-ultra-generate-001` | 1:1, 3:4, 4:3, 9:16, 16:9 |
 | `imagen-4.0-fast-generate-001`  | 1:1, 3:4, 4:3, 9:16, 16:9 |
+### Gemini Image Models
+[Gemini image models](https://ai.google.dev/gemini-api/docs/image-generation) (e.g. `gemini-2.5-flash-image`) are technically multimodal output language models, but they can be used with the `generateImage()` function for a simpler image generation experience. Internally, the provider calls the language model API with `responseModalities: ['IMAGE']`.
+```ts
+import { google } from '@ai-sdk/google';
+import { generateImage } from 'ai';
+const { image } = await generateImage({
+  model: google.image('gemini-2.5-flash-image'),
+  prompt: 'A photorealistic image of a cat wearing a wizard hat',
+  aspectRatio: '1:1',
+});
+```
+Gemini image models also support image editing by providing input images:
+```ts
+import { google } from '@ai-sdk/google';
+import { generateImage } from 'ai';
+import fs from 'node:fs';
+const sourceImage = fs.readFileSync('./cat.png');
+const { image } = await generateImage({
+  model: google.image('gemini-2.5-flash-image'),
+  prompt: {
+    text: 'Add a small wizard hat to this cat',
+    images: [sourceImage],
+  },
+});
+```
+You can also use URLs for input images:
+```ts
+import { google } from '@ai-sdk/google';
+import { generateImage } from 'ai';
+const { image } = await generateImage({
+  model: google.image('gemini-2.5-flash-image'),
+  prompt: {
+    text: 'Add a small wizard hat to this cat',
+    images: ['https://example.com/cat.png'],
+  },
+});
+```
+<Note>
+  Gemini image models do not support the `size` or `n` parameters. Use
+  `aspectRatio` instead of `size`. Mask-based inpainting is also not supported.
+</Note>
+<Note>
+  For more advanced use cases where you need both text and image outputs, or
+  want more control over the generation process, you can use Gemini image models
+  directly with `generateText()`. See [Image Outputs](#image-outputs) for
+  details.
+</Note>
+#### Gemini Image Model Capabilities
+| Model                        | Image Generation    | Image Editing       | Aspect Ratios                                       |
+| ---------------------------- | ------------------- | ------------------- | --------------------------------------------------- |
+| `gemini-2.5-flash-image`     | <Check size={18} /> | <Check size={18} /> | 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9 |
+| `gemini-3-pro-image-preview` | <Check size={18} /> | <Check size={18} /> | 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9 |
+<Note>
+  `gemini-3-pro-image-preview` supports additional features including up to 14
+  reference images for editing (6 objects, 5 humans), resolution options (1K,
+  2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
+  grounding.
+</Note>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ai-sdk/google",
-  "version": "3.0.25",
+  "version": "3.0.26",
   "license": "Apache-2.0",
   "sideEffects": false,
   "main": "./dist/index.js",
@@ -36,8 +36,8 @@
     }
   },
   "dependencies": {
-    "@ai-sdk/provider": "3.0.8",
-    "@ai-sdk/provider-utils": "4.0.14"
+    "@ai-sdk/provider-utils": "4.0.14",
+    "@ai-sdk/provider": "3.0.8"
   },
   "devDependencies": {
     "@types/node": "20.17.24",

package/src/google-generative-ai-image-model.ts CHANGED Viewed

@@ -1,11 +1,19 @@
-import { ImageModelV3, SharedV3Warning } from '@ai-sdk/provider';
+import {
+  ImageModelV3,
+  LanguageModelV3Prompt,
+  SharedV3Warning,
+} from '@ai-sdk/provider';
 import {
   combineHeaders,
+  convertToBase64,
   createJsonResponseHandler,
+  FetchFunction,
+  generateId as defaultGenerateId,
   type InferSchema,
   lazySchema,
   parseProviderOptions,
   postJsonToApi,
+  Resolvable,
   resolve,
   zodSchema,
 } from '@ai-sdk/provider-utils';
@@ -15,7 +23,7 @@ import {
   GoogleGenerativeAIImageModelId,
   GoogleGenerativeAIImageSettings,
 } from './google-generative-ai-image-settings';
-import { FetchFunction, Resolvable } from '@ai-sdk/provider-utils';
+import { GoogleGenerativeAILanguageModel } from './google-generative-ai-language-model';
 interface GoogleGenerativeAIImageModelConfig {
   provider: string;
@@ -32,8 +40,15 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
   readonly specificationVersion = 'v3';
   get maxImagesPerCall(): number {
+    if (this.settings.maxImagesPerCall != null) {
+      return this.settings.maxImagesPerCall;
+    }
+    // https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash-image
+    if (isGeminiModel(this.modelId)) {
+      return 10;
+    }
     // https://ai.google.dev/gemini-api/docs/imagen#imagen-model
-    return this.settings.maxImagesPerCall ?? 4;
+    return 4;
   }
   get provider(): string {
@@ -48,6 +63,16 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
   async doGenerate(
     options: Parameters<ImageModelV3['doGenerate']>[0],
+  ): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
+    // Gemini image models use the language model API internally
+    if (isGeminiModel(this.modelId)) {
+      return this.doGenerateGemini(options);
+    }
+    return this.doGenerateImagen(options);
+  }
+  private async doGenerateImagen(
+    options: Parameters<ImageModelV3['doGenerate']>[0],
   ): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
     const {
       prompt,
@@ -63,10 +88,10 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
     } = options;
     const warnings: Array<SharedV3Warning> = [];
-    // Google Generative AI does not support image editing
+    // Imagen API endpoints do not support image editing
     if (files != null && files.length > 0) {
       throw new Error(
-        'Google Generative AI does not support image editing. ' +
+        'Google Generative AI does not support image editing with Imagen models. ' +
           'Use Google Vertex AI (@ai-sdk/google-vertex) for image editing capabilities.',
       );
     }
@@ -138,10 +163,10 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
       images: response.predictions.map(
         (p: { bytesBase64Encoded: string }) => p.bytesBase64Encoded,
       ),
-      warnings: warnings ?? [],
+      warnings,
       providerMetadata: {
         google: {
-          images: response.predictions.map(prediction => ({
+          images: response.predictions.map(() => ({
             // Add any prediction-specific metadata here
           })),
         },
@@ -153,6 +178,146 @@ export class GoogleGenerativeAIImageModel implements ImageModelV3 {
       },
     };
   }
+  private async doGenerateGemini(
+    options: Parameters<ImageModelV3['doGenerate']>[0],
+  ): Promise<Awaited<ReturnType<ImageModelV3['doGenerate']>>> {
+    const {
+      prompt,
+      n,
+      size,
+      aspectRatio,
+      seed,
+      providerOptions,
+      headers,
+      abortSignal,
+      files,
+      mask,
+    } = options;
+    const warnings: Array<SharedV3Warning> = [];
+    // Gemini does not support mask-based inpainting
+    if (mask != null) {
+      throw new Error(
+        'Gemini image models do not support mask-based image editing.',
+      );
+    }
+    // Gemini does not support generating multiple images per call via n parameter
+    if (n != null && n > 1) {
+      throw new Error(
+        'Gemini image models do not support generating a set number of images per call. Use n=1 or omit the n parameter.',
+      );
+    }
+    if (size != null) {
+      warnings.push({
+        type: 'unsupported',
+        feature: 'size',
+        details:
+          'This model does not support the `size` option. Use `aspectRatio` instead.',
+      });
+    }
+    // Build user message content for language model
+    const userContent: Array<
+      | { type: 'text'; text: string }
+      | { type: 'file'; data: string | Uint8Array | URL; mediaType: string }
+    > = [];
+    // Add text prompt
+    if (prompt != null) {
+      userContent.push({ type: 'text', text: prompt });
+    }
+    // Add input images for editing
+    if (files != null && files.length > 0) {
+      for (const file of files) {
+        if (file.type === 'url') {
+          userContent.push({
+            type: 'file',
+            data: new URL(file.url),
+            mediaType: 'image/*',
+          });
+        } else {
+          userContent.push({
+            type: 'file',
+            data:
+              typeof file.data === 'string'
+                ? file.data
+                : new Uint8Array(file.data),
+            mediaType: file.mediaType,
+          });
+        }
+      }
+    }
+    const languageModelPrompt: LanguageModelV3Prompt = [
+      { role: 'user', content: userContent },
+    ];
+    // Instantiate language model
+    const languageModel = new GoogleGenerativeAILanguageModel(this.modelId, {
+      provider: this.config.provider,
+      baseURL: this.config.baseURL,
+      headers: this.config.headers ?? {},
+      fetch: this.config.fetch,
+      generateId: this.config.generateId ?? defaultGenerateId,
+    });
+    // Call language model with image-only response modality
+    const result = await languageModel.doGenerate({
+      prompt: languageModelPrompt,
+      seed,
+      providerOptions: {
+        google: {
+          responseModalities: ['IMAGE'],
+          imageConfig: aspectRatio ? { aspectRatio } : undefined,
+          ...((providerOptions?.google as Record<string, unknown>) ?? {}),
+        },
+      },
+      headers,
+      abortSignal,
+    });
+    const currentDate = this.config._internal?.currentDate?.() ?? new Date();
+    // Extract images from language model response
+    const images: string[] = [];
+    for (const part of result.content) {
+      if (part.type === 'file' && part.mediaType.startsWith('image/')) {
+        images.push(convertToBase64(part.data));
+      }
+    }
+    return {
+      images,
+      warnings,
+      providerMetadata: {
+        google: {
+          images: images.map(() => ({})),
+        },
+      },
+      response: {
+        timestamp: currentDate,
+        modelId: this.modelId,
+        headers: result.response?.headers,
+      },
+      usage: result.usage
+        ? {
+            inputTokens: result.usage.inputTokens.total,
+            outputTokens: result.usage.outputTokens.total,
+            totalTokens:
+              (result.usage.inputTokens.total ?? 0) +
+              (result.usage.outputTokens.total ?? 0),
+          }
+        : undefined,
+    };
+  }
+}
+function isGeminiModel(modelId: string): boolean {
+  return modelId.startsWith('gemini-');
 }
 // minimal version of the schema

package/src/google-generative-ai-image-settings.ts CHANGED Viewed

@@ -1,7 +1,11 @@
 export type GoogleGenerativeAIImageModelId =
+  // Imagen models (use :predict API)
   | 'imagen-4.0-generate-001'
   | 'imagen-4.0-ultra-generate-001'
   | 'imagen-4.0-fast-generate-001'
+  // Gemini image models (technically multimodal output language models, use :generateContent API)
+  | 'gemini-2.5-flash-image'
+  | 'gemini-3-pro-image-preview'
   | (string & {});
 export interface GoogleGenerativeAIImageSettings {