npm - @ai-sdk/google - Versions diffs - 4.0.0-canary.75 → 4.0.0-canary.76 - Mend

@ai-sdk/google 4.0.0-canary.75 → 4.0.0-canary.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/CHANGELOG.md +6 -0
package/dist/index.d.ts +25 -2
package/dist/index.js +557 -290
package/dist/index.js.map +1 -1
package/docs/15-google.mdx +77 -0
package/package.json +1 -1
package/src/google-provider.ts +23 -0
package/src/google-speech-api.ts +36 -0
package/src/google-speech-model-options.ts +48 -0
package/src/google-speech-model.ts +286 -0
package/src/index.ts +4 -0

package/docs/15-google.mdx CHANGED Viewed

@@ -1935,3 +1935,80 @@ console.log(result.providerMetadata?.google?.groundingMetadata);
   2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
   grounding.
 </Note>
+## Speech Models
+You can create models that call the [Gemini text-to-speech API](https://ai.google.dev/gemini-api/docs/speech-generation)
+using the `.speech()` factory method.
+The first argument is the model id e.g. `gemini-2.5-flash-preview-tts`.
+```ts
+const model = google.speech('gemini-2.5-flash-preview-tts');
+```
+The `voice` argument can be set to one of Gemini's [30 prebuilt voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices)
+e.g. `Kore`, `Puck`, `Zephyr`, or `Charon`. Voice names are case-sensitive. It defaults to `Kore`.
+```ts highlight="6"
+import { experimental_generateSpeech as generateSpeech } from 'ai';
+import { google } from '@ai-sdk/google';
+const result = await generateSpeech({
+  model: google.speech('gemini-2.5-flash-preview-tts'),
+  text: 'Hello, world!',
+  voice: 'Kore', // Gemini voice name
+});
+```
+By default the generated audio is returned as a playable WAV file (`result.audio.mediaType` is
+`audio/wav`). Set `outputFormat: 'pcm'` to receive the raw signed 16-bit little-endian mono PCM
+bytes instead; the sample rate is reported in `result.providerMetadata.google.sampleRate`.
+Gemini honors natural-language style direction. The `instructions` argument is prepended to the
+spoken text, so `instructions: 'Say cheerfully'` with `text: 'Hello'` speaks `Say cheerfully: Hello`.
+### Multi-speaker audio
+For multi-speaker dialogue, pass a `multiSpeakerVoiceConfig` through `providerOptions`. Each speaker
+name must match a name used in the input text. When set, it overrides the top-level `voice`.
+```ts highlight="8-23"
+import { experimental_generateSpeech as generateSpeech } from 'ai';
+import { google, type GoogleSpeechModelOptions } from '@ai-sdk/google';
+const result = await generateSpeech({
+  model: google.speech('gemini-2.5-flash-preview-tts'),
+  text: 'Joe: How are you? Jane: Doing great, thanks!',
+  providerOptions: {
+    google: {
+      multiSpeakerVoiceConfig: {
+        speakerVoiceConfigs: [
+          {
+            speaker: 'Joe',
+            voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } },
+          },
+          {
+            speaker: 'Jane',
+            voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Puck' } },
+          },
+        ],
+      },
+    } satisfies GoogleSpeechModelOptions,
+  },
+});
+```
+<Note>
+  Gemini TTS models do not support the `speed` or `language` options; passing
+  them adds a warning to `result.warnings`. Language is detected automatically
+  from the input text.
+</Note>
+### Model Capabilities
+| Model                          | Multi-speaker       | Style via instructions |
+| ------------------------------ | ------------------- | ---------------------- |
+| `gemini-2.5-flash-preview-tts` | <Check size={18} /> | <Check size={18} />    |
+| `gemini-2.5-pro-preview-tts`   | <Check size={18} /> | <Check size={18} />    |
+| `gemini-3.1-flash-tts-preview` | <Check size={18} /> | <Check size={18} />    |

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ai-sdk/google",
-  "version": "4.0.0-canary.75",
+  "version": "4.0.0-canary.76",
   "type": "module",
   "license": "Apache-2.0",
   "sideEffects": false,

package/src/google-provider.ts CHANGED Viewed

@@ -5,6 +5,7 @@ import type {
   ImageModelV4,
   LanguageModelV4,
   ProviderV4,
+  SpeechModelV4,
 } from '@ai-sdk/provider';
 import {
   generateId,
@@ -28,6 +29,8 @@ import { GoogleImageModel } from './google-image-model';
 import { GoogleFiles } from './google-files';
 import { GoogleVideoModel } from './google-video-model';
 import type { GoogleVideoModelId } from './google-video-settings';
+import { GoogleSpeechModel } from './google-speech-model';
+import type { GoogleSpeechModelId } from './google-speech-model-options';
 import {
   GoogleInteractionsLanguageModel,
   type GoogleInteractionsModelInput,
@@ -85,6 +88,16 @@ export interface GoogleProvider extends ProviderV4 {
    */
   videoModel(modelId: GoogleVideoModelId): Experimental_VideoModelV4;
+  /**
+   * Creates a model for speech generation (text-to-speech).
+   */
+  speech(modelId: GoogleSpeechModelId): SpeechModelV4;
+  /**
+   * Creates a model for speech generation (text-to-speech).
+   */
+  speechModel(modelId: GoogleSpeechModelId): SpeechModelV4;
   files(): FilesV4;
   /**
@@ -223,6 +236,14 @@ export function createGoogle(
       generateId: options.generateId ?? generateId,
     });
+  const createSpeechModel = (modelId: GoogleSpeechModelId) =>
+    new GoogleSpeechModel(modelId, {
+      provider: `${providerName}.speech`,
+      baseURL,
+      headers: getHeaders,
+      fetch: options.fetch,
+    });
   const createInteractionsModel = (
     modelIdOrAgent:
       | GoogleInteractionsModelId
@@ -263,6 +284,8 @@ export function createGoogle(
   provider.video = createVideoModel;
   provider.videoModel = createVideoModel;
   provider.files = createFiles;
+  provider.speech = createSpeechModel;
+  provider.speechModel = createSpeechModel;
   provider.interactions = createInteractionsModel;
   provider.tools = googleTools;

package/src/google-speech-api.ts ADDED Viewed

@@ -0,0 +1,36 @@
+import { lazySchema, zodSchema } from '@ai-sdk/provider-utils';
+import { z } from 'zod/v4';
+/**
+ * Response schema for the Gemini `:generateContent` endpoint when called with
+ * `responseModalities: ['AUDIO']`. The generated audio is returned as base64
+ * encoded raw PCM in the first inline-data part.
+ */
+export const googleSpeechResponseSchema = lazySchema(() =>
+  zodSchema(
+    z.object({
+      candidates: z
+        .array(
+          z.object({
+            content: z
+              .object({
+                parts: z
+                  .array(
+                    z.object({
+                      inlineData: z
+                        .object({
+                          mimeType: z.string().nullish(),
+                          data: z.string().nullish(),
+                        })
+                        .nullish(),
+                    }),
+                  )
+                  .nullish(),
+              })
+              .nullish(),
+          }),
+        )
+        .nullish(),
+    }),
+  ),
+);

package/src/google-speech-model-options.ts ADDED Viewed

@@ -0,0 +1,48 @@
+import {
+  lazySchema,
+  zodSchema,
+  type InferSchema,
+} from '@ai-sdk/provider-utils';
+import { z } from 'zod/v4';
+export type GoogleSpeechModelId =
+  | 'gemini-2.5-flash-preview-tts'
+  | 'gemini-2.5-pro-preview-tts'
+  | 'gemini-3.1-flash-tts-preview'
+  | (string & {});
+const prebuiltVoiceConfigSchema = z.object({
+  voiceName: z.string(),
+});
+const voiceConfigSchema = z.object({
+  prebuiltVoiceConfig: prebuiltVoiceConfigSchema,
+});
+export const googleSpeechProviderOptionsSchema = lazySchema(() =>
+  zodSchema(
+    z.object({
+      /**
+       * Multi-speaker configuration for dialogue audio. When provided, this
+       * overrides the top-level `voice`. The Gemini TTS API supports up to two
+       * speakers; each speaker name must match a name used in the input text.
+       *
+       * https://ai.google.dev/gemini-api/docs/speech-generation#multi-speaker
+       */
+      multiSpeakerVoiceConfig: z
+        .object({
+          speakerVoiceConfigs: z.array(
+            z.object({
+              speaker: z.string(),
+              voiceConfig: voiceConfigSchema,
+            }),
+          ),
+        })
+        .optional(),
+    }),
+  ),
+);
+export type GoogleSpeechModelOptions = InferSchema<
+  typeof googleSpeechProviderOptionsSchema
+>;

package/src/google-speech-model.ts ADDED Viewed

@@ -0,0 +1,286 @@
+import type { SpeechModelV4, SharedV4Warning } from '@ai-sdk/provider';
+import {
+  combineHeaders,
+  convertBase64ToUint8Array,
+  createJsonResponseHandler,
+  parseProviderOptions,
+  postJsonToApi,
+  resolve,
+  serializeModelOptions,
+  WORKFLOW_DESERIALIZE,
+  WORKFLOW_SERIALIZE,
+  type FetchFunction,
+  type Resolvable,
+} from '@ai-sdk/provider-utils';
+import { googleFailedResponseHandler } from './google-error';
+import { googleSpeechResponseSchema } from './google-speech-api';
+import {
+  googleSpeechProviderOptionsSchema,
+  type GoogleSpeechModelId,
+} from './google-speech-model-options';
+interface GoogleSpeechModelConfig {
+  provider: string;
+  baseURL: string;
+  headers?: Resolvable<Record<string, string | undefined>>;
+  fetch?: FetchFunction;
+  _internal?: {
+    currentDate?: () => Date;
+  };
+}
+const DEFAULT_VOICE = 'Kore';
+// Gemini TTS returns raw PCM at 24kHz when the response does not specify a rate.
+const DEFAULT_SAMPLE_RATE = 24000;
+export class GoogleSpeechModel implements SpeechModelV4 {
+  readonly specificationVersion = 'v4';
+  static [WORKFLOW_SERIALIZE](model: GoogleSpeechModel) {
+    return serializeModelOptions({
+      modelId: model.modelId,
+      config: model.config,
+    });
+  }
+  static [WORKFLOW_DESERIALIZE](options: {
+    modelId: GoogleSpeechModelId;
+    config: GoogleSpeechModelConfig;
+  }) {
+    return new GoogleSpeechModel(options.modelId, options.config);
+  }
+  get provider(): string {
+    return this.config.provider;
+  }
+  constructor(
+    readonly modelId: GoogleSpeechModelId,
+    private readonly config: GoogleSpeechModelConfig,
+  ) {}
+  private async getArgs({
+    text,
+    voice = DEFAULT_VOICE,
+    outputFormat,
+    instructions,
+    speed,
+    language,
+    providerOptions,
+  }: Parameters<SpeechModelV4['doGenerate']>[0]) {
+    const warnings: SharedV4Warning[] = [];
+    const googleOptions = await parseProviderOptions({
+      provider: 'google',
+      providerOptions,
+      schema: googleSpeechProviderOptionsSchema,
+    });
+    // Multi-speaker (provider option) takes precedence over the single voice.
+    const multiSpeakerVoiceConfig = googleOptions?.multiSpeakerVoiceConfig;
+    const speechConfig = multiSpeakerVoiceConfig
+      ? { multiSpeakerVoiceConfig }
+      : { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } };
+    // Gemini honors natural-language style direction expressed in the prompt
+    // text, so map `instructions` onto the spoken content. With multi-speaker
+    // the transcript starts with speaker labels (e.g. `Joe: ...`), so prepending
+    // instructions would corrupt that parsing — ignore them there (with a warning).
+    let promptText = text;
+    if (instructions != null) {
+      if (multiSpeakerVoiceConfig) {
+        warnings.push({
+          type: 'unsupported',
+          feature: 'instructions',
+          details:
+            'Google Gemini TTS ignores `instructions` when `multiSpeakerVoiceConfig` is set, ' +
+            'because prepending them would break multi-speaker transcript parsing.',
+        });
+      } else {
+        promptText = `${instructions}: ${text}`;
+      }
+    }
+    if (speed != null) {
+      warnings.push({
+        type: 'unsupported',
+        feature: 'speed',
+        details:
+          'Google Gemini TTS models do not support the `speed` option. It was ignored.',
+      });
+    }
+    if (language != null) {
+      warnings.push({
+        type: 'unsupported',
+        feature: 'language',
+        details:
+          'Google Gemini TTS models do not support the `language` option. ' +
+          'Language is detected automatically from the input text.',
+      });
+    }
+    // Only `wav` (default, WAV-wrapped) and `pcm` (raw) are supported.
+    let resolvedOutputFormat: 'wav' | 'pcm' = 'wav';
+    if (outputFormat === 'pcm') {
+      resolvedOutputFormat = 'pcm';
+    } else if (outputFormat != null && outputFormat !== 'wav') {
+      warnings.push({
+        type: 'unsupported',
+        feature: 'outputFormat',
+        details: `Unsupported output format: ${outputFormat}. Using wav instead.`,
+      });
+    }
+    const requestBody = {
+      contents: [{ parts: [{ text: promptText }] }],
+      generationConfig: {
+        responseModalities: ['AUDIO'],
+        speechConfig,
+      },
+    };
+    return { requestBody, warnings, outputFormat: resolvedOutputFormat };
+  }
+  async doGenerate(
+    options: Parameters<SpeechModelV4['doGenerate']>[0],
+  ): Promise<Awaited<ReturnType<SpeechModelV4['doGenerate']>>> {
+    const currentDate = this.config._internal?.currentDate?.() ?? new Date();
+    const { requestBody, warnings, outputFormat } = await this.getArgs(options);
+    const {
+      value: response,
+      responseHeaders,
+      rawValue: rawResponse,
+    } = await postJsonToApi({
+      url: `${this.config.baseURL}/models/${this.modelId}:generateContent`,
+      headers: combineHeaders(
+        this.config.headers ? await resolve(this.config.headers) : undefined,
+        options.headers,
+      ),
+      body: requestBody,
+      failedResponseHandler: googleFailedResponseHandler,
+      successfulResponseHandler: createJsonResponseHandler(
+        googleSpeechResponseSchema,
+      ),
+      abortSignal: options.abortSignal,
+      fetch: this.config.fetch,
+    });
+    // `generateSpeech` returns a single audio result, and Gemini returns one
+    // inline audio part per request, so take the first inline-data part.
+    let base64Audio: string | undefined;
+    let mimeType: string | undefined;
+    for (const candidate of response.candidates ?? []) {
+      for (const part of candidate.content?.parts ?? []) {
+        if (part.inlineData?.data) {
+          base64Audio = part.inlineData.data;
+          mimeType = part.inlineData.mimeType ?? undefined;
+          break;
+        }
+      }
+      if (base64Audio != null) {
+        break;
+      }
+    }
+    const sampleRate = parseSampleRate(mimeType) ?? DEFAULT_SAMPLE_RATE;
+    const pcm =
+      base64Audio != null
+        ? convertBase64ToUint8Array(base64Audio)
+        : new Uint8Array(0);
+    // Gemini returns headerless raw PCM (e.g. `audio/L16;rate=24000`). Unlike
+    // providers that return a container format (mp3/opus/wav) directly,
+    // `generateSpeech`'s `detectMediaType` can't identify raw PCM and would
+    // mislabel it `audio/mp3` (not playable), so wrap it in a minimal WAV header
+    // by default; `outputFormat: 'pcm'` returns the raw bytes untouched.
+    // Empty audio is returned as-is so the core layer throws NoSpeechGeneratedError.
+    const audio =
+      outputFormat === 'pcm' || pcm.length === 0
+        ? pcm
+        : addWavHeader(pcm, sampleRate);
+    if (outputFormat === 'pcm' && pcm.length > 0) {
+      warnings.push({
+        type: 'unsupported',
+        feature: 'outputFormat',
+        details:
+          `Returning raw PCM audio (signed 16-bit little-endian, mono, ${sampleRate} Hz). ` +
+          'These bytes have no container header and are not directly playable; ' +
+          'see providerMetadata.google for the sample rate and mime type.',
+      });
+    }
+    return {
+      audio,
+      warnings,
+      request: {
+        body: JSON.stringify(requestBody),
+      },
+      response: {
+        timestamp: currentDate,
+        modelId: this.modelId,
+        headers: responseHeaders,
+        body: rawResponse,
+      },
+      providerMetadata: {
+        google: {
+          sampleRate,
+          mimeType: mimeType ?? null,
+        },
+      },
+    };
+  }
+}
+/**
+ * Parses the sample rate from a PCM mime type such as `audio/L16;rate=24000`.
+ */
+function parseSampleRate(mimeType: string | undefined): number | undefined {
+  if (mimeType == null) {
+    return undefined;
+  }
+  const match = /rate=(\d+)/.exec(mimeType);
+  return match ? Number.parseInt(match[1], 10) : undefined;
+}
+/**
+ * Wraps raw signed 16-bit little-endian mono PCM in a minimal 44-byte WAV
+ * (RIFF/WAVE) container so the output is playable and detectable as `audio/wav`.
+ */
+function addWavHeader(pcm: Uint8Array, sampleRate: number): Uint8Array {
+  const numChannels = 1;
+  const bitsPerSample = 16;
+  const blockAlign = (numChannels * bitsPerSample) / 8;
+  const byteRate = sampleRate * blockAlign;
+  const dataSize = pcm.length;
+  const buffer = new ArrayBuffer(44 + dataSize);
+  const view = new DataView(buffer);
+  writeAscii(view, 0, 'RIFF');
+  view.setUint32(4, 36 + dataSize, true);
+  writeAscii(view, 8, 'WAVE');
+  writeAscii(view, 12, 'fmt ');
+  view.setUint32(16, 16, true); // PCM fmt chunk size
+  view.setUint16(20, 1, true); // audio format = PCM
+  view.setUint16(22, numChannels, true);
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, byteRate, true);
+  view.setUint16(32, blockAlign, true);
+  view.setUint16(34, bitsPerSample, true);
+  writeAscii(view, 36, 'data');
+  view.setUint32(40, dataSize, true);
+  const out = new Uint8Array(buffer);
+  out.set(pcm, 44);
+  return out;
+}
+function writeAscii(view: DataView, offset: number, text: string): void {
+  for (let i = 0; i < text.length; i++) {
+    view.setUint8(offset + i, text.charCodeAt(i));
+  }
+}

package/src/index.ts CHANGED Viewed

@@ -29,6 +29,10 @@ export type {
   /** @deprecated Use `GoogleVideoModelId` instead. */
   GoogleVideoModelId as GoogleGenerativeAIVideoModelId,
 } from './google-video-settings';
+export type {
+  GoogleSpeechModelOptions,
+  GoogleSpeechModelId,
+} from './google-speech-model-options';
 export type { GoogleFilesUploadOptions } from './google-files';
 export type {
   GoogleLanguageModelInteractionsOptions,