@ai-sdk/google 4.0.0-canary.75 → 4.0.0-canary.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1935,3 +1935,80 @@ console.log(result.providerMetadata?.google?.groundingMetadata);
1935
1935
  2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
1936
1936
  grounding.
1937
1937
  </Note>
1938
+
1939
+ ## Speech Models
1940
+
1941
+ You can create models that call the [Gemini text-to-speech API](https://ai.google.dev/gemini-api/docs/speech-generation)
1942
+ using the `.speech()` factory method.
1943
+
1944
+ The first argument is the model id e.g. `gemini-2.5-flash-preview-tts`.
1945
+
1946
+ ```ts
1947
+ const model = google.speech('gemini-2.5-flash-preview-tts');
1948
+ ```
1949
+
1950
+ The `voice` argument can be set to one of Gemini's [30 prebuilt voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices)
1951
+ e.g. `Kore`, `Puck`, `Zephyr`, or `Charon`. Voice names are case-sensitive. It defaults to `Kore`.
1952
+
1953
+ ```ts highlight="6"
1954
+ import { experimental_generateSpeech as generateSpeech } from 'ai';
1955
+ import { google } from '@ai-sdk/google';
1956
+
1957
+ const result = await generateSpeech({
1958
+ model: google.speech('gemini-2.5-flash-preview-tts'),
1959
+ text: 'Hello, world!',
1960
+ voice: 'Kore', // Gemini voice name
1961
+ });
1962
+ ```
1963
+
1964
+ By default the generated audio is returned as a playable WAV file (`result.audio.mediaType` is
1965
+ `audio/wav`). Set `outputFormat: 'pcm'` to receive the raw signed 16-bit little-endian mono PCM
1966
+ bytes instead; the sample rate is reported in `result.providerMetadata.google.sampleRate`.
1967
+
1968
+ Gemini honors natural-language style direction. The `instructions` argument is prepended to the
1969
+ spoken text, so `instructions: 'Say cheerfully'` with `text: 'Hello'` speaks `Say cheerfully: Hello`.
1970
+
1971
+ ### Multi-speaker audio
1972
+
1973
+ For multi-speaker dialogue, pass a `multiSpeakerVoiceConfig` through `providerOptions`. Each speaker
1974
+ name must match a name used in the input text. When set, it overrides the top-level `voice`.
1975
+
1976
+ ```ts highlight="8-23"
1977
+ import { experimental_generateSpeech as generateSpeech } from 'ai';
1978
+ import { google, type GoogleSpeechModelOptions } from '@ai-sdk/google';
1979
+
1980
+ const result = await generateSpeech({
1981
+ model: google.speech('gemini-2.5-flash-preview-tts'),
1982
+ text: 'Joe: How are you? Jane: Doing great, thanks!',
1983
+ providerOptions: {
1984
+ google: {
1985
+ multiSpeakerVoiceConfig: {
1986
+ speakerVoiceConfigs: [
1987
+ {
1988
+ speaker: 'Joe',
1989
+ voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } },
1990
+ },
1991
+ {
1992
+ speaker: 'Jane',
1993
+ voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Puck' } },
1994
+ },
1995
+ ],
1996
+ },
1997
+ } satisfies GoogleSpeechModelOptions,
1998
+ },
1999
+ });
2000
+ ```
2001
+
2002
+ <Note>
2003
+ Gemini TTS models do not support the `speed` or `language` options; passing
2004
+ them adds a warning to `result.warnings`. Language is detected automatically
2005
+ from the input text.
2006
+ </Note>
2007
+
2008
+ ### Model Capabilities
2009
+
2010
+ | Model | Multi-speaker | Style via instructions |
2011
+ | ------------------------------ | ------------------- | ---------------------- |
2012
+ | `gemini-2.5-flash-preview-tts` | <Check size={18} /> | <Check size={18} /> |
2013
+ | `gemini-2.5-pro-preview-tts` | <Check size={18} /> | <Check size={18} /> |
2014
+ | `gemini-3.1-flash-tts-preview` | <Check size={18} /> | <Check size={18} /> |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ai-sdk/google",
3
- "version": "4.0.0-canary.75",
3
+ "version": "4.0.0-canary.76",
4
4
  "type": "module",
5
5
  "license": "Apache-2.0",
6
6
  "sideEffects": false,
@@ -5,6 +5,7 @@ import type {
5
5
  ImageModelV4,
6
6
  LanguageModelV4,
7
7
  ProviderV4,
8
+ SpeechModelV4,
8
9
  } from '@ai-sdk/provider';
9
10
  import {
10
11
  generateId,
@@ -28,6 +29,8 @@ import { GoogleImageModel } from './google-image-model';
28
29
  import { GoogleFiles } from './google-files';
29
30
  import { GoogleVideoModel } from './google-video-model';
30
31
  import type { GoogleVideoModelId } from './google-video-settings';
32
+ import { GoogleSpeechModel } from './google-speech-model';
33
+ import type { GoogleSpeechModelId } from './google-speech-model-options';
31
34
  import {
32
35
  GoogleInteractionsLanguageModel,
33
36
  type GoogleInteractionsModelInput,
@@ -85,6 +88,16 @@ export interface GoogleProvider extends ProviderV4 {
85
88
  */
86
89
  videoModel(modelId: GoogleVideoModelId): Experimental_VideoModelV4;
87
90
 
91
+ /**
92
+ * Creates a model for speech generation (text-to-speech).
93
+ */
94
+ speech(modelId: GoogleSpeechModelId): SpeechModelV4;
95
+
96
+ /**
97
+ * Creates a model for speech generation (text-to-speech).
98
+ */
99
+ speechModel(modelId: GoogleSpeechModelId): SpeechModelV4;
100
+
88
101
  files(): FilesV4;
89
102
 
90
103
  /**
@@ -223,6 +236,14 @@ export function createGoogle(
223
236
  generateId: options.generateId ?? generateId,
224
237
  });
225
238
 
239
+ const createSpeechModel = (modelId: GoogleSpeechModelId) =>
240
+ new GoogleSpeechModel(modelId, {
241
+ provider: `${providerName}.speech`,
242
+ baseURL,
243
+ headers: getHeaders,
244
+ fetch: options.fetch,
245
+ });
246
+
226
247
  const createInteractionsModel = (
227
248
  modelIdOrAgent:
228
249
  | GoogleInteractionsModelId
@@ -263,6 +284,8 @@ export function createGoogle(
263
284
  provider.video = createVideoModel;
264
285
  provider.videoModel = createVideoModel;
265
286
  provider.files = createFiles;
287
+ provider.speech = createSpeechModel;
288
+ provider.speechModel = createSpeechModel;
266
289
  provider.interactions = createInteractionsModel;
267
290
  provider.tools = googleTools;
268
291
 
@@ -0,0 +1,36 @@
1
+ import { lazySchema, zodSchema } from '@ai-sdk/provider-utils';
2
+ import { z } from 'zod/v4';
3
+
4
+ /**
5
+ * Response schema for the Gemini `:generateContent` endpoint when called with
6
+ * `responseModalities: ['AUDIO']`. The generated audio is returned as base64
7
+ * encoded raw PCM in the first inline-data part.
8
+ */
9
+ export const googleSpeechResponseSchema = lazySchema(() =>
10
+ zodSchema(
11
+ z.object({
12
+ candidates: z
13
+ .array(
14
+ z.object({
15
+ content: z
16
+ .object({
17
+ parts: z
18
+ .array(
19
+ z.object({
20
+ inlineData: z
21
+ .object({
22
+ mimeType: z.string().nullish(),
23
+ data: z.string().nullish(),
24
+ })
25
+ .nullish(),
26
+ }),
27
+ )
28
+ .nullish(),
29
+ })
30
+ .nullish(),
31
+ }),
32
+ )
33
+ .nullish(),
34
+ }),
35
+ ),
36
+ );
@@ -0,0 +1,48 @@
1
+ import {
2
+ lazySchema,
3
+ zodSchema,
4
+ type InferSchema,
5
+ } from '@ai-sdk/provider-utils';
6
+ import { z } from 'zod/v4';
7
+
8
+ export type GoogleSpeechModelId =
9
+ | 'gemini-2.5-flash-preview-tts'
10
+ | 'gemini-2.5-pro-preview-tts'
11
+ | 'gemini-3.1-flash-tts-preview'
12
+ | (string & {});
13
+
14
+ const prebuiltVoiceConfigSchema = z.object({
15
+ voiceName: z.string(),
16
+ });
17
+
18
+ const voiceConfigSchema = z.object({
19
+ prebuiltVoiceConfig: prebuiltVoiceConfigSchema,
20
+ });
21
+
22
+ export const googleSpeechProviderOptionsSchema = lazySchema(() =>
23
+ zodSchema(
24
+ z.object({
25
+ /**
26
+ * Multi-speaker configuration for dialogue audio. When provided, this
27
+ * overrides the top-level `voice`. The Gemini TTS API supports up to two
28
+ * speakers; each speaker name must match a name used in the input text.
29
+ *
30
+ * https://ai.google.dev/gemini-api/docs/speech-generation#multi-speaker
31
+ */
32
+ multiSpeakerVoiceConfig: z
33
+ .object({
34
+ speakerVoiceConfigs: z.array(
35
+ z.object({
36
+ speaker: z.string(),
37
+ voiceConfig: voiceConfigSchema,
38
+ }),
39
+ ),
40
+ })
41
+ .optional(),
42
+ }),
43
+ ),
44
+ );
45
+
46
+ export type GoogleSpeechModelOptions = InferSchema<
47
+ typeof googleSpeechProviderOptionsSchema
48
+ >;
@@ -0,0 +1,286 @@
1
+ import type { SpeechModelV4, SharedV4Warning } from '@ai-sdk/provider';
2
+ import {
3
+ combineHeaders,
4
+ convertBase64ToUint8Array,
5
+ createJsonResponseHandler,
6
+ parseProviderOptions,
7
+ postJsonToApi,
8
+ resolve,
9
+ serializeModelOptions,
10
+ WORKFLOW_DESERIALIZE,
11
+ WORKFLOW_SERIALIZE,
12
+ type FetchFunction,
13
+ type Resolvable,
14
+ } from '@ai-sdk/provider-utils';
15
+ import { googleFailedResponseHandler } from './google-error';
16
+ import { googleSpeechResponseSchema } from './google-speech-api';
17
+ import {
18
+ googleSpeechProviderOptionsSchema,
19
+ type GoogleSpeechModelId,
20
+ } from './google-speech-model-options';
21
+
22
+ interface GoogleSpeechModelConfig {
23
+ provider: string;
24
+ baseURL: string;
25
+ headers?: Resolvable<Record<string, string | undefined>>;
26
+ fetch?: FetchFunction;
27
+ _internal?: {
28
+ currentDate?: () => Date;
29
+ };
30
+ }
31
+
32
+ const DEFAULT_VOICE = 'Kore';
33
+ // Gemini TTS returns raw PCM at 24kHz when the response does not specify a rate.
34
+ const DEFAULT_SAMPLE_RATE = 24000;
35
+
36
+ export class GoogleSpeechModel implements SpeechModelV4 {
37
+ readonly specificationVersion = 'v4';
38
+
39
+ static [WORKFLOW_SERIALIZE](model: GoogleSpeechModel) {
40
+ return serializeModelOptions({
41
+ modelId: model.modelId,
42
+ config: model.config,
43
+ });
44
+ }
45
+
46
+ static [WORKFLOW_DESERIALIZE](options: {
47
+ modelId: GoogleSpeechModelId;
48
+ config: GoogleSpeechModelConfig;
49
+ }) {
50
+ return new GoogleSpeechModel(options.modelId, options.config);
51
+ }
52
+
53
+ get provider(): string {
54
+ return this.config.provider;
55
+ }
56
+
57
+ constructor(
58
+ readonly modelId: GoogleSpeechModelId,
59
+ private readonly config: GoogleSpeechModelConfig,
60
+ ) {}
61
+
62
+ private async getArgs({
63
+ text,
64
+ voice = DEFAULT_VOICE,
65
+ outputFormat,
66
+ instructions,
67
+ speed,
68
+ language,
69
+ providerOptions,
70
+ }: Parameters<SpeechModelV4['doGenerate']>[0]) {
71
+ const warnings: SharedV4Warning[] = [];
72
+
73
+ const googleOptions = await parseProviderOptions({
74
+ provider: 'google',
75
+ providerOptions,
76
+ schema: googleSpeechProviderOptionsSchema,
77
+ });
78
+
79
+ // Multi-speaker (provider option) takes precedence over the single voice.
80
+ const multiSpeakerVoiceConfig = googleOptions?.multiSpeakerVoiceConfig;
81
+ const speechConfig = multiSpeakerVoiceConfig
82
+ ? { multiSpeakerVoiceConfig }
83
+ : { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } };
84
+
85
+ // Gemini honors natural-language style direction expressed in the prompt
86
+ // text, so map `instructions` onto the spoken content. With multi-speaker
87
+ // the transcript starts with speaker labels (e.g. `Joe: ...`), so prepending
88
+ // instructions would corrupt that parsing — ignore them there (with a warning).
89
+ let promptText = text;
90
+ if (instructions != null) {
91
+ if (multiSpeakerVoiceConfig) {
92
+ warnings.push({
93
+ type: 'unsupported',
94
+ feature: 'instructions',
95
+ details:
96
+ 'Google Gemini TTS ignores `instructions` when `multiSpeakerVoiceConfig` is set, ' +
97
+ 'because prepending them would break multi-speaker transcript parsing.',
98
+ });
99
+ } else {
100
+ promptText = `${instructions}: ${text}`;
101
+ }
102
+ }
103
+
104
+ if (speed != null) {
105
+ warnings.push({
106
+ type: 'unsupported',
107
+ feature: 'speed',
108
+ details:
109
+ 'Google Gemini TTS models do not support the `speed` option. It was ignored.',
110
+ });
111
+ }
112
+
113
+ if (language != null) {
114
+ warnings.push({
115
+ type: 'unsupported',
116
+ feature: 'language',
117
+ details:
118
+ 'Google Gemini TTS models do not support the `language` option. ' +
119
+ 'Language is detected automatically from the input text.',
120
+ });
121
+ }
122
+
123
+ // Only `wav` (default, WAV-wrapped) and `pcm` (raw) are supported.
124
+ let resolvedOutputFormat: 'wav' | 'pcm' = 'wav';
125
+ if (outputFormat === 'pcm') {
126
+ resolvedOutputFormat = 'pcm';
127
+ } else if (outputFormat != null && outputFormat !== 'wav') {
128
+ warnings.push({
129
+ type: 'unsupported',
130
+ feature: 'outputFormat',
131
+ details: `Unsupported output format: ${outputFormat}. Using wav instead.`,
132
+ });
133
+ }
134
+
135
+ const requestBody = {
136
+ contents: [{ parts: [{ text: promptText }] }],
137
+ generationConfig: {
138
+ responseModalities: ['AUDIO'],
139
+ speechConfig,
140
+ },
141
+ };
142
+
143
+ return { requestBody, warnings, outputFormat: resolvedOutputFormat };
144
+ }
145
+
146
+ async doGenerate(
147
+ options: Parameters<SpeechModelV4['doGenerate']>[0],
148
+ ): Promise<Awaited<ReturnType<SpeechModelV4['doGenerate']>>> {
149
+ const currentDate = this.config._internal?.currentDate?.() ?? new Date();
150
+ const { requestBody, warnings, outputFormat } = await this.getArgs(options);
151
+
152
+ const {
153
+ value: response,
154
+ responseHeaders,
155
+ rawValue: rawResponse,
156
+ } = await postJsonToApi({
157
+ url: `${this.config.baseURL}/models/${this.modelId}:generateContent`,
158
+ headers: combineHeaders(
159
+ this.config.headers ? await resolve(this.config.headers) : undefined,
160
+ options.headers,
161
+ ),
162
+ body: requestBody,
163
+ failedResponseHandler: googleFailedResponseHandler,
164
+ successfulResponseHandler: createJsonResponseHandler(
165
+ googleSpeechResponseSchema,
166
+ ),
167
+ abortSignal: options.abortSignal,
168
+ fetch: this.config.fetch,
169
+ });
170
+
171
+ // `generateSpeech` returns a single audio result, and Gemini returns one
172
+ // inline audio part per request, so take the first inline-data part.
173
+ let base64Audio: string | undefined;
174
+ let mimeType: string | undefined;
175
+ for (const candidate of response.candidates ?? []) {
176
+ for (const part of candidate.content?.parts ?? []) {
177
+ if (part.inlineData?.data) {
178
+ base64Audio = part.inlineData.data;
179
+ mimeType = part.inlineData.mimeType ?? undefined;
180
+ break;
181
+ }
182
+ }
183
+ if (base64Audio != null) {
184
+ break;
185
+ }
186
+ }
187
+
188
+ const sampleRate = parseSampleRate(mimeType) ?? DEFAULT_SAMPLE_RATE;
189
+ const pcm =
190
+ base64Audio != null
191
+ ? convertBase64ToUint8Array(base64Audio)
192
+ : new Uint8Array(0);
193
+
194
+ // Gemini returns headerless raw PCM (e.g. `audio/L16;rate=24000`). Unlike
195
+ // providers that return a container format (mp3/opus/wav) directly,
196
+ // `generateSpeech`'s `detectMediaType` can't identify raw PCM and would
197
+ // mislabel it `audio/mp3` (not playable), so wrap it in a minimal WAV header
198
+ // by default; `outputFormat: 'pcm'` returns the raw bytes untouched.
199
+ // Empty audio is returned as-is so the core layer throws NoSpeechGeneratedError.
200
+ const audio =
201
+ outputFormat === 'pcm' || pcm.length === 0
202
+ ? pcm
203
+ : addWavHeader(pcm, sampleRate);
204
+
205
+ if (outputFormat === 'pcm' && pcm.length > 0) {
206
+ warnings.push({
207
+ type: 'unsupported',
208
+ feature: 'outputFormat',
209
+ details:
210
+ `Returning raw PCM audio (signed 16-bit little-endian, mono, ${sampleRate} Hz). ` +
211
+ 'These bytes have no container header and are not directly playable; ' +
212
+ 'see providerMetadata.google for the sample rate and mime type.',
213
+ });
214
+ }
215
+
216
+ return {
217
+ audio,
218
+ warnings,
219
+ request: {
220
+ body: JSON.stringify(requestBody),
221
+ },
222
+ response: {
223
+ timestamp: currentDate,
224
+ modelId: this.modelId,
225
+ headers: responseHeaders,
226
+ body: rawResponse,
227
+ },
228
+ providerMetadata: {
229
+ google: {
230
+ sampleRate,
231
+ mimeType: mimeType ?? null,
232
+ },
233
+ },
234
+ };
235
+ }
236
+ }
237
+
238
+ /**
239
+ * Parses the sample rate from a PCM mime type such as `audio/L16;rate=24000`.
240
+ */
241
+ function parseSampleRate(mimeType: string | undefined): number | undefined {
242
+ if (mimeType == null) {
243
+ return undefined;
244
+ }
245
+ const match = /rate=(\d+)/.exec(mimeType);
246
+ return match ? Number.parseInt(match[1], 10) : undefined;
247
+ }
248
+
249
+ /**
250
+ * Wraps raw signed 16-bit little-endian mono PCM in a minimal 44-byte WAV
251
+ * (RIFF/WAVE) container so the output is playable and detectable as `audio/wav`.
252
+ */
253
+ function addWavHeader(pcm: Uint8Array, sampleRate: number): Uint8Array {
254
+ const numChannels = 1;
255
+ const bitsPerSample = 16;
256
+ const blockAlign = (numChannels * bitsPerSample) / 8;
257
+ const byteRate = sampleRate * blockAlign;
258
+ const dataSize = pcm.length;
259
+
260
+ const buffer = new ArrayBuffer(44 + dataSize);
261
+ const view = new DataView(buffer);
262
+
263
+ writeAscii(view, 0, 'RIFF');
264
+ view.setUint32(4, 36 + dataSize, true);
265
+ writeAscii(view, 8, 'WAVE');
266
+ writeAscii(view, 12, 'fmt ');
267
+ view.setUint32(16, 16, true); // PCM fmt chunk size
268
+ view.setUint16(20, 1, true); // audio format = PCM
269
+ view.setUint16(22, numChannels, true);
270
+ view.setUint32(24, sampleRate, true);
271
+ view.setUint32(28, byteRate, true);
272
+ view.setUint16(32, blockAlign, true);
273
+ view.setUint16(34, bitsPerSample, true);
274
+ writeAscii(view, 36, 'data');
275
+ view.setUint32(40, dataSize, true);
276
+
277
+ const out = new Uint8Array(buffer);
278
+ out.set(pcm, 44);
279
+ return out;
280
+ }
281
+
282
+ function writeAscii(view: DataView, offset: number, text: string): void {
283
+ for (let i = 0; i < text.length; i++) {
284
+ view.setUint8(offset + i, text.charCodeAt(i));
285
+ }
286
+ }
package/src/index.ts CHANGED
@@ -29,6 +29,10 @@ export type {
29
29
  /** @deprecated Use `GoogleVideoModelId` instead. */
30
30
  GoogleVideoModelId as GoogleGenerativeAIVideoModelId,
31
31
  } from './google-video-settings';
32
+ export type {
33
+ GoogleSpeechModelOptions,
34
+ GoogleSpeechModelId,
35
+ } from './google-speech-model-options';
32
36
  export type { GoogleFilesUploadOptions } from './google-files';
33
37
  export type {
34
38
  GoogleLanguageModelInteractionsOptions,