@ai-sdk/google 4.0.0-canary.74 → 4.0.0-canary.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1774,6 +1774,7 @@ The following optional provider options are available for Google embedding model
1774
1774
  | Model | Default Dimensions | Custom Dimensions | Multimodal |
1775
1775
  | ---------------------------- | ------------------ | ------------------- | ------------------- |
1776
1776
  | `gemini-embedding-001` | 3072 | <Check size={18} /> | <Cross size={18} /> |
1777
+ | `gemini-embedding-2` | 3072 | <Check size={18} /> | <Check size={18} /> |
1777
1778
  | `gemini-embedding-2-preview` | 3072 | <Check size={18} /> | <Check size={18} /> |
1778
1779
 
1779
1780
  ## Image Models
@@ -1934,3 +1935,80 @@ console.log(result.providerMetadata?.google?.groundingMetadata);
1934
1935
  2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
1935
1936
  grounding.
1936
1937
  </Note>
1938
+
1939
+ ## Speech Models
1940
+
1941
+ You can create models that call the [Gemini text-to-speech API](https://ai.google.dev/gemini-api/docs/speech-generation)
1942
+ using the `.speech()` factory method.
1943
+
1944
+ The first argument is the model id e.g. `gemini-2.5-flash-preview-tts`.
1945
+
1946
+ ```ts
1947
+ const model = google.speech('gemini-2.5-flash-preview-tts');
1948
+ ```
1949
+
1950
+ The `voice` argument can be set to one of Gemini's [30 prebuilt voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices)
1951
+ e.g. `Kore`, `Puck`, `Zephyr`, or `Charon`. Voice names are case-sensitive. It defaults to `Kore`.
1952
+
1953
+ ```ts highlight="6"
1954
+ import { experimental_generateSpeech as generateSpeech } from 'ai';
1955
+ import { google } from '@ai-sdk/google';
1956
+
1957
+ const result = await generateSpeech({
1958
+ model: google.speech('gemini-2.5-flash-preview-tts'),
1959
+ text: 'Hello, world!',
1960
+ voice: 'Kore', // Gemini voice name
1961
+ });
1962
+ ```
1963
+
1964
+ By default the generated audio is returned as a playable WAV file (`result.audio.mediaType` is
1965
+ `audio/wav`). Set `outputFormat: 'pcm'` to receive the raw signed 16-bit little-endian mono PCM
1966
+ bytes instead; the sample rate is reported in `result.providerMetadata.google.sampleRate`.
1967
+
1968
+ Gemini honors natural-language style direction. The `instructions` argument is prepended to the
1969
+ spoken text, so `instructions: 'Say cheerfully'` with `text: 'Hello'` speaks `Say cheerfully: Hello`.
1970
+
1971
+ ### Multi-speaker audio
1972
+
1973
+ For multi-speaker dialogue, pass a `multiSpeakerVoiceConfig` through `providerOptions`. Each speaker
1974
+ name must match a name used in the input text. When set, it overrides the top-level `voice`.
1975
+
1976
+ ```ts highlight="8-23"
1977
+ import { experimental_generateSpeech as generateSpeech } from 'ai';
1978
+ import { google, type GoogleSpeechModelOptions } from '@ai-sdk/google';
1979
+
1980
+ const result = await generateSpeech({
1981
+ model: google.speech('gemini-2.5-flash-preview-tts'),
1982
+ text: 'Joe: How are you? Jane: Doing great, thanks!',
1983
+ providerOptions: {
1984
+ google: {
1985
+ multiSpeakerVoiceConfig: {
1986
+ speakerVoiceConfigs: [
1987
+ {
1988
+ speaker: 'Joe',
1989
+ voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } },
1990
+ },
1991
+ {
1992
+ speaker: 'Jane',
1993
+ voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Puck' } },
1994
+ },
1995
+ ],
1996
+ },
1997
+ } satisfies GoogleSpeechModelOptions,
1998
+ },
1999
+ });
2000
+ ```
2001
+
2002
+ <Note>
2003
+ Gemini TTS models do not support the `speed` or `language` options; passing
2004
+ them adds a warning to `result.warnings`. Language is detected automatically
2005
+ from the input text.
2006
+ </Note>
2007
+
2008
+ ### Model Capabilities
2009
+
2010
+ | Model | Multi-speaker | Style via instructions |
2011
+ | ------------------------------ | ------------------- | ---------------------- |
2012
+ | `gemini-2.5-flash-preview-tts` | <Check size={18} /> | <Check size={18} /> |
2013
+ | `gemini-2.5-pro-preview-tts` | <Check size={18} /> | <Check size={18} /> |
2014
+ | `gemini-3.1-flash-tts-preview` | <Check size={18} /> | <Check size={18} /> |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ai-sdk/google",
3
- "version": "4.0.0-canary.74",
3
+ "version": "4.0.0-canary.76",
4
4
  "type": "module",
5
5
  "license": "Apache-2.0",
6
6
  "sideEffects": false,
@@ -7,6 +7,7 @@ import { z } from 'zod/v4';
7
7
 
8
8
  export type GoogleEmbeddingModelId =
9
9
  | 'gemini-embedding-001'
10
+ | 'gemini-embedding-2'
10
11
  | 'gemini-embedding-2-preview'
11
12
  | (string & {});
12
13
 
@@ -196,7 +196,7 @@ export class GoogleJSONAccumulator {
196
196
  const startIdx = this.pathStack.length - 1;
197
197
 
198
198
  for (let i = startIdx; i < targetContainer.length; i++) {
199
- const seg = targetContainer[i];
199
+ const pathSegment = targetContainer[i];
200
200
  const parentEntry = this.pathStack[this.pathStack.length - 1];
201
201
 
202
202
  if (parentEntry.childCount > 0) {
@@ -204,8 +204,8 @@ export class GoogleJSONAccumulator {
204
204
  }
205
205
  parentEntry.childCount++;
206
206
 
207
- if (typeof seg === 'string') {
208
- fragment += `${JSON.stringify(seg)}:`;
207
+ if (typeof pathSegment === 'string') {
208
+ fragment += `${JSON.stringify(pathSegment)}:`;
209
209
  }
210
210
 
211
211
  const childSeg =
@@ -214,7 +214,7 @@ export class GoogleJSONAccumulator {
214
214
 
215
215
  fragment += isArray ? '[' : '{';
216
216
 
217
- this.pathStack.push({ segment: seg, isArray, childCount: 0 });
217
+ this.pathStack.push({ segment: pathSegment, isArray, childCount: 0 });
218
218
  }
219
219
 
220
220
  return fragment;
@@ -287,9 +287,9 @@ function getNestedValue(
287
287
  segments: Array<string | number>,
288
288
  ): unknown {
289
289
  let current: unknown = obj;
290
- for (const seg of segments) {
290
+ for (const pathSegment of segments) {
291
291
  if (current == null || typeof current !== 'object') return undefined;
292
- current = (current as Record<string | number, unknown>)[seg];
292
+ current = (current as Record<string | number, unknown>)[pathSegment];
293
293
  }
294
294
  return current;
295
295
  }
@@ -307,12 +307,12 @@ function setNestedValue(
307
307
  ): void {
308
308
  let current: Record<string | number, unknown> = obj;
309
309
  for (let i = 0; i < segments.length - 1; i++) {
310
- const seg = segments[i];
310
+ const pathSegment = segments[i];
311
311
  const nextSeg = segments[i + 1];
312
- if (current[seg] == null) {
313
- current[seg] = typeof nextSeg === 'number' ? [] : {};
312
+ if (current[pathSegment] == null) {
313
+ current[pathSegment] = typeof nextSeg === 'number' ? [] : {};
314
314
  }
315
- current = current[seg] as Record<string | number, unknown>;
315
+ current = current[pathSegment] as Record<string | number, unknown>;
316
316
  }
317
317
  current[segments[segments.length - 1]] = value;
318
318
  }
@@ -37,6 +37,8 @@ export type GoogleModelId =
37
37
  | 'gemini-flash-latest'
38
38
  | 'gemini-flash-lite-latest'
39
39
  | 'deep-research-pro-preview-12-2025'
40
+ | 'deep-research-max-preview-04-2026'
41
+ | 'deep-research-preview-04-2026'
40
42
  | 'nano-banana-pro-preview'
41
43
  | 'aqa'
42
44
  // Experimental models
@@ -5,6 +5,7 @@ import type {
5
5
  ImageModelV4,
6
6
  LanguageModelV4,
7
7
  ProviderV4,
8
+ SpeechModelV4,
8
9
  } from '@ai-sdk/provider';
9
10
  import {
10
11
  generateId,
@@ -28,6 +29,8 @@ import { GoogleImageModel } from './google-image-model';
28
29
  import { GoogleFiles } from './google-files';
29
30
  import { GoogleVideoModel } from './google-video-model';
30
31
  import type { GoogleVideoModelId } from './google-video-settings';
32
+ import { GoogleSpeechModel } from './google-speech-model';
33
+ import type { GoogleSpeechModelId } from './google-speech-model-options';
31
34
  import {
32
35
  GoogleInteractionsLanguageModel,
33
36
  type GoogleInteractionsModelInput,
@@ -85,6 +88,16 @@ export interface GoogleProvider extends ProviderV4 {
85
88
  */
86
89
  videoModel(modelId: GoogleVideoModelId): Experimental_VideoModelV4;
87
90
 
91
+ /**
92
+ * Creates a model for speech generation (text-to-speech).
93
+ */
94
+ speech(modelId: GoogleSpeechModelId): SpeechModelV4;
95
+
96
+ /**
97
+ * Creates a model for speech generation (text-to-speech).
98
+ */
99
+ speechModel(modelId: GoogleSpeechModelId): SpeechModelV4;
100
+
88
101
  files(): FilesV4;
89
102
 
90
103
  /**
@@ -223,6 +236,14 @@ export function createGoogle(
223
236
  generateId: options.generateId ?? generateId,
224
237
  });
225
238
 
239
+ const createSpeechModel = (modelId: GoogleSpeechModelId) =>
240
+ new GoogleSpeechModel(modelId, {
241
+ provider: `${providerName}.speech`,
242
+ baseURL,
243
+ headers: getHeaders,
244
+ fetch: options.fetch,
245
+ });
246
+
226
247
  const createInteractionsModel = (
227
248
  modelIdOrAgent:
228
249
  | GoogleInteractionsModelId
@@ -263,6 +284,8 @@ export function createGoogle(
263
284
  provider.video = createVideoModel;
264
285
  provider.videoModel = createVideoModel;
265
286
  provider.files = createFiles;
287
+ provider.speech = createSpeechModel;
288
+ provider.speechModel = createSpeechModel;
266
289
  provider.interactions = createInteractionsModel;
267
290
  provider.tools = googleTools;
268
291
 
@@ -0,0 +1,36 @@
1
+ import { lazySchema, zodSchema } from '@ai-sdk/provider-utils';
2
+ import { z } from 'zod/v4';
3
+
4
+ /**
5
+ * Response schema for the Gemini `:generateContent` endpoint when called with
6
+ * `responseModalities: ['AUDIO']`. The generated audio is returned as base64
7
+ * encoded raw PCM in the first inline-data part.
8
+ */
9
+ export const googleSpeechResponseSchema = lazySchema(() =>
10
+ zodSchema(
11
+ z.object({
12
+ candidates: z
13
+ .array(
14
+ z.object({
15
+ content: z
16
+ .object({
17
+ parts: z
18
+ .array(
19
+ z.object({
20
+ inlineData: z
21
+ .object({
22
+ mimeType: z.string().nullish(),
23
+ data: z.string().nullish(),
24
+ })
25
+ .nullish(),
26
+ }),
27
+ )
28
+ .nullish(),
29
+ })
30
+ .nullish(),
31
+ }),
32
+ )
33
+ .nullish(),
34
+ }),
35
+ ),
36
+ );
@@ -0,0 +1,48 @@
1
+ import {
2
+ lazySchema,
3
+ zodSchema,
4
+ type InferSchema,
5
+ } from '@ai-sdk/provider-utils';
6
+ import { z } from 'zod/v4';
7
+
8
+ export type GoogleSpeechModelId =
9
+ | 'gemini-2.5-flash-preview-tts'
10
+ | 'gemini-2.5-pro-preview-tts'
11
+ | 'gemini-3.1-flash-tts-preview'
12
+ | (string & {});
13
+
14
+ const prebuiltVoiceConfigSchema = z.object({
15
+ voiceName: z.string(),
16
+ });
17
+
18
+ const voiceConfigSchema = z.object({
19
+ prebuiltVoiceConfig: prebuiltVoiceConfigSchema,
20
+ });
21
+
22
+ export const googleSpeechProviderOptionsSchema = lazySchema(() =>
23
+ zodSchema(
24
+ z.object({
25
+ /**
26
+ * Multi-speaker configuration for dialogue audio. When provided, this
27
+ * overrides the top-level `voice`. The Gemini TTS API supports up to two
28
+ * speakers; each speaker name must match a name used in the input text.
29
+ *
30
+ * https://ai.google.dev/gemini-api/docs/speech-generation#multi-speaker
31
+ */
32
+ multiSpeakerVoiceConfig: z
33
+ .object({
34
+ speakerVoiceConfigs: z.array(
35
+ z.object({
36
+ speaker: z.string(),
37
+ voiceConfig: voiceConfigSchema,
38
+ }),
39
+ ),
40
+ })
41
+ .optional(),
42
+ }),
43
+ ),
44
+ );
45
+
46
+ export type GoogleSpeechModelOptions = InferSchema<
47
+ typeof googleSpeechProviderOptionsSchema
48
+ >;
@@ -0,0 +1,286 @@
1
+ import type { SpeechModelV4, SharedV4Warning } from '@ai-sdk/provider';
2
+ import {
3
+ combineHeaders,
4
+ convertBase64ToUint8Array,
5
+ createJsonResponseHandler,
6
+ parseProviderOptions,
7
+ postJsonToApi,
8
+ resolve,
9
+ serializeModelOptions,
10
+ WORKFLOW_DESERIALIZE,
11
+ WORKFLOW_SERIALIZE,
12
+ type FetchFunction,
13
+ type Resolvable,
14
+ } from '@ai-sdk/provider-utils';
15
+ import { googleFailedResponseHandler } from './google-error';
16
+ import { googleSpeechResponseSchema } from './google-speech-api';
17
+ import {
18
+ googleSpeechProviderOptionsSchema,
19
+ type GoogleSpeechModelId,
20
+ } from './google-speech-model-options';
21
+
22
+ interface GoogleSpeechModelConfig {
23
+ provider: string;
24
+ baseURL: string;
25
+ headers?: Resolvable<Record<string, string | undefined>>;
26
+ fetch?: FetchFunction;
27
+ _internal?: {
28
+ currentDate?: () => Date;
29
+ };
30
+ }
31
+
32
+ const DEFAULT_VOICE = 'Kore';
33
+ // Gemini TTS returns raw PCM at 24kHz when the response does not specify a rate.
34
+ const DEFAULT_SAMPLE_RATE = 24000;
35
+
36
+ export class GoogleSpeechModel implements SpeechModelV4 {
37
+ readonly specificationVersion = 'v4';
38
+
39
+ static [WORKFLOW_SERIALIZE](model: GoogleSpeechModel) {
40
+ return serializeModelOptions({
41
+ modelId: model.modelId,
42
+ config: model.config,
43
+ });
44
+ }
45
+
46
+ static [WORKFLOW_DESERIALIZE](options: {
47
+ modelId: GoogleSpeechModelId;
48
+ config: GoogleSpeechModelConfig;
49
+ }) {
50
+ return new GoogleSpeechModel(options.modelId, options.config);
51
+ }
52
+
53
+ get provider(): string {
54
+ return this.config.provider;
55
+ }
56
+
57
+ constructor(
58
+ readonly modelId: GoogleSpeechModelId,
59
+ private readonly config: GoogleSpeechModelConfig,
60
+ ) {}
61
+
62
+ private async getArgs({
63
+ text,
64
+ voice = DEFAULT_VOICE,
65
+ outputFormat,
66
+ instructions,
67
+ speed,
68
+ language,
69
+ providerOptions,
70
+ }: Parameters<SpeechModelV4['doGenerate']>[0]) {
71
+ const warnings: SharedV4Warning[] = [];
72
+
73
+ const googleOptions = await parseProviderOptions({
74
+ provider: 'google',
75
+ providerOptions,
76
+ schema: googleSpeechProviderOptionsSchema,
77
+ });
78
+
79
+ // Multi-speaker (provider option) takes precedence over the single voice.
80
+ const multiSpeakerVoiceConfig = googleOptions?.multiSpeakerVoiceConfig;
81
+ const speechConfig = multiSpeakerVoiceConfig
82
+ ? { multiSpeakerVoiceConfig }
83
+ : { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } };
84
+
85
+ // Gemini honors natural-language style direction expressed in the prompt
86
+ // text, so map `instructions` onto the spoken content. With multi-speaker
87
+ // the transcript starts with speaker labels (e.g. `Joe: ...`), so prepending
88
+ // instructions would corrupt that parsing — ignore them there (with a warning).
89
+ let promptText = text;
90
+ if (instructions != null) {
91
+ if (multiSpeakerVoiceConfig) {
92
+ warnings.push({
93
+ type: 'unsupported',
94
+ feature: 'instructions',
95
+ details:
96
+ 'Google Gemini TTS ignores `instructions` when `multiSpeakerVoiceConfig` is set, ' +
97
+ 'because prepending them would break multi-speaker transcript parsing.',
98
+ });
99
+ } else {
100
+ promptText = `${instructions}: ${text}`;
101
+ }
102
+ }
103
+
104
+ if (speed != null) {
105
+ warnings.push({
106
+ type: 'unsupported',
107
+ feature: 'speed',
108
+ details:
109
+ 'Google Gemini TTS models do not support the `speed` option. It was ignored.',
110
+ });
111
+ }
112
+
113
+ if (language != null) {
114
+ warnings.push({
115
+ type: 'unsupported',
116
+ feature: 'language',
117
+ details:
118
+ 'Google Gemini TTS models do not support the `language` option. ' +
119
+ 'Language is detected automatically from the input text.',
120
+ });
121
+ }
122
+
123
+ // Only `wav` (default, WAV-wrapped) and `pcm` (raw) are supported.
124
+ let resolvedOutputFormat: 'wav' | 'pcm' = 'wav';
125
+ if (outputFormat === 'pcm') {
126
+ resolvedOutputFormat = 'pcm';
127
+ } else if (outputFormat != null && outputFormat !== 'wav') {
128
+ warnings.push({
129
+ type: 'unsupported',
130
+ feature: 'outputFormat',
131
+ details: `Unsupported output format: ${outputFormat}. Using wav instead.`,
132
+ });
133
+ }
134
+
135
+ const requestBody = {
136
+ contents: [{ parts: [{ text: promptText }] }],
137
+ generationConfig: {
138
+ responseModalities: ['AUDIO'],
139
+ speechConfig,
140
+ },
141
+ };
142
+
143
+ return { requestBody, warnings, outputFormat: resolvedOutputFormat };
144
+ }
145
+
146
+ async doGenerate(
147
+ options: Parameters<SpeechModelV4['doGenerate']>[0],
148
+ ): Promise<Awaited<ReturnType<SpeechModelV4['doGenerate']>>> {
149
+ const currentDate = this.config._internal?.currentDate?.() ?? new Date();
150
+ const { requestBody, warnings, outputFormat } = await this.getArgs(options);
151
+
152
+ const {
153
+ value: response,
154
+ responseHeaders,
155
+ rawValue: rawResponse,
156
+ } = await postJsonToApi({
157
+ url: `${this.config.baseURL}/models/${this.modelId}:generateContent`,
158
+ headers: combineHeaders(
159
+ this.config.headers ? await resolve(this.config.headers) : undefined,
160
+ options.headers,
161
+ ),
162
+ body: requestBody,
163
+ failedResponseHandler: googleFailedResponseHandler,
164
+ successfulResponseHandler: createJsonResponseHandler(
165
+ googleSpeechResponseSchema,
166
+ ),
167
+ abortSignal: options.abortSignal,
168
+ fetch: this.config.fetch,
169
+ });
170
+
171
+ // `generateSpeech` returns a single audio result, and Gemini returns one
172
+ // inline audio part per request, so take the first inline-data part.
173
+ let base64Audio: string | undefined;
174
+ let mimeType: string | undefined;
175
+ for (const candidate of response.candidates ?? []) {
176
+ for (const part of candidate.content?.parts ?? []) {
177
+ if (part.inlineData?.data) {
178
+ base64Audio = part.inlineData.data;
179
+ mimeType = part.inlineData.mimeType ?? undefined;
180
+ break;
181
+ }
182
+ }
183
+ if (base64Audio != null) {
184
+ break;
185
+ }
186
+ }
187
+
188
+ const sampleRate = parseSampleRate(mimeType) ?? DEFAULT_SAMPLE_RATE;
189
+ const pcm =
190
+ base64Audio != null
191
+ ? convertBase64ToUint8Array(base64Audio)
192
+ : new Uint8Array(0);
193
+
194
+ // Gemini returns headerless raw PCM (e.g. `audio/L16;rate=24000`). Unlike
195
+ // providers that return a container format (mp3/opus/wav) directly,
196
+ // `generateSpeech`'s `detectMediaType` can't identify raw PCM and would
197
+ // mislabel it `audio/mp3` (not playable), so wrap it in a minimal WAV header
198
+ // by default; `outputFormat: 'pcm'` returns the raw bytes untouched.
199
+ // Empty audio is returned as-is so the core layer throws NoSpeechGeneratedError.
200
+ const audio =
201
+ outputFormat === 'pcm' || pcm.length === 0
202
+ ? pcm
203
+ : addWavHeader(pcm, sampleRate);
204
+
205
+ if (outputFormat === 'pcm' && pcm.length > 0) {
206
+ warnings.push({
207
+ type: 'unsupported',
208
+ feature: 'outputFormat',
209
+ details:
210
+ `Returning raw PCM audio (signed 16-bit little-endian, mono, ${sampleRate} Hz). ` +
211
+ 'These bytes have no container header and are not directly playable; ' +
212
+ 'see providerMetadata.google for the sample rate and mime type.',
213
+ });
214
+ }
215
+
216
+ return {
217
+ audio,
218
+ warnings,
219
+ request: {
220
+ body: JSON.stringify(requestBody),
221
+ },
222
+ response: {
223
+ timestamp: currentDate,
224
+ modelId: this.modelId,
225
+ headers: responseHeaders,
226
+ body: rawResponse,
227
+ },
228
+ providerMetadata: {
229
+ google: {
230
+ sampleRate,
231
+ mimeType: mimeType ?? null,
232
+ },
233
+ },
234
+ };
235
+ }
236
+ }
237
+
238
+ /**
239
+ * Parses the sample rate from a PCM mime type such as `audio/L16;rate=24000`.
240
+ */
241
+ function parseSampleRate(mimeType: string | undefined): number | undefined {
242
+ if (mimeType == null) {
243
+ return undefined;
244
+ }
245
+ const match = /rate=(\d+)/.exec(mimeType);
246
+ return match ? Number.parseInt(match[1], 10) : undefined;
247
+ }
248
+
249
+ /**
250
+ * Wraps raw signed 16-bit little-endian mono PCM in a minimal 44-byte WAV
251
+ * (RIFF/WAVE) container so the output is playable and detectable as `audio/wav`.
252
+ */
253
+ function addWavHeader(pcm: Uint8Array, sampleRate: number): Uint8Array {
254
+ const numChannels = 1;
255
+ const bitsPerSample = 16;
256
+ const blockAlign = (numChannels * bitsPerSample) / 8;
257
+ const byteRate = sampleRate * blockAlign;
258
+ const dataSize = pcm.length;
259
+
260
+ const buffer = new ArrayBuffer(44 + dataSize);
261
+ const view = new DataView(buffer);
262
+
263
+ writeAscii(view, 0, 'RIFF');
264
+ view.setUint32(4, 36 + dataSize, true);
265
+ writeAscii(view, 8, 'WAVE');
266
+ writeAscii(view, 12, 'fmt ');
267
+ view.setUint32(16, 16, true); // PCM fmt chunk size
268
+ view.setUint16(20, 1, true); // audio format = PCM
269
+ view.setUint16(22, numChannels, true);
270
+ view.setUint32(24, sampleRate, true);
271
+ view.setUint32(28, byteRate, true);
272
+ view.setUint16(32, blockAlign, true);
273
+ view.setUint16(34, bitsPerSample, true);
274
+ writeAscii(view, 36, 'data');
275
+ view.setUint32(40, dataSize, true);
276
+
277
+ const out = new Uint8Array(buffer);
278
+ out.set(pcm, 44);
279
+ return out;
280
+ }
281
+
282
+ function writeAscii(view: DataView, offset: number, text: string): void {
283
+ for (let i = 0; i < text.length; i++) {
284
+ view.setUint8(offset + i, text.charCodeAt(i));
285
+ }
286
+ }
package/src/index.ts CHANGED
@@ -29,6 +29,10 @@ export type {
29
29
  /** @deprecated Use `GoogleVideoModelId` instead. */
30
30
  GoogleVideoModelId as GoogleGenerativeAIVideoModelId,
31
31
  } from './google-video-settings';
32
+ export type {
33
+ GoogleSpeechModelOptions,
34
+ GoogleSpeechModelId,
35
+ } from './google-speech-model-options';
32
36
  export type { GoogleFilesUploadOptions } from './google-files';
33
37
  export type {
34
38
  GoogleLanguageModelInteractionsOptions,
@@ -446,25 +446,25 @@ export function buildGoogleInteractionsStreamTransform({
446
446
  open.kind === 'text' ||
447
447
  open.kind === 'image')
448
448
  ) {
449
- const img = event.delta as
449
+ const imageDelta = event.delta as
450
450
  | { data?: string; mime_type?: string; uri?: string }
451
451
  | undefined;
452
452
  const google: Record<string, string> = {};
453
453
  if (interactionId != null) google.interactionId = interactionId;
454
454
  const providerMetadata =
455
455
  Object.keys(google).length > 0 ? { google } : undefined;
456
- if (img?.data != null && img.data.length > 0) {
456
+ if (imageDelta?.data != null && imageDelta.data.length > 0) {
457
457
  controller.enqueue({
458
458
  type: 'file',
459
- mediaType: img.mime_type ?? 'image/png',
460
- data: { type: 'data', data: img.data },
459
+ mediaType: imageDelta.mime_type ?? 'image/png',
460
+ data: { type: 'data', data: imageDelta.data },
461
461
  ...(providerMetadata ? { providerMetadata } : {}),
462
462
  });
463
- } else if (img?.uri != null && img.uri.length > 0) {
463
+ } else if (imageDelta?.uri != null && imageDelta.uri.length > 0) {
464
464
  controller.enqueue({
465
465
  type: 'file',
466
- mediaType: img.mime_type ?? 'image/png',
467
- data: { type: 'url', url: new URL(img.uri) },
466
+ mediaType: imageDelta.mime_type ?? 'image/png',
467
+ data: { type: 'url', url: new URL(imageDelta.uri) },
468
468
  ...(providerMetadata ? { providerMetadata } : {}),
469
469
  });
470
470
  }