@ai-sdk/google 4.0.0-canary.74 → 4.0.0-canary.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/index.d.ts +27 -4
- package/dist/index.js +638 -366
- package/dist/index.js.map +1 -1
- package/dist/internal/index.d.ts +1 -1
- package/dist/internal/index.js +10 -10
- package/dist/internal/index.js.map +1 -1
- package/docs/15-google.mdx +78 -0
- package/package.json +1 -1
- package/src/google-embedding-model-options.ts +1 -0
- package/src/google-json-accumulator.ts +10 -10
- package/src/google-language-model-options.ts +2 -0
- package/src/google-provider.ts +23 -0
- package/src/google-speech-api.ts +36 -0
- package/src/google-speech-model-options.ts +48 -0
- package/src/google-speech-model.ts +286 -0
- package/src/index.ts +4 -0
- package/src/interactions/build-google-interactions-stream-transform.ts +7 -7
- package/src/interactions/extract-google-interactions-sources.ts +20 -13
- package/src/interactions/google-interactions-language-model.ts +50 -46
- package/src/interactions/stream-google-interactions.ts +8 -5
package/docs/15-google.mdx
CHANGED
|
@@ -1774,6 +1774,7 @@ The following optional provider options are available for Google embedding model
|
|
|
1774
1774
|
| Model | Default Dimensions | Custom Dimensions | Multimodal |
|
|
1775
1775
|
| ---------------------------- | ------------------ | ------------------- | ------------------- |
|
|
1776
1776
|
| `gemini-embedding-001` | 3072 | <Check size={18} /> | <Cross size={18} /> |
|
|
1777
|
+
| `gemini-embedding-2` | 3072 | <Check size={18} /> | <Check size={18} /> |
|
|
1777
1778
|
| `gemini-embedding-2-preview` | 3072 | <Check size={18} /> | <Check size={18} /> |
|
|
1778
1779
|
|
|
1779
1780
|
## Image Models
|
|
@@ -1934,3 +1935,80 @@ console.log(result.providerMetadata?.google?.groundingMetadata);
|
|
|
1934
1935
|
2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
|
|
1935
1936
|
grounding.
|
|
1936
1937
|
</Note>
|
|
1938
|
+
|
|
1939
|
+
## Speech Models
|
|
1940
|
+
|
|
1941
|
+
You can create models that call the [Gemini text-to-speech API](https://ai.google.dev/gemini-api/docs/speech-generation)
|
|
1942
|
+
using the `.speech()` factory method.
|
|
1943
|
+
|
|
1944
|
+
The first argument is the model id e.g. `gemini-2.5-flash-preview-tts`.
|
|
1945
|
+
|
|
1946
|
+
```ts
|
|
1947
|
+
const model = google.speech('gemini-2.5-flash-preview-tts');
|
|
1948
|
+
```
|
|
1949
|
+
|
|
1950
|
+
The `voice` argument can be set to one of Gemini's [30 prebuilt voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices)
|
|
1951
|
+
e.g. `Kore`, `Puck`, `Zephyr`, or `Charon`. Voice names are case-sensitive. It defaults to `Kore`.
|
|
1952
|
+
|
|
1953
|
+
```ts highlight="6"
|
|
1954
|
+
import { experimental_generateSpeech as generateSpeech } from 'ai';
|
|
1955
|
+
import { google } from '@ai-sdk/google';
|
|
1956
|
+
|
|
1957
|
+
const result = await generateSpeech({
|
|
1958
|
+
model: google.speech('gemini-2.5-flash-preview-tts'),
|
|
1959
|
+
text: 'Hello, world!',
|
|
1960
|
+
voice: 'Kore', // Gemini voice name
|
|
1961
|
+
});
|
|
1962
|
+
```
|
|
1963
|
+
|
|
1964
|
+
By default the generated audio is returned as a playable WAV file (`result.audio.mediaType` is
|
|
1965
|
+
`audio/wav`). Set `outputFormat: 'pcm'` to receive the raw signed 16-bit little-endian mono PCM
|
|
1966
|
+
bytes instead; the sample rate is reported in `result.providerMetadata.google.sampleRate`.
|
|
1967
|
+
|
|
1968
|
+
Gemini honors natural-language style direction. The `instructions` argument is prepended to the
|
|
1969
|
+
spoken text, so `instructions: 'Say cheerfully'` with `text: 'Hello'` speaks `Say cheerfully: Hello`.
|
|
1970
|
+
|
|
1971
|
+
### Multi-speaker audio
|
|
1972
|
+
|
|
1973
|
+
For multi-speaker dialogue, pass a `multiSpeakerVoiceConfig` through `providerOptions`. Each speaker
|
|
1974
|
+
name must match a name used in the input text. When set, it overrides the top-level `voice`.
|
|
1975
|
+
|
|
1976
|
+
```ts highlight="8-23"
|
|
1977
|
+
import { experimental_generateSpeech as generateSpeech } from 'ai';
|
|
1978
|
+
import { google, type GoogleSpeechModelOptions } from '@ai-sdk/google';
|
|
1979
|
+
|
|
1980
|
+
const result = await generateSpeech({
|
|
1981
|
+
model: google.speech('gemini-2.5-flash-preview-tts'),
|
|
1982
|
+
text: 'Joe: How are you? Jane: Doing great, thanks!',
|
|
1983
|
+
providerOptions: {
|
|
1984
|
+
google: {
|
|
1985
|
+
multiSpeakerVoiceConfig: {
|
|
1986
|
+
speakerVoiceConfigs: [
|
|
1987
|
+
{
|
|
1988
|
+
speaker: 'Joe',
|
|
1989
|
+
voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } },
|
|
1990
|
+
},
|
|
1991
|
+
{
|
|
1992
|
+
speaker: 'Jane',
|
|
1993
|
+
voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Puck' } },
|
|
1994
|
+
},
|
|
1995
|
+
],
|
|
1996
|
+
},
|
|
1997
|
+
} satisfies GoogleSpeechModelOptions,
|
|
1998
|
+
},
|
|
1999
|
+
});
|
|
2000
|
+
```
|
|
2001
|
+
|
|
2002
|
+
<Note>
|
|
2003
|
+
Gemini TTS models do not support the `speed` or `language` options; passing
|
|
2004
|
+
them adds a warning to `result.warnings`. Language is detected automatically
|
|
2005
|
+
from the input text.
|
|
2006
|
+
</Note>
|
|
2007
|
+
|
|
2008
|
+
### Model Capabilities
|
|
2009
|
+
|
|
2010
|
+
| Model | Multi-speaker | Style via instructions |
|
|
2011
|
+
| ------------------------------ | ------------------- | ---------------------- |
|
|
2012
|
+
| `gemini-2.5-flash-preview-tts` | <Check size={18} /> | <Check size={18} /> |
|
|
2013
|
+
| `gemini-2.5-pro-preview-tts` | <Check size={18} /> | <Check size={18} /> |
|
|
2014
|
+
| `gemini-3.1-flash-tts-preview` | <Check size={18} /> | <Check size={18} /> |
|
package/package.json
CHANGED
|
@@ -196,7 +196,7 @@ export class GoogleJSONAccumulator {
|
|
|
196
196
|
const startIdx = this.pathStack.length - 1;
|
|
197
197
|
|
|
198
198
|
for (let i = startIdx; i < targetContainer.length; i++) {
|
|
199
|
-
const
|
|
199
|
+
const pathSegment = targetContainer[i];
|
|
200
200
|
const parentEntry = this.pathStack[this.pathStack.length - 1];
|
|
201
201
|
|
|
202
202
|
if (parentEntry.childCount > 0) {
|
|
@@ -204,8 +204,8 @@ export class GoogleJSONAccumulator {
|
|
|
204
204
|
}
|
|
205
205
|
parentEntry.childCount++;
|
|
206
206
|
|
|
207
|
-
if (typeof
|
|
208
|
-
fragment += `${JSON.stringify(
|
|
207
|
+
if (typeof pathSegment === 'string') {
|
|
208
|
+
fragment += `${JSON.stringify(pathSegment)}:`;
|
|
209
209
|
}
|
|
210
210
|
|
|
211
211
|
const childSeg =
|
|
@@ -214,7 +214,7 @@ export class GoogleJSONAccumulator {
|
|
|
214
214
|
|
|
215
215
|
fragment += isArray ? '[' : '{';
|
|
216
216
|
|
|
217
|
-
this.pathStack.push({ segment:
|
|
217
|
+
this.pathStack.push({ segment: pathSegment, isArray, childCount: 0 });
|
|
218
218
|
}
|
|
219
219
|
|
|
220
220
|
return fragment;
|
|
@@ -287,9 +287,9 @@ function getNestedValue(
|
|
|
287
287
|
segments: Array<string | number>,
|
|
288
288
|
): unknown {
|
|
289
289
|
let current: unknown = obj;
|
|
290
|
-
for (const
|
|
290
|
+
for (const pathSegment of segments) {
|
|
291
291
|
if (current == null || typeof current !== 'object') return undefined;
|
|
292
|
-
current = (current as Record<string | number, unknown>)[
|
|
292
|
+
current = (current as Record<string | number, unknown>)[pathSegment];
|
|
293
293
|
}
|
|
294
294
|
return current;
|
|
295
295
|
}
|
|
@@ -307,12 +307,12 @@ function setNestedValue(
|
|
|
307
307
|
): void {
|
|
308
308
|
let current: Record<string | number, unknown> = obj;
|
|
309
309
|
for (let i = 0; i < segments.length - 1; i++) {
|
|
310
|
-
const
|
|
310
|
+
const pathSegment = segments[i];
|
|
311
311
|
const nextSeg = segments[i + 1];
|
|
312
|
-
if (current[
|
|
313
|
-
current[
|
|
312
|
+
if (current[pathSegment] == null) {
|
|
313
|
+
current[pathSegment] = typeof nextSeg === 'number' ? [] : {};
|
|
314
314
|
}
|
|
315
|
-
current = current[
|
|
315
|
+
current = current[pathSegment] as Record<string | number, unknown>;
|
|
316
316
|
}
|
|
317
317
|
current[segments[segments.length - 1]] = value;
|
|
318
318
|
}
|
|
@@ -37,6 +37,8 @@ export type GoogleModelId =
|
|
|
37
37
|
| 'gemini-flash-latest'
|
|
38
38
|
| 'gemini-flash-lite-latest'
|
|
39
39
|
| 'deep-research-pro-preview-12-2025'
|
|
40
|
+
| 'deep-research-max-preview-04-2026'
|
|
41
|
+
| 'deep-research-preview-04-2026'
|
|
40
42
|
| 'nano-banana-pro-preview'
|
|
41
43
|
| 'aqa'
|
|
42
44
|
// Experimental models
|
package/src/google-provider.ts
CHANGED
|
@@ -5,6 +5,7 @@ import type {
|
|
|
5
5
|
ImageModelV4,
|
|
6
6
|
LanguageModelV4,
|
|
7
7
|
ProviderV4,
|
|
8
|
+
SpeechModelV4,
|
|
8
9
|
} from '@ai-sdk/provider';
|
|
9
10
|
import {
|
|
10
11
|
generateId,
|
|
@@ -28,6 +29,8 @@ import { GoogleImageModel } from './google-image-model';
|
|
|
28
29
|
import { GoogleFiles } from './google-files';
|
|
29
30
|
import { GoogleVideoModel } from './google-video-model';
|
|
30
31
|
import type { GoogleVideoModelId } from './google-video-settings';
|
|
32
|
+
import { GoogleSpeechModel } from './google-speech-model';
|
|
33
|
+
import type { GoogleSpeechModelId } from './google-speech-model-options';
|
|
31
34
|
import {
|
|
32
35
|
GoogleInteractionsLanguageModel,
|
|
33
36
|
type GoogleInteractionsModelInput,
|
|
@@ -85,6 +88,16 @@ export interface GoogleProvider extends ProviderV4 {
|
|
|
85
88
|
*/
|
|
86
89
|
videoModel(modelId: GoogleVideoModelId): Experimental_VideoModelV4;
|
|
87
90
|
|
|
91
|
+
/**
|
|
92
|
+
* Creates a model for speech generation (text-to-speech).
|
|
93
|
+
*/
|
|
94
|
+
speech(modelId: GoogleSpeechModelId): SpeechModelV4;
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Creates a model for speech generation (text-to-speech).
|
|
98
|
+
*/
|
|
99
|
+
speechModel(modelId: GoogleSpeechModelId): SpeechModelV4;
|
|
100
|
+
|
|
88
101
|
files(): FilesV4;
|
|
89
102
|
|
|
90
103
|
/**
|
|
@@ -223,6 +236,14 @@ export function createGoogle(
|
|
|
223
236
|
generateId: options.generateId ?? generateId,
|
|
224
237
|
});
|
|
225
238
|
|
|
239
|
+
const createSpeechModel = (modelId: GoogleSpeechModelId) =>
|
|
240
|
+
new GoogleSpeechModel(modelId, {
|
|
241
|
+
provider: `${providerName}.speech`,
|
|
242
|
+
baseURL,
|
|
243
|
+
headers: getHeaders,
|
|
244
|
+
fetch: options.fetch,
|
|
245
|
+
});
|
|
246
|
+
|
|
226
247
|
const createInteractionsModel = (
|
|
227
248
|
modelIdOrAgent:
|
|
228
249
|
| GoogleInteractionsModelId
|
|
@@ -263,6 +284,8 @@ export function createGoogle(
|
|
|
263
284
|
provider.video = createVideoModel;
|
|
264
285
|
provider.videoModel = createVideoModel;
|
|
265
286
|
provider.files = createFiles;
|
|
287
|
+
provider.speech = createSpeechModel;
|
|
288
|
+
provider.speechModel = createSpeechModel;
|
|
266
289
|
provider.interactions = createInteractionsModel;
|
|
267
290
|
provider.tools = googleTools;
|
|
268
291
|
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { lazySchema, zodSchema } from '@ai-sdk/provider-utils';
|
|
2
|
+
import { z } from 'zod/v4';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Response schema for the Gemini `:generateContent` endpoint when called with
|
|
6
|
+
* `responseModalities: ['AUDIO']`. The generated audio is returned as base64
|
|
7
|
+
* encoded raw PCM in the first inline-data part.
|
|
8
|
+
*/
|
|
9
|
+
export const googleSpeechResponseSchema = lazySchema(() =>
|
|
10
|
+
zodSchema(
|
|
11
|
+
z.object({
|
|
12
|
+
candidates: z
|
|
13
|
+
.array(
|
|
14
|
+
z.object({
|
|
15
|
+
content: z
|
|
16
|
+
.object({
|
|
17
|
+
parts: z
|
|
18
|
+
.array(
|
|
19
|
+
z.object({
|
|
20
|
+
inlineData: z
|
|
21
|
+
.object({
|
|
22
|
+
mimeType: z.string().nullish(),
|
|
23
|
+
data: z.string().nullish(),
|
|
24
|
+
})
|
|
25
|
+
.nullish(),
|
|
26
|
+
}),
|
|
27
|
+
)
|
|
28
|
+
.nullish(),
|
|
29
|
+
})
|
|
30
|
+
.nullish(),
|
|
31
|
+
}),
|
|
32
|
+
)
|
|
33
|
+
.nullish(),
|
|
34
|
+
}),
|
|
35
|
+
),
|
|
36
|
+
);
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import {
|
|
2
|
+
lazySchema,
|
|
3
|
+
zodSchema,
|
|
4
|
+
type InferSchema,
|
|
5
|
+
} from '@ai-sdk/provider-utils';
|
|
6
|
+
import { z } from 'zod/v4';
|
|
7
|
+
|
|
8
|
+
export type GoogleSpeechModelId =
|
|
9
|
+
| 'gemini-2.5-flash-preview-tts'
|
|
10
|
+
| 'gemini-2.5-pro-preview-tts'
|
|
11
|
+
| 'gemini-3.1-flash-tts-preview'
|
|
12
|
+
| (string & {});
|
|
13
|
+
|
|
14
|
+
const prebuiltVoiceConfigSchema = z.object({
|
|
15
|
+
voiceName: z.string(),
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
const voiceConfigSchema = z.object({
|
|
19
|
+
prebuiltVoiceConfig: prebuiltVoiceConfigSchema,
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
export const googleSpeechProviderOptionsSchema = lazySchema(() =>
|
|
23
|
+
zodSchema(
|
|
24
|
+
z.object({
|
|
25
|
+
/**
|
|
26
|
+
* Multi-speaker configuration for dialogue audio. When provided, this
|
|
27
|
+
* overrides the top-level `voice`. The Gemini TTS API supports up to two
|
|
28
|
+
* speakers; each speaker name must match a name used in the input text.
|
|
29
|
+
*
|
|
30
|
+
* https://ai.google.dev/gemini-api/docs/speech-generation#multi-speaker
|
|
31
|
+
*/
|
|
32
|
+
multiSpeakerVoiceConfig: z
|
|
33
|
+
.object({
|
|
34
|
+
speakerVoiceConfigs: z.array(
|
|
35
|
+
z.object({
|
|
36
|
+
speaker: z.string(),
|
|
37
|
+
voiceConfig: voiceConfigSchema,
|
|
38
|
+
}),
|
|
39
|
+
),
|
|
40
|
+
})
|
|
41
|
+
.optional(),
|
|
42
|
+
}),
|
|
43
|
+
),
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
export type GoogleSpeechModelOptions = InferSchema<
|
|
47
|
+
typeof googleSpeechProviderOptionsSchema
|
|
48
|
+
>;
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import type { SpeechModelV4, SharedV4Warning } from '@ai-sdk/provider';
|
|
2
|
+
import {
|
|
3
|
+
combineHeaders,
|
|
4
|
+
convertBase64ToUint8Array,
|
|
5
|
+
createJsonResponseHandler,
|
|
6
|
+
parseProviderOptions,
|
|
7
|
+
postJsonToApi,
|
|
8
|
+
resolve,
|
|
9
|
+
serializeModelOptions,
|
|
10
|
+
WORKFLOW_DESERIALIZE,
|
|
11
|
+
WORKFLOW_SERIALIZE,
|
|
12
|
+
type FetchFunction,
|
|
13
|
+
type Resolvable,
|
|
14
|
+
} from '@ai-sdk/provider-utils';
|
|
15
|
+
import { googleFailedResponseHandler } from './google-error';
|
|
16
|
+
import { googleSpeechResponseSchema } from './google-speech-api';
|
|
17
|
+
import {
|
|
18
|
+
googleSpeechProviderOptionsSchema,
|
|
19
|
+
type GoogleSpeechModelId,
|
|
20
|
+
} from './google-speech-model-options';
|
|
21
|
+
|
|
22
|
+
interface GoogleSpeechModelConfig {
|
|
23
|
+
provider: string;
|
|
24
|
+
baseURL: string;
|
|
25
|
+
headers?: Resolvable<Record<string, string | undefined>>;
|
|
26
|
+
fetch?: FetchFunction;
|
|
27
|
+
_internal?: {
|
|
28
|
+
currentDate?: () => Date;
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const DEFAULT_VOICE = 'Kore';
|
|
33
|
+
// Gemini TTS returns raw PCM at 24kHz when the response does not specify a rate.
|
|
34
|
+
const DEFAULT_SAMPLE_RATE = 24000;
|
|
35
|
+
|
|
36
|
+
export class GoogleSpeechModel implements SpeechModelV4 {
|
|
37
|
+
readonly specificationVersion = 'v4';
|
|
38
|
+
|
|
39
|
+
static [WORKFLOW_SERIALIZE](model: GoogleSpeechModel) {
|
|
40
|
+
return serializeModelOptions({
|
|
41
|
+
modelId: model.modelId,
|
|
42
|
+
config: model.config,
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
static [WORKFLOW_DESERIALIZE](options: {
|
|
47
|
+
modelId: GoogleSpeechModelId;
|
|
48
|
+
config: GoogleSpeechModelConfig;
|
|
49
|
+
}) {
|
|
50
|
+
return new GoogleSpeechModel(options.modelId, options.config);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
get provider(): string {
|
|
54
|
+
return this.config.provider;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
constructor(
|
|
58
|
+
readonly modelId: GoogleSpeechModelId,
|
|
59
|
+
private readonly config: GoogleSpeechModelConfig,
|
|
60
|
+
) {}
|
|
61
|
+
|
|
62
|
+
private async getArgs({
|
|
63
|
+
text,
|
|
64
|
+
voice = DEFAULT_VOICE,
|
|
65
|
+
outputFormat,
|
|
66
|
+
instructions,
|
|
67
|
+
speed,
|
|
68
|
+
language,
|
|
69
|
+
providerOptions,
|
|
70
|
+
}: Parameters<SpeechModelV4['doGenerate']>[0]) {
|
|
71
|
+
const warnings: SharedV4Warning[] = [];
|
|
72
|
+
|
|
73
|
+
const googleOptions = await parseProviderOptions({
|
|
74
|
+
provider: 'google',
|
|
75
|
+
providerOptions,
|
|
76
|
+
schema: googleSpeechProviderOptionsSchema,
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// Multi-speaker (provider option) takes precedence over the single voice.
|
|
80
|
+
const multiSpeakerVoiceConfig = googleOptions?.multiSpeakerVoiceConfig;
|
|
81
|
+
const speechConfig = multiSpeakerVoiceConfig
|
|
82
|
+
? { multiSpeakerVoiceConfig }
|
|
83
|
+
: { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } };
|
|
84
|
+
|
|
85
|
+
// Gemini honors natural-language style direction expressed in the prompt
|
|
86
|
+
// text, so map `instructions` onto the spoken content. With multi-speaker
|
|
87
|
+
// the transcript starts with speaker labels (e.g. `Joe: ...`), so prepending
|
|
88
|
+
// instructions would corrupt that parsing — ignore them there (with a warning).
|
|
89
|
+
let promptText = text;
|
|
90
|
+
if (instructions != null) {
|
|
91
|
+
if (multiSpeakerVoiceConfig) {
|
|
92
|
+
warnings.push({
|
|
93
|
+
type: 'unsupported',
|
|
94
|
+
feature: 'instructions',
|
|
95
|
+
details:
|
|
96
|
+
'Google Gemini TTS ignores `instructions` when `multiSpeakerVoiceConfig` is set, ' +
|
|
97
|
+
'because prepending them would break multi-speaker transcript parsing.',
|
|
98
|
+
});
|
|
99
|
+
} else {
|
|
100
|
+
promptText = `${instructions}: ${text}`;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (speed != null) {
|
|
105
|
+
warnings.push({
|
|
106
|
+
type: 'unsupported',
|
|
107
|
+
feature: 'speed',
|
|
108
|
+
details:
|
|
109
|
+
'Google Gemini TTS models do not support the `speed` option. It was ignored.',
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (language != null) {
|
|
114
|
+
warnings.push({
|
|
115
|
+
type: 'unsupported',
|
|
116
|
+
feature: 'language',
|
|
117
|
+
details:
|
|
118
|
+
'Google Gemini TTS models do not support the `language` option. ' +
|
|
119
|
+
'Language is detected automatically from the input text.',
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Only `wav` (default, WAV-wrapped) and `pcm` (raw) are supported.
|
|
124
|
+
let resolvedOutputFormat: 'wav' | 'pcm' = 'wav';
|
|
125
|
+
if (outputFormat === 'pcm') {
|
|
126
|
+
resolvedOutputFormat = 'pcm';
|
|
127
|
+
} else if (outputFormat != null && outputFormat !== 'wav') {
|
|
128
|
+
warnings.push({
|
|
129
|
+
type: 'unsupported',
|
|
130
|
+
feature: 'outputFormat',
|
|
131
|
+
details: `Unsupported output format: ${outputFormat}. Using wav instead.`,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const requestBody = {
|
|
136
|
+
contents: [{ parts: [{ text: promptText }] }],
|
|
137
|
+
generationConfig: {
|
|
138
|
+
responseModalities: ['AUDIO'],
|
|
139
|
+
speechConfig,
|
|
140
|
+
},
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
return { requestBody, warnings, outputFormat: resolvedOutputFormat };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
async doGenerate(
|
|
147
|
+
options: Parameters<SpeechModelV4['doGenerate']>[0],
|
|
148
|
+
): Promise<Awaited<ReturnType<SpeechModelV4['doGenerate']>>> {
|
|
149
|
+
const currentDate = this.config._internal?.currentDate?.() ?? new Date();
|
|
150
|
+
const { requestBody, warnings, outputFormat } = await this.getArgs(options);
|
|
151
|
+
|
|
152
|
+
const {
|
|
153
|
+
value: response,
|
|
154
|
+
responseHeaders,
|
|
155
|
+
rawValue: rawResponse,
|
|
156
|
+
} = await postJsonToApi({
|
|
157
|
+
url: `${this.config.baseURL}/models/${this.modelId}:generateContent`,
|
|
158
|
+
headers: combineHeaders(
|
|
159
|
+
this.config.headers ? await resolve(this.config.headers) : undefined,
|
|
160
|
+
options.headers,
|
|
161
|
+
),
|
|
162
|
+
body: requestBody,
|
|
163
|
+
failedResponseHandler: googleFailedResponseHandler,
|
|
164
|
+
successfulResponseHandler: createJsonResponseHandler(
|
|
165
|
+
googleSpeechResponseSchema,
|
|
166
|
+
),
|
|
167
|
+
abortSignal: options.abortSignal,
|
|
168
|
+
fetch: this.config.fetch,
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
// `generateSpeech` returns a single audio result, and Gemini returns one
|
|
172
|
+
// inline audio part per request, so take the first inline-data part.
|
|
173
|
+
let base64Audio: string | undefined;
|
|
174
|
+
let mimeType: string | undefined;
|
|
175
|
+
for (const candidate of response.candidates ?? []) {
|
|
176
|
+
for (const part of candidate.content?.parts ?? []) {
|
|
177
|
+
if (part.inlineData?.data) {
|
|
178
|
+
base64Audio = part.inlineData.data;
|
|
179
|
+
mimeType = part.inlineData.mimeType ?? undefined;
|
|
180
|
+
break;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
if (base64Audio != null) {
|
|
184
|
+
break;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const sampleRate = parseSampleRate(mimeType) ?? DEFAULT_SAMPLE_RATE;
|
|
189
|
+
const pcm =
|
|
190
|
+
base64Audio != null
|
|
191
|
+
? convertBase64ToUint8Array(base64Audio)
|
|
192
|
+
: new Uint8Array(0);
|
|
193
|
+
|
|
194
|
+
// Gemini returns headerless raw PCM (e.g. `audio/L16;rate=24000`). Unlike
|
|
195
|
+
// providers that return a container format (mp3/opus/wav) directly,
|
|
196
|
+
// `generateSpeech`'s `detectMediaType` can't identify raw PCM and would
|
|
197
|
+
// mislabel it `audio/mp3` (not playable), so wrap it in a minimal WAV header
|
|
198
|
+
// by default; `outputFormat: 'pcm'` returns the raw bytes untouched.
|
|
199
|
+
// Empty audio is returned as-is so the core layer throws NoSpeechGeneratedError.
|
|
200
|
+
const audio =
|
|
201
|
+
outputFormat === 'pcm' || pcm.length === 0
|
|
202
|
+
? pcm
|
|
203
|
+
: addWavHeader(pcm, sampleRate);
|
|
204
|
+
|
|
205
|
+
if (outputFormat === 'pcm' && pcm.length > 0) {
|
|
206
|
+
warnings.push({
|
|
207
|
+
type: 'unsupported',
|
|
208
|
+
feature: 'outputFormat',
|
|
209
|
+
details:
|
|
210
|
+
`Returning raw PCM audio (signed 16-bit little-endian, mono, ${sampleRate} Hz). ` +
|
|
211
|
+
'These bytes have no container header and are not directly playable; ' +
|
|
212
|
+
'see providerMetadata.google for the sample rate and mime type.',
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
audio,
|
|
218
|
+
warnings,
|
|
219
|
+
request: {
|
|
220
|
+
body: JSON.stringify(requestBody),
|
|
221
|
+
},
|
|
222
|
+
response: {
|
|
223
|
+
timestamp: currentDate,
|
|
224
|
+
modelId: this.modelId,
|
|
225
|
+
headers: responseHeaders,
|
|
226
|
+
body: rawResponse,
|
|
227
|
+
},
|
|
228
|
+
providerMetadata: {
|
|
229
|
+
google: {
|
|
230
|
+
sampleRate,
|
|
231
|
+
mimeType: mimeType ?? null,
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Parses the sample rate from a PCM mime type such as `audio/L16;rate=24000`.
|
|
240
|
+
*/
|
|
241
|
+
function parseSampleRate(mimeType: string | undefined): number | undefined {
|
|
242
|
+
if (mimeType == null) {
|
|
243
|
+
return undefined;
|
|
244
|
+
}
|
|
245
|
+
const match = /rate=(\d+)/.exec(mimeType);
|
|
246
|
+
return match ? Number.parseInt(match[1], 10) : undefined;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Wraps raw signed 16-bit little-endian mono PCM in a minimal 44-byte WAV
|
|
251
|
+
* (RIFF/WAVE) container so the output is playable and detectable as `audio/wav`.
|
|
252
|
+
*/
|
|
253
|
+
function addWavHeader(pcm: Uint8Array, sampleRate: number): Uint8Array {
|
|
254
|
+
const numChannels = 1;
|
|
255
|
+
const bitsPerSample = 16;
|
|
256
|
+
const blockAlign = (numChannels * bitsPerSample) / 8;
|
|
257
|
+
const byteRate = sampleRate * blockAlign;
|
|
258
|
+
const dataSize = pcm.length;
|
|
259
|
+
|
|
260
|
+
const buffer = new ArrayBuffer(44 + dataSize);
|
|
261
|
+
const view = new DataView(buffer);
|
|
262
|
+
|
|
263
|
+
writeAscii(view, 0, 'RIFF');
|
|
264
|
+
view.setUint32(4, 36 + dataSize, true);
|
|
265
|
+
writeAscii(view, 8, 'WAVE');
|
|
266
|
+
writeAscii(view, 12, 'fmt ');
|
|
267
|
+
view.setUint32(16, 16, true); // PCM fmt chunk size
|
|
268
|
+
view.setUint16(20, 1, true); // audio format = PCM
|
|
269
|
+
view.setUint16(22, numChannels, true);
|
|
270
|
+
view.setUint32(24, sampleRate, true);
|
|
271
|
+
view.setUint32(28, byteRate, true);
|
|
272
|
+
view.setUint16(32, blockAlign, true);
|
|
273
|
+
view.setUint16(34, bitsPerSample, true);
|
|
274
|
+
writeAscii(view, 36, 'data');
|
|
275
|
+
view.setUint32(40, dataSize, true);
|
|
276
|
+
|
|
277
|
+
const out = new Uint8Array(buffer);
|
|
278
|
+
out.set(pcm, 44);
|
|
279
|
+
return out;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function writeAscii(view: DataView, offset: number, text: string): void {
|
|
283
|
+
for (let i = 0; i < text.length; i++) {
|
|
284
|
+
view.setUint8(offset + i, text.charCodeAt(i));
|
|
285
|
+
}
|
|
286
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -29,6 +29,10 @@ export type {
|
|
|
29
29
|
/** @deprecated Use `GoogleVideoModelId` instead. */
|
|
30
30
|
GoogleVideoModelId as GoogleGenerativeAIVideoModelId,
|
|
31
31
|
} from './google-video-settings';
|
|
32
|
+
export type {
|
|
33
|
+
GoogleSpeechModelOptions,
|
|
34
|
+
GoogleSpeechModelId,
|
|
35
|
+
} from './google-speech-model-options';
|
|
32
36
|
export type { GoogleFilesUploadOptions } from './google-files';
|
|
33
37
|
export type {
|
|
34
38
|
GoogleLanguageModelInteractionsOptions,
|
|
@@ -446,25 +446,25 @@ export function buildGoogleInteractionsStreamTransform({
|
|
|
446
446
|
open.kind === 'text' ||
|
|
447
447
|
open.kind === 'image')
|
|
448
448
|
) {
|
|
449
|
-
const
|
|
449
|
+
const imageDelta = event.delta as
|
|
450
450
|
| { data?: string; mime_type?: string; uri?: string }
|
|
451
451
|
| undefined;
|
|
452
452
|
const google: Record<string, string> = {};
|
|
453
453
|
if (interactionId != null) google.interactionId = interactionId;
|
|
454
454
|
const providerMetadata =
|
|
455
455
|
Object.keys(google).length > 0 ? { google } : undefined;
|
|
456
|
-
if (
|
|
456
|
+
if (imageDelta?.data != null && imageDelta.data.length > 0) {
|
|
457
457
|
controller.enqueue({
|
|
458
458
|
type: 'file',
|
|
459
|
-
mediaType:
|
|
460
|
-
data: { type: 'data', data:
|
|
459
|
+
mediaType: imageDelta.mime_type ?? 'image/png',
|
|
460
|
+
data: { type: 'data', data: imageDelta.data },
|
|
461
461
|
...(providerMetadata ? { providerMetadata } : {}),
|
|
462
462
|
});
|
|
463
|
-
} else if (
|
|
463
|
+
} else if (imageDelta?.uri != null && imageDelta.uri.length > 0) {
|
|
464
464
|
controller.enqueue({
|
|
465
465
|
type: 'file',
|
|
466
|
-
mediaType:
|
|
467
|
-
data: { type: 'url', url: new URL(
|
|
466
|
+
mediaType: imageDelta.mime_type ?? 'image/png',
|
|
467
|
+
data: { type: 'url', url: new URL(imageDelta.uri) },
|
|
468
468
|
...(providerMetadata ? { providerMetadata } : {}),
|
|
469
469
|
});
|
|
470
470
|
}
|