@ai-sdk/google 4.0.0-canary.75 → 4.0.0-canary.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/index.d.ts +25 -2
- package/dist/index.js +557 -290
- package/dist/index.js.map +1 -1
- package/docs/15-google.mdx +77 -0
- package/package.json +1 -1
- package/src/google-provider.ts +23 -0
- package/src/google-speech-api.ts +36 -0
- package/src/google-speech-model-options.ts +48 -0
- package/src/google-speech-model.ts +286 -0
- package/src/index.ts +4 -0
package/docs/15-google.mdx
CHANGED
|
@@ -1935,3 +1935,80 @@ console.log(result.providerMetadata?.google?.groundingMetadata);
|
|
|
1935
1935
|
2K, 4K via `providerOptions.google.imageConfig.imageSize`), and Google Search
|
|
1936
1936
|
grounding.
|
|
1937
1937
|
</Note>
|
|
1938
|
+
|
|
1939
|
+
## Speech Models
|
|
1940
|
+
|
|
1941
|
+
You can create models that call the [Gemini text-to-speech API](https://ai.google.dev/gemini-api/docs/speech-generation)
|
|
1942
|
+
using the `.speech()` factory method.
|
|
1943
|
+
|
|
1944
|
+
The first argument is the model id e.g. `gemini-2.5-flash-preview-tts`.
|
|
1945
|
+
|
|
1946
|
+
```ts
|
|
1947
|
+
const model = google.speech('gemini-2.5-flash-preview-tts');
|
|
1948
|
+
```
|
|
1949
|
+
|
|
1950
|
+
The `voice` argument can be set to one of Gemini's [30 prebuilt voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices)
|
|
1951
|
+
e.g. `Kore`, `Puck`, `Zephyr`, or `Charon`. Voice names are case-sensitive. It defaults to `Kore`.
|
|
1952
|
+
|
|
1953
|
+
```ts highlight="6"
|
|
1954
|
+
import { experimental_generateSpeech as generateSpeech } from 'ai';
|
|
1955
|
+
import { google } from '@ai-sdk/google';
|
|
1956
|
+
|
|
1957
|
+
const result = await generateSpeech({
|
|
1958
|
+
model: google.speech('gemini-2.5-flash-preview-tts'),
|
|
1959
|
+
text: 'Hello, world!',
|
|
1960
|
+
voice: 'Kore', // Gemini voice name
|
|
1961
|
+
});
|
|
1962
|
+
```
|
|
1963
|
+
|
|
1964
|
+
By default the generated audio is returned as a playable WAV file (`result.audio.mediaType` is
|
|
1965
|
+
`audio/wav`). Set `outputFormat: 'pcm'` to receive the raw signed 16-bit little-endian mono PCM
|
|
1966
|
+
bytes instead; the sample rate is reported in `result.providerMetadata.google.sampleRate`.
|
|
1967
|
+
|
|
1968
|
+
Gemini honors natural-language style direction. The `instructions` argument is prepended to the
|
|
1969
|
+
spoken text, so `instructions: 'Say cheerfully'` with `text: 'Hello'` speaks `Say cheerfully: Hello`.
|
|
1970
|
+
|
|
1971
|
+
### Multi-speaker audio
|
|
1972
|
+
|
|
1973
|
+
For multi-speaker dialogue, pass a `multiSpeakerVoiceConfig` through `providerOptions`. Each speaker
|
|
1974
|
+
name must match a name used in the input text. When set, it overrides the top-level `voice`.
|
|
1975
|
+
|
|
1976
|
+
```ts highlight="8-23"
|
|
1977
|
+
import { experimental_generateSpeech as generateSpeech } from 'ai';
|
|
1978
|
+
import { google, type GoogleSpeechModelOptions } from '@ai-sdk/google';
|
|
1979
|
+
|
|
1980
|
+
const result = await generateSpeech({
|
|
1981
|
+
model: google.speech('gemini-2.5-flash-preview-tts'),
|
|
1982
|
+
text: 'Joe: How are you? Jane: Doing great, thanks!',
|
|
1983
|
+
providerOptions: {
|
|
1984
|
+
google: {
|
|
1985
|
+
multiSpeakerVoiceConfig: {
|
|
1986
|
+
speakerVoiceConfigs: [
|
|
1987
|
+
{
|
|
1988
|
+
speaker: 'Joe',
|
|
1989
|
+
voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Kore' } },
|
|
1990
|
+
},
|
|
1991
|
+
{
|
|
1992
|
+
speaker: 'Jane',
|
|
1993
|
+
voiceConfig: { prebuiltVoiceConfig: { voiceName: 'Puck' } },
|
|
1994
|
+
},
|
|
1995
|
+
],
|
|
1996
|
+
},
|
|
1997
|
+
} satisfies GoogleSpeechModelOptions,
|
|
1998
|
+
},
|
|
1999
|
+
});
|
|
2000
|
+
```
|
|
2001
|
+
|
|
2002
|
+
<Note>
|
|
2003
|
+
Gemini TTS models do not support the `speed` or `language` options; passing
|
|
2004
|
+
them adds a warning to `result.warnings`. Language is detected automatically
|
|
2005
|
+
from the input text.
|
|
2006
|
+
</Note>
|
|
2007
|
+
|
|
2008
|
+
### Model Capabilities
|
|
2009
|
+
|
|
2010
|
+
| Model | Multi-speaker | Style via instructions |
|
|
2011
|
+
| ------------------------------ | ------------------- | ---------------------- |
|
|
2012
|
+
| `gemini-2.5-flash-preview-tts` | <Check size={18} /> | <Check size={18} /> |
|
|
2013
|
+
| `gemini-2.5-pro-preview-tts` | <Check size={18} /> | <Check size={18} /> |
|
|
2014
|
+
| `gemini-3.1-flash-tts-preview` | <Check size={18} /> | <Check size={18} /> |
|
package/package.json
CHANGED
package/src/google-provider.ts
CHANGED
|
@@ -5,6 +5,7 @@ import type {
|
|
|
5
5
|
ImageModelV4,
|
|
6
6
|
LanguageModelV4,
|
|
7
7
|
ProviderV4,
|
|
8
|
+
SpeechModelV4,
|
|
8
9
|
} from '@ai-sdk/provider';
|
|
9
10
|
import {
|
|
10
11
|
generateId,
|
|
@@ -28,6 +29,8 @@ import { GoogleImageModel } from './google-image-model';
|
|
|
28
29
|
import { GoogleFiles } from './google-files';
|
|
29
30
|
import { GoogleVideoModel } from './google-video-model';
|
|
30
31
|
import type { GoogleVideoModelId } from './google-video-settings';
|
|
32
|
+
import { GoogleSpeechModel } from './google-speech-model';
|
|
33
|
+
import type { GoogleSpeechModelId } from './google-speech-model-options';
|
|
31
34
|
import {
|
|
32
35
|
GoogleInteractionsLanguageModel,
|
|
33
36
|
type GoogleInteractionsModelInput,
|
|
@@ -85,6 +88,16 @@ export interface GoogleProvider extends ProviderV4 {
|
|
|
85
88
|
*/
|
|
86
89
|
videoModel(modelId: GoogleVideoModelId): Experimental_VideoModelV4;
|
|
87
90
|
|
|
91
|
+
/**
|
|
92
|
+
* Creates a model for speech generation (text-to-speech).
|
|
93
|
+
*/
|
|
94
|
+
speech(modelId: GoogleSpeechModelId): SpeechModelV4;
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Creates a model for speech generation (text-to-speech).
|
|
98
|
+
*/
|
|
99
|
+
speechModel(modelId: GoogleSpeechModelId): SpeechModelV4;
|
|
100
|
+
|
|
88
101
|
files(): FilesV4;
|
|
89
102
|
|
|
90
103
|
/**
|
|
@@ -223,6 +236,14 @@ export function createGoogle(
|
|
|
223
236
|
generateId: options.generateId ?? generateId,
|
|
224
237
|
});
|
|
225
238
|
|
|
239
|
+
const createSpeechModel = (modelId: GoogleSpeechModelId) =>
|
|
240
|
+
new GoogleSpeechModel(modelId, {
|
|
241
|
+
provider: `${providerName}.speech`,
|
|
242
|
+
baseURL,
|
|
243
|
+
headers: getHeaders,
|
|
244
|
+
fetch: options.fetch,
|
|
245
|
+
});
|
|
246
|
+
|
|
226
247
|
const createInteractionsModel = (
|
|
227
248
|
modelIdOrAgent:
|
|
228
249
|
| GoogleInteractionsModelId
|
|
@@ -263,6 +284,8 @@ export function createGoogle(
|
|
|
263
284
|
provider.video = createVideoModel;
|
|
264
285
|
provider.videoModel = createVideoModel;
|
|
265
286
|
provider.files = createFiles;
|
|
287
|
+
provider.speech = createSpeechModel;
|
|
288
|
+
provider.speechModel = createSpeechModel;
|
|
266
289
|
provider.interactions = createInteractionsModel;
|
|
267
290
|
provider.tools = googleTools;
|
|
268
291
|
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { lazySchema, zodSchema } from '@ai-sdk/provider-utils';
|
|
2
|
+
import { z } from 'zod/v4';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Response schema for the Gemini `:generateContent` endpoint when called with
|
|
6
|
+
* `responseModalities: ['AUDIO']`. The generated audio is returned as base64
|
|
7
|
+
* encoded raw PCM in the first inline-data part.
|
|
8
|
+
*/
|
|
9
|
+
export const googleSpeechResponseSchema = lazySchema(() =>
|
|
10
|
+
zodSchema(
|
|
11
|
+
z.object({
|
|
12
|
+
candidates: z
|
|
13
|
+
.array(
|
|
14
|
+
z.object({
|
|
15
|
+
content: z
|
|
16
|
+
.object({
|
|
17
|
+
parts: z
|
|
18
|
+
.array(
|
|
19
|
+
z.object({
|
|
20
|
+
inlineData: z
|
|
21
|
+
.object({
|
|
22
|
+
mimeType: z.string().nullish(),
|
|
23
|
+
data: z.string().nullish(),
|
|
24
|
+
})
|
|
25
|
+
.nullish(),
|
|
26
|
+
}),
|
|
27
|
+
)
|
|
28
|
+
.nullish(),
|
|
29
|
+
})
|
|
30
|
+
.nullish(),
|
|
31
|
+
}),
|
|
32
|
+
)
|
|
33
|
+
.nullish(),
|
|
34
|
+
}),
|
|
35
|
+
),
|
|
36
|
+
);
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import {
|
|
2
|
+
lazySchema,
|
|
3
|
+
zodSchema,
|
|
4
|
+
type InferSchema,
|
|
5
|
+
} from '@ai-sdk/provider-utils';
|
|
6
|
+
import { z } from 'zod/v4';
|
|
7
|
+
|
|
8
|
+
export type GoogleSpeechModelId =
|
|
9
|
+
| 'gemini-2.5-flash-preview-tts'
|
|
10
|
+
| 'gemini-2.5-pro-preview-tts'
|
|
11
|
+
| 'gemini-3.1-flash-tts-preview'
|
|
12
|
+
| (string & {});
|
|
13
|
+
|
|
14
|
+
const prebuiltVoiceConfigSchema = z.object({
|
|
15
|
+
voiceName: z.string(),
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
const voiceConfigSchema = z.object({
|
|
19
|
+
prebuiltVoiceConfig: prebuiltVoiceConfigSchema,
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
export const googleSpeechProviderOptionsSchema = lazySchema(() =>
|
|
23
|
+
zodSchema(
|
|
24
|
+
z.object({
|
|
25
|
+
/**
|
|
26
|
+
* Multi-speaker configuration for dialogue audio. When provided, this
|
|
27
|
+
* overrides the top-level `voice`. The Gemini TTS API supports up to two
|
|
28
|
+
* speakers; each speaker name must match a name used in the input text.
|
|
29
|
+
*
|
|
30
|
+
* https://ai.google.dev/gemini-api/docs/speech-generation#multi-speaker
|
|
31
|
+
*/
|
|
32
|
+
multiSpeakerVoiceConfig: z
|
|
33
|
+
.object({
|
|
34
|
+
speakerVoiceConfigs: z.array(
|
|
35
|
+
z.object({
|
|
36
|
+
speaker: z.string(),
|
|
37
|
+
voiceConfig: voiceConfigSchema,
|
|
38
|
+
}),
|
|
39
|
+
),
|
|
40
|
+
})
|
|
41
|
+
.optional(),
|
|
42
|
+
}),
|
|
43
|
+
),
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
export type GoogleSpeechModelOptions = InferSchema<
|
|
47
|
+
typeof googleSpeechProviderOptionsSchema
|
|
48
|
+
>;
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import type { SpeechModelV4, SharedV4Warning } from '@ai-sdk/provider';
|
|
2
|
+
import {
|
|
3
|
+
combineHeaders,
|
|
4
|
+
convertBase64ToUint8Array,
|
|
5
|
+
createJsonResponseHandler,
|
|
6
|
+
parseProviderOptions,
|
|
7
|
+
postJsonToApi,
|
|
8
|
+
resolve,
|
|
9
|
+
serializeModelOptions,
|
|
10
|
+
WORKFLOW_DESERIALIZE,
|
|
11
|
+
WORKFLOW_SERIALIZE,
|
|
12
|
+
type FetchFunction,
|
|
13
|
+
type Resolvable,
|
|
14
|
+
} from '@ai-sdk/provider-utils';
|
|
15
|
+
import { googleFailedResponseHandler } from './google-error';
|
|
16
|
+
import { googleSpeechResponseSchema } from './google-speech-api';
|
|
17
|
+
import {
|
|
18
|
+
googleSpeechProviderOptionsSchema,
|
|
19
|
+
type GoogleSpeechModelId,
|
|
20
|
+
} from './google-speech-model-options';
|
|
21
|
+
|
|
22
|
+
interface GoogleSpeechModelConfig {
|
|
23
|
+
provider: string;
|
|
24
|
+
baseURL: string;
|
|
25
|
+
headers?: Resolvable<Record<string, string | undefined>>;
|
|
26
|
+
fetch?: FetchFunction;
|
|
27
|
+
_internal?: {
|
|
28
|
+
currentDate?: () => Date;
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const DEFAULT_VOICE = 'Kore';
|
|
33
|
+
// Gemini TTS returns raw PCM at 24kHz when the response does not specify a rate.
|
|
34
|
+
const DEFAULT_SAMPLE_RATE = 24000;
|
|
35
|
+
|
|
36
|
+
export class GoogleSpeechModel implements SpeechModelV4 {
|
|
37
|
+
readonly specificationVersion = 'v4';
|
|
38
|
+
|
|
39
|
+
static [WORKFLOW_SERIALIZE](model: GoogleSpeechModel) {
|
|
40
|
+
return serializeModelOptions({
|
|
41
|
+
modelId: model.modelId,
|
|
42
|
+
config: model.config,
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
static [WORKFLOW_DESERIALIZE](options: {
|
|
47
|
+
modelId: GoogleSpeechModelId;
|
|
48
|
+
config: GoogleSpeechModelConfig;
|
|
49
|
+
}) {
|
|
50
|
+
return new GoogleSpeechModel(options.modelId, options.config);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
get provider(): string {
|
|
54
|
+
return this.config.provider;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
constructor(
|
|
58
|
+
readonly modelId: GoogleSpeechModelId,
|
|
59
|
+
private readonly config: GoogleSpeechModelConfig,
|
|
60
|
+
) {}
|
|
61
|
+
|
|
62
|
+
private async getArgs({
|
|
63
|
+
text,
|
|
64
|
+
voice = DEFAULT_VOICE,
|
|
65
|
+
outputFormat,
|
|
66
|
+
instructions,
|
|
67
|
+
speed,
|
|
68
|
+
language,
|
|
69
|
+
providerOptions,
|
|
70
|
+
}: Parameters<SpeechModelV4['doGenerate']>[0]) {
|
|
71
|
+
const warnings: SharedV4Warning[] = [];
|
|
72
|
+
|
|
73
|
+
const googleOptions = await parseProviderOptions({
|
|
74
|
+
provider: 'google',
|
|
75
|
+
providerOptions,
|
|
76
|
+
schema: googleSpeechProviderOptionsSchema,
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// Multi-speaker (provider option) takes precedence over the single voice.
|
|
80
|
+
const multiSpeakerVoiceConfig = googleOptions?.multiSpeakerVoiceConfig;
|
|
81
|
+
const speechConfig = multiSpeakerVoiceConfig
|
|
82
|
+
? { multiSpeakerVoiceConfig }
|
|
83
|
+
: { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } };
|
|
84
|
+
|
|
85
|
+
// Gemini honors natural-language style direction expressed in the prompt
|
|
86
|
+
// text, so map `instructions` onto the spoken content. With multi-speaker
|
|
87
|
+
// the transcript starts with speaker labels (e.g. `Joe: ...`), so prepending
|
|
88
|
+
// instructions would corrupt that parsing — ignore them there (with a warning).
|
|
89
|
+
let promptText = text;
|
|
90
|
+
if (instructions != null) {
|
|
91
|
+
if (multiSpeakerVoiceConfig) {
|
|
92
|
+
warnings.push({
|
|
93
|
+
type: 'unsupported',
|
|
94
|
+
feature: 'instructions',
|
|
95
|
+
details:
|
|
96
|
+
'Google Gemini TTS ignores `instructions` when `multiSpeakerVoiceConfig` is set, ' +
|
|
97
|
+
'because prepending them would break multi-speaker transcript parsing.',
|
|
98
|
+
});
|
|
99
|
+
} else {
|
|
100
|
+
promptText = `${instructions}: ${text}`;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (speed != null) {
|
|
105
|
+
warnings.push({
|
|
106
|
+
type: 'unsupported',
|
|
107
|
+
feature: 'speed',
|
|
108
|
+
details:
|
|
109
|
+
'Google Gemini TTS models do not support the `speed` option. It was ignored.',
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (language != null) {
|
|
114
|
+
warnings.push({
|
|
115
|
+
type: 'unsupported',
|
|
116
|
+
feature: 'language',
|
|
117
|
+
details:
|
|
118
|
+
'Google Gemini TTS models do not support the `language` option. ' +
|
|
119
|
+
'Language is detected automatically from the input text.',
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Only `wav` (default, WAV-wrapped) and `pcm` (raw) are supported.
|
|
124
|
+
let resolvedOutputFormat: 'wav' | 'pcm' = 'wav';
|
|
125
|
+
if (outputFormat === 'pcm') {
|
|
126
|
+
resolvedOutputFormat = 'pcm';
|
|
127
|
+
} else if (outputFormat != null && outputFormat !== 'wav') {
|
|
128
|
+
warnings.push({
|
|
129
|
+
type: 'unsupported',
|
|
130
|
+
feature: 'outputFormat',
|
|
131
|
+
details: `Unsupported output format: ${outputFormat}. Using wav instead.`,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const requestBody = {
|
|
136
|
+
contents: [{ parts: [{ text: promptText }] }],
|
|
137
|
+
generationConfig: {
|
|
138
|
+
responseModalities: ['AUDIO'],
|
|
139
|
+
speechConfig,
|
|
140
|
+
},
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
return { requestBody, warnings, outputFormat: resolvedOutputFormat };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
async doGenerate(
|
|
147
|
+
options: Parameters<SpeechModelV4['doGenerate']>[0],
|
|
148
|
+
): Promise<Awaited<ReturnType<SpeechModelV4['doGenerate']>>> {
|
|
149
|
+
const currentDate = this.config._internal?.currentDate?.() ?? new Date();
|
|
150
|
+
const { requestBody, warnings, outputFormat } = await this.getArgs(options);
|
|
151
|
+
|
|
152
|
+
const {
|
|
153
|
+
value: response,
|
|
154
|
+
responseHeaders,
|
|
155
|
+
rawValue: rawResponse,
|
|
156
|
+
} = await postJsonToApi({
|
|
157
|
+
url: `${this.config.baseURL}/models/${this.modelId}:generateContent`,
|
|
158
|
+
headers: combineHeaders(
|
|
159
|
+
this.config.headers ? await resolve(this.config.headers) : undefined,
|
|
160
|
+
options.headers,
|
|
161
|
+
),
|
|
162
|
+
body: requestBody,
|
|
163
|
+
failedResponseHandler: googleFailedResponseHandler,
|
|
164
|
+
successfulResponseHandler: createJsonResponseHandler(
|
|
165
|
+
googleSpeechResponseSchema,
|
|
166
|
+
),
|
|
167
|
+
abortSignal: options.abortSignal,
|
|
168
|
+
fetch: this.config.fetch,
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
// `generateSpeech` returns a single audio result, and Gemini returns one
|
|
172
|
+
// inline audio part per request, so take the first inline-data part.
|
|
173
|
+
let base64Audio: string | undefined;
|
|
174
|
+
let mimeType: string | undefined;
|
|
175
|
+
for (const candidate of response.candidates ?? []) {
|
|
176
|
+
for (const part of candidate.content?.parts ?? []) {
|
|
177
|
+
if (part.inlineData?.data) {
|
|
178
|
+
base64Audio = part.inlineData.data;
|
|
179
|
+
mimeType = part.inlineData.mimeType ?? undefined;
|
|
180
|
+
break;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
if (base64Audio != null) {
|
|
184
|
+
break;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const sampleRate = parseSampleRate(mimeType) ?? DEFAULT_SAMPLE_RATE;
|
|
189
|
+
const pcm =
|
|
190
|
+
base64Audio != null
|
|
191
|
+
? convertBase64ToUint8Array(base64Audio)
|
|
192
|
+
: new Uint8Array(0);
|
|
193
|
+
|
|
194
|
+
// Gemini returns headerless raw PCM (e.g. `audio/L16;rate=24000`). Unlike
|
|
195
|
+
// providers that return a container format (mp3/opus/wav) directly,
|
|
196
|
+
// `generateSpeech`'s `detectMediaType` can't identify raw PCM and would
|
|
197
|
+
// mislabel it `audio/mp3` (not playable), so wrap it in a minimal WAV header
|
|
198
|
+
// by default; `outputFormat: 'pcm'` returns the raw bytes untouched.
|
|
199
|
+
// Empty audio is returned as-is so the core layer throws NoSpeechGeneratedError.
|
|
200
|
+
const audio =
|
|
201
|
+
outputFormat === 'pcm' || pcm.length === 0
|
|
202
|
+
? pcm
|
|
203
|
+
: addWavHeader(pcm, sampleRate);
|
|
204
|
+
|
|
205
|
+
if (outputFormat === 'pcm' && pcm.length > 0) {
|
|
206
|
+
warnings.push({
|
|
207
|
+
type: 'unsupported',
|
|
208
|
+
feature: 'outputFormat',
|
|
209
|
+
details:
|
|
210
|
+
`Returning raw PCM audio (signed 16-bit little-endian, mono, ${sampleRate} Hz). ` +
|
|
211
|
+
'These bytes have no container header and are not directly playable; ' +
|
|
212
|
+
'see providerMetadata.google for the sample rate and mime type.',
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
audio,
|
|
218
|
+
warnings,
|
|
219
|
+
request: {
|
|
220
|
+
body: JSON.stringify(requestBody),
|
|
221
|
+
},
|
|
222
|
+
response: {
|
|
223
|
+
timestamp: currentDate,
|
|
224
|
+
modelId: this.modelId,
|
|
225
|
+
headers: responseHeaders,
|
|
226
|
+
body: rawResponse,
|
|
227
|
+
},
|
|
228
|
+
providerMetadata: {
|
|
229
|
+
google: {
|
|
230
|
+
sampleRate,
|
|
231
|
+
mimeType: mimeType ?? null,
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Parses the sample rate from a PCM mime type such as `audio/L16;rate=24000`.
|
|
240
|
+
*/
|
|
241
|
+
function parseSampleRate(mimeType: string | undefined): number | undefined {
|
|
242
|
+
if (mimeType == null) {
|
|
243
|
+
return undefined;
|
|
244
|
+
}
|
|
245
|
+
const match = /rate=(\d+)/.exec(mimeType);
|
|
246
|
+
return match ? Number.parseInt(match[1], 10) : undefined;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Wraps raw signed 16-bit little-endian mono PCM in a minimal 44-byte WAV
|
|
251
|
+
* (RIFF/WAVE) container so the output is playable and detectable as `audio/wav`.
|
|
252
|
+
*/
|
|
253
|
+
function addWavHeader(pcm: Uint8Array, sampleRate: number): Uint8Array {
|
|
254
|
+
const numChannels = 1;
|
|
255
|
+
const bitsPerSample = 16;
|
|
256
|
+
const blockAlign = (numChannels * bitsPerSample) / 8;
|
|
257
|
+
const byteRate = sampleRate * blockAlign;
|
|
258
|
+
const dataSize = pcm.length;
|
|
259
|
+
|
|
260
|
+
const buffer = new ArrayBuffer(44 + dataSize);
|
|
261
|
+
const view = new DataView(buffer);
|
|
262
|
+
|
|
263
|
+
writeAscii(view, 0, 'RIFF');
|
|
264
|
+
view.setUint32(4, 36 + dataSize, true);
|
|
265
|
+
writeAscii(view, 8, 'WAVE');
|
|
266
|
+
writeAscii(view, 12, 'fmt ');
|
|
267
|
+
view.setUint32(16, 16, true); // PCM fmt chunk size
|
|
268
|
+
view.setUint16(20, 1, true); // audio format = PCM
|
|
269
|
+
view.setUint16(22, numChannels, true);
|
|
270
|
+
view.setUint32(24, sampleRate, true);
|
|
271
|
+
view.setUint32(28, byteRate, true);
|
|
272
|
+
view.setUint16(32, blockAlign, true);
|
|
273
|
+
view.setUint16(34, bitsPerSample, true);
|
|
274
|
+
writeAscii(view, 36, 'data');
|
|
275
|
+
view.setUint32(40, dataSize, true);
|
|
276
|
+
|
|
277
|
+
const out = new Uint8Array(buffer);
|
|
278
|
+
out.set(pcm, 44);
|
|
279
|
+
return out;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function writeAscii(view: DataView, offset: number, text: string): void {
|
|
283
|
+
for (let i = 0; i < text.length; i++) {
|
|
284
|
+
view.setUint8(offset + i, text.charCodeAt(i));
|
|
285
|
+
}
|
|
286
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -29,6 +29,10 @@ export type {
|
|
|
29
29
|
/** @deprecated Use `GoogleVideoModelId` instead. */
|
|
30
30
|
GoogleVideoModelId as GoogleGenerativeAIVideoModelId,
|
|
31
31
|
} from './google-video-settings';
|
|
32
|
+
export type {
|
|
33
|
+
GoogleSpeechModelOptions,
|
|
34
|
+
GoogleSpeechModelId,
|
|
35
|
+
} from './google-speech-model-options';
|
|
32
36
|
export type { GoogleFilesUploadOptions } from './google-files';
|
|
33
37
|
export type {
|
|
34
38
|
GoogleLanguageModelInteractionsOptions,
|