@ai-sdk/google 4.0.0-canary.78 → 4.0.0-canary.79
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/dist/index.d.ts +29 -2
- package/dist/index.js +416 -9
- package/dist/index.js.map +1 -1
- package/docs/15-google.mdx +26 -0
- package/package.json +3 -3
- package/src/google-provider.ts +33 -0
- package/src/index.ts +2 -0
- package/src/realtime/google-realtime-event-mapper.ts +383 -0
- package/src/realtime/google-realtime-model-options.ts +3 -0
- package/src/realtime/google-realtime-model.ts +160 -0
- package/src/realtime/index.ts +2 -0
package/docs/15-google.mdx
CHANGED
|
@@ -1065,6 +1065,32 @@ The following Zod features are known to not work with Google:
|
|
|
1065
1065
|
available provider model ID as a string if needed.
|
|
1066
1066
|
</Note>
|
|
1067
1067
|
|
|
1068
|
+
## Realtime Models
|
|
1069
|
+
|
|
1070
|
+
<Note type="warning">Realtime is an experimental feature.</Note>
|
|
1071
|
+
|
|
1072
|
+
You can create models that call the [Gemini Live API](https://ai.google.dev/gemini-api/docs/live)
|
|
1073
|
+
using the `.experimental_realtime()` factory method.
|
|
1074
|
+
|
|
1075
|
+
```ts
|
|
1076
|
+
import { google } from '@ai-sdk/google';
|
|
1077
|
+
|
|
1078
|
+
const model = google.experimental_realtime('gemini-3.1-flash-live-preview');
|
|
1079
|
+
```
|
|
1080
|
+
|
|
1081
|
+
Realtime sessions run in the browser and require a short-lived token created on
|
|
1082
|
+
your server with `google.experimental_realtime.getToken()`:
|
|
1083
|
+
|
|
1084
|
+
```ts
|
|
1085
|
+
const token = await google.experimental_realtime.getToken({
|
|
1086
|
+
model: 'gemini-3.1-flash-live-preview',
|
|
1087
|
+
});
|
|
1088
|
+
```
|
|
1089
|
+
|
|
1090
|
+
Google realtime models may require provider-specific audio formats, depending
|
|
1091
|
+
on the model and modality. See [Realtime](/docs/ai-sdk-core/realtime) for the
|
|
1092
|
+
complete setup and tool calling pattern.
|
|
1093
|
+
|
|
1068
1094
|
## Interactions API
|
|
1069
1095
|
|
|
1070
1096
|
The [Gemini Interactions API](https://ai.google.dev/gemini-api/docs/interactions)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ai-sdk/google",
|
|
3
|
-
"version": "4.0.0-canary.
|
|
3
|
+
"version": "4.0.0-canary.79",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"sideEffects": false,
|
|
@@ -35,8 +35,8 @@
|
|
|
35
35
|
}
|
|
36
36
|
},
|
|
37
37
|
"dependencies": {
|
|
38
|
-
"@ai-sdk/provider": "4.0.0-canary.
|
|
39
|
-
"@ai-sdk/provider-utils": "5.0.0-canary.
|
|
38
|
+
"@ai-sdk/provider": "4.0.0-canary.18",
|
|
39
|
+
"@ai-sdk/provider-utils": "5.0.0-canary.46"
|
|
40
40
|
},
|
|
41
41
|
"devDependencies": {
|
|
42
42
|
"@types/node": "22.19.19",
|
package/src/google-provider.ts
CHANGED
|
@@ -5,6 +5,8 @@ import type {
|
|
|
5
5
|
ImageModelV4,
|
|
6
6
|
LanguageModelV4,
|
|
7
7
|
ProviderV4,
|
|
8
|
+
Experimental_RealtimeFactoryV4 as RealtimeFactoryV4,
|
|
9
|
+
Experimental_RealtimeFactoryV4GetTokenOptions as RealtimeFactoryV4GetTokenOptions,
|
|
8
10
|
SpeechModelV4,
|
|
9
11
|
} from '@ai-sdk/provider';
|
|
10
12
|
import {
|
|
@@ -37,6 +39,7 @@ import {
|
|
|
37
39
|
} from './interactions/google-interactions-language-model';
|
|
38
40
|
import type { GoogleInteractionsModelId } from './interactions/google-interactions-language-model-options';
|
|
39
41
|
import type { GoogleInteractionsAgentName } from './interactions/google-interactions-agent';
|
|
42
|
+
import { GoogleRealtimeModel } from './realtime/google-realtime-model';
|
|
40
43
|
|
|
41
44
|
export interface GoogleProvider extends ProviderV4 {
|
|
42
45
|
(modelId: GoogleModelId): LanguageModelV4;
|
|
@@ -115,6 +118,8 @@ export interface GoogleProvider extends ProviderV4 {
|
|
|
115
118
|
| { managedAgent: string },
|
|
116
119
|
): LanguageModelV4;
|
|
117
120
|
|
|
121
|
+
experimental_realtime: RealtimeFactoryV4;
|
|
122
|
+
|
|
118
123
|
tools: typeof googleTools;
|
|
119
124
|
}
|
|
120
125
|
|
|
@@ -236,6 +241,14 @@ export function createGoogle(
|
|
|
236
241
|
generateId: options.generateId ?? generateId,
|
|
237
242
|
});
|
|
238
243
|
|
|
244
|
+
const createRealtimeModel = (modelId: string) =>
|
|
245
|
+
new GoogleRealtimeModel(modelId, {
|
|
246
|
+
provider: `${providerName}.realtime`,
|
|
247
|
+
baseURL,
|
|
248
|
+
headers: getHeaders,
|
|
249
|
+
fetch: options.fetch,
|
|
250
|
+
});
|
|
251
|
+
|
|
239
252
|
const createSpeechModel = (modelId: GoogleSpeechModelId) =>
|
|
240
253
|
new GoogleSpeechModel(modelId, {
|
|
241
254
|
provider: `${providerName}.speech`,
|
|
@@ -244,6 +257,25 @@ export function createGoogle(
|
|
|
244
257
|
fetch: options.fetch,
|
|
245
258
|
});
|
|
246
259
|
|
|
260
|
+
const experimentalRealtimeFactory = Object.assign(
|
|
261
|
+
(modelId: string) => createRealtimeModel(modelId),
|
|
262
|
+
{
|
|
263
|
+
getToken: async (tokenOptions: RealtimeFactoryV4GetTokenOptions) => {
|
|
264
|
+
const model = createRealtimeModel(tokenOptions.model);
|
|
265
|
+
const secret = await model.doCreateClientSecret({
|
|
266
|
+
sessionConfig: tokenOptions.sessionConfig,
|
|
267
|
+
expiresAfterSeconds: tokenOptions.expiresAfterSeconds,
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
token: secret.token,
|
|
272
|
+
url: secret.url,
|
|
273
|
+
expiresAt: secret.expiresAt,
|
|
274
|
+
};
|
|
275
|
+
},
|
|
276
|
+
},
|
|
277
|
+
) as RealtimeFactoryV4;
|
|
278
|
+
|
|
247
279
|
const createInteractionsModel = (
|
|
248
280
|
modelIdOrAgent:
|
|
249
281
|
| GoogleInteractionsModelId
|
|
@@ -283,6 +315,7 @@ export function createGoogle(
|
|
|
283
315
|
provider.imageModel = createImageModel;
|
|
284
316
|
provider.video = createVideoModel;
|
|
285
317
|
provider.videoModel = createVideoModel;
|
|
318
|
+
provider.experimental_realtime = experimentalRealtimeFactory;
|
|
286
319
|
provider.files = createFiles;
|
|
287
320
|
provider.speech = createSpeechModel;
|
|
288
321
|
provider.speechModel = createSpeechModel;
|
package/src/index.ts
CHANGED
|
@@ -54,5 +54,7 @@ export type {
|
|
|
54
54
|
/** @deprecated Use `GoogleProviderSettings` instead. */
|
|
55
55
|
GoogleProviderSettings as GoogleGenerativeAIProviderSettings,
|
|
56
56
|
} from './google-provider';
|
|
57
|
+
export { GoogleRealtimeModel as Experimental_GoogleRealtimeModel } from './realtime/google-realtime-model';
|
|
58
|
+
export type { GoogleRealtimeModelConfig as Experimental_GoogleRealtimeModelConfig } from './realtime/google-realtime-model';
|
|
57
59
|
|
|
58
60
|
export { VERSION } from './version';
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
Experimental_RealtimeModelV4 as RealtimeModelV4,
|
|
3
|
+
Experimental_RealtimeModelV4ClientEvent as RealtimeModelV4ClientEvent,
|
|
4
|
+
Experimental_RealtimeModelV4FunctionCallOutput as RealtimeModelV4FunctionCallOutput,
|
|
5
|
+
Experimental_RealtimeModelV4ServerEvent as RealtimeModelV4ServerEvent,
|
|
6
|
+
Experimental_RealtimeModelV4SessionConfig as RealtimeModelV4SessionConfig,
|
|
7
|
+
} from '@ai-sdk/provider';
|
|
8
|
+
import { safeParseJSON } from '@ai-sdk/provider-utils';
|
|
9
|
+
import { convertJSONSchemaToOpenAPISchema } from '../convert-json-schema-to-openapi-schema';
|
|
10
|
+
import { getModelPath } from '../get-model-path';
|
|
11
|
+
|
|
12
|
+
type GoogleRealtimeFunctionCall = {
|
|
13
|
+
id: string;
|
|
14
|
+
name: string;
|
|
15
|
+
args?: Record<string, unknown>;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
type GoogleRealtimeServerContent = {
|
|
19
|
+
interrupted?: boolean;
|
|
20
|
+
modelTurn?: {
|
|
21
|
+
parts?: Array<{
|
|
22
|
+
inlineData?: { data?: string };
|
|
23
|
+
text?: string;
|
|
24
|
+
}>;
|
|
25
|
+
};
|
|
26
|
+
outputTranscription?: { text?: string };
|
|
27
|
+
inputTranscription?: { text?: string };
|
|
28
|
+
turnComplete?: boolean;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
type GoogleRealtimeWireEvent = {
|
|
32
|
+
setupComplete?: unknown;
|
|
33
|
+
toolCall?: {
|
|
34
|
+
functionCalls?: GoogleRealtimeFunctionCall[];
|
|
35
|
+
};
|
|
36
|
+
toolCallCancellation?: unknown;
|
|
37
|
+
serverContent?: GoogleRealtimeServerContent;
|
|
38
|
+
inputTranscription?: { text?: string };
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Stateful event mapper for Google's Gemini Live API.
|
|
43
|
+
*
|
|
44
|
+
* Unlike OpenAI/xAI, Google's events don't have response/item IDs and
|
|
45
|
+
* a single message can contain multiple pieces of data. This class
|
|
46
|
+
* tracks turn state to generate consistent synthetic IDs.
|
|
47
|
+
*/
|
|
48
|
+
export class GoogleRealtimeEventMapper {
|
|
49
|
+
private turnCounter = 0;
|
|
50
|
+
private hasAudio = false;
|
|
51
|
+
private hasText = false;
|
|
52
|
+
private hasTranscript = false;
|
|
53
|
+
private turnClosed = false;
|
|
54
|
+
private inputAudioRate = 16000;
|
|
55
|
+
|
|
56
|
+
private get responseId(): string {
|
|
57
|
+
return `google-resp-${this.turnCounter}`;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
private get itemId(): string {
|
|
61
|
+
return `google-item-${this.turnCounter}`;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Rolls over to the next turn lazily, only once new model content actually
|
|
66
|
+
* arrives. `turnComplete` merely marks the current turn closed; the counter
|
|
67
|
+
* is not advanced until the next response begins. This keeps a transcript
|
|
68
|
+
* that arrives shortly after `turnComplete` attached to the turn it belongs
|
|
69
|
+
* to, since Google delivers transcription independently with no guaranteed
|
|
70
|
+
* ordering relative to `turnComplete`.
|
|
71
|
+
*/
|
|
72
|
+
private beginTurnIfClosed(): void {
|
|
73
|
+
if (!this.turnClosed) return;
|
|
74
|
+
this.turnCounter++;
|
|
75
|
+
this.hasAudio = false;
|
|
76
|
+
this.hasText = false;
|
|
77
|
+
this.hasTranscript = false;
|
|
78
|
+
this.turnClosed = false;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
parseServerEvent(
|
|
82
|
+
raw: unknown,
|
|
83
|
+
): RealtimeModelV4ServerEvent | RealtimeModelV4ServerEvent[] {
|
|
84
|
+
const data = raw as GoogleRealtimeWireEvent;
|
|
85
|
+
|
|
86
|
+
if (data.setupComplete != null) {
|
|
87
|
+
return { type: 'session-created', raw };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (data.toolCall != null) {
|
|
91
|
+
this.beginTurnIfClosed();
|
|
92
|
+
const functionCalls = data.toolCall.functionCalls ?? [];
|
|
93
|
+
return functionCalls.flatMap(functionCall => {
|
|
94
|
+
const args = JSON.stringify(functionCall.args ?? {});
|
|
95
|
+
return [
|
|
96
|
+
{
|
|
97
|
+
type: 'function-call-arguments-delta' as const,
|
|
98
|
+
responseId: this.responseId,
|
|
99
|
+
itemId: this.itemId,
|
|
100
|
+
callId: functionCall.id,
|
|
101
|
+
delta: args,
|
|
102
|
+
raw,
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
type: 'function-call-arguments-done' as const,
|
|
106
|
+
responseId: this.responseId,
|
|
107
|
+
itemId: this.itemId,
|
|
108
|
+
callId: functionCall.id,
|
|
109
|
+
name: functionCall.name,
|
|
110
|
+
arguments: args,
|
|
111
|
+
raw,
|
|
112
|
+
},
|
|
113
|
+
];
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (data.toolCallCancellation != null) {
|
|
118
|
+
return {
|
|
119
|
+
type: 'custom',
|
|
120
|
+
rawType: 'toolCallCancellation',
|
|
121
|
+
raw,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if (data.serverContent != null) {
|
|
126
|
+
return this.parseServerContent(data.serverContent, raw);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (data.inputTranscription?.text != null) {
|
|
130
|
+
return {
|
|
131
|
+
type: 'input-transcription-completed',
|
|
132
|
+
itemId: `google-input-${this.turnCounter}`,
|
|
133
|
+
transcript: data.inputTranscription.text,
|
|
134
|
+
raw,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return { type: 'custom', rawType: String(Object.keys(data)[0]), raw };
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
private parseServerContent(
|
|
142
|
+
serverContent: GoogleRealtimeServerContent,
|
|
143
|
+
raw: unknown,
|
|
144
|
+
): RealtimeModelV4ServerEvent | RealtimeModelV4ServerEvent[] {
|
|
145
|
+
const events: RealtimeModelV4ServerEvent[] = [];
|
|
146
|
+
|
|
147
|
+
if (serverContent.interrupted) {
|
|
148
|
+
events.push({
|
|
149
|
+
type: 'speech-started',
|
|
150
|
+
raw,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (serverContent.modelTurn?.parts) {
|
|
155
|
+
// New model response content marks the start of the next turn.
|
|
156
|
+
this.beginTurnIfClosed();
|
|
157
|
+
for (const part of serverContent.modelTurn.parts) {
|
|
158
|
+
if (part.inlineData?.data) {
|
|
159
|
+
this.hasAudio = true;
|
|
160
|
+
events.push({
|
|
161
|
+
type: 'audio-delta',
|
|
162
|
+
responseId: this.responseId,
|
|
163
|
+
itemId: this.itemId,
|
|
164
|
+
delta: part.inlineData.data,
|
|
165
|
+
raw,
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
if (part.text) {
|
|
169
|
+
this.hasText = true;
|
|
170
|
+
events.push({
|
|
171
|
+
type: 'text-delta',
|
|
172
|
+
responseId: this.responseId,
|
|
173
|
+
itemId: this.itemId,
|
|
174
|
+
delta: part.text,
|
|
175
|
+
raw,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (serverContent.outputTranscription?.text) {
|
|
182
|
+
this.hasTranscript = true;
|
|
183
|
+
events.push({
|
|
184
|
+
type: 'audio-transcript-delta',
|
|
185
|
+
responseId: this.responseId,
|
|
186
|
+
itemId: this.itemId,
|
|
187
|
+
delta: serverContent.outputTranscription.text,
|
|
188
|
+
raw,
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (serverContent.inputTranscription?.text) {
|
|
193
|
+
events.push({
|
|
194
|
+
type: 'input-transcription-completed',
|
|
195
|
+
itemId: `google-input-${this.turnCounter}`,
|
|
196
|
+
transcript: serverContent.inputTranscription.text,
|
|
197
|
+
raw,
|
|
198
|
+
});
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (serverContent.turnComplete) {
|
|
202
|
+
if (this.hasAudio) {
|
|
203
|
+
events.push({
|
|
204
|
+
type: 'audio-done',
|
|
205
|
+
responseId: this.responseId,
|
|
206
|
+
itemId: this.itemId,
|
|
207
|
+
raw,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
if (this.hasText) {
|
|
211
|
+
events.push({
|
|
212
|
+
type: 'text-done',
|
|
213
|
+
responseId: this.responseId,
|
|
214
|
+
itemId: this.itemId,
|
|
215
|
+
raw,
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
if (this.hasTranscript) {
|
|
219
|
+
events.push({
|
|
220
|
+
type: 'audio-transcript-done',
|
|
221
|
+
responseId: this.responseId,
|
|
222
|
+
itemId: this.itemId,
|
|
223
|
+
raw,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
events.push({
|
|
227
|
+
type: 'response-done',
|
|
228
|
+
responseId: this.responseId,
|
|
229
|
+
status: 'completed',
|
|
230
|
+
raw,
|
|
231
|
+
});
|
|
232
|
+
// Mark the turn closed but defer advancing the counter until the next
|
|
233
|
+
// response actually begins (see `beginTurnIfClosed`).
|
|
234
|
+
this.turnClosed = true;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if (events.length === 0) {
|
|
238
|
+
return { type: 'custom', rawType: 'serverContent', raw };
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return events.length === 1 ? events[0] : events;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
serializeClientEvent(
|
|
245
|
+
event: RealtimeModelV4ClientEvent,
|
|
246
|
+
modelId: string,
|
|
247
|
+
): ReturnType<RealtimeModelV4['serializeClientEvent']> {
|
|
248
|
+
switch (event.type) {
|
|
249
|
+
case 'session-update':
|
|
250
|
+
// Capture the configured capture rate so input audio blobs advertise
|
|
251
|
+
// the real rate. Google accepts any rate as long as the blob's mimeType
|
|
252
|
+
// matches; a mismatched label corrupts custom-rate audio.
|
|
253
|
+
if (event.config.inputAudioFormat?.rate != null) {
|
|
254
|
+
this.inputAudioRate = event.config.inputAudioFormat.rate;
|
|
255
|
+
}
|
|
256
|
+
return {
|
|
257
|
+
setup: buildGoogleSessionConfig(event.config, modelId),
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
case 'input-audio-append':
|
|
261
|
+
return {
|
|
262
|
+
realtimeInput: {
|
|
263
|
+
audio: {
|
|
264
|
+
data: event.audio,
|
|
265
|
+
mimeType: `audio/pcm;rate=${this.inputAudioRate}`,
|
|
266
|
+
},
|
|
267
|
+
},
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
case 'input-audio-commit':
|
|
271
|
+
case 'input-audio-clear':
|
|
272
|
+
case 'response-create':
|
|
273
|
+
case 'response-cancel':
|
|
274
|
+
case 'conversation-item-truncate':
|
|
275
|
+
return null;
|
|
276
|
+
|
|
277
|
+
case 'conversation-item-create': {
|
|
278
|
+
const item = event.item;
|
|
279
|
+
switch (item.type) {
|
|
280
|
+
case 'text-message':
|
|
281
|
+
return {
|
|
282
|
+
realtimeInput: {
|
|
283
|
+
text: item.text,
|
|
284
|
+
},
|
|
285
|
+
};
|
|
286
|
+
case 'function-call-output':
|
|
287
|
+
return serializeFunctionCallOutput(item);
|
|
288
|
+
case 'audio-message':
|
|
289
|
+
return null;
|
|
290
|
+
}
|
|
291
|
+
break;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
return null;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
async function serializeFunctionCallOutput(
|
|
300
|
+
item: RealtimeModelV4FunctionCallOutput,
|
|
301
|
+
): Promise<unknown> {
|
|
302
|
+
const parseResult = await safeParseJSON({ text: item.output });
|
|
303
|
+
const response = parseResult.success ? parseResult.value : {};
|
|
304
|
+
|
|
305
|
+
return {
|
|
306
|
+
toolResponse: {
|
|
307
|
+
functionResponses: [
|
|
308
|
+
{
|
|
309
|
+
id: item.callId,
|
|
310
|
+
name: item.name,
|
|
311
|
+
response,
|
|
312
|
+
},
|
|
313
|
+
],
|
|
314
|
+
},
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Builds a Google-specific session configuration from a normalized config.
|
|
320
|
+
* Used to construct the `bidiGenerateContentSetup` payload for auth token creation.
|
|
321
|
+
*/
|
|
322
|
+
export function buildGoogleSessionConfig(
|
|
323
|
+
config: RealtimeModelV4SessionConfig | undefined,
|
|
324
|
+
modelId: string,
|
|
325
|
+
): Record<string, unknown> {
|
|
326
|
+
const setup: Record<string, unknown> = {
|
|
327
|
+
model: getModelPath(modelId),
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
const generationConfig: Record<string, unknown> = {};
|
|
331
|
+
|
|
332
|
+
if (config?.outputModalities != null) {
|
|
333
|
+
generationConfig.responseModalities = config.outputModalities.map(m =>
|
|
334
|
+
m.toUpperCase(),
|
|
335
|
+
);
|
|
336
|
+
} else {
|
|
337
|
+
generationConfig.responseModalities = ['AUDIO'];
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if (config?.voice != null) {
|
|
341
|
+
generationConfig.speechConfig = {
|
|
342
|
+
voiceConfig: {
|
|
343
|
+
prebuiltVoiceConfig: {
|
|
344
|
+
voiceName: config.voice,
|
|
345
|
+
},
|
|
346
|
+
},
|
|
347
|
+
};
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
setup.generationConfig = generationConfig;
|
|
351
|
+
|
|
352
|
+
if (config?.instructions != null) {
|
|
353
|
+
setup.systemInstruction = {
|
|
354
|
+
parts: [{ text: config.instructions }],
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
if (config?.tools != null && config.tools.length > 0) {
|
|
359
|
+
setup.tools = [
|
|
360
|
+
{
|
|
361
|
+
functionDeclarations: config.tools.map(tool => ({
|
|
362
|
+
name: tool.name,
|
|
363
|
+
description: tool.description,
|
|
364
|
+
parameters: convertJSONSchemaToOpenAPISchema(tool.parameters),
|
|
365
|
+
})),
|
|
366
|
+
},
|
|
367
|
+
];
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
if (config?.inputAudioTranscription != null) {
|
|
371
|
+
setup.inputAudioTranscription = {};
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
if (config?.outputAudioTranscription != null) {
|
|
375
|
+
setup.outputAudioTranscription = {};
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
if (config?.providerOptions != null) {
|
|
379
|
+
Object.assign(setup, config.providerOptions);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
return setup;
|
|
383
|
+
}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
Experimental_RealtimeModelV4 as RealtimeModelV4,
|
|
3
|
+
Experimental_RealtimeModelV4ClientEvent as RealtimeModelV4ClientEvent,
|
|
4
|
+
Experimental_RealtimeModelV4ClientSecretOptions as RealtimeModelV4ClientSecretOptions,
|
|
5
|
+
Experimental_RealtimeModelV4ClientSecretResult as RealtimeModelV4ClientSecretResult,
|
|
6
|
+
Experimental_RealtimeModelV4ServerEvent as RealtimeModelV4ServerEvent,
|
|
7
|
+
Experimental_RealtimeModelV4SessionConfig as RealtimeModelV4SessionConfig,
|
|
8
|
+
} from '@ai-sdk/provider';
|
|
9
|
+
import type { FetchFunction } from '@ai-sdk/provider-utils';
|
|
10
|
+
import {
|
|
11
|
+
GoogleRealtimeEventMapper,
|
|
12
|
+
buildGoogleSessionConfig,
|
|
13
|
+
} from './google-realtime-event-mapper';
|
|
14
|
+
|
|
15
|
+
const realtimeWebSocketPath =
|
|
16
|
+
'google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained';
|
|
17
|
+
|
|
18
|
+
function getRealtimeBaseURL(baseURL: string): URL {
|
|
19
|
+
const url = new URL(baseURL);
|
|
20
|
+
const pathSegments = url.pathname.split('/');
|
|
21
|
+
const version = pathSegments.at(-1);
|
|
22
|
+
|
|
23
|
+
if (version === 'v1beta' || version === 'v1alpha') {
|
|
24
|
+
pathSegments.pop();
|
|
25
|
+
url.pathname = pathSegments.join('/') || '/';
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return url;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function getAuthTokensURL(baseURL: string): string {
|
|
32
|
+
const url = getRealtimeBaseURL(baseURL);
|
|
33
|
+
url.pathname = `${url.pathname.replace(/\/$/, '')}/v1alpha/auth_tokens`;
|
|
34
|
+
return url.toString();
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function getWebSocketURL(baseURL: string): string {
|
|
38
|
+
const url = getRealtimeBaseURL(baseURL);
|
|
39
|
+
url.protocol = url.protocol === 'https:' ? 'wss:' : 'ws:';
|
|
40
|
+
url.pathname = `${url.pathname.replace(/\/$/, '')}/ws/${realtimeWebSocketPath}`;
|
|
41
|
+
return url.toString();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export type GoogleRealtimeModelConfig = {
|
|
45
|
+
provider: string;
|
|
46
|
+
baseURL: string;
|
|
47
|
+
headers: () => Record<string, string | undefined>;
|
|
48
|
+
fetch?: FetchFunction;
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
export class GoogleRealtimeModel implements RealtimeModelV4 {
|
|
52
|
+
readonly specificationVersion = 'v4' as const;
|
|
53
|
+
readonly provider: string;
|
|
54
|
+
readonly modelId: string;
|
|
55
|
+
|
|
56
|
+
private readonly config: GoogleRealtimeModelConfig;
|
|
57
|
+
private readonly mapper = new GoogleRealtimeEventMapper();
|
|
58
|
+
|
|
59
|
+
constructor(modelId: string, config: GoogleRealtimeModelConfig) {
|
|
60
|
+
this.modelId = modelId;
|
|
61
|
+
this.provider = config.provider;
|
|
62
|
+
this.config = config;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
async doCreateClientSecret(
|
|
66
|
+
options: RealtimeModelV4ClientSecretOptions,
|
|
67
|
+
): Promise<RealtimeModelV4ClientSecretResult> {
|
|
68
|
+
const fetchFn = this.config.fetch ?? fetch;
|
|
69
|
+
const headers = this.config.headers();
|
|
70
|
+
const apiKey = headers['x-goog-api-key'];
|
|
71
|
+
|
|
72
|
+
if (!apiKey) {
|
|
73
|
+
throw new Error(
|
|
74
|
+
'Google Generative AI API key is required for realtime token creation.',
|
|
75
|
+
);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// `newSessionExpireTime` controls how long the token can be used to *open*
|
|
79
|
+
// a session — the window callers actually care about — so map
|
|
80
|
+
// `expiresAfterSeconds` to it (Google otherwise defaults it to ~60s).
|
|
81
|
+
// `expireTime` is the overall token lifetime and must be >=
|
|
82
|
+
// `newSessionExpireTime`, so extend it to leave room for the opened session
|
|
83
|
+
// to run.
|
|
84
|
+
const now = Date.now();
|
|
85
|
+
const openWindowMs = (options.expiresAfterSeconds ?? 60) * 1000;
|
|
86
|
+
const newSessionExpireTime = new Date(now + openWindowMs).toISOString();
|
|
87
|
+
const expireTime = new Date(
|
|
88
|
+
now + openWindowMs + 30 * 60 * 1000,
|
|
89
|
+
).toISOString();
|
|
90
|
+
|
|
91
|
+
const setupPayload = buildGoogleSessionConfig(
|
|
92
|
+
options.sessionConfig,
|
|
93
|
+
this.modelId,
|
|
94
|
+
);
|
|
95
|
+
|
|
96
|
+
const response = await fetchFn(
|
|
97
|
+
`${getAuthTokensURL(this.config.baseURL)}?key=${encodeURIComponent(apiKey)}`,
|
|
98
|
+
{
|
|
99
|
+
method: 'POST',
|
|
100
|
+
headers: { 'Content-Type': 'application/json' },
|
|
101
|
+
body: JSON.stringify({
|
|
102
|
+
// `uses: 0` means no limit is applied to how many times the token can
|
|
103
|
+
// start a session (per the AuthToken spec). An unset value would
|
|
104
|
+
// default to 1, which breaks WebSocket reconnects within the session.
|
|
105
|
+
uses: 0,
|
|
106
|
+
expireTime,
|
|
107
|
+
newSessionExpireTime,
|
|
108
|
+
bidiGenerateContentSetup: setupPayload,
|
|
109
|
+
}),
|
|
110
|
+
},
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
if (!response.ok) {
|
|
114
|
+
const text = await response.text();
|
|
115
|
+
throw new Error(
|
|
116
|
+
`Google realtime auth token request failed: ${response.status} ${text}`,
|
|
117
|
+
);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const data = (await response.json()) as {
|
|
121
|
+
name: string;
|
|
122
|
+
expireTime?: string;
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
token: data.name,
|
|
127
|
+
url: getWebSocketURL(this.config.baseURL),
|
|
128
|
+
expiresAt: data.expireTime
|
|
129
|
+
? Math.floor(new Date(data.expireTime).getTime() / 1000)
|
|
130
|
+
: undefined,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
getWebSocketConfig(options: { token: string; url: string }): {
|
|
135
|
+
url: string;
|
|
136
|
+
protocols?: string[];
|
|
137
|
+
} {
|
|
138
|
+
return {
|
|
139
|
+
url: `${options.url}?access_token=${encodeURIComponent(options.token)}`,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
parseServerEvent(
|
|
144
|
+
raw: unknown,
|
|
145
|
+
): RealtimeModelV4ServerEvent | RealtimeModelV4ServerEvent[] {
|
|
146
|
+
return this.mapper.parseServerEvent(raw);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
serializeClientEvent(
|
|
150
|
+
event: RealtimeModelV4ClientEvent,
|
|
151
|
+
): ReturnType<RealtimeModelV4['serializeClientEvent']> {
|
|
152
|
+
return this.mapper.serializeClientEvent(event, this.modelId);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
buildSessionConfig(
|
|
156
|
+
config: RealtimeModelV4SessionConfig,
|
|
157
|
+
): Record<string, unknown> {
|
|
158
|
+
return buildGoogleSessionConfig(config, this.modelId);
|
|
159
|
+
}
|
|
160
|
+
}
|