@livekit/agents-plugin-google 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +89 -0
- package/dist/beta/gemini_tts.cjs +239 -0
- package/dist/beta/gemini_tts.cjs.map +1 -0
- package/dist/beta/gemini_tts.d.cts +47 -0
- package/dist/beta/gemini_tts.d.ts +47 -0
- package/dist/beta/gemini_tts.d.ts.map +1 -0
- package/dist/beta/gemini_tts.js +221 -0
- package/dist/beta/gemini_tts.js.map +1 -0
- package/dist/beta/gemini_tts.test.cjs +9 -0
- package/dist/beta/gemini_tts.test.cjs.map +1 -0
- package/dist/beta/gemini_tts.test.d.cts +2 -0
- package/dist/beta/gemini_tts.test.d.ts +2 -0
- package/dist/beta/gemini_tts.test.d.ts.map +1 -0
- package/dist/beta/gemini_tts.test.js +8 -0
- package/dist/beta/gemini_tts.test.js.map +1 -0
- package/dist/beta/index.cjs +42 -0
- package/dist/beta/index.cjs.map +1 -0
- package/dist/beta/index.d.cts +3 -0
- package/dist/beta/index.d.ts +3 -0
- package/dist/beta/index.d.ts.map +1 -0
- package/dist/beta/index.js +7 -0
- package/dist/beta/index.js.map +1 -0
- package/dist/beta/realtime/api_proto.cjs +17 -0
- package/dist/beta/realtime/api_proto.cjs.map +1 -0
- package/dist/beta/realtime/api_proto.d.cts +26 -0
- package/dist/beta/realtime/api_proto.d.ts +26 -0
- package/dist/beta/realtime/api_proto.d.ts.map +1 -0
- package/dist/beta/realtime/api_proto.js +1 -0
- package/dist/beta/realtime/api_proto.js.map +1 -0
- package/dist/beta/realtime/index.cjs +29 -0
- package/dist/beta/realtime/index.cjs.map +1 -0
- package/dist/beta/realtime/index.d.cts +3 -0
- package/dist/beta/realtime/index.d.ts +3 -0
- package/dist/beta/realtime/index.d.ts.map +1 -0
- package/dist/beta/realtime/index.js +5 -0
- package/dist/beta/realtime/index.js.map +1 -0
- package/dist/beta/realtime/realtime_api.cjs +993 -0
- package/dist/beta/realtime/realtime_api.cjs.map +1 -0
- package/dist/beta/realtime/realtime_api.d.cts +267 -0
- package/dist/beta/realtime/realtime_api.d.ts +267 -0
- package/dist/beta/realtime/realtime_api.d.ts.map +1 -0
- package/dist/beta/realtime/realtime_api.js +974 -0
- package/dist/beta/realtime/realtime_api.js.map +1 -0
- package/dist/index.cjs +58 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/llm.cjs +381 -0
- package/dist/llm.cjs.map +1 -0
- package/dist/llm.d.cts +82 -0
- package/dist/llm.d.ts +82 -0
- package/dist/llm.d.ts.map +1 -0
- package/dist/llm.js +362 -0
- package/dist/llm.js.map +1 -0
- package/dist/llm.test.cjs +8 -0
- package/dist/llm.test.cjs.map +1 -0
- package/dist/llm.test.d.cts +2 -0
- package/dist/llm.test.d.ts +2 -0
- package/dist/llm.test.d.ts.map +1 -0
- package/dist/llm.test.js +7 -0
- package/dist/llm.test.js.map +1 -0
- package/dist/models.cjs +17 -0
- package/dist/models.cjs.map +1 -0
- package/dist/models.d.cts +5 -0
- package/dist/models.d.ts +5 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +1 -0
- package/dist/models.js.map +1 -0
- package/dist/tools.cjs +17 -0
- package/dist/tools.cjs.map +1 -0
- package/dist/tools.d.cts +3 -0
- package/dist/tools.d.ts +3 -0
- package/dist/tools.d.ts.map +1 -0
- package/dist/tools.js +1 -0
- package/dist/tools.js.map +1 -0
- package/dist/utils.cjs +137 -0
- package/dist/utils.cjs.map +1 -0
- package/dist/utils.d.cts +14 -0
- package/dist/utils.d.ts +14 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +112 -0
- package/dist/utils.js.map +1 -0
- package/package.json +56 -0
- package/src/beta/gemini_tts.test.ts +11 -0
- package/src/beta/gemini_tts.ts +309 -0
- package/src/beta/index.ts +6 -0
- package/src/beta/realtime/api_proto.ts +41 -0
- package/src/beta/realtime/index.ts +5 -0
- package/src/beta/realtime/realtime_api.ts +1440 -0
- package/src/index.ts +20 -0
- package/src/llm.test.ts +10 -0
- package/src/llm.ts +463 -0
- package/src/models.ts +100 -0
- package/src/tools.ts +6 -0
- package/src/utils.ts +157 -0
|
@@ -0,0 +1,993 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var realtime_api_exports = {};
|
|
30
|
+
__export(realtime_api_exports, {
|
|
31
|
+
DEFAULT_IMAGE_ENCODE_OPTIONS: () => DEFAULT_IMAGE_ENCODE_OPTIONS,
|
|
32
|
+
RealtimeModel: () => RealtimeModel,
|
|
33
|
+
RealtimeSession: () => RealtimeSession
|
|
34
|
+
});
|
|
35
|
+
module.exports = __toCommonJS(realtime_api_exports);
|
|
36
|
+
var types = __toESM(require("@google/genai"), 1);
|
|
37
|
+
var import_genai = require("@google/genai");
|
|
38
|
+
var import_agents = require("@livekit/agents");
|
|
39
|
+
var import_mutex = require("@livekit/mutex");
|
|
40
|
+
var import_rtc_node = require("@livekit/rtc-node");
|
|
41
|
+
var import_async = require("@std/async");
|
|
42
|
+
var import_tools = require("../../tools.cjs");
|
|
43
|
+
var import_utils = require("../../utils.cjs");
|
|
44
|
+
const INPUT_AUDIO_SAMPLE_RATE = 16e3;
|
|
45
|
+
const INPUT_AUDIO_CHANNELS = 1;
|
|
46
|
+
const OUTPUT_AUDIO_SAMPLE_RATE = 24e3;
|
|
47
|
+
const OUTPUT_AUDIO_CHANNELS = 1;
|
|
48
|
+
const DEFAULT_IMAGE_ENCODE_OPTIONS = {
|
|
49
|
+
format: "JPEG",
|
|
50
|
+
quality: 75,
|
|
51
|
+
resizeOptions: {
|
|
52
|
+
width: 1024,
|
|
53
|
+
height: 1024,
|
|
54
|
+
strategy: "scale_aspect_fit"
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
function setsEqual(a, b) {
|
|
58
|
+
return a.size === b.size && [...a].every((x) => b.has(x));
|
|
59
|
+
}
|
|
60
|
+
class RealtimeModel extends import_agents.llm.RealtimeModel {
|
|
61
|
+
/** @internal */
|
|
62
|
+
_options;
|
|
63
|
+
constructor(options = {}) {
|
|
64
|
+
var _a, _b;
|
|
65
|
+
const inputAudioTranscription = options.inputAudioTranscription === void 0 ? {} : options.inputAudioTranscription;
|
|
66
|
+
const outputAudioTranscription = options.outputAudioTranscription === void 0 ? {} : options.outputAudioTranscription;
|
|
67
|
+
let serverTurnDetection = true;
|
|
68
|
+
if ((_b = (_a = options.realtimeInputConfig) == null ? void 0 : _a.automaticActivityDetection) == null ? void 0 : _b.disabled) {
|
|
69
|
+
serverTurnDetection = false;
|
|
70
|
+
}
|
|
71
|
+
super({
|
|
72
|
+
messageTruncation: false,
|
|
73
|
+
turnDetection: serverTurnDetection,
|
|
74
|
+
userTranscription: inputAudioTranscription !== null,
|
|
75
|
+
autoToolReplyGeneration: true
|
|
76
|
+
});
|
|
77
|
+
const apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
|
|
78
|
+
const project = options.project || process.env.GOOGLE_CLOUD_PROJECT;
|
|
79
|
+
const location = options.location || process.env.GOOGLE_CLOUD_LOCATION || "us-central1";
|
|
80
|
+
const vertexai = options.vertexai ?? false;
|
|
81
|
+
const defaultModel = vertexai ? "gemini-2.0-flash-exp" : "gemini-2.0-flash-live-001";
|
|
82
|
+
this._options = {
|
|
83
|
+
model: options.model || defaultModel,
|
|
84
|
+
apiKey,
|
|
85
|
+
voice: options.voice || "Puck",
|
|
86
|
+
language: options.language,
|
|
87
|
+
responseModalities: options.modalities || [import_genai.Modality.AUDIO],
|
|
88
|
+
vertexai,
|
|
89
|
+
project,
|
|
90
|
+
location,
|
|
91
|
+
candidateCount: options.candidateCount || 1,
|
|
92
|
+
temperature: options.temperature,
|
|
93
|
+
maxOutputTokens: options.maxOutputTokens,
|
|
94
|
+
topP: options.topP,
|
|
95
|
+
topK: options.topK,
|
|
96
|
+
presencePenalty: options.presencePenalty,
|
|
97
|
+
frequencyPenalty: options.frequencyPenalty,
|
|
98
|
+
instructions: options.instructions,
|
|
99
|
+
inputAudioTranscription: inputAudioTranscription || void 0,
|
|
100
|
+
outputAudioTranscription: outputAudioTranscription || void 0,
|
|
101
|
+
imageEncodeOptions: options.imageEncodeOptions || DEFAULT_IMAGE_ENCODE_OPTIONS,
|
|
102
|
+
connOptions: options.connOptions || import_agents.DEFAULT_API_CONNECT_OPTIONS,
|
|
103
|
+
httpOptions: options.httpOptions,
|
|
104
|
+
enableAffectiveDialog: options.enableAffectiveDialog,
|
|
105
|
+
proactivity: options.proactivity,
|
|
106
|
+
realtimeInputConfig: options.realtimeInputConfig,
|
|
107
|
+
contextWindowCompression: options.contextWindowCompression,
|
|
108
|
+
apiVersion: options.apiVersion,
|
|
109
|
+
geminiTools: options.geminiTools
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Create a new realtime session
|
|
114
|
+
*/
|
|
115
|
+
session() {
|
|
116
|
+
return new RealtimeSession(this);
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Update model options
|
|
120
|
+
*/
|
|
121
|
+
updateOptions(options) {
|
|
122
|
+
if (options.voice !== void 0) {
|
|
123
|
+
this._options.voice = options.voice;
|
|
124
|
+
}
|
|
125
|
+
if (options.temperature !== void 0) {
|
|
126
|
+
this._options.temperature = options.temperature;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Close the model and cleanup resources
|
|
131
|
+
*/
|
|
132
|
+
async close() {
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
136
|
+
_tools = {};
|
|
137
|
+
_chatCtx = import_agents.llm.ChatContext.empty();
|
|
138
|
+
options;
|
|
139
|
+
geminiDeclarations = [];
|
|
140
|
+
messageChannel = new import_agents.Queue();
|
|
141
|
+
inputResampler;
|
|
142
|
+
inputResamplerInputRate;
|
|
143
|
+
instructions;
|
|
144
|
+
currentGeneration;
|
|
145
|
+
bstream;
|
|
146
|
+
// Google-specific properties
|
|
147
|
+
activeSession;
|
|
148
|
+
sessionShouldClose = new import_agents.Event();
|
|
149
|
+
responseCreatedFutures = {};
|
|
150
|
+
pendingGenerationFut;
|
|
151
|
+
sessionResumptionHandle;
|
|
152
|
+
inUserActivity = false;
|
|
153
|
+
sessionLock = new import_mutex.Mutex();
|
|
154
|
+
numRetries = 0;
|
|
155
|
+
hasReceivedAudioInput = false;
|
|
156
|
+
#client;
|
|
157
|
+
#task;
|
|
158
|
+
#logger = (0, import_agents.log)();
|
|
159
|
+
#closed = false;
|
|
160
|
+
constructor(realtimeModel) {
|
|
161
|
+
super(realtimeModel);
|
|
162
|
+
this.options = realtimeModel._options;
|
|
163
|
+
this.bstream = new import_agents.AudioByteStream(
|
|
164
|
+
INPUT_AUDIO_SAMPLE_RATE,
|
|
165
|
+
INPUT_AUDIO_CHANNELS,
|
|
166
|
+
INPUT_AUDIO_SAMPLE_RATE / 20
|
|
167
|
+
);
|
|
168
|
+
const { apiKey, project, location, vertexai, enableAffectiveDialog, proactivity } = this.options;
|
|
169
|
+
const apiVersion = !this.options.apiVersion && (enableAffectiveDialog || proactivity) ? "v1alpha" : this.options.apiVersion;
|
|
170
|
+
const httpOptions = {
|
|
171
|
+
...this.options.httpOptions,
|
|
172
|
+
apiVersion,
|
|
173
|
+
timeout: this.options.connOptions.timeoutMs
|
|
174
|
+
};
|
|
175
|
+
const clientOptions = vertexai ? {
|
|
176
|
+
vertexai: true,
|
|
177
|
+
project,
|
|
178
|
+
location,
|
|
179
|
+
httpOptions
|
|
180
|
+
} : {
|
|
181
|
+
apiKey,
|
|
182
|
+
httpOptions
|
|
183
|
+
};
|
|
184
|
+
this.#client = new import_genai.GoogleGenAI(clientOptions);
|
|
185
|
+
this.#task = this.#mainTask();
|
|
186
|
+
}
|
|
187
|
+
async closeActiveSession() {
|
|
188
|
+
const unlock = await this.sessionLock.lock();
|
|
189
|
+
if (this.activeSession) {
|
|
190
|
+
try {
|
|
191
|
+
await this.activeSession.close();
|
|
192
|
+
} catch (error) {
|
|
193
|
+
this.#logger.warn({ error }, "Error closing Gemini session");
|
|
194
|
+
} finally {
|
|
195
|
+
this.activeSession = void 0;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
unlock();
|
|
199
|
+
}
|
|
200
|
+
markRestartNeeded() {
|
|
201
|
+
if (!this.sessionShouldClose.isSet) {
|
|
202
|
+
this.sessionShouldClose.set();
|
|
203
|
+
this.messageChannel = new import_agents.Queue();
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
getToolResultsForRealtime(ctx, vertexai) {
|
|
207
|
+
const toolResponses = [];
|
|
208
|
+
for (const item of ctx.items) {
|
|
209
|
+
if (item.type === "function_call_output") {
|
|
210
|
+
const response = {
|
|
211
|
+
id: item.callId,
|
|
212
|
+
name: item.name,
|
|
213
|
+
response: { output: item.output }
|
|
214
|
+
};
|
|
215
|
+
if (!vertexai) {
|
|
216
|
+
response.id = item.callId;
|
|
217
|
+
}
|
|
218
|
+
toolResponses.push(response);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
return toolResponses.length > 0 ? { functionResponses: toolResponses } : void 0;
|
|
222
|
+
}
|
|
223
|
+
updateOptions(options) {
|
|
224
|
+
let shouldRestart = false;
|
|
225
|
+
if (options.voice !== void 0 && this.options.voice !== options.voice) {
|
|
226
|
+
this.options.voice = options.voice;
|
|
227
|
+
shouldRestart = true;
|
|
228
|
+
}
|
|
229
|
+
if (options.temperature !== void 0 && this.options.temperature !== options.temperature) {
|
|
230
|
+
this.options.temperature = options.temperature;
|
|
231
|
+
shouldRestart = true;
|
|
232
|
+
}
|
|
233
|
+
if (shouldRestart) {
|
|
234
|
+
this.markRestartNeeded();
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
async updateInstructions(instructions) {
|
|
238
|
+
if (this.options.instructions === void 0 || this.options.instructions !== instructions) {
|
|
239
|
+
this.options.instructions = instructions;
|
|
240
|
+
this.markRestartNeeded();
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
async updateChatCtx(chatCtx) {
|
|
244
|
+
const unlock = await this.sessionLock.lock();
|
|
245
|
+
try {
|
|
246
|
+
if (!this.activeSession) {
|
|
247
|
+
this._chatCtx = chatCtx.copy();
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
} finally {
|
|
251
|
+
unlock();
|
|
252
|
+
}
|
|
253
|
+
const diffOps = import_agents.llm.computeChatCtxDiff(this._chatCtx, chatCtx);
|
|
254
|
+
if (diffOps.toRemove.length > 0) {
|
|
255
|
+
this.#logger.warn("Gemini Live does not support removing messages");
|
|
256
|
+
}
|
|
257
|
+
const appendCtx = import_agents.llm.ChatContext.empty();
|
|
258
|
+
for (const [, itemId] of diffOps.toCreate) {
|
|
259
|
+
const item = chatCtx.getById(itemId);
|
|
260
|
+
if (item) {
|
|
261
|
+
appendCtx.items.push(item);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
if (appendCtx.items.length > 0) {
|
|
265
|
+
const [turns] = await appendCtx.copy({
|
|
266
|
+
excludeFunctionCall: true
|
|
267
|
+
}).toProviderFormat("google", false);
|
|
268
|
+
const toolResults = this.getToolResultsForRealtime(appendCtx, this.options.vertexai);
|
|
269
|
+
if (turns.length > 0) {
|
|
270
|
+
this.sendClientEvent({
|
|
271
|
+
type: "content",
|
|
272
|
+
value: {
|
|
273
|
+
turns,
|
|
274
|
+
turnComplete: false
|
|
275
|
+
}
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
if (toolResults) {
|
|
279
|
+
this.sendClientEvent({
|
|
280
|
+
type: "tool_response",
|
|
281
|
+
value: toolResults
|
|
282
|
+
});
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
this._chatCtx = chatCtx.copy();
|
|
286
|
+
}
|
|
287
|
+
async updateTools(tools) {
|
|
288
|
+
const newDeclarations = (0, import_utils.toFunctionDeclarations)(tools);
|
|
289
|
+
const currentToolNames = new Set(this.geminiDeclarations.map((f) => f.name));
|
|
290
|
+
const newToolNames = new Set(newDeclarations.map((f) => f.name));
|
|
291
|
+
if (!setsEqual(currentToolNames, newToolNames)) {
|
|
292
|
+
this.geminiDeclarations = newDeclarations;
|
|
293
|
+
this._tools = tools;
|
|
294
|
+
this.markRestartNeeded();
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
get chatCtx() {
|
|
298
|
+
return this._chatCtx.copy();
|
|
299
|
+
}
|
|
300
|
+
get tools() {
|
|
301
|
+
return { ...this._tools };
|
|
302
|
+
}
|
|
303
|
+
get manualActivityDetection() {
|
|
304
|
+
var _a, _b;
|
|
305
|
+
return ((_b = (_a = this.options.realtimeInputConfig) == null ? void 0 : _a.automaticActivityDetection) == null ? void 0 : _b.disabled) ?? false;
|
|
306
|
+
}
|
|
307
|
+
pushAudio(frame) {
|
|
308
|
+
this.hasReceivedAudioInput = true;
|
|
309
|
+
for (const f of this.resampleAudio(frame)) {
|
|
310
|
+
for (const nf of this.bstream.write(f.data.buffer)) {
|
|
311
|
+
const realtimeInput = {
|
|
312
|
+
mediaChunks: [
|
|
313
|
+
{
|
|
314
|
+
mimeType: "audio/pcm",
|
|
315
|
+
data: Buffer.from(nf.data.buffer).toString("base64")
|
|
316
|
+
}
|
|
317
|
+
]
|
|
318
|
+
};
|
|
319
|
+
this.sendClientEvent({
|
|
320
|
+
type: "realtime_input",
|
|
321
|
+
value: realtimeInput
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
pushVideo(_) {
|
|
327
|
+
}
|
|
328
|
+
sendClientEvent(event) {
|
|
329
|
+
this.messageChannel.put(event);
|
|
330
|
+
}
|
|
331
|
+
async generateReply(instructions) {
|
|
332
|
+
if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
|
|
333
|
+
this.#logger.warn(
|
|
334
|
+
"generateReply called while another generation is pending, cancelling previous."
|
|
335
|
+
);
|
|
336
|
+
this.pendingGenerationFut.reject(new Error("Superseded by new generate_reply call"));
|
|
337
|
+
}
|
|
338
|
+
const fut = new import_agents.Future();
|
|
339
|
+
this.pendingGenerationFut = fut;
|
|
340
|
+
if (this.inUserActivity) {
|
|
341
|
+
this.sendClientEvent({
|
|
342
|
+
type: "realtime_input",
|
|
343
|
+
value: {
|
|
344
|
+
activityEnd: {}
|
|
345
|
+
}
|
|
346
|
+
});
|
|
347
|
+
this.inUserActivity = false;
|
|
348
|
+
}
|
|
349
|
+
const turns = [];
|
|
350
|
+
if (instructions !== void 0) {
|
|
351
|
+
turns.push({
|
|
352
|
+
parts: [{ text: instructions }],
|
|
353
|
+
role: "model"
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
turns.push({
|
|
357
|
+
parts: [{ text: "." }],
|
|
358
|
+
role: "user"
|
|
359
|
+
});
|
|
360
|
+
this.sendClientEvent({
|
|
361
|
+
type: "content",
|
|
362
|
+
value: {
|
|
363
|
+
turns,
|
|
364
|
+
turnComplete: true
|
|
365
|
+
}
|
|
366
|
+
});
|
|
367
|
+
const timeoutHandle = setTimeout(() => {
|
|
368
|
+
if (!fut.done) {
|
|
369
|
+
fut.reject(new Error("generateReply timed out waiting for generation_created event."));
|
|
370
|
+
if (this.pendingGenerationFut === fut) {
|
|
371
|
+
this.pendingGenerationFut = void 0;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}, 5e3);
|
|
375
|
+
fut.await.finally(() => clearTimeout(timeoutHandle));
|
|
376
|
+
return fut.await;
|
|
377
|
+
}
|
|
378
|
+
startUserActivity() {
|
|
379
|
+
if (!this.manualActivityDetection) {
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
if (!this.inUserActivity) {
|
|
383
|
+
this.inUserActivity = true;
|
|
384
|
+
this.sendClientEvent({
|
|
385
|
+
type: "realtime_input",
|
|
386
|
+
value: {
|
|
387
|
+
activityStart: {}
|
|
388
|
+
}
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
async interrupt() {
|
|
393
|
+
var _a;
|
|
394
|
+
if (((_a = this.options.realtimeInputConfig) == null ? void 0 : _a.activityHandling) === import_genai.ActivityHandling.NO_INTERRUPTION) {
|
|
395
|
+
return;
|
|
396
|
+
}
|
|
397
|
+
this.startUserActivity();
|
|
398
|
+
}
|
|
399
|
+
async truncate(_options) {
|
|
400
|
+
this.#logger.warn("truncate is not supported by the Google Realtime API.");
|
|
401
|
+
}
|
|
402
|
+
async close() {
|
|
403
|
+
super.close();
|
|
404
|
+
this.#closed = true;
|
|
405
|
+
this.sessionShouldClose.set();
|
|
406
|
+
await this.closeActiveSession();
|
|
407
|
+
if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
|
|
408
|
+
this.pendingGenerationFut.reject(new Error("Session closed"));
|
|
409
|
+
}
|
|
410
|
+
for (const fut of Object.values(this.responseCreatedFutures)) {
|
|
411
|
+
if (!fut.done) {
|
|
412
|
+
fut.reject(new Error("Session closed before response created"));
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
this.responseCreatedFutures = {};
|
|
416
|
+
if (this.currentGeneration) {
|
|
417
|
+
this.markCurrentGenerationDone();
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
async #mainTask() {
|
|
421
|
+
const maxRetries = this.options.connOptions.maxRetry;
|
|
422
|
+
while (!this.#closed) {
|
|
423
|
+
await this.closeActiveSession();
|
|
424
|
+
this.sessionShouldClose.clear();
|
|
425
|
+
const config = this.buildConnectConfig();
|
|
426
|
+
try {
|
|
427
|
+
this.#logger.debug("Connecting to Gemini Realtime API...");
|
|
428
|
+
const sessionOpened = new import_agents.Event();
|
|
429
|
+
const session = await this.#client.live.connect({
|
|
430
|
+
model: this.options.model,
|
|
431
|
+
callbacks: {
|
|
432
|
+
onopen: () => sessionOpened.set(),
|
|
433
|
+
onmessage: (message) => {
|
|
434
|
+
this.onReceiveMessage(session, message);
|
|
435
|
+
},
|
|
436
|
+
onerror: (error) => {
|
|
437
|
+
this.#logger.error("Gemini Live session error:", error);
|
|
438
|
+
if (!this.sessionShouldClose.isSet) {
|
|
439
|
+
this.markRestartNeeded();
|
|
440
|
+
}
|
|
441
|
+
},
|
|
442
|
+
onclose: (event) => {
|
|
443
|
+
this.#logger.debug("Gemini Live session closed:", event.code, event.reason);
|
|
444
|
+
this.markCurrentGenerationDone();
|
|
445
|
+
}
|
|
446
|
+
},
|
|
447
|
+
config
|
|
448
|
+
});
|
|
449
|
+
await sessionOpened.wait();
|
|
450
|
+
const unlock = await this.sessionLock.lock();
|
|
451
|
+
try {
|
|
452
|
+
this.activeSession = session;
|
|
453
|
+
const [turns] = await this._chatCtx.copy({
|
|
454
|
+
excludeFunctionCall: true
|
|
455
|
+
}).toProviderFormat("google", false);
|
|
456
|
+
if (turns.length > 0) {
|
|
457
|
+
await session.sendClientContent({
|
|
458
|
+
turns,
|
|
459
|
+
turnComplete: false
|
|
460
|
+
});
|
|
461
|
+
}
|
|
462
|
+
} finally {
|
|
463
|
+
unlock();
|
|
464
|
+
}
|
|
465
|
+
const sendTask = import_agents.Task.from((controller) => this.sendTask(session, controller));
|
|
466
|
+
const restartWaitTask = import_agents.Task.from(({ signal }) => {
|
|
467
|
+
const abortEvent = new import_agents.Event();
|
|
468
|
+
signal.addEventListener("abort", () => abortEvent.set());
|
|
469
|
+
return Promise.race([this.sessionShouldClose.wait(), abortEvent.wait()]);
|
|
470
|
+
});
|
|
471
|
+
await Promise.race([sendTask.result, restartWaitTask.result]);
|
|
472
|
+
if (!restartWaitTask.done && this.#closed) {
|
|
473
|
+
break;
|
|
474
|
+
}
|
|
475
|
+
await (0, import_agents.cancelAndWait)([sendTask, restartWaitTask], 2e3);
|
|
476
|
+
} catch (error) {
|
|
477
|
+
this.#logger.error(`Gemini Realtime API error: ${error}`);
|
|
478
|
+
if (this.#closed) break;
|
|
479
|
+
if (maxRetries === 0) {
|
|
480
|
+
this.emitError(error, false);
|
|
481
|
+
throw new import_agents.APIConnectionError({
|
|
482
|
+
message: "Failed to connect to Gemini Live"
|
|
483
|
+
});
|
|
484
|
+
}
|
|
485
|
+
if (this.numRetries >= maxRetries) {
|
|
486
|
+
this.emitError(error, false);
|
|
487
|
+
throw new import_agents.APIConnectionError({
|
|
488
|
+
message: `Failed to connect to Gemini Live after ${maxRetries} attempts`
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
const retryInterval = this.numRetries === 100 ? 0 : this.options.connOptions.retryIntervalMs;
|
|
492
|
+
this.#logger.warn(
|
|
493
|
+
{
|
|
494
|
+
attempt: this.numRetries,
|
|
495
|
+
maxRetries
|
|
496
|
+
},
|
|
497
|
+
`Gemini Realtime API connection failed, retrying in ${retryInterval}ms`
|
|
498
|
+
);
|
|
499
|
+
await (0, import_async.delay)(retryInterval);
|
|
500
|
+
this.numRetries++;
|
|
501
|
+
} finally {
|
|
502
|
+
await this.closeActiveSession();
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
async sendTask(session, controller) {
|
|
507
|
+
try {
|
|
508
|
+
while (!this.#closed && !this.sessionShouldClose.isSet && !controller.signal.aborted) {
|
|
509
|
+
const msg = await this.messageChannel.get();
|
|
510
|
+
if (controller.signal.aborted) break;
|
|
511
|
+
const unlock = await this.sessionLock.lock();
|
|
512
|
+
try {
|
|
513
|
+
if (this.sessionShouldClose.isSet || this.activeSession !== session) {
|
|
514
|
+
break;
|
|
515
|
+
}
|
|
516
|
+
} finally {
|
|
517
|
+
unlock();
|
|
518
|
+
}
|
|
519
|
+
switch (msg.type) {
|
|
520
|
+
case "content":
|
|
521
|
+
const { turns, turnComplete } = msg.value;
|
|
522
|
+
this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
|
|
523
|
+
await session.sendClientContent({
|
|
524
|
+
turns,
|
|
525
|
+
turnComplete: turnComplete ?? true
|
|
526
|
+
});
|
|
527
|
+
break;
|
|
528
|
+
case "tool_response":
|
|
529
|
+
const { functionResponses } = msg.value;
|
|
530
|
+
if (functionResponses) {
|
|
531
|
+
this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
|
|
532
|
+
await session.sendToolResponse({
|
|
533
|
+
functionResponses
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
break;
|
|
537
|
+
case "realtime_input":
|
|
538
|
+
const { mediaChunks, activityStart, activityEnd } = msg.value;
|
|
539
|
+
if (mediaChunks) {
|
|
540
|
+
for (const mediaChunk of mediaChunks) {
|
|
541
|
+
await session.sendRealtimeInput({ media: mediaChunk });
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (activityStart) await session.sendRealtimeInput({ activityStart });
|
|
545
|
+
if (activityEnd) await session.sendRealtimeInput({ activityEnd });
|
|
546
|
+
break;
|
|
547
|
+
default:
|
|
548
|
+
this.#logger.warn(`Warning: Received unhandled message type: ${msg.type}`);
|
|
549
|
+
break;
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
} catch (e) {
|
|
553
|
+
if (!this.sessionShouldClose.isSet) {
|
|
554
|
+
this.#logger.error(`Error in send task: ${e}`);
|
|
555
|
+
this.markRestartNeeded();
|
|
556
|
+
}
|
|
557
|
+
} finally {
|
|
558
|
+
this.#logger.debug(
|
|
559
|
+
{
|
|
560
|
+
closed: this.#closed,
|
|
561
|
+
sessionShouldClose: this.sessionShouldClose.isSet,
|
|
562
|
+
aborted: controller.signal.aborted
|
|
563
|
+
},
|
|
564
|
+
"send task finished."
|
|
565
|
+
);
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
async onReceiveMessage(session, response) {
|
|
569
|
+
var _a, _b, _c;
|
|
570
|
+
const hasAudioData = (_c = (_b = (_a = response.serverContent) == null ? void 0 : _a.modelTurn) == null ? void 0 : _b.parts) == null ? void 0 : _c.some(
|
|
571
|
+
(part) => {
|
|
572
|
+
var _a2;
|
|
573
|
+
return (_a2 = part.inlineData) == null ? void 0 : _a2.data;
|
|
574
|
+
}
|
|
575
|
+
);
|
|
576
|
+
if (!hasAudioData) {
|
|
577
|
+
this.#logger.debug(`(server) <- ${JSON.stringify(this.loggableServerMessage(response))}`);
|
|
578
|
+
}
|
|
579
|
+
const unlock = await this.sessionLock.lock();
|
|
580
|
+
try {
|
|
581
|
+
if (this.sessionShouldClose.isSet || this.activeSession !== session) {
|
|
582
|
+
this.#logger.debug("onReceiveMessage: Session changed or closed, stopping receive.");
|
|
583
|
+
return;
|
|
584
|
+
}
|
|
585
|
+
} finally {
|
|
586
|
+
unlock();
|
|
587
|
+
}
|
|
588
|
+
if ((!this.currentGeneration || this.currentGeneration._done) && (response.serverContent || response.toolCall)) {
|
|
589
|
+
this.startNewGeneration();
|
|
590
|
+
}
|
|
591
|
+
if (response.sessionResumptionUpdate) {
|
|
592
|
+
if (response.sessionResumptionUpdate.resumable && response.sessionResumptionUpdate.newHandle) {
|
|
593
|
+
this.sessionResumptionHandle = response.sessionResumptionUpdate.newHandle;
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
try {
|
|
597
|
+
if (response.serverContent) {
|
|
598
|
+
this.handleServerContent(response.serverContent);
|
|
599
|
+
}
|
|
600
|
+
if (response.toolCall) {
|
|
601
|
+
this.handleToolCall(response.toolCall);
|
|
602
|
+
}
|
|
603
|
+
if (response.toolCallCancellation) {
|
|
604
|
+
this.handleToolCallCancellation(response.toolCallCancellation);
|
|
605
|
+
}
|
|
606
|
+
if (response.usageMetadata) {
|
|
607
|
+
this.handleUsageMetadata(response.usageMetadata);
|
|
608
|
+
}
|
|
609
|
+
if (response.goAway) {
|
|
610
|
+
this.handleGoAway(response.goAway);
|
|
611
|
+
}
|
|
612
|
+
if (this.numRetries > 0) {
|
|
613
|
+
this.numRetries = 0;
|
|
614
|
+
}
|
|
615
|
+
} catch (e) {
|
|
616
|
+
if (!this.sessionShouldClose.isSet) {
|
|
617
|
+
this.#logger.error(`Error in onReceiveMessage: ${e}`);
|
|
618
|
+
this.markRestartNeeded();
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
/// Truncate large base64/audio payloads for logging to avoid flooding logs
|
|
623
|
+
truncateString(data, maxLength = 30) {
|
|
624
|
+
return data.length > maxLength ? `${data.slice(0, maxLength)}\u2026` : data;
|
|
625
|
+
}
|
|
626
|
+
loggableClientEvent(event, maxLength = 30) {
|
|
627
|
+
var _a;
|
|
628
|
+
const obj = { ...event };
|
|
629
|
+
if (obj.type === "realtime_input" && ((_a = obj.value) == null ? void 0 : _a.mediaChunks)) {
|
|
630
|
+
obj.value = {
|
|
631
|
+
...obj.value,
|
|
632
|
+
mediaChunks: obj.value.mediaChunks.map(
|
|
633
|
+
(mc) => ({
|
|
634
|
+
...mc,
|
|
635
|
+
data: typeof mc.data === "string" ? this.truncateString(mc.data, maxLength) : mc.data
|
|
636
|
+
})
|
|
637
|
+
)
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
return obj;
|
|
641
|
+
}
|
|
642
|
+
loggableServerMessage(message, maxLength = 30) {
|
|
643
|
+
const obj = { ...message };
|
|
644
|
+
if (obj.serverContent && obj.serverContent.modelTurn && Array.isArray(obj.serverContent.modelTurn.parts)) {
|
|
645
|
+
obj.serverContent = { ...obj.serverContent };
|
|
646
|
+
obj.serverContent.modelTurn = { ...obj.serverContent.modelTurn };
|
|
647
|
+
obj.serverContent.modelTurn.parts = obj.serverContent.modelTurn.parts.map((part) => {
|
|
648
|
+
var _a;
|
|
649
|
+
if (((_a = part == null ? void 0 : part.inlineData) == null ? void 0 : _a.data) && typeof part.inlineData.data === "string") {
|
|
650
|
+
return {
|
|
651
|
+
...part,
|
|
652
|
+
inlineData: {
|
|
653
|
+
...part.inlineData,
|
|
654
|
+
data: this.truncateString(part.inlineData.data, maxLength)
|
|
655
|
+
}
|
|
656
|
+
};
|
|
657
|
+
}
|
|
658
|
+
return part;
|
|
659
|
+
});
|
|
660
|
+
}
|
|
661
|
+
return obj;
|
|
662
|
+
}
|
|
663
|
+
markCurrentGenerationDone() {
|
|
664
|
+
if (!this.currentGeneration || this.currentGeneration._done) {
|
|
665
|
+
return;
|
|
666
|
+
}
|
|
667
|
+
this.handleInputSpeechStopped();
|
|
668
|
+
const gen = this.currentGeneration;
|
|
669
|
+
if (gen.inputTranscription) {
|
|
670
|
+
this.emit("input_audio_transcription_completed", {
|
|
671
|
+
itemId: gen.inputId,
|
|
672
|
+
transcript: gen.inputTranscription,
|
|
673
|
+
isFinal: true
|
|
674
|
+
});
|
|
675
|
+
this._chatCtx.addMessage({
|
|
676
|
+
role: "user",
|
|
677
|
+
content: gen.inputTranscription,
|
|
678
|
+
id: gen.inputId
|
|
679
|
+
});
|
|
680
|
+
}
|
|
681
|
+
if (gen.outputText) {
|
|
682
|
+
this._chatCtx.addMessage({
|
|
683
|
+
role: "assistant",
|
|
684
|
+
content: gen.outputText,
|
|
685
|
+
id: gen.responseId
|
|
686
|
+
});
|
|
687
|
+
}
|
|
688
|
+
if (this.options.outputAudioTranscription === void 0) {
|
|
689
|
+
gen.textChannel.write("");
|
|
690
|
+
}
|
|
691
|
+
gen.textChannel.close();
|
|
692
|
+
gen.audioChannel.close();
|
|
693
|
+
gen.functionChannel.close();
|
|
694
|
+
gen.messageChannel.close();
|
|
695
|
+
gen._done = true;
|
|
696
|
+
}
|
|
697
|
+
emitError(error, recoverable) {
|
|
698
|
+
this.emit("error", {
|
|
699
|
+
timestamp: Date.now(),
|
|
700
|
+
// TODO(brian): add label to realtime model
|
|
701
|
+
label: "google_realtime",
|
|
702
|
+
error,
|
|
703
|
+
recoverable
|
|
704
|
+
});
|
|
705
|
+
}
|
|
706
|
+
buildConnectConfig() {
|
|
707
|
+
const opts = this.options;
|
|
708
|
+
const config = {
|
|
709
|
+
responseModalities: opts.responseModalities,
|
|
710
|
+
systemInstruction: opts.instructions ? {
|
|
711
|
+
parts: [{ text: opts.instructions }]
|
|
712
|
+
} : void 0,
|
|
713
|
+
speechConfig: {
|
|
714
|
+
voiceConfig: {
|
|
715
|
+
prebuiltVoiceConfig: {
|
|
716
|
+
voiceName: opts.voice
|
|
717
|
+
}
|
|
718
|
+
},
|
|
719
|
+
languageCode: opts.language
|
|
720
|
+
},
|
|
721
|
+
tools: [
|
|
722
|
+
{
|
|
723
|
+
functionDeclarations: this.geminiDeclarations,
|
|
724
|
+
...this.options.geminiTools
|
|
725
|
+
}
|
|
726
|
+
],
|
|
727
|
+
inputAudioTranscription: opts.inputAudioTranscription,
|
|
728
|
+
outputAudioTranscription: opts.outputAudioTranscription,
|
|
729
|
+
sessionResumption: {
|
|
730
|
+
handle: this.sessionResumptionHandle
|
|
731
|
+
}
|
|
732
|
+
};
|
|
733
|
+
if (opts.temperature !== void 0) {
|
|
734
|
+
config.temperature = opts.temperature;
|
|
735
|
+
}
|
|
736
|
+
if (opts.maxOutputTokens !== void 0) {
|
|
737
|
+
config.maxOutputTokens = opts.maxOutputTokens;
|
|
738
|
+
}
|
|
739
|
+
if (opts.topP !== void 0) {
|
|
740
|
+
config.topP = opts.topP;
|
|
741
|
+
}
|
|
742
|
+
if (opts.topK !== void 0) {
|
|
743
|
+
config.topK = opts.topK;
|
|
744
|
+
}
|
|
745
|
+
if (opts.proactivity !== void 0) {
|
|
746
|
+
config.proactivity = { proactiveAudio: opts.proactivity };
|
|
747
|
+
}
|
|
748
|
+
if (opts.enableAffectiveDialog !== void 0) {
|
|
749
|
+
config.enableAffectiveDialog = opts.enableAffectiveDialog;
|
|
750
|
+
}
|
|
751
|
+
if (opts.realtimeInputConfig !== void 0) {
|
|
752
|
+
config.realtimeInputConfig = opts.realtimeInputConfig;
|
|
753
|
+
}
|
|
754
|
+
if (opts.contextWindowCompression !== void 0) {
|
|
755
|
+
config.contextWindowCompression = opts.contextWindowCompression;
|
|
756
|
+
}
|
|
757
|
+
return config;
|
|
758
|
+
}
|
|
759
|
+
startNewGeneration() {
|
|
760
|
+
if (this.currentGeneration && !this.currentGeneration._done) {
|
|
761
|
+
this.#logger.warn("Starting new generation while another is active. Finalizing previous.");
|
|
762
|
+
this.markCurrentGenerationDone();
|
|
763
|
+
}
|
|
764
|
+
const responseId = (0, import_agents.shortuuid)("GR_");
|
|
765
|
+
this.currentGeneration = {
|
|
766
|
+
messageChannel: import_agents.stream.createStreamChannel(),
|
|
767
|
+
functionChannel: import_agents.stream.createStreamChannel(),
|
|
768
|
+
responseId,
|
|
769
|
+
inputId: (0, import_agents.shortuuid)("GI_"),
|
|
770
|
+
textChannel: import_agents.stream.createStreamChannel(),
|
|
771
|
+
audioChannel: import_agents.stream.createStreamChannel(),
|
|
772
|
+
inputTranscription: "",
|
|
773
|
+
outputText: "",
|
|
774
|
+
_createdTimestamp: Date.now(),
|
|
775
|
+
_done: false
|
|
776
|
+
};
|
|
777
|
+
if (!this.options.responseModalities.includes(import_genai.Modality.AUDIO)) {
|
|
778
|
+
this.currentGeneration.audioChannel.close();
|
|
779
|
+
}
|
|
780
|
+
this.currentGeneration.messageChannel.write({
|
|
781
|
+
messageId: responseId,
|
|
782
|
+
textStream: this.currentGeneration.textChannel.stream(),
|
|
783
|
+
audioStream: this.currentGeneration.audioChannel.stream()
|
|
784
|
+
});
|
|
785
|
+
const generationEvent = {
|
|
786
|
+
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
787
|
+
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
788
|
+
userInitiated: false
|
|
789
|
+
};
|
|
790
|
+
if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
|
|
791
|
+
generationEvent.userInitiated = true;
|
|
792
|
+
this.pendingGenerationFut.resolve(generationEvent);
|
|
793
|
+
this.pendingGenerationFut = void 0;
|
|
794
|
+
} else {
|
|
795
|
+
this.handleInputSpeechStarted();
|
|
796
|
+
}
|
|
797
|
+
this.emit("generation_created", generationEvent);
|
|
798
|
+
}
|
|
799
|
+
handleInputSpeechStarted() {
|
|
800
|
+
this.emit("input_speech_started", {});
|
|
801
|
+
}
|
|
802
|
+
handleInputSpeechStopped() {
|
|
803
|
+
this.emit("input_speech_stopped", {
|
|
804
|
+
userTranscriptionEnabled: false
|
|
805
|
+
});
|
|
806
|
+
}
|
|
807
|
+
handleServerContent(serverContent) {
|
|
808
|
+
if (!this.currentGeneration) {
|
|
809
|
+
this.#logger.warn("received server content but no active generation.");
|
|
810
|
+
return;
|
|
811
|
+
}
|
|
812
|
+
const gen = this.currentGeneration;
|
|
813
|
+
if (serverContent.modelTurn) {
|
|
814
|
+
const turn = serverContent.modelTurn;
|
|
815
|
+
for (const part of turn.parts || []) {
|
|
816
|
+
if (part.text) {
|
|
817
|
+
gen.outputText += part.text;
|
|
818
|
+
gen.textChannel.write(part.text);
|
|
819
|
+
}
|
|
820
|
+
if (part.inlineData) {
|
|
821
|
+
if (!gen._firstTokenTimestamp) {
|
|
822
|
+
gen._firstTokenTimestamp = Date.now();
|
|
823
|
+
}
|
|
824
|
+
try {
|
|
825
|
+
if (!part.inlineData.data) {
|
|
826
|
+
throw new Error("frameData is not bytes");
|
|
827
|
+
}
|
|
828
|
+
const binaryString = atob(part.inlineData.data);
|
|
829
|
+
const len = binaryString.length;
|
|
830
|
+
const bytes = new Uint8Array(len);
|
|
831
|
+
for (let i = 0; i < len; i++) {
|
|
832
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
833
|
+
}
|
|
834
|
+
const int16Array = new Int16Array(bytes.buffer);
|
|
835
|
+
const audioFrame = new import_rtc_node.AudioFrame(
|
|
836
|
+
int16Array,
|
|
837
|
+
OUTPUT_AUDIO_SAMPLE_RATE,
|
|
838
|
+
OUTPUT_AUDIO_CHANNELS,
|
|
839
|
+
int16Array.length / OUTPUT_AUDIO_CHANNELS
|
|
840
|
+
);
|
|
841
|
+
gen.audioChannel.write(audioFrame);
|
|
842
|
+
} catch (error) {
|
|
843
|
+
this.#logger.error("Error processing audio data:", error);
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
if (serverContent.inputTranscription && serverContent.inputTranscription.text) {
|
|
849
|
+
let text = serverContent.inputTranscription.text;
|
|
850
|
+
if (gen.inputTranscription === "") {
|
|
851
|
+
text = text.trimStart();
|
|
852
|
+
}
|
|
853
|
+
gen.inputTranscription += text;
|
|
854
|
+
this.emit("input_audio_transcription_completed", {
|
|
855
|
+
itemId: gen.inputId,
|
|
856
|
+
transcript: gen.inputTranscription,
|
|
857
|
+
isFinal: false
|
|
858
|
+
});
|
|
859
|
+
}
|
|
860
|
+
if (serverContent.outputTranscription && serverContent.outputTranscription.text) {
|
|
861
|
+
const text = serverContent.outputTranscription.text;
|
|
862
|
+
gen.outputText += text;
|
|
863
|
+
gen.textChannel.write(text);
|
|
864
|
+
}
|
|
865
|
+
if (serverContent.generationComplete || serverContent.turnComplete) {
|
|
866
|
+
gen._completedTimestamp = Date.now();
|
|
867
|
+
}
|
|
868
|
+
if (serverContent.interrupted) {
|
|
869
|
+
this.handleInputSpeechStarted();
|
|
870
|
+
}
|
|
871
|
+
if (serverContent.turnComplete) {
|
|
872
|
+
this.markCurrentGenerationDone();
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
handleToolCall(toolCall) {
|
|
876
|
+
if (!this.currentGeneration) {
|
|
877
|
+
this.#logger.warn("received tool call but no active generation.");
|
|
878
|
+
return;
|
|
879
|
+
}
|
|
880
|
+
const gen = this.currentGeneration;
|
|
881
|
+
for (const fc of toolCall.functionCalls || []) {
|
|
882
|
+
gen.functionChannel.write({
|
|
883
|
+
callId: fc.id || (0, import_agents.shortuuid)("fnc-call-"),
|
|
884
|
+
name: fc.name,
|
|
885
|
+
args: fc.args ? JSON.stringify(fc.args) : ""
|
|
886
|
+
});
|
|
887
|
+
}
|
|
888
|
+
this.markCurrentGenerationDone();
|
|
889
|
+
}
|
|
890
|
+
handleToolCallCancellation(cancellation) {
|
|
891
|
+
this.#logger.warn(
|
|
892
|
+
{
|
|
893
|
+
functionCallIds: cancellation.ids
|
|
894
|
+
},
|
|
895
|
+
"server cancelled tool calls"
|
|
896
|
+
);
|
|
897
|
+
}
|
|
898
|
+
handleUsageMetadata(usage) {
|
|
899
|
+
if (!this.currentGeneration) {
|
|
900
|
+
this.#logger.debug("Received usage metadata but no active generation");
|
|
901
|
+
return;
|
|
902
|
+
}
|
|
903
|
+
const gen = this.currentGeneration;
|
|
904
|
+
const createdTimestamp = gen._createdTimestamp;
|
|
905
|
+
const firstTokenTimestamp = gen._firstTokenTimestamp;
|
|
906
|
+
const completedTimestamp = gen._completedTimestamp || Date.now();
|
|
907
|
+
const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
|
|
908
|
+
const duration = (completedTimestamp - createdTimestamp) / 1e3;
|
|
909
|
+
const inputTokens = usage.promptTokenCount || 0;
|
|
910
|
+
const outputTokens = usage.responseTokenCount || 0;
|
|
911
|
+
const totalTokens = usage.totalTokenCount || 0;
|
|
912
|
+
const realtimeMetrics = {
|
|
913
|
+
type: "realtime_model_metrics",
|
|
914
|
+
timestamp: createdTimestamp / 1e3,
|
|
915
|
+
requestId: gen.responseId,
|
|
916
|
+
ttft,
|
|
917
|
+
duration,
|
|
918
|
+
cancelled: gen._done && !gen._completedTimestamp,
|
|
919
|
+
label: "google_realtime",
|
|
920
|
+
inputTokens,
|
|
921
|
+
outputTokens,
|
|
922
|
+
totalTokens,
|
|
923
|
+
tokensPerSecond: duration > 0 ? outputTokens / duration : 0,
|
|
924
|
+
inputTokenDetails: {
|
|
925
|
+
...this.tokenDetailsMap(usage.promptTokensDetails),
|
|
926
|
+
cachedTokens: (usage.cacheTokensDetails || []).reduce(
|
|
927
|
+
(sum, detail) => sum + (detail.tokenCount || 0),
|
|
928
|
+
0
|
|
929
|
+
),
|
|
930
|
+
cachedTokensDetails: this.tokenDetailsMap(usage.cacheTokensDetails)
|
|
931
|
+
},
|
|
932
|
+
outputTokenDetails: this.tokenDetailsMap(usage.responseTokensDetails)
|
|
933
|
+
};
|
|
934
|
+
this.emit("metrics_collected", realtimeMetrics);
|
|
935
|
+
}
|
|
936
|
+
tokenDetailsMap(tokenDetails) {
|
|
937
|
+
const tokenDetailsMap = { audioTokens: 0, textTokens: 0, imageTokens: 0 };
|
|
938
|
+
if (!tokenDetails) {
|
|
939
|
+
return tokenDetailsMap;
|
|
940
|
+
}
|
|
941
|
+
for (const tokenDetail of tokenDetails) {
|
|
942
|
+
if (!tokenDetail.tokenCount) {
|
|
943
|
+
continue;
|
|
944
|
+
}
|
|
945
|
+
if (tokenDetail.modality === types.MediaModality.AUDIO) {
|
|
946
|
+
tokenDetailsMap.audioTokens += tokenDetail.tokenCount;
|
|
947
|
+
} else if (tokenDetail.modality === types.MediaModality.TEXT) {
|
|
948
|
+
tokenDetailsMap.textTokens += tokenDetail.tokenCount;
|
|
949
|
+
} else if (tokenDetail.modality === types.MediaModality.IMAGE) {
|
|
950
|
+
tokenDetailsMap.imageTokens += tokenDetail.tokenCount;
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
return tokenDetailsMap;
|
|
954
|
+
}
|
|
955
|
+
handleGoAway(goAway) {
|
|
956
|
+
this.#logger.warn({ timeLeft: goAway.timeLeft }, "Gemini server indicates disconnection soon.");
|
|
957
|
+
this.sessionShouldClose.set();
|
|
958
|
+
}
|
|
959
|
+
async commitAudio() {
|
|
960
|
+
}
|
|
961
|
+
async clearAudio() {
|
|
962
|
+
}
|
|
963
|
+
*resampleAudio(frame) {
|
|
964
|
+
if (this.inputResampler) {
|
|
965
|
+
if (frame.sampleRate !== this.inputResamplerInputRate) {
|
|
966
|
+
this.inputResampler = void 0;
|
|
967
|
+
this.inputResamplerInputRate = void 0;
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
if (this.inputResampler === void 0 && (frame.sampleRate !== INPUT_AUDIO_SAMPLE_RATE || frame.channels !== INPUT_AUDIO_CHANNELS)) {
|
|
971
|
+
this.inputResampler = new import_rtc_node.AudioResampler(
|
|
972
|
+
frame.sampleRate,
|
|
973
|
+
INPUT_AUDIO_SAMPLE_RATE,
|
|
974
|
+
INPUT_AUDIO_CHANNELS
|
|
975
|
+
);
|
|
976
|
+
this.inputResamplerInputRate = frame.sampleRate;
|
|
977
|
+
}
|
|
978
|
+
if (this.inputResampler) {
|
|
979
|
+
for (const resampledFrame of this.inputResampler.push(frame)) {
|
|
980
|
+
yield resampledFrame;
|
|
981
|
+
}
|
|
982
|
+
} else {
|
|
983
|
+
yield frame;
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
988
|
+
0 && (module.exports = {
|
|
989
|
+
DEFAULT_IMAGE_ENCODE_OPTIONS,
|
|
990
|
+
RealtimeModel,
|
|
991
|
+
RealtimeSession
|
|
992
|
+
});
|
|
993
|
+
//# sourceMappingURL=realtime_api.cjs.map
|