@livekit/agents-plugin-google 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +89 -0
  3. package/dist/beta/gemini_tts.cjs +239 -0
  4. package/dist/beta/gemini_tts.cjs.map +1 -0
  5. package/dist/beta/gemini_tts.d.cts +47 -0
  6. package/dist/beta/gemini_tts.d.ts +47 -0
  7. package/dist/beta/gemini_tts.d.ts.map +1 -0
  8. package/dist/beta/gemini_tts.js +221 -0
  9. package/dist/beta/gemini_tts.js.map +1 -0
  10. package/dist/beta/gemini_tts.test.cjs +9 -0
  11. package/dist/beta/gemini_tts.test.cjs.map +1 -0
  12. package/dist/beta/gemini_tts.test.d.cts +2 -0
  13. package/dist/beta/gemini_tts.test.d.ts +2 -0
  14. package/dist/beta/gemini_tts.test.d.ts.map +1 -0
  15. package/dist/beta/gemini_tts.test.js +8 -0
  16. package/dist/beta/gemini_tts.test.js.map +1 -0
  17. package/dist/beta/index.cjs +42 -0
  18. package/dist/beta/index.cjs.map +1 -0
  19. package/dist/beta/index.d.cts +3 -0
  20. package/dist/beta/index.d.ts +3 -0
  21. package/dist/beta/index.d.ts.map +1 -0
  22. package/dist/beta/index.js +7 -0
  23. package/dist/beta/index.js.map +1 -0
  24. package/dist/beta/realtime/api_proto.cjs +17 -0
  25. package/dist/beta/realtime/api_proto.cjs.map +1 -0
  26. package/dist/beta/realtime/api_proto.d.cts +26 -0
  27. package/dist/beta/realtime/api_proto.d.ts +26 -0
  28. package/dist/beta/realtime/api_proto.d.ts.map +1 -0
  29. package/dist/beta/realtime/api_proto.js +1 -0
  30. package/dist/beta/realtime/api_proto.js.map +1 -0
  31. package/dist/beta/realtime/index.cjs +29 -0
  32. package/dist/beta/realtime/index.cjs.map +1 -0
  33. package/dist/beta/realtime/index.d.cts +3 -0
  34. package/dist/beta/realtime/index.d.ts +3 -0
  35. package/dist/beta/realtime/index.d.ts.map +1 -0
  36. package/dist/beta/realtime/index.js +5 -0
  37. package/dist/beta/realtime/index.js.map +1 -0
  38. package/dist/beta/realtime/realtime_api.cjs +993 -0
  39. package/dist/beta/realtime/realtime_api.cjs.map +1 -0
  40. package/dist/beta/realtime/realtime_api.d.cts +267 -0
  41. package/dist/beta/realtime/realtime_api.d.ts +267 -0
  42. package/dist/beta/realtime/realtime_api.d.ts.map +1 -0
  43. package/dist/beta/realtime/realtime_api.js +974 -0
  44. package/dist/beta/realtime/realtime_api.js.map +1 -0
  45. package/dist/index.cjs +58 -0
  46. package/dist/index.cjs.map +1 -0
  47. package/dist/index.d.cts +4 -0
  48. package/dist/index.d.ts +4 -0
  49. package/dist/index.d.ts.map +1 -0
  50. package/dist/index.js +20 -0
  51. package/dist/index.js.map +1 -0
  52. package/dist/llm.cjs +381 -0
  53. package/dist/llm.cjs.map +1 -0
  54. package/dist/llm.d.cts +82 -0
  55. package/dist/llm.d.ts +82 -0
  56. package/dist/llm.d.ts.map +1 -0
  57. package/dist/llm.js +362 -0
  58. package/dist/llm.js.map +1 -0
  59. package/dist/llm.test.cjs +8 -0
  60. package/dist/llm.test.cjs.map +1 -0
  61. package/dist/llm.test.d.cts +2 -0
  62. package/dist/llm.test.d.ts +2 -0
  63. package/dist/llm.test.d.ts.map +1 -0
  64. package/dist/llm.test.js +7 -0
  65. package/dist/llm.test.js.map +1 -0
  66. package/dist/models.cjs +17 -0
  67. package/dist/models.cjs.map +1 -0
  68. package/dist/models.d.cts +5 -0
  69. package/dist/models.d.ts +5 -0
  70. package/dist/models.d.ts.map +1 -0
  71. package/dist/models.js +1 -0
  72. package/dist/models.js.map +1 -0
  73. package/dist/tools.cjs +17 -0
  74. package/dist/tools.cjs.map +1 -0
  75. package/dist/tools.d.cts +3 -0
  76. package/dist/tools.d.ts +3 -0
  77. package/dist/tools.d.ts.map +1 -0
  78. package/dist/tools.js +1 -0
  79. package/dist/tools.js.map +1 -0
  80. package/dist/utils.cjs +137 -0
  81. package/dist/utils.cjs.map +1 -0
  82. package/dist/utils.d.cts +14 -0
  83. package/dist/utils.d.ts +14 -0
  84. package/dist/utils.d.ts.map +1 -0
  85. package/dist/utils.js +112 -0
  86. package/dist/utils.js.map +1 -0
  87. package/package.json +56 -0
  88. package/src/beta/gemini_tts.test.ts +11 -0
  89. package/src/beta/gemini_tts.ts +309 -0
  90. package/src/beta/index.ts +6 -0
  91. package/src/beta/realtime/api_proto.ts +41 -0
  92. package/src/beta/realtime/index.ts +5 -0
  93. package/src/beta/realtime/realtime_api.ts +1440 -0
  94. package/src/index.ts +20 -0
  95. package/src/llm.test.ts +10 -0
  96. package/src/llm.ts +463 -0
  97. package/src/models.ts +100 -0
  98. package/src/tools.ts +6 -0
  99. package/src/utils.ts +157 -0
@@ -0,0 +1,993 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+ var realtime_api_exports = {};
30
+ __export(realtime_api_exports, {
31
+ DEFAULT_IMAGE_ENCODE_OPTIONS: () => DEFAULT_IMAGE_ENCODE_OPTIONS,
32
+ RealtimeModel: () => RealtimeModel,
33
+ RealtimeSession: () => RealtimeSession
34
+ });
35
+ module.exports = __toCommonJS(realtime_api_exports);
36
+ var types = __toESM(require("@google/genai"), 1);
37
+ var import_genai = require("@google/genai");
38
+ var import_agents = require("@livekit/agents");
39
+ var import_mutex = require("@livekit/mutex");
40
+ var import_rtc_node = require("@livekit/rtc-node");
41
+ var import_async = require("@std/async");
42
+ var import_tools = require("../../tools.cjs");
43
+ var import_utils = require("../../utils.cjs");
44
+ const INPUT_AUDIO_SAMPLE_RATE = 16e3;
45
+ const INPUT_AUDIO_CHANNELS = 1;
46
+ const OUTPUT_AUDIO_SAMPLE_RATE = 24e3;
47
+ const OUTPUT_AUDIO_CHANNELS = 1;
48
+ const DEFAULT_IMAGE_ENCODE_OPTIONS = {
49
+ format: "JPEG",
50
+ quality: 75,
51
+ resizeOptions: {
52
+ width: 1024,
53
+ height: 1024,
54
+ strategy: "scale_aspect_fit"
55
+ }
56
+ };
57
+ function setsEqual(a, b) {
58
+ return a.size === b.size && [...a].every((x) => b.has(x));
59
+ }
60
+ class RealtimeModel extends import_agents.llm.RealtimeModel {
61
+ /** @internal */
62
+ _options;
63
+ constructor(options = {}) {
64
+ var _a, _b;
65
+ const inputAudioTranscription = options.inputAudioTranscription === void 0 ? {} : options.inputAudioTranscription;
66
+ const outputAudioTranscription = options.outputAudioTranscription === void 0 ? {} : options.outputAudioTranscription;
67
+ let serverTurnDetection = true;
68
+ if ((_b = (_a = options.realtimeInputConfig) == null ? void 0 : _a.automaticActivityDetection) == null ? void 0 : _b.disabled) {
69
+ serverTurnDetection = false;
70
+ }
71
+ super({
72
+ messageTruncation: false,
73
+ turnDetection: serverTurnDetection,
74
+ userTranscription: inputAudioTranscription !== null,
75
+ autoToolReplyGeneration: true
76
+ });
77
+ const apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
78
+ const project = options.project || process.env.GOOGLE_CLOUD_PROJECT;
79
+ const location = options.location || process.env.GOOGLE_CLOUD_LOCATION || "us-central1";
80
+ const vertexai = options.vertexai ?? false;
81
+ const defaultModel = vertexai ? "gemini-2.0-flash-exp" : "gemini-2.0-flash-live-001";
82
+ this._options = {
83
+ model: options.model || defaultModel,
84
+ apiKey,
85
+ voice: options.voice || "Puck",
86
+ language: options.language,
87
+ responseModalities: options.modalities || [import_genai.Modality.AUDIO],
88
+ vertexai,
89
+ project,
90
+ location,
91
+ candidateCount: options.candidateCount || 1,
92
+ temperature: options.temperature,
93
+ maxOutputTokens: options.maxOutputTokens,
94
+ topP: options.topP,
95
+ topK: options.topK,
96
+ presencePenalty: options.presencePenalty,
97
+ frequencyPenalty: options.frequencyPenalty,
98
+ instructions: options.instructions,
99
+ inputAudioTranscription: inputAudioTranscription || void 0,
100
+ outputAudioTranscription: outputAudioTranscription || void 0,
101
+ imageEncodeOptions: options.imageEncodeOptions || DEFAULT_IMAGE_ENCODE_OPTIONS,
102
+ connOptions: options.connOptions || import_agents.DEFAULT_API_CONNECT_OPTIONS,
103
+ httpOptions: options.httpOptions,
104
+ enableAffectiveDialog: options.enableAffectiveDialog,
105
+ proactivity: options.proactivity,
106
+ realtimeInputConfig: options.realtimeInputConfig,
107
+ contextWindowCompression: options.contextWindowCompression,
108
+ apiVersion: options.apiVersion,
109
+ geminiTools: options.geminiTools
110
+ };
111
+ }
112
+ /**
113
+ * Create a new realtime session
114
+ */
115
+ session() {
116
+ return new RealtimeSession(this);
117
+ }
118
+ /**
119
+ * Update model options
120
+ */
121
+ updateOptions(options) {
122
+ if (options.voice !== void 0) {
123
+ this._options.voice = options.voice;
124
+ }
125
+ if (options.temperature !== void 0) {
126
+ this._options.temperature = options.temperature;
127
+ }
128
+ }
129
+ /**
130
+ * Close the model and cleanup resources
131
+ */
132
+ async close() {
133
+ }
134
+ }
135
+ class RealtimeSession extends import_agents.llm.RealtimeSession {
136
+ _tools = {};
137
+ _chatCtx = import_agents.llm.ChatContext.empty();
138
+ options;
139
+ geminiDeclarations = [];
140
+ messageChannel = new import_agents.Queue();
141
+ inputResampler;
142
+ inputResamplerInputRate;
143
+ instructions;
144
+ currentGeneration;
145
+ bstream;
146
+ // Google-specific properties
147
+ activeSession;
148
+ sessionShouldClose = new import_agents.Event();
149
+ responseCreatedFutures = {};
150
+ pendingGenerationFut;
151
+ sessionResumptionHandle;
152
+ inUserActivity = false;
153
+ sessionLock = new import_mutex.Mutex();
154
+ numRetries = 0;
155
+ hasReceivedAudioInput = false;
156
+ #client;
157
+ #task;
158
+ #logger = (0, import_agents.log)();
159
+ #closed = false;
160
+ constructor(realtimeModel) {
161
+ super(realtimeModel);
162
+ this.options = realtimeModel._options;
163
+ this.bstream = new import_agents.AudioByteStream(
164
+ INPUT_AUDIO_SAMPLE_RATE,
165
+ INPUT_AUDIO_CHANNELS,
166
+ INPUT_AUDIO_SAMPLE_RATE / 20
167
+ );
168
+ const { apiKey, project, location, vertexai, enableAffectiveDialog, proactivity } = this.options;
169
+ const apiVersion = !this.options.apiVersion && (enableAffectiveDialog || proactivity) ? "v1alpha" : this.options.apiVersion;
170
+ const httpOptions = {
171
+ ...this.options.httpOptions,
172
+ apiVersion,
173
+ timeout: this.options.connOptions.timeoutMs
174
+ };
175
+ const clientOptions = vertexai ? {
176
+ vertexai: true,
177
+ project,
178
+ location,
179
+ httpOptions
180
+ } : {
181
+ apiKey,
182
+ httpOptions
183
+ };
184
+ this.#client = new import_genai.GoogleGenAI(clientOptions);
185
+ this.#task = this.#mainTask();
186
+ }
187
+ async closeActiveSession() {
188
+ const unlock = await this.sessionLock.lock();
189
+ if (this.activeSession) {
190
+ try {
191
+ await this.activeSession.close();
192
+ } catch (error) {
193
+ this.#logger.warn({ error }, "Error closing Gemini session");
194
+ } finally {
195
+ this.activeSession = void 0;
196
+ }
197
+ }
198
+ unlock();
199
+ }
200
+ markRestartNeeded() {
201
+ if (!this.sessionShouldClose.isSet) {
202
+ this.sessionShouldClose.set();
203
+ this.messageChannel = new import_agents.Queue();
204
+ }
205
+ }
206
+ getToolResultsForRealtime(ctx, vertexai) {
207
+ const toolResponses = [];
208
+ for (const item of ctx.items) {
209
+ if (item.type === "function_call_output") {
210
+ const response = {
211
+ id: item.callId,
212
+ name: item.name,
213
+ response: { output: item.output }
214
+ };
215
+ if (!vertexai) {
216
+ response.id = item.callId;
217
+ }
218
+ toolResponses.push(response);
219
+ }
220
+ }
221
+ return toolResponses.length > 0 ? { functionResponses: toolResponses } : void 0;
222
+ }
223
+ updateOptions(options) {
224
+ let shouldRestart = false;
225
+ if (options.voice !== void 0 && this.options.voice !== options.voice) {
226
+ this.options.voice = options.voice;
227
+ shouldRestart = true;
228
+ }
229
+ if (options.temperature !== void 0 && this.options.temperature !== options.temperature) {
230
+ this.options.temperature = options.temperature;
231
+ shouldRestart = true;
232
+ }
233
+ if (shouldRestart) {
234
+ this.markRestartNeeded();
235
+ }
236
+ }
237
+ async updateInstructions(instructions) {
238
+ if (this.options.instructions === void 0 || this.options.instructions !== instructions) {
239
+ this.options.instructions = instructions;
240
+ this.markRestartNeeded();
241
+ }
242
+ }
243
+ async updateChatCtx(chatCtx) {
244
+ const unlock = await this.sessionLock.lock();
245
+ try {
246
+ if (!this.activeSession) {
247
+ this._chatCtx = chatCtx.copy();
248
+ return;
249
+ }
250
+ } finally {
251
+ unlock();
252
+ }
253
+ const diffOps = import_agents.llm.computeChatCtxDiff(this._chatCtx, chatCtx);
254
+ if (diffOps.toRemove.length > 0) {
255
+ this.#logger.warn("Gemini Live does not support removing messages");
256
+ }
257
+ const appendCtx = import_agents.llm.ChatContext.empty();
258
+ for (const [, itemId] of diffOps.toCreate) {
259
+ const item = chatCtx.getById(itemId);
260
+ if (item) {
261
+ appendCtx.items.push(item);
262
+ }
263
+ }
264
+ if (appendCtx.items.length > 0) {
265
+ const [turns] = await appendCtx.copy({
266
+ excludeFunctionCall: true
267
+ }).toProviderFormat("google", false);
268
+ const toolResults = this.getToolResultsForRealtime(appendCtx, this.options.vertexai);
269
+ if (turns.length > 0) {
270
+ this.sendClientEvent({
271
+ type: "content",
272
+ value: {
273
+ turns,
274
+ turnComplete: false
275
+ }
276
+ });
277
+ }
278
+ if (toolResults) {
279
+ this.sendClientEvent({
280
+ type: "tool_response",
281
+ value: toolResults
282
+ });
283
+ }
284
+ }
285
+ this._chatCtx = chatCtx.copy();
286
+ }
287
+ async updateTools(tools) {
288
+ const newDeclarations = (0, import_utils.toFunctionDeclarations)(tools);
289
+ const currentToolNames = new Set(this.geminiDeclarations.map((f) => f.name));
290
+ const newToolNames = new Set(newDeclarations.map((f) => f.name));
291
+ if (!setsEqual(currentToolNames, newToolNames)) {
292
+ this.geminiDeclarations = newDeclarations;
293
+ this._tools = tools;
294
+ this.markRestartNeeded();
295
+ }
296
+ }
297
+ get chatCtx() {
298
+ return this._chatCtx.copy();
299
+ }
300
+ get tools() {
301
+ return { ...this._tools };
302
+ }
303
+ get manualActivityDetection() {
304
+ var _a, _b;
305
+ return ((_b = (_a = this.options.realtimeInputConfig) == null ? void 0 : _a.automaticActivityDetection) == null ? void 0 : _b.disabled) ?? false;
306
+ }
307
+ pushAudio(frame) {
308
+ this.hasReceivedAudioInput = true;
309
+ for (const f of this.resampleAudio(frame)) {
310
+ for (const nf of this.bstream.write(f.data.buffer)) {
311
+ const realtimeInput = {
312
+ mediaChunks: [
313
+ {
314
+ mimeType: "audio/pcm",
315
+ data: Buffer.from(nf.data.buffer).toString("base64")
316
+ }
317
+ ]
318
+ };
319
+ this.sendClientEvent({
320
+ type: "realtime_input",
321
+ value: realtimeInput
322
+ });
323
+ }
324
+ }
325
+ }
326
+ pushVideo(_) {
327
+ }
328
+ sendClientEvent(event) {
329
+ this.messageChannel.put(event);
330
+ }
331
+ async generateReply(instructions) {
332
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
333
+ this.#logger.warn(
334
+ "generateReply called while another generation is pending, cancelling previous."
335
+ );
336
+ this.pendingGenerationFut.reject(new Error("Superseded by new generate_reply call"));
337
+ }
338
+ const fut = new import_agents.Future();
339
+ this.pendingGenerationFut = fut;
340
+ if (this.inUserActivity) {
341
+ this.sendClientEvent({
342
+ type: "realtime_input",
343
+ value: {
344
+ activityEnd: {}
345
+ }
346
+ });
347
+ this.inUserActivity = false;
348
+ }
349
+ const turns = [];
350
+ if (instructions !== void 0) {
351
+ turns.push({
352
+ parts: [{ text: instructions }],
353
+ role: "model"
354
+ });
355
+ }
356
+ turns.push({
357
+ parts: [{ text: "." }],
358
+ role: "user"
359
+ });
360
+ this.sendClientEvent({
361
+ type: "content",
362
+ value: {
363
+ turns,
364
+ turnComplete: true
365
+ }
366
+ });
367
+ const timeoutHandle = setTimeout(() => {
368
+ if (!fut.done) {
369
+ fut.reject(new Error("generateReply timed out waiting for generation_created event."));
370
+ if (this.pendingGenerationFut === fut) {
371
+ this.pendingGenerationFut = void 0;
372
+ }
373
+ }
374
+ }, 5e3);
375
+ fut.await.finally(() => clearTimeout(timeoutHandle));
376
+ return fut.await;
377
+ }
378
+ startUserActivity() {
379
+ if (!this.manualActivityDetection) {
380
+ return;
381
+ }
382
+ if (!this.inUserActivity) {
383
+ this.inUserActivity = true;
384
+ this.sendClientEvent({
385
+ type: "realtime_input",
386
+ value: {
387
+ activityStart: {}
388
+ }
389
+ });
390
+ }
391
+ }
392
+ async interrupt() {
393
+ var _a;
394
+ if (((_a = this.options.realtimeInputConfig) == null ? void 0 : _a.activityHandling) === import_genai.ActivityHandling.NO_INTERRUPTION) {
395
+ return;
396
+ }
397
+ this.startUserActivity();
398
+ }
399
+ async truncate(_options) {
400
+ this.#logger.warn("truncate is not supported by the Google Realtime API.");
401
+ }
402
+ async close() {
403
+ super.close();
404
+ this.#closed = true;
405
+ this.sessionShouldClose.set();
406
+ await this.closeActiveSession();
407
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
408
+ this.pendingGenerationFut.reject(new Error("Session closed"));
409
+ }
410
+ for (const fut of Object.values(this.responseCreatedFutures)) {
411
+ if (!fut.done) {
412
+ fut.reject(new Error("Session closed before response created"));
413
+ }
414
+ }
415
+ this.responseCreatedFutures = {};
416
+ if (this.currentGeneration) {
417
+ this.markCurrentGenerationDone();
418
+ }
419
+ }
420
+ async #mainTask() {
421
+ const maxRetries = this.options.connOptions.maxRetry;
422
+ while (!this.#closed) {
423
+ await this.closeActiveSession();
424
+ this.sessionShouldClose.clear();
425
+ const config = this.buildConnectConfig();
426
+ try {
427
+ this.#logger.debug("Connecting to Gemini Realtime API...");
428
+ const sessionOpened = new import_agents.Event();
429
+ const session = await this.#client.live.connect({
430
+ model: this.options.model,
431
+ callbacks: {
432
+ onopen: () => sessionOpened.set(),
433
+ onmessage: (message) => {
434
+ this.onReceiveMessage(session, message);
435
+ },
436
+ onerror: (error) => {
437
+ this.#logger.error("Gemini Live session error:", error);
438
+ if (!this.sessionShouldClose.isSet) {
439
+ this.markRestartNeeded();
440
+ }
441
+ },
442
+ onclose: (event) => {
443
+ this.#logger.debug("Gemini Live session closed:", event.code, event.reason);
444
+ this.markCurrentGenerationDone();
445
+ }
446
+ },
447
+ config
448
+ });
449
+ await sessionOpened.wait();
450
+ const unlock = await this.sessionLock.lock();
451
+ try {
452
+ this.activeSession = session;
453
+ const [turns] = await this._chatCtx.copy({
454
+ excludeFunctionCall: true
455
+ }).toProviderFormat("google", false);
456
+ if (turns.length > 0) {
457
+ await session.sendClientContent({
458
+ turns,
459
+ turnComplete: false
460
+ });
461
+ }
462
+ } finally {
463
+ unlock();
464
+ }
465
+ const sendTask = import_agents.Task.from((controller) => this.sendTask(session, controller));
466
+ const restartWaitTask = import_agents.Task.from(({ signal }) => {
467
+ const abortEvent = new import_agents.Event();
468
+ signal.addEventListener("abort", () => abortEvent.set());
469
+ return Promise.race([this.sessionShouldClose.wait(), abortEvent.wait()]);
470
+ });
471
+ await Promise.race([sendTask.result, restartWaitTask.result]);
472
+ if (!restartWaitTask.done && this.#closed) {
473
+ break;
474
+ }
475
+ await (0, import_agents.cancelAndWait)([sendTask, restartWaitTask], 2e3);
476
+ } catch (error) {
477
+ this.#logger.error(`Gemini Realtime API error: ${error}`);
478
+ if (this.#closed) break;
479
+ if (maxRetries === 0) {
480
+ this.emitError(error, false);
481
+ throw new import_agents.APIConnectionError({
482
+ message: "Failed to connect to Gemini Live"
483
+ });
484
+ }
485
+ if (this.numRetries >= maxRetries) {
486
+ this.emitError(error, false);
487
+ throw new import_agents.APIConnectionError({
488
+ message: `Failed to connect to Gemini Live after ${maxRetries} attempts`
489
+ });
490
+ }
491
+ const retryInterval = this.numRetries === 100 ? 0 : this.options.connOptions.retryIntervalMs;
492
+ this.#logger.warn(
493
+ {
494
+ attempt: this.numRetries,
495
+ maxRetries
496
+ },
497
+ `Gemini Realtime API connection failed, retrying in ${retryInterval}ms`
498
+ );
499
+ await (0, import_async.delay)(retryInterval);
500
+ this.numRetries++;
501
+ } finally {
502
+ await this.closeActiveSession();
503
+ }
504
+ }
505
+ }
506
+ async sendTask(session, controller) {
507
+ try {
508
+ while (!this.#closed && !this.sessionShouldClose.isSet && !controller.signal.aborted) {
509
+ const msg = await this.messageChannel.get();
510
+ if (controller.signal.aborted) break;
511
+ const unlock = await this.sessionLock.lock();
512
+ try {
513
+ if (this.sessionShouldClose.isSet || this.activeSession !== session) {
514
+ break;
515
+ }
516
+ } finally {
517
+ unlock();
518
+ }
519
+ switch (msg.type) {
520
+ case "content":
521
+ const { turns, turnComplete } = msg.value;
522
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
523
+ await session.sendClientContent({
524
+ turns,
525
+ turnComplete: turnComplete ?? true
526
+ });
527
+ break;
528
+ case "tool_response":
529
+ const { functionResponses } = msg.value;
530
+ if (functionResponses) {
531
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
532
+ await session.sendToolResponse({
533
+ functionResponses
534
+ });
535
+ }
536
+ break;
537
+ case "realtime_input":
538
+ const { mediaChunks, activityStart, activityEnd } = msg.value;
539
+ if (mediaChunks) {
540
+ for (const mediaChunk of mediaChunks) {
541
+ await session.sendRealtimeInput({ media: mediaChunk });
542
+ }
543
+ }
544
+ if (activityStart) await session.sendRealtimeInput({ activityStart });
545
+ if (activityEnd) await session.sendRealtimeInput({ activityEnd });
546
+ break;
547
+ default:
548
+ this.#logger.warn(`Warning: Received unhandled message type: ${msg.type}`);
549
+ break;
550
+ }
551
+ }
552
+ } catch (e) {
553
+ if (!this.sessionShouldClose.isSet) {
554
+ this.#logger.error(`Error in send task: ${e}`);
555
+ this.markRestartNeeded();
556
+ }
557
+ } finally {
558
+ this.#logger.debug(
559
+ {
560
+ closed: this.#closed,
561
+ sessionShouldClose: this.sessionShouldClose.isSet,
562
+ aborted: controller.signal.aborted
563
+ },
564
+ "send task finished."
565
+ );
566
+ }
567
+ }
568
+ async onReceiveMessage(session, response) {
569
+ var _a, _b, _c;
570
+ const hasAudioData = (_c = (_b = (_a = response.serverContent) == null ? void 0 : _a.modelTurn) == null ? void 0 : _b.parts) == null ? void 0 : _c.some(
571
+ (part) => {
572
+ var _a2;
573
+ return (_a2 = part.inlineData) == null ? void 0 : _a2.data;
574
+ }
575
+ );
576
+ if (!hasAudioData) {
577
+ this.#logger.debug(`(server) <- ${JSON.stringify(this.loggableServerMessage(response))}`);
578
+ }
579
+ const unlock = await this.sessionLock.lock();
580
+ try {
581
+ if (this.sessionShouldClose.isSet || this.activeSession !== session) {
582
+ this.#logger.debug("onReceiveMessage: Session changed or closed, stopping receive.");
583
+ return;
584
+ }
585
+ } finally {
586
+ unlock();
587
+ }
588
+ if ((!this.currentGeneration || this.currentGeneration._done) && (response.serverContent || response.toolCall)) {
589
+ this.startNewGeneration();
590
+ }
591
+ if (response.sessionResumptionUpdate) {
592
+ if (response.sessionResumptionUpdate.resumable && response.sessionResumptionUpdate.newHandle) {
593
+ this.sessionResumptionHandle = response.sessionResumptionUpdate.newHandle;
594
+ }
595
+ }
596
+ try {
597
+ if (response.serverContent) {
598
+ this.handleServerContent(response.serverContent);
599
+ }
600
+ if (response.toolCall) {
601
+ this.handleToolCall(response.toolCall);
602
+ }
603
+ if (response.toolCallCancellation) {
604
+ this.handleToolCallCancellation(response.toolCallCancellation);
605
+ }
606
+ if (response.usageMetadata) {
607
+ this.handleUsageMetadata(response.usageMetadata);
608
+ }
609
+ if (response.goAway) {
610
+ this.handleGoAway(response.goAway);
611
+ }
612
+ if (this.numRetries > 0) {
613
+ this.numRetries = 0;
614
+ }
615
+ } catch (e) {
616
+ if (!this.sessionShouldClose.isSet) {
617
+ this.#logger.error(`Error in onReceiveMessage: ${e}`);
618
+ this.markRestartNeeded();
619
+ }
620
+ }
621
+ }
622
+ /// Truncate large base64/audio payloads for logging to avoid flooding logs
623
+ truncateString(data, maxLength = 30) {
624
+ return data.length > maxLength ? `${data.slice(0, maxLength)}\u2026` : data;
625
+ }
626
+ loggableClientEvent(event, maxLength = 30) {
627
+ var _a;
628
+ const obj = { ...event };
629
+ if (obj.type === "realtime_input" && ((_a = obj.value) == null ? void 0 : _a.mediaChunks)) {
630
+ obj.value = {
631
+ ...obj.value,
632
+ mediaChunks: obj.value.mediaChunks.map(
633
+ (mc) => ({
634
+ ...mc,
635
+ data: typeof mc.data === "string" ? this.truncateString(mc.data, maxLength) : mc.data
636
+ })
637
+ )
638
+ };
639
+ }
640
+ return obj;
641
+ }
642
+ loggableServerMessage(message, maxLength = 30) {
643
+ const obj = { ...message };
644
+ if (obj.serverContent && obj.serverContent.modelTurn && Array.isArray(obj.serverContent.modelTurn.parts)) {
645
+ obj.serverContent = { ...obj.serverContent };
646
+ obj.serverContent.modelTurn = { ...obj.serverContent.modelTurn };
647
+ obj.serverContent.modelTurn.parts = obj.serverContent.modelTurn.parts.map((part) => {
648
+ var _a;
649
+ if (((_a = part == null ? void 0 : part.inlineData) == null ? void 0 : _a.data) && typeof part.inlineData.data === "string") {
650
+ return {
651
+ ...part,
652
+ inlineData: {
653
+ ...part.inlineData,
654
+ data: this.truncateString(part.inlineData.data, maxLength)
655
+ }
656
+ };
657
+ }
658
+ return part;
659
+ });
660
+ }
661
+ return obj;
662
+ }
663
+ markCurrentGenerationDone() {
664
+ if (!this.currentGeneration || this.currentGeneration._done) {
665
+ return;
666
+ }
667
+ this.handleInputSpeechStopped();
668
+ const gen = this.currentGeneration;
669
+ if (gen.inputTranscription) {
670
+ this.emit("input_audio_transcription_completed", {
671
+ itemId: gen.inputId,
672
+ transcript: gen.inputTranscription,
673
+ isFinal: true
674
+ });
675
+ this._chatCtx.addMessage({
676
+ role: "user",
677
+ content: gen.inputTranscription,
678
+ id: gen.inputId
679
+ });
680
+ }
681
+ if (gen.outputText) {
682
+ this._chatCtx.addMessage({
683
+ role: "assistant",
684
+ content: gen.outputText,
685
+ id: gen.responseId
686
+ });
687
+ }
688
+ if (this.options.outputAudioTranscription === void 0) {
689
+ gen.textChannel.write("");
690
+ }
691
+ gen.textChannel.close();
692
+ gen.audioChannel.close();
693
+ gen.functionChannel.close();
694
+ gen.messageChannel.close();
695
+ gen._done = true;
696
+ }
697
+ emitError(error, recoverable) {
698
+ this.emit("error", {
699
+ timestamp: Date.now(),
700
+ // TODO(brian): add label to realtime model
701
+ label: "google_realtime",
702
+ error,
703
+ recoverable
704
+ });
705
+ }
706
+ buildConnectConfig() {
707
+ const opts = this.options;
708
+ const config = {
709
+ responseModalities: opts.responseModalities,
710
+ systemInstruction: opts.instructions ? {
711
+ parts: [{ text: opts.instructions }]
712
+ } : void 0,
713
+ speechConfig: {
714
+ voiceConfig: {
715
+ prebuiltVoiceConfig: {
716
+ voiceName: opts.voice
717
+ }
718
+ },
719
+ languageCode: opts.language
720
+ },
721
+ tools: [
722
+ {
723
+ functionDeclarations: this.geminiDeclarations,
724
+ ...this.options.geminiTools
725
+ }
726
+ ],
727
+ inputAudioTranscription: opts.inputAudioTranscription,
728
+ outputAudioTranscription: opts.outputAudioTranscription,
729
+ sessionResumption: {
730
+ handle: this.sessionResumptionHandle
731
+ }
732
+ };
733
+ if (opts.temperature !== void 0) {
734
+ config.temperature = opts.temperature;
735
+ }
736
+ if (opts.maxOutputTokens !== void 0) {
737
+ config.maxOutputTokens = opts.maxOutputTokens;
738
+ }
739
+ if (opts.topP !== void 0) {
740
+ config.topP = opts.topP;
741
+ }
742
+ if (opts.topK !== void 0) {
743
+ config.topK = opts.topK;
744
+ }
745
+ if (opts.proactivity !== void 0) {
746
+ config.proactivity = { proactiveAudio: opts.proactivity };
747
+ }
748
+ if (opts.enableAffectiveDialog !== void 0) {
749
+ config.enableAffectiveDialog = opts.enableAffectiveDialog;
750
+ }
751
+ if (opts.realtimeInputConfig !== void 0) {
752
+ config.realtimeInputConfig = opts.realtimeInputConfig;
753
+ }
754
+ if (opts.contextWindowCompression !== void 0) {
755
+ config.contextWindowCompression = opts.contextWindowCompression;
756
+ }
757
+ return config;
758
+ }
759
+ startNewGeneration() {
760
+ if (this.currentGeneration && !this.currentGeneration._done) {
761
+ this.#logger.warn("Starting new generation while another is active. Finalizing previous.");
762
+ this.markCurrentGenerationDone();
763
+ }
764
+ const responseId = (0, import_agents.shortuuid)("GR_");
765
+ this.currentGeneration = {
766
+ messageChannel: import_agents.stream.createStreamChannel(),
767
+ functionChannel: import_agents.stream.createStreamChannel(),
768
+ responseId,
769
+ inputId: (0, import_agents.shortuuid)("GI_"),
770
+ textChannel: import_agents.stream.createStreamChannel(),
771
+ audioChannel: import_agents.stream.createStreamChannel(),
772
+ inputTranscription: "",
773
+ outputText: "",
774
+ _createdTimestamp: Date.now(),
775
+ _done: false
776
+ };
777
+ if (!this.options.responseModalities.includes(import_genai.Modality.AUDIO)) {
778
+ this.currentGeneration.audioChannel.close();
779
+ }
780
+ this.currentGeneration.messageChannel.write({
781
+ messageId: responseId,
782
+ textStream: this.currentGeneration.textChannel.stream(),
783
+ audioStream: this.currentGeneration.audioChannel.stream()
784
+ });
785
+ const generationEvent = {
786
+ messageStream: this.currentGeneration.messageChannel.stream(),
787
+ functionStream: this.currentGeneration.functionChannel.stream(),
788
+ userInitiated: false
789
+ };
790
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
791
+ generationEvent.userInitiated = true;
792
+ this.pendingGenerationFut.resolve(generationEvent);
793
+ this.pendingGenerationFut = void 0;
794
+ } else {
795
+ this.handleInputSpeechStarted();
796
+ }
797
+ this.emit("generation_created", generationEvent);
798
+ }
799
+ handleInputSpeechStarted() {
800
+ this.emit("input_speech_started", {});
801
+ }
802
+ handleInputSpeechStopped() {
803
+ this.emit("input_speech_stopped", {
804
+ userTranscriptionEnabled: false
805
+ });
806
+ }
807
+ handleServerContent(serverContent) {
808
+ if (!this.currentGeneration) {
809
+ this.#logger.warn("received server content but no active generation.");
810
+ return;
811
+ }
812
+ const gen = this.currentGeneration;
813
+ if (serverContent.modelTurn) {
814
+ const turn = serverContent.modelTurn;
815
+ for (const part of turn.parts || []) {
816
+ if (part.text) {
817
+ gen.outputText += part.text;
818
+ gen.textChannel.write(part.text);
819
+ }
820
+ if (part.inlineData) {
821
+ if (!gen._firstTokenTimestamp) {
822
+ gen._firstTokenTimestamp = Date.now();
823
+ }
824
+ try {
825
+ if (!part.inlineData.data) {
826
+ throw new Error("frameData is not bytes");
827
+ }
828
+ const binaryString = atob(part.inlineData.data);
829
+ const len = binaryString.length;
830
+ const bytes = new Uint8Array(len);
831
+ for (let i = 0; i < len; i++) {
832
+ bytes[i] = binaryString.charCodeAt(i);
833
+ }
834
+ const int16Array = new Int16Array(bytes.buffer);
835
+ const audioFrame = new import_rtc_node.AudioFrame(
836
+ int16Array,
837
+ OUTPUT_AUDIO_SAMPLE_RATE,
838
+ OUTPUT_AUDIO_CHANNELS,
839
+ int16Array.length / OUTPUT_AUDIO_CHANNELS
840
+ );
841
+ gen.audioChannel.write(audioFrame);
842
+ } catch (error) {
843
+ this.#logger.error("Error processing audio data:", error);
844
+ }
845
+ }
846
+ }
847
+ }
848
+ if (serverContent.inputTranscription && serverContent.inputTranscription.text) {
849
+ let text = serverContent.inputTranscription.text;
850
+ if (gen.inputTranscription === "") {
851
+ text = text.trimStart();
852
+ }
853
+ gen.inputTranscription += text;
854
+ this.emit("input_audio_transcription_completed", {
855
+ itemId: gen.inputId,
856
+ transcript: gen.inputTranscription,
857
+ isFinal: false
858
+ });
859
+ }
860
+ if (serverContent.outputTranscription && serverContent.outputTranscription.text) {
861
+ const text = serverContent.outputTranscription.text;
862
+ gen.outputText += text;
863
+ gen.textChannel.write(text);
864
+ }
865
+ if (serverContent.generationComplete || serverContent.turnComplete) {
866
+ gen._completedTimestamp = Date.now();
867
+ }
868
+ if (serverContent.interrupted) {
869
+ this.handleInputSpeechStarted();
870
+ }
871
+ if (serverContent.turnComplete) {
872
+ this.markCurrentGenerationDone();
873
+ }
874
+ }
875
+ handleToolCall(toolCall) {
876
+ if (!this.currentGeneration) {
877
+ this.#logger.warn("received tool call but no active generation.");
878
+ return;
879
+ }
880
+ const gen = this.currentGeneration;
881
+ for (const fc of toolCall.functionCalls || []) {
882
+ gen.functionChannel.write({
883
+ callId: fc.id || (0, import_agents.shortuuid)("fnc-call-"),
884
+ name: fc.name,
885
+ args: fc.args ? JSON.stringify(fc.args) : ""
886
+ });
887
+ }
888
+ this.markCurrentGenerationDone();
889
+ }
890
+ handleToolCallCancellation(cancellation) {
891
+ this.#logger.warn(
892
+ {
893
+ functionCallIds: cancellation.ids
894
+ },
895
+ "server cancelled tool calls"
896
+ );
897
+ }
898
+ handleUsageMetadata(usage) {
899
+ if (!this.currentGeneration) {
900
+ this.#logger.debug("Received usage metadata but no active generation");
901
+ return;
902
+ }
903
+ const gen = this.currentGeneration;
904
+ const createdTimestamp = gen._createdTimestamp;
905
+ const firstTokenTimestamp = gen._firstTokenTimestamp;
906
+ const completedTimestamp = gen._completedTimestamp || Date.now();
907
+ const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
908
+ const duration = (completedTimestamp - createdTimestamp) / 1e3;
909
+ const inputTokens = usage.promptTokenCount || 0;
910
+ const outputTokens = usage.responseTokenCount || 0;
911
+ const totalTokens = usage.totalTokenCount || 0;
912
+ const realtimeMetrics = {
913
+ type: "realtime_model_metrics",
914
+ timestamp: createdTimestamp / 1e3,
915
+ requestId: gen.responseId,
916
+ ttft,
917
+ duration,
918
+ cancelled: gen._done && !gen._completedTimestamp,
919
+ label: "google_realtime",
920
+ inputTokens,
921
+ outputTokens,
922
+ totalTokens,
923
+ tokensPerSecond: duration > 0 ? outputTokens / duration : 0,
924
+ inputTokenDetails: {
925
+ ...this.tokenDetailsMap(usage.promptTokensDetails),
926
+ cachedTokens: (usage.cacheTokensDetails || []).reduce(
927
+ (sum, detail) => sum + (detail.tokenCount || 0),
928
+ 0
929
+ ),
930
+ cachedTokensDetails: this.tokenDetailsMap(usage.cacheTokensDetails)
931
+ },
932
+ outputTokenDetails: this.tokenDetailsMap(usage.responseTokensDetails)
933
+ };
934
+ this.emit("metrics_collected", realtimeMetrics);
935
+ }
936
+ tokenDetailsMap(tokenDetails) {
937
+ const tokenDetailsMap = { audioTokens: 0, textTokens: 0, imageTokens: 0 };
938
+ if (!tokenDetails) {
939
+ return tokenDetailsMap;
940
+ }
941
+ for (const tokenDetail of tokenDetails) {
942
+ if (!tokenDetail.tokenCount) {
943
+ continue;
944
+ }
945
+ if (tokenDetail.modality === types.MediaModality.AUDIO) {
946
+ tokenDetailsMap.audioTokens += tokenDetail.tokenCount;
947
+ } else if (tokenDetail.modality === types.MediaModality.TEXT) {
948
+ tokenDetailsMap.textTokens += tokenDetail.tokenCount;
949
+ } else if (tokenDetail.modality === types.MediaModality.IMAGE) {
950
+ tokenDetailsMap.imageTokens += tokenDetail.tokenCount;
951
+ }
952
+ }
953
+ return tokenDetailsMap;
954
+ }
955
+ handleGoAway(goAway) {
956
+ this.#logger.warn({ timeLeft: goAway.timeLeft }, "Gemini server indicates disconnection soon.");
957
+ this.sessionShouldClose.set();
958
+ }
959
+ async commitAudio() {
960
+ }
961
+ async clearAudio() {
962
+ }
963
+ *resampleAudio(frame) {
964
+ if (this.inputResampler) {
965
+ if (frame.sampleRate !== this.inputResamplerInputRate) {
966
+ this.inputResampler = void 0;
967
+ this.inputResamplerInputRate = void 0;
968
+ }
969
+ }
970
+ if (this.inputResampler === void 0 && (frame.sampleRate !== INPUT_AUDIO_SAMPLE_RATE || frame.channels !== INPUT_AUDIO_CHANNELS)) {
971
+ this.inputResampler = new import_rtc_node.AudioResampler(
972
+ frame.sampleRate,
973
+ INPUT_AUDIO_SAMPLE_RATE,
974
+ INPUT_AUDIO_CHANNELS
975
+ );
976
+ this.inputResamplerInputRate = frame.sampleRate;
977
+ }
978
+ if (this.inputResampler) {
979
+ for (const resampledFrame of this.inputResampler.push(frame)) {
980
+ yield resampledFrame;
981
+ }
982
+ } else {
983
+ yield frame;
984
+ }
985
+ }
986
+ }
987
+ // Annotate the CommonJS export names for ESM import in node:
988
+ 0 && (module.exports = {
989
+ DEFAULT_IMAGE_ENCODE_OPTIONS,
990
+ RealtimeModel,
991
+ RealtimeSession
992
+ });
993
+ //# sourceMappingURL=realtime_api.cjs.map