@livekit/agents-plugin-google 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +89 -0
  3. package/dist/beta/gemini_tts.cjs +239 -0
  4. package/dist/beta/gemini_tts.cjs.map +1 -0
  5. package/dist/beta/gemini_tts.d.cts +47 -0
  6. package/dist/beta/gemini_tts.d.ts +47 -0
  7. package/dist/beta/gemini_tts.d.ts.map +1 -0
  8. package/dist/beta/gemini_tts.js +221 -0
  9. package/dist/beta/gemini_tts.js.map +1 -0
  10. package/dist/beta/gemini_tts.test.cjs +9 -0
  11. package/dist/beta/gemini_tts.test.cjs.map +1 -0
  12. package/dist/beta/gemini_tts.test.d.cts +2 -0
  13. package/dist/beta/gemini_tts.test.d.ts +2 -0
  14. package/dist/beta/gemini_tts.test.d.ts.map +1 -0
  15. package/dist/beta/gemini_tts.test.js +8 -0
  16. package/dist/beta/gemini_tts.test.js.map +1 -0
  17. package/dist/beta/index.cjs +42 -0
  18. package/dist/beta/index.cjs.map +1 -0
  19. package/dist/beta/index.d.cts +3 -0
  20. package/dist/beta/index.d.ts +3 -0
  21. package/dist/beta/index.d.ts.map +1 -0
  22. package/dist/beta/index.js +7 -0
  23. package/dist/beta/index.js.map +1 -0
  24. package/dist/beta/realtime/api_proto.cjs +17 -0
  25. package/dist/beta/realtime/api_proto.cjs.map +1 -0
  26. package/dist/beta/realtime/api_proto.d.cts +26 -0
  27. package/dist/beta/realtime/api_proto.d.ts +26 -0
  28. package/dist/beta/realtime/api_proto.d.ts.map +1 -0
  29. package/dist/beta/realtime/api_proto.js +1 -0
  30. package/dist/beta/realtime/api_proto.js.map +1 -0
  31. package/dist/beta/realtime/index.cjs +29 -0
  32. package/dist/beta/realtime/index.cjs.map +1 -0
  33. package/dist/beta/realtime/index.d.cts +3 -0
  34. package/dist/beta/realtime/index.d.ts +3 -0
  35. package/dist/beta/realtime/index.d.ts.map +1 -0
  36. package/dist/beta/realtime/index.js +5 -0
  37. package/dist/beta/realtime/index.js.map +1 -0
  38. package/dist/beta/realtime/realtime_api.cjs +993 -0
  39. package/dist/beta/realtime/realtime_api.cjs.map +1 -0
  40. package/dist/beta/realtime/realtime_api.d.cts +267 -0
  41. package/dist/beta/realtime/realtime_api.d.ts +267 -0
  42. package/dist/beta/realtime/realtime_api.d.ts.map +1 -0
  43. package/dist/beta/realtime/realtime_api.js +974 -0
  44. package/dist/beta/realtime/realtime_api.js.map +1 -0
  45. package/dist/index.cjs +58 -0
  46. package/dist/index.cjs.map +1 -0
  47. package/dist/index.d.cts +4 -0
  48. package/dist/index.d.ts +4 -0
  49. package/dist/index.d.ts.map +1 -0
  50. package/dist/index.js +20 -0
  51. package/dist/index.js.map +1 -0
  52. package/dist/llm.cjs +381 -0
  53. package/dist/llm.cjs.map +1 -0
  54. package/dist/llm.d.cts +82 -0
  55. package/dist/llm.d.ts +82 -0
  56. package/dist/llm.d.ts.map +1 -0
  57. package/dist/llm.js +362 -0
  58. package/dist/llm.js.map +1 -0
  59. package/dist/llm.test.cjs +8 -0
  60. package/dist/llm.test.cjs.map +1 -0
  61. package/dist/llm.test.d.cts +2 -0
  62. package/dist/llm.test.d.ts +2 -0
  63. package/dist/llm.test.d.ts.map +1 -0
  64. package/dist/llm.test.js +7 -0
  65. package/dist/llm.test.js.map +1 -0
  66. package/dist/models.cjs +17 -0
  67. package/dist/models.cjs.map +1 -0
  68. package/dist/models.d.cts +5 -0
  69. package/dist/models.d.ts +5 -0
  70. package/dist/models.d.ts.map +1 -0
  71. package/dist/models.js +1 -0
  72. package/dist/models.js.map +1 -0
  73. package/dist/tools.cjs +17 -0
  74. package/dist/tools.cjs.map +1 -0
  75. package/dist/tools.d.cts +3 -0
  76. package/dist/tools.d.ts +3 -0
  77. package/dist/tools.d.ts.map +1 -0
  78. package/dist/tools.js +1 -0
  79. package/dist/tools.js.map +1 -0
  80. package/dist/utils.cjs +137 -0
  81. package/dist/utils.cjs.map +1 -0
  82. package/dist/utils.d.cts +14 -0
  83. package/dist/utils.d.ts +14 -0
  84. package/dist/utils.d.ts.map +1 -0
  85. package/dist/utils.js +112 -0
  86. package/dist/utils.js.map +1 -0
  87. package/package.json +56 -0
  88. package/src/beta/gemini_tts.test.ts +11 -0
  89. package/src/beta/gemini_tts.ts +309 -0
  90. package/src/beta/index.ts +6 -0
  91. package/src/beta/realtime/api_proto.ts +41 -0
  92. package/src/beta/realtime/index.ts +5 -0
  93. package/src/beta/realtime/realtime_api.ts +1440 -0
  94. package/src/index.ts +20 -0
  95. package/src/llm.test.ts +10 -0
  96. package/src/llm.ts +463 -0
  97. package/src/models.ts +100 -0
  98. package/src/tools.ts +6 -0
  99. package/src/utils.ts +157 -0
@@ -0,0 +1,974 @@
1
+ import * as types from "@google/genai";
2
+ import {
3
+ ActivityHandling,
4
+ GoogleGenAI,
5
+ Modality
6
+ } from "@google/genai";
7
+ import {
8
+ APIConnectionError,
9
+ AudioByteStream,
10
+ DEFAULT_API_CONNECT_OPTIONS,
11
+ Event,
12
+ Future,
13
+ Queue,
14
+ Task,
15
+ cancelAndWait,
16
+ llm,
17
+ log,
18
+ shortuuid,
19
+ stream
20
+ } from "@livekit/agents";
21
+ import { Mutex } from "@livekit/mutex";
22
+ import { AudioFrame, AudioResampler } from "@livekit/rtc-node";
23
+ import { delay } from "@std/async";
24
+ import {} from "../../tools.js";
25
+ import { toFunctionDeclarations } from "../../utils.js";
26
+ const INPUT_AUDIO_SAMPLE_RATE = 16e3;
27
+ const INPUT_AUDIO_CHANNELS = 1;
28
+ const OUTPUT_AUDIO_SAMPLE_RATE = 24e3;
29
+ const OUTPUT_AUDIO_CHANNELS = 1;
30
+ const DEFAULT_IMAGE_ENCODE_OPTIONS = {
31
+ format: "JPEG",
32
+ quality: 75,
33
+ resizeOptions: {
34
+ width: 1024,
35
+ height: 1024,
36
+ strategy: "scale_aspect_fit"
37
+ }
38
+ };
39
+ function setsEqual(a, b) {
40
+ return a.size === b.size && [...a].every((x) => b.has(x));
41
+ }
42
+ class RealtimeModel extends llm.RealtimeModel {
43
+ /** @internal */
44
+ _options;
45
+ constructor(options = {}) {
46
+ var _a, _b;
47
+ const inputAudioTranscription = options.inputAudioTranscription === void 0 ? {} : options.inputAudioTranscription;
48
+ const outputAudioTranscription = options.outputAudioTranscription === void 0 ? {} : options.outputAudioTranscription;
49
+ let serverTurnDetection = true;
50
+ if ((_b = (_a = options.realtimeInputConfig) == null ? void 0 : _a.automaticActivityDetection) == null ? void 0 : _b.disabled) {
51
+ serverTurnDetection = false;
52
+ }
53
+ super({
54
+ messageTruncation: false,
55
+ turnDetection: serverTurnDetection,
56
+ userTranscription: inputAudioTranscription !== null,
57
+ autoToolReplyGeneration: true
58
+ });
59
+ const apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
60
+ const project = options.project || process.env.GOOGLE_CLOUD_PROJECT;
61
+ const location = options.location || process.env.GOOGLE_CLOUD_LOCATION || "us-central1";
62
+ const vertexai = options.vertexai ?? false;
63
+ const defaultModel = vertexai ? "gemini-2.0-flash-exp" : "gemini-2.0-flash-live-001";
64
+ this._options = {
65
+ model: options.model || defaultModel,
66
+ apiKey,
67
+ voice: options.voice || "Puck",
68
+ language: options.language,
69
+ responseModalities: options.modalities || [Modality.AUDIO],
70
+ vertexai,
71
+ project,
72
+ location,
73
+ candidateCount: options.candidateCount || 1,
74
+ temperature: options.temperature,
75
+ maxOutputTokens: options.maxOutputTokens,
76
+ topP: options.topP,
77
+ topK: options.topK,
78
+ presencePenalty: options.presencePenalty,
79
+ frequencyPenalty: options.frequencyPenalty,
80
+ instructions: options.instructions,
81
+ inputAudioTranscription: inputAudioTranscription || void 0,
82
+ outputAudioTranscription: outputAudioTranscription || void 0,
83
+ imageEncodeOptions: options.imageEncodeOptions || DEFAULT_IMAGE_ENCODE_OPTIONS,
84
+ connOptions: options.connOptions || DEFAULT_API_CONNECT_OPTIONS,
85
+ httpOptions: options.httpOptions,
86
+ enableAffectiveDialog: options.enableAffectiveDialog,
87
+ proactivity: options.proactivity,
88
+ realtimeInputConfig: options.realtimeInputConfig,
89
+ contextWindowCompression: options.contextWindowCompression,
90
+ apiVersion: options.apiVersion,
91
+ geminiTools: options.geminiTools
92
+ };
93
+ }
94
+ /**
95
+ * Create a new realtime session
96
+ */
97
+ session() {
98
+ return new RealtimeSession(this);
99
+ }
100
+ /**
101
+ * Update model options
102
+ */
103
+ updateOptions(options) {
104
+ if (options.voice !== void 0) {
105
+ this._options.voice = options.voice;
106
+ }
107
+ if (options.temperature !== void 0) {
108
+ this._options.temperature = options.temperature;
109
+ }
110
+ }
111
+ /**
112
+ * Close the model and cleanup resources
113
+ */
114
+ async close() {
115
+ }
116
+ }
117
+ class RealtimeSession extends llm.RealtimeSession {
118
+ _tools = {};
119
+ _chatCtx = llm.ChatContext.empty();
120
+ options;
121
+ geminiDeclarations = [];
122
+ messageChannel = new Queue();
123
+ inputResampler;
124
+ inputResamplerInputRate;
125
+ instructions;
126
+ currentGeneration;
127
+ bstream;
128
+ // Google-specific properties
129
+ activeSession;
130
+ sessionShouldClose = new Event();
131
+ responseCreatedFutures = {};
132
+ pendingGenerationFut;
133
+ sessionResumptionHandle;
134
+ inUserActivity = false;
135
+ sessionLock = new Mutex();
136
+ numRetries = 0;
137
+ hasReceivedAudioInput = false;
138
+ #client;
139
+ #task;
140
+ #logger = log();
141
+ #closed = false;
142
+ constructor(realtimeModel) {
143
+ super(realtimeModel);
144
+ this.options = realtimeModel._options;
145
+ this.bstream = new AudioByteStream(
146
+ INPUT_AUDIO_SAMPLE_RATE,
147
+ INPUT_AUDIO_CHANNELS,
148
+ INPUT_AUDIO_SAMPLE_RATE / 20
149
+ );
150
+ const { apiKey, project, location, vertexai, enableAffectiveDialog, proactivity } = this.options;
151
+ const apiVersion = !this.options.apiVersion && (enableAffectiveDialog || proactivity) ? "v1alpha" : this.options.apiVersion;
152
+ const httpOptions = {
153
+ ...this.options.httpOptions,
154
+ apiVersion,
155
+ timeout: this.options.connOptions.timeoutMs
156
+ };
157
+ const clientOptions = vertexai ? {
158
+ vertexai: true,
159
+ project,
160
+ location,
161
+ httpOptions
162
+ } : {
163
+ apiKey,
164
+ httpOptions
165
+ };
166
+ this.#client = new GoogleGenAI(clientOptions);
167
+ this.#task = this.#mainTask();
168
+ }
169
+ async closeActiveSession() {
170
+ const unlock = await this.sessionLock.lock();
171
+ if (this.activeSession) {
172
+ try {
173
+ await this.activeSession.close();
174
+ } catch (error) {
175
+ this.#logger.warn({ error }, "Error closing Gemini session");
176
+ } finally {
177
+ this.activeSession = void 0;
178
+ }
179
+ }
180
+ unlock();
181
+ }
182
+ markRestartNeeded() {
183
+ if (!this.sessionShouldClose.isSet) {
184
+ this.sessionShouldClose.set();
185
+ this.messageChannel = new Queue();
186
+ }
187
+ }
188
+ getToolResultsForRealtime(ctx, vertexai) {
189
+ const toolResponses = [];
190
+ for (const item of ctx.items) {
191
+ if (item.type === "function_call_output") {
192
+ const response = {
193
+ id: item.callId,
194
+ name: item.name,
195
+ response: { output: item.output }
196
+ };
197
+ if (!vertexai) {
198
+ response.id = item.callId;
199
+ }
200
+ toolResponses.push(response);
201
+ }
202
+ }
203
+ return toolResponses.length > 0 ? { functionResponses: toolResponses } : void 0;
204
+ }
205
+ updateOptions(options) {
206
+ let shouldRestart = false;
207
+ if (options.voice !== void 0 && this.options.voice !== options.voice) {
208
+ this.options.voice = options.voice;
209
+ shouldRestart = true;
210
+ }
211
+ if (options.temperature !== void 0 && this.options.temperature !== options.temperature) {
212
+ this.options.temperature = options.temperature;
213
+ shouldRestart = true;
214
+ }
215
+ if (shouldRestart) {
216
+ this.markRestartNeeded();
217
+ }
218
+ }
219
+ async updateInstructions(instructions) {
220
+ if (this.options.instructions === void 0 || this.options.instructions !== instructions) {
221
+ this.options.instructions = instructions;
222
+ this.markRestartNeeded();
223
+ }
224
+ }
225
+ async updateChatCtx(chatCtx) {
226
+ const unlock = await this.sessionLock.lock();
227
+ try {
228
+ if (!this.activeSession) {
229
+ this._chatCtx = chatCtx.copy();
230
+ return;
231
+ }
232
+ } finally {
233
+ unlock();
234
+ }
235
+ const diffOps = llm.computeChatCtxDiff(this._chatCtx, chatCtx);
236
+ if (diffOps.toRemove.length > 0) {
237
+ this.#logger.warn("Gemini Live does not support removing messages");
238
+ }
239
+ const appendCtx = llm.ChatContext.empty();
240
+ for (const [, itemId] of diffOps.toCreate) {
241
+ const item = chatCtx.getById(itemId);
242
+ if (item) {
243
+ appendCtx.items.push(item);
244
+ }
245
+ }
246
+ if (appendCtx.items.length > 0) {
247
+ const [turns] = await appendCtx.copy({
248
+ excludeFunctionCall: true
249
+ }).toProviderFormat("google", false);
250
+ const toolResults = this.getToolResultsForRealtime(appendCtx, this.options.vertexai);
251
+ if (turns.length > 0) {
252
+ this.sendClientEvent({
253
+ type: "content",
254
+ value: {
255
+ turns,
256
+ turnComplete: false
257
+ }
258
+ });
259
+ }
260
+ if (toolResults) {
261
+ this.sendClientEvent({
262
+ type: "tool_response",
263
+ value: toolResults
264
+ });
265
+ }
266
+ }
267
+ this._chatCtx = chatCtx.copy();
268
+ }
269
+ async updateTools(tools) {
270
+ const newDeclarations = toFunctionDeclarations(tools);
271
+ const currentToolNames = new Set(this.geminiDeclarations.map((f) => f.name));
272
+ const newToolNames = new Set(newDeclarations.map((f) => f.name));
273
+ if (!setsEqual(currentToolNames, newToolNames)) {
274
+ this.geminiDeclarations = newDeclarations;
275
+ this._tools = tools;
276
+ this.markRestartNeeded();
277
+ }
278
+ }
279
+ get chatCtx() {
280
+ return this._chatCtx.copy();
281
+ }
282
+ get tools() {
283
+ return { ...this._tools };
284
+ }
285
+ get manualActivityDetection() {
286
+ var _a, _b;
287
+ return ((_b = (_a = this.options.realtimeInputConfig) == null ? void 0 : _a.automaticActivityDetection) == null ? void 0 : _b.disabled) ?? false;
288
+ }
289
+ pushAudio(frame) {
290
+ this.hasReceivedAudioInput = true;
291
+ for (const f of this.resampleAudio(frame)) {
292
+ for (const nf of this.bstream.write(f.data.buffer)) {
293
+ const realtimeInput = {
294
+ mediaChunks: [
295
+ {
296
+ mimeType: "audio/pcm",
297
+ data: Buffer.from(nf.data.buffer).toString("base64")
298
+ }
299
+ ]
300
+ };
301
+ this.sendClientEvent({
302
+ type: "realtime_input",
303
+ value: realtimeInput
304
+ });
305
+ }
306
+ }
307
+ }
308
+ pushVideo(_) {
309
+ }
310
+ sendClientEvent(event) {
311
+ this.messageChannel.put(event);
312
+ }
313
+ async generateReply(instructions) {
314
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
315
+ this.#logger.warn(
316
+ "generateReply called while another generation is pending, cancelling previous."
317
+ );
318
+ this.pendingGenerationFut.reject(new Error("Superseded by new generate_reply call"));
319
+ }
320
+ const fut = new Future();
321
+ this.pendingGenerationFut = fut;
322
+ if (this.inUserActivity) {
323
+ this.sendClientEvent({
324
+ type: "realtime_input",
325
+ value: {
326
+ activityEnd: {}
327
+ }
328
+ });
329
+ this.inUserActivity = false;
330
+ }
331
+ const turns = [];
332
+ if (instructions !== void 0) {
333
+ turns.push({
334
+ parts: [{ text: instructions }],
335
+ role: "model"
336
+ });
337
+ }
338
+ turns.push({
339
+ parts: [{ text: "." }],
340
+ role: "user"
341
+ });
342
+ this.sendClientEvent({
343
+ type: "content",
344
+ value: {
345
+ turns,
346
+ turnComplete: true
347
+ }
348
+ });
349
+ const timeoutHandle = setTimeout(() => {
350
+ if (!fut.done) {
351
+ fut.reject(new Error("generateReply timed out waiting for generation_created event."));
352
+ if (this.pendingGenerationFut === fut) {
353
+ this.pendingGenerationFut = void 0;
354
+ }
355
+ }
356
+ }, 5e3);
357
+ fut.await.finally(() => clearTimeout(timeoutHandle));
358
+ return fut.await;
359
+ }
360
+ startUserActivity() {
361
+ if (!this.manualActivityDetection) {
362
+ return;
363
+ }
364
+ if (!this.inUserActivity) {
365
+ this.inUserActivity = true;
366
+ this.sendClientEvent({
367
+ type: "realtime_input",
368
+ value: {
369
+ activityStart: {}
370
+ }
371
+ });
372
+ }
373
+ }
374
+ async interrupt() {
375
+ var _a;
376
+ if (((_a = this.options.realtimeInputConfig) == null ? void 0 : _a.activityHandling) === ActivityHandling.NO_INTERRUPTION) {
377
+ return;
378
+ }
379
+ this.startUserActivity();
380
+ }
381
+ async truncate(_options) {
382
+ this.#logger.warn("truncate is not supported by the Google Realtime API.");
383
+ }
384
+ async close() {
385
+ super.close();
386
+ this.#closed = true;
387
+ this.sessionShouldClose.set();
388
+ await this.closeActiveSession();
389
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
390
+ this.pendingGenerationFut.reject(new Error("Session closed"));
391
+ }
392
+ for (const fut of Object.values(this.responseCreatedFutures)) {
393
+ if (!fut.done) {
394
+ fut.reject(new Error("Session closed before response created"));
395
+ }
396
+ }
397
+ this.responseCreatedFutures = {};
398
+ if (this.currentGeneration) {
399
+ this.markCurrentGenerationDone();
400
+ }
401
+ }
402
+ async #mainTask() {
403
+ const maxRetries = this.options.connOptions.maxRetry;
404
+ while (!this.#closed) {
405
+ await this.closeActiveSession();
406
+ this.sessionShouldClose.clear();
407
+ const config = this.buildConnectConfig();
408
+ try {
409
+ this.#logger.debug("Connecting to Gemini Realtime API...");
410
+ const sessionOpened = new Event();
411
+ const session = await this.#client.live.connect({
412
+ model: this.options.model,
413
+ callbacks: {
414
+ onopen: () => sessionOpened.set(),
415
+ onmessage: (message) => {
416
+ this.onReceiveMessage(session, message);
417
+ },
418
+ onerror: (error) => {
419
+ this.#logger.error("Gemini Live session error:", error);
420
+ if (!this.sessionShouldClose.isSet) {
421
+ this.markRestartNeeded();
422
+ }
423
+ },
424
+ onclose: (event) => {
425
+ this.#logger.debug("Gemini Live session closed:", event.code, event.reason);
426
+ this.markCurrentGenerationDone();
427
+ }
428
+ },
429
+ config
430
+ });
431
+ await sessionOpened.wait();
432
+ const unlock = await this.sessionLock.lock();
433
+ try {
434
+ this.activeSession = session;
435
+ const [turns] = await this._chatCtx.copy({
436
+ excludeFunctionCall: true
437
+ }).toProviderFormat("google", false);
438
+ if (turns.length > 0) {
439
+ await session.sendClientContent({
440
+ turns,
441
+ turnComplete: false
442
+ });
443
+ }
444
+ } finally {
445
+ unlock();
446
+ }
447
+ const sendTask = Task.from((controller) => this.sendTask(session, controller));
448
+ const restartWaitTask = Task.from(({ signal }) => {
449
+ const abortEvent = new Event();
450
+ signal.addEventListener("abort", () => abortEvent.set());
451
+ return Promise.race([this.sessionShouldClose.wait(), abortEvent.wait()]);
452
+ });
453
+ await Promise.race([sendTask.result, restartWaitTask.result]);
454
+ if (!restartWaitTask.done && this.#closed) {
455
+ break;
456
+ }
457
+ await cancelAndWait([sendTask, restartWaitTask], 2e3);
458
+ } catch (error) {
459
+ this.#logger.error(`Gemini Realtime API error: ${error}`);
460
+ if (this.#closed) break;
461
+ if (maxRetries === 0) {
462
+ this.emitError(error, false);
463
+ throw new APIConnectionError({
464
+ message: "Failed to connect to Gemini Live"
465
+ });
466
+ }
467
+ if (this.numRetries >= maxRetries) {
468
+ this.emitError(error, false);
469
+ throw new APIConnectionError({
470
+ message: `Failed to connect to Gemini Live after ${maxRetries} attempts`
471
+ });
472
+ }
473
+ const retryInterval = this.numRetries === 100 ? 0 : this.options.connOptions.retryIntervalMs;
474
+ this.#logger.warn(
475
+ {
476
+ attempt: this.numRetries,
477
+ maxRetries
478
+ },
479
+ `Gemini Realtime API connection failed, retrying in ${retryInterval}ms`
480
+ );
481
+ await delay(retryInterval);
482
+ this.numRetries++;
483
+ } finally {
484
+ await this.closeActiveSession();
485
+ }
486
+ }
487
+ }
488
+ async sendTask(session, controller) {
489
+ try {
490
+ while (!this.#closed && !this.sessionShouldClose.isSet && !controller.signal.aborted) {
491
+ const msg = await this.messageChannel.get();
492
+ if (controller.signal.aborted) break;
493
+ const unlock = await this.sessionLock.lock();
494
+ try {
495
+ if (this.sessionShouldClose.isSet || this.activeSession !== session) {
496
+ break;
497
+ }
498
+ } finally {
499
+ unlock();
500
+ }
501
+ switch (msg.type) {
502
+ case "content":
503
+ const { turns, turnComplete } = msg.value;
504
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
505
+ await session.sendClientContent({
506
+ turns,
507
+ turnComplete: turnComplete ?? true
508
+ });
509
+ break;
510
+ case "tool_response":
511
+ const { functionResponses } = msg.value;
512
+ if (functionResponses) {
513
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
514
+ await session.sendToolResponse({
515
+ functionResponses
516
+ });
517
+ }
518
+ break;
519
+ case "realtime_input":
520
+ const { mediaChunks, activityStart, activityEnd } = msg.value;
521
+ if (mediaChunks) {
522
+ for (const mediaChunk of mediaChunks) {
523
+ await session.sendRealtimeInput({ media: mediaChunk });
524
+ }
525
+ }
526
+ if (activityStart) await session.sendRealtimeInput({ activityStart });
527
+ if (activityEnd) await session.sendRealtimeInput({ activityEnd });
528
+ break;
529
+ default:
530
+ this.#logger.warn(`Warning: Received unhandled message type: ${msg.type}`);
531
+ break;
532
+ }
533
+ }
534
+ } catch (e) {
535
+ if (!this.sessionShouldClose.isSet) {
536
+ this.#logger.error(`Error in send task: ${e}`);
537
+ this.markRestartNeeded();
538
+ }
539
+ } finally {
540
+ this.#logger.debug(
541
+ {
542
+ closed: this.#closed,
543
+ sessionShouldClose: this.sessionShouldClose.isSet,
544
+ aborted: controller.signal.aborted
545
+ },
546
+ "send task finished."
547
+ );
548
+ }
549
+ }
550
+ async onReceiveMessage(session, response) {
551
+ var _a, _b, _c;
552
+ const hasAudioData = (_c = (_b = (_a = response.serverContent) == null ? void 0 : _a.modelTurn) == null ? void 0 : _b.parts) == null ? void 0 : _c.some(
553
+ (part) => {
554
+ var _a2;
555
+ return (_a2 = part.inlineData) == null ? void 0 : _a2.data;
556
+ }
557
+ );
558
+ if (!hasAudioData) {
559
+ this.#logger.debug(`(server) <- ${JSON.stringify(this.loggableServerMessage(response))}`);
560
+ }
561
+ const unlock = await this.sessionLock.lock();
562
+ try {
563
+ if (this.sessionShouldClose.isSet || this.activeSession !== session) {
564
+ this.#logger.debug("onReceiveMessage: Session changed or closed, stopping receive.");
565
+ return;
566
+ }
567
+ } finally {
568
+ unlock();
569
+ }
570
+ if ((!this.currentGeneration || this.currentGeneration._done) && (response.serverContent || response.toolCall)) {
571
+ this.startNewGeneration();
572
+ }
573
+ if (response.sessionResumptionUpdate) {
574
+ if (response.sessionResumptionUpdate.resumable && response.sessionResumptionUpdate.newHandle) {
575
+ this.sessionResumptionHandle = response.sessionResumptionUpdate.newHandle;
576
+ }
577
+ }
578
+ try {
579
+ if (response.serverContent) {
580
+ this.handleServerContent(response.serverContent);
581
+ }
582
+ if (response.toolCall) {
583
+ this.handleToolCall(response.toolCall);
584
+ }
585
+ if (response.toolCallCancellation) {
586
+ this.handleToolCallCancellation(response.toolCallCancellation);
587
+ }
588
+ if (response.usageMetadata) {
589
+ this.handleUsageMetadata(response.usageMetadata);
590
+ }
591
+ if (response.goAway) {
592
+ this.handleGoAway(response.goAway);
593
+ }
594
+ if (this.numRetries > 0) {
595
+ this.numRetries = 0;
596
+ }
597
+ } catch (e) {
598
+ if (!this.sessionShouldClose.isSet) {
599
+ this.#logger.error(`Error in onReceiveMessage: ${e}`);
600
+ this.markRestartNeeded();
601
+ }
602
+ }
603
+ }
604
+ /// Truncate large base64/audio payloads for logging to avoid flooding logs
605
+ truncateString(data, maxLength = 30) {
606
+ return data.length > maxLength ? `${data.slice(0, maxLength)}\u2026` : data;
607
+ }
608
+ loggableClientEvent(event, maxLength = 30) {
609
+ var _a;
610
+ const obj = { ...event };
611
+ if (obj.type === "realtime_input" && ((_a = obj.value) == null ? void 0 : _a.mediaChunks)) {
612
+ obj.value = {
613
+ ...obj.value,
614
+ mediaChunks: obj.value.mediaChunks.map(
615
+ (mc) => ({
616
+ ...mc,
617
+ data: typeof mc.data === "string" ? this.truncateString(mc.data, maxLength) : mc.data
618
+ })
619
+ )
620
+ };
621
+ }
622
+ return obj;
623
+ }
624
+ loggableServerMessage(message, maxLength = 30) {
625
+ const obj = { ...message };
626
+ if (obj.serverContent && obj.serverContent.modelTurn && Array.isArray(obj.serverContent.modelTurn.parts)) {
627
+ obj.serverContent = { ...obj.serverContent };
628
+ obj.serverContent.modelTurn = { ...obj.serverContent.modelTurn };
629
+ obj.serverContent.modelTurn.parts = obj.serverContent.modelTurn.parts.map((part) => {
630
+ var _a;
631
+ if (((_a = part == null ? void 0 : part.inlineData) == null ? void 0 : _a.data) && typeof part.inlineData.data === "string") {
632
+ return {
633
+ ...part,
634
+ inlineData: {
635
+ ...part.inlineData,
636
+ data: this.truncateString(part.inlineData.data, maxLength)
637
+ }
638
+ };
639
+ }
640
+ return part;
641
+ });
642
+ }
643
+ return obj;
644
+ }
645
+ markCurrentGenerationDone() {
646
+ if (!this.currentGeneration || this.currentGeneration._done) {
647
+ return;
648
+ }
649
+ this.handleInputSpeechStopped();
650
+ const gen = this.currentGeneration;
651
+ if (gen.inputTranscription) {
652
+ this.emit("input_audio_transcription_completed", {
653
+ itemId: gen.inputId,
654
+ transcript: gen.inputTranscription,
655
+ isFinal: true
656
+ });
657
+ this._chatCtx.addMessage({
658
+ role: "user",
659
+ content: gen.inputTranscription,
660
+ id: gen.inputId
661
+ });
662
+ }
663
+ if (gen.outputText) {
664
+ this._chatCtx.addMessage({
665
+ role: "assistant",
666
+ content: gen.outputText,
667
+ id: gen.responseId
668
+ });
669
+ }
670
+ if (this.options.outputAudioTranscription === void 0) {
671
+ gen.textChannel.write("");
672
+ }
673
+ gen.textChannel.close();
674
+ gen.audioChannel.close();
675
+ gen.functionChannel.close();
676
+ gen.messageChannel.close();
677
+ gen._done = true;
678
+ }
679
+ emitError(error, recoverable) {
680
+ this.emit("error", {
681
+ timestamp: Date.now(),
682
+ // TODO(brian): add label to realtime model
683
+ label: "google_realtime",
684
+ error,
685
+ recoverable
686
+ });
687
+ }
688
+ buildConnectConfig() {
689
+ const opts = this.options;
690
+ const config = {
691
+ responseModalities: opts.responseModalities,
692
+ systemInstruction: opts.instructions ? {
693
+ parts: [{ text: opts.instructions }]
694
+ } : void 0,
695
+ speechConfig: {
696
+ voiceConfig: {
697
+ prebuiltVoiceConfig: {
698
+ voiceName: opts.voice
699
+ }
700
+ },
701
+ languageCode: opts.language
702
+ },
703
+ tools: [
704
+ {
705
+ functionDeclarations: this.geminiDeclarations,
706
+ ...this.options.geminiTools
707
+ }
708
+ ],
709
+ inputAudioTranscription: opts.inputAudioTranscription,
710
+ outputAudioTranscription: opts.outputAudioTranscription,
711
+ sessionResumption: {
712
+ handle: this.sessionResumptionHandle
713
+ }
714
+ };
715
+ if (opts.temperature !== void 0) {
716
+ config.temperature = opts.temperature;
717
+ }
718
+ if (opts.maxOutputTokens !== void 0) {
719
+ config.maxOutputTokens = opts.maxOutputTokens;
720
+ }
721
+ if (opts.topP !== void 0) {
722
+ config.topP = opts.topP;
723
+ }
724
+ if (opts.topK !== void 0) {
725
+ config.topK = opts.topK;
726
+ }
727
+ if (opts.proactivity !== void 0) {
728
+ config.proactivity = { proactiveAudio: opts.proactivity };
729
+ }
730
+ if (opts.enableAffectiveDialog !== void 0) {
731
+ config.enableAffectiveDialog = opts.enableAffectiveDialog;
732
+ }
733
+ if (opts.realtimeInputConfig !== void 0) {
734
+ config.realtimeInputConfig = opts.realtimeInputConfig;
735
+ }
736
+ if (opts.contextWindowCompression !== void 0) {
737
+ config.contextWindowCompression = opts.contextWindowCompression;
738
+ }
739
+ return config;
740
+ }
741
+ startNewGeneration() {
742
+ if (this.currentGeneration && !this.currentGeneration._done) {
743
+ this.#logger.warn("Starting new generation while another is active. Finalizing previous.");
744
+ this.markCurrentGenerationDone();
745
+ }
746
+ const responseId = shortuuid("GR_");
747
+ this.currentGeneration = {
748
+ messageChannel: stream.createStreamChannel(),
749
+ functionChannel: stream.createStreamChannel(),
750
+ responseId,
751
+ inputId: shortuuid("GI_"),
752
+ textChannel: stream.createStreamChannel(),
753
+ audioChannel: stream.createStreamChannel(),
754
+ inputTranscription: "",
755
+ outputText: "",
756
+ _createdTimestamp: Date.now(),
757
+ _done: false
758
+ };
759
+ if (!this.options.responseModalities.includes(Modality.AUDIO)) {
760
+ this.currentGeneration.audioChannel.close();
761
+ }
762
+ this.currentGeneration.messageChannel.write({
763
+ messageId: responseId,
764
+ textStream: this.currentGeneration.textChannel.stream(),
765
+ audioStream: this.currentGeneration.audioChannel.stream()
766
+ });
767
+ const generationEvent = {
768
+ messageStream: this.currentGeneration.messageChannel.stream(),
769
+ functionStream: this.currentGeneration.functionChannel.stream(),
770
+ userInitiated: false
771
+ };
772
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
773
+ generationEvent.userInitiated = true;
774
+ this.pendingGenerationFut.resolve(generationEvent);
775
+ this.pendingGenerationFut = void 0;
776
+ } else {
777
+ this.handleInputSpeechStarted();
778
+ }
779
+ this.emit("generation_created", generationEvent);
780
+ }
781
+ handleInputSpeechStarted() {
782
+ this.emit("input_speech_started", {});
783
+ }
784
+ handleInputSpeechStopped() {
785
+ this.emit("input_speech_stopped", {
786
+ userTranscriptionEnabled: false
787
+ });
788
+ }
789
+ handleServerContent(serverContent) {
790
+ if (!this.currentGeneration) {
791
+ this.#logger.warn("received server content but no active generation.");
792
+ return;
793
+ }
794
+ const gen = this.currentGeneration;
795
+ if (serverContent.modelTurn) {
796
+ const turn = serverContent.modelTurn;
797
+ for (const part of turn.parts || []) {
798
+ if (part.text) {
799
+ gen.outputText += part.text;
800
+ gen.textChannel.write(part.text);
801
+ }
802
+ if (part.inlineData) {
803
+ if (!gen._firstTokenTimestamp) {
804
+ gen._firstTokenTimestamp = Date.now();
805
+ }
806
+ try {
807
+ if (!part.inlineData.data) {
808
+ throw new Error("frameData is not bytes");
809
+ }
810
+ const binaryString = atob(part.inlineData.data);
811
+ const len = binaryString.length;
812
+ const bytes = new Uint8Array(len);
813
+ for (let i = 0; i < len; i++) {
814
+ bytes[i] = binaryString.charCodeAt(i);
815
+ }
816
+ const int16Array = new Int16Array(bytes.buffer);
817
+ const audioFrame = new AudioFrame(
818
+ int16Array,
819
+ OUTPUT_AUDIO_SAMPLE_RATE,
820
+ OUTPUT_AUDIO_CHANNELS,
821
+ int16Array.length / OUTPUT_AUDIO_CHANNELS
822
+ );
823
+ gen.audioChannel.write(audioFrame);
824
+ } catch (error) {
825
+ this.#logger.error("Error processing audio data:", error);
826
+ }
827
+ }
828
+ }
829
+ }
830
+ if (serverContent.inputTranscription && serverContent.inputTranscription.text) {
831
+ let text = serverContent.inputTranscription.text;
832
+ if (gen.inputTranscription === "") {
833
+ text = text.trimStart();
834
+ }
835
+ gen.inputTranscription += text;
836
+ this.emit("input_audio_transcription_completed", {
837
+ itemId: gen.inputId,
838
+ transcript: gen.inputTranscription,
839
+ isFinal: false
840
+ });
841
+ }
842
+ if (serverContent.outputTranscription && serverContent.outputTranscription.text) {
843
+ const text = serverContent.outputTranscription.text;
844
+ gen.outputText += text;
845
+ gen.textChannel.write(text);
846
+ }
847
+ if (serverContent.generationComplete || serverContent.turnComplete) {
848
+ gen._completedTimestamp = Date.now();
849
+ }
850
+ if (serverContent.interrupted) {
851
+ this.handleInputSpeechStarted();
852
+ }
853
+ if (serverContent.turnComplete) {
854
+ this.markCurrentGenerationDone();
855
+ }
856
+ }
857
+ handleToolCall(toolCall) {
858
+ if (!this.currentGeneration) {
859
+ this.#logger.warn("received tool call but no active generation.");
860
+ return;
861
+ }
862
+ const gen = this.currentGeneration;
863
+ for (const fc of toolCall.functionCalls || []) {
864
+ gen.functionChannel.write({
865
+ callId: fc.id || shortuuid("fnc-call-"),
866
+ name: fc.name,
867
+ args: fc.args ? JSON.stringify(fc.args) : ""
868
+ });
869
+ }
870
+ this.markCurrentGenerationDone();
871
+ }
872
+ handleToolCallCancellation(cancellation) {
873
+ this.#logger.warn(
874
+ {
875
+ functionCallIds: cancellation.ids
876
+ },
877
+ "server cancelled tool calls"
878
+ );
879
+ }
880
+ handleUsageMetadata(usage) {
881
+ if (!this.currentGeneration) {
882
+ this.#logger.debug("Received usage metadata but no active generation");
883
+ return;
884
+ }
885
+ const gen = this.currentGeneration;
886
+ const createdTimestamp = gen._createdTimestamp;
887
+ const firstTokenTimestamp = gen._firstTokenTimestamp;
888
+ const completedTimestamp = gen._completedTimestamp || Date.now();
889
+ const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
890
+ const duration = (completedTimestamp - createdTimestamp) / 1e3;
891
+ const inputTokens = usage.promptTokenCount || 0;
892
+ const outputTokens = usage.responseTokenCount || 0;
893
+ const totalTokens = usage.totalTokenCount || 0;
894
+ const realtimeMetrics = {
895
+ type: "realtime_model_metrics",
896
+ timestamp: createdTimestamp / 1e3,
897
+ requestId: gen.responseId,
898
+ ttft,
899
+ duration,
900
+ cancelled: gen._done && !gen._completedTimestamp,
901
+ label: "google_realtime",
902
+ inputTokens,
903
+ outputTokens,
904
+ totalTokens,
905
+ tokensPerSecond: duration > 0 ? outputTokens / duration : 0,
906
+ inputTokenDetails: {
907
+ ...this.tokenDetailsMap(usage.promptTokensDetails),
908
+ cachedTokens: (usage.cacheTokensDetails || []).reduce(
909
+ (sum, detail) => sum + (detail.tokenCount || 0),
910
+ 0
911
+ ),
912
+ cachedTokensDetails: this.tokenDetailsMap(usage.cacheTokensDetails)
913
+ },
914
+ outputTokenDetails: this.tokenDetailsMap(usage.responseTokensDetails)
915
+ };
916
+ this.emit("metrics_collected", realtimeMetrics);
917
+ }
918
+ tokenDetailsMap(tokenDetails) {
919
+ const tokenDetailsMap = { audioTokens: 0, textTokens: 0, imageTokens: 0 };
920
+ if (!tokenDetails) {
921
+ return tokenDetailsMap;
922
+ }
923
+ for (const tokenDetail of tokenDetails) {
924
+ if (!tokenDetail.tokenCount) {
925
+ continue;
926
+ }
927
+ if (tokenDetail.modality === types.MediaModality.AUDIO) {
928
+ tokenDetailsMap.audioTokens += tokenDetail.tokenCount;
929
+ } else if (tokenDetail.modality === types.MediaModality.TEXT) {
930
+ tokenDetailsMap.textTokens += tokenDetail.tokenCount;
931
+ } else if (tokenDetail.modality === types.MediaModality.IMAGE) {
932
+ tokenDetailsMap.imageTokens += tokenDetail.tokenCount;
933
+ }
934
+ }
935
+ return tokenDetailsMap;
936
+ }
937
+ handleGoAway(goAway) {
938
+ this.#logger.warn({ timeLeft: goAway.timeLeft }, "Gemini server indicates disconnection soon.");
939
+ this.sessionShouldClose.set();
940
+ }
941
+ async commitAudio() {
942
+ }
943
+ async clearAudio() {
944
+ }
945
+ *resampleAudio(frame) {
946
+ if (this.inputResampler) {
947
+ if (frame.sampleRate !== this.inputResamplerInputRate) {
948
+ this.inputResampler = void 0;
949
+ this.inputResamplerInputRate = void 0;
950
+ }
951
+ }
952
+ if (this.inputResampler === void 0 && (frame.sampleRate !== INPUT_AUDIO_SAMPLE_RATE || frame.channels !== INPUT_AUDIO_CHANNELS)) {
953
+ this.inputResampler = new AudioResampler(
954
+ frame.sampleRate,
955
+ INPUT_AUDIO_SAMPLE_RATE,
956
+ INPUT_AUDIO_CHANNELS
957
+ );
958
+ this.inputResamplerInputRate = frame.sampleRate;
959
+ }
960
+ if (this.inputResampler) {
961
+ for (const resampledFrame of this.inputResampler.push(frame)) {
962
+ yield resampledFrame;
963
+ }
964
+ } else {
965
+ yield frame;
966
+ }
967
+ }
968
+ }
969
+ export {
970
+ DEFAULT_IMAGE_ENCODE_OPTIONS,
971
+ RealtimeModel,
972
+ RealtimeSession
973
+ };
974
+ //# sourceMappingURL=realtime_api.js.map