@livekit/agents-plugin-google 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +89 -0
  3. package/dist/beta/gemini_tts.cjs +239 -0
  4. package/dist/beta/gemini_tts.cjs.map +1 -0
  5. package/dist/beta/gemini_tts.d.cts +47 -0
  6. package/dist/beta/gemini_tts.d.ts +47 -0
  7. package/dist/beta/gemini_tts.d.ts.map +1 -0
  8. package/dist/beta/gemini_tts.js +221 -0
  9. package/dist/beta/gemini_tts.js.map +1 -0
  10. package/dist/beta/gemini_tts.test.cjs +9 -0
  11. package/dist/beta/gemini_tts.test.cjs.map +1 -0
  12. package/dist/beta/gemini_tts.test.d.cts +2 -0
  13. package/dist/beta/gemini_tts.test.d.ts +2 -0
  14. package/dist/beta/gemini_tts.test.d.ts.map +1 -0
  15. package/dist/beta/gemini_tts.test.js +8 -0
  16. package/dist/beta/gemini_tts.test.js.map +1 -0
  17. package/dist/beta/index.cjs +42 -0
  18. package/dist/beta/index.cjs.map +1 -0
  19. package/dist/beta/index.d.cts +3 -0
  20. package/dist/beta/index.d.ts +3 -0
  21. package/dist/beta/index.d.ts.map +1 -0
  22. package/dist/beta/index.js +7 -0
  23. package/dist/beta/index.js.map +1 -0
  24. package/dist/beta/realtime/api_proto.cjs +17 -0
  25. package/dist/beta/realtime/api_proto.cjs.map +1 -0
  26. package/dist/beta/realtime/api_proto.d.cts +26 -0
  27. package/dist/beta/realtime/api_proto.d.ts +26 -0
  28. package/dist/beta/realtime/api_proto.d.ts.map +1 -0
  29. package/dist/beta/realtime/api_proto.js +1 -0
  30. package/dist/beta/realtime/api_proto.js.map +1 -0
  31. package/dist/beta/realtime/index.cjs +29 -0
  32. package/dist/beta/realtime/index.cjs.map +1 -0
  33. package/dist/beta/realtime/index.d.cts +3 -0
  34. package/dist/beta/realtime/index.d.ts +3 -0
  35. package/dist/beta/realtime/index.d.ts.map +1 -0
  36. package/dist/beta/realtime/index.js +5 -0
  37. package/dist/beta/realtime/index.js.map +1 -0
  38. package/dist/beta/realtime/realtime_api.cjs +993 -0
  39. package/dist/beta/realtime/realtime_api.cjs.map +1 -0
  40. package/dist/beta/realtime/realtime_api.d.cts +267 -0
  41. package/dist/beta/realtime/realtime_api.d.ts +267 -0
  42. package/dist/beta/realtime/realtime_api.d.ts.map +1 -0
  43. package/dist/beta/realtime/realtime_api.js +974 -0
  44. package/dist/beta/realtime/realtime_api.js.map +1 -0
  45. package/dist/index.cjs +58 -0
  46. package/dist/index.cjs.map +1 -0
  47. package/dist/index.d.cts +4 -0
  48. package/dist/index.d.ts +4 -0
  49. package/dist/index.d.ts.map +1 -0
  50. package/dist/index.js +20 -0
  51. package/dist/index.js.map +1 -0
  52. package/dist/llm.cjs +381 -0
  53. package/dist/llm.cjs.map +1 -0
  54. package/dist/llm.d.cts +82 -0
  55. package/dist/llm.d.ts +82 -0
  56. package/dist/llm.d.ts.map +1 -0
  57. package/dist/llm.js +362 -0
  58. package/dist/llm.js.map +1 -0
  59. package/dist/llm.test.cjs +8 -0
  60. package/dist/llm.test.cjs.map +1 -0
  61. package/dist/llm.test.d.cts +2 -0
  62. package/dist/llm.test.d.ts +2 -0
  63. package/dist/llm.test.d.ts.map +1 -0
  64. package/dist/llm.test.js +7 -0
  65. package/dist/llm.test.js.map +1 -0
  66. package/dist/models.cjs +17 -0
  67. package/dist/models.cjs.map +1 -0
  68. package/dist/models.d.cts +5 -0
  69. package/dist/models.d.ts +5 -0
  70. package/dist/models.d.ts.map +1 -0
  71. package/dist/models.js +1 -0
  72. package/dist/models.js.map +1 -0
  73. package/dist/tools.cjs +17 -0
  74. package/dist/tools.cjs.map +1 -0
  75. package/dist/tools.d.cts +3 -0
  76. package/dist/tools.d.ts +3 -0
  77. package/dist/tools.d.ts.map +1 -0
  78. package/dist/tools.js +1 -0
  79. package/dist/tools.js.map +1 -0
  80. package/dist/utils.cjs +137 -0
  81. package/dist/utils.cjs.map +1 -0
  82. package/dist/utils.d.cts +14 -0
  83. package/dist/utils.d.ts +14 -0
  84. package/dist/utils.d.ts.map +1 -0
  85. package/dist/utils.js +112 -0
  86. package/dist/utils.js.map +1 -0
  87. package/package.json +56 -0
  88. package/src/beta/gemini_tts.test.ts +11 -0
  89. package/src/beta/gemini_tts.ts +309 -0
  90. package/src/beta/index.ts +6 -0
  91. package/src/beta/realtime/api_proto.ts +41 -0
  92. package/src/beta/realtime/index.ts +5 -0
  93. package/src/beta/realtime/realtime_api.ts +1440 -0
  94. package/src/index.ts +20 -0
  95. package/src/llm.test.ts +10 -0
  96. package/src/llm.ts +463 -0
  97. package/src/models.ts +100 -0
  98. package/src/tools.ts +6 -0
  99. package/src/utils.ts +157 -0
@@ -0,0 +1,1440 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { Session } from '@google/genai';
5
+ import * as types from '@google/genai';
6
+ import {
7
+ ActivityHandling,
8
+ type AudioTranscriptionConfig,
9
+ type ContextWindowCompressionConfig,
10
+ GoogleGenAI,
11
+ type HttpOptions,
12
+ Modality,
13
+ type RealtimeInputConfig,
14
+ } from '@google/genai';
15
+ import type { APIConnectOptions } from '@livekit/agents';
16
+ import {
17
+ APIConnectionError,
18
+ AudioByteStream,
19
+ DEFAULT_API_CONNECT_OPTIONS,
20
+ Event,
21
+ Future,
22
+ Queue,
23
+ Task,
24
+ cancelAndWait,
25
+ llm,
26
+ log,
27
+ shortuuid,
28
+ stream,
29
+ } from '@livekit/agents';
30
+ import { Mutex } from '@livekit/mutex';
31
+ import { AudioFrame, AudioResampler, type VideoFrame } from '@livekit/rtc-node';
32
+ import { delay } from '@std/async';
33
+ import { type LLMTools } from '../../tools.js';
34
+ import { toFunctionDeclarations } from '../../utils.js';
35
+ import type * as api_proto from './api_proto.js';
36
+ import type { LiveAPIModels, Voice } from './api_proto.js';
37
+
38
+ // Input audio constants (matching Python)
39
+ const INPUT_AUDIO_SAMPLE_RATE = 16000;
40
+ const INPUT_AUDIO_CHANNELS = 1;
41
+
42
+ // Output audio constants (matching Python)
43
+ const OUTPUT_AUDIO_SAMPLE_RATE = 24000;
44
+ const OUTPUT_AUDIO_CHANNELS = 1;
45
+
46
+ /**
47
+ * Default image encoding options for Google Realtime API
48
+ */
49
+ export const DEFAULT_IMAGE_ENCODE_OPTIONS = {
50
+ format: 'JPEG' as const,
51
+ quality: 75,
52
+ resizeOptions: {
53
+ width: 1024,
54
+ height: 1024,
55
+ strategy: 'scale_aspect_fit' as const,
56
+ },
57
+ };
58
+
59
+ /**
60
+ * Input transcription result
61
+ */
62
+ export interface InputTranscription {
63
+ itemId: string;
64
+ transcript: string;
65
+ }
66
+
67
+ /**
68
+ * Helper function to check if two sets are equal
69
+ */
70
+ function setsEqual<T>(a: Set<T>, b: Set<T>): boolean {
71
+ return a.size === b.size && [...a].every((x) => b.has(x));
72
+ }
73
+
74
+ /**
75
+ * Internal realtime options for Google Realtime API
76
+ */
77
+ interface RealtimeOptions {
78
+ model: LiveAPIModels | string;
79
+ apiKey?: string;
80
+ voice: Voice | string;
81
+ language?: string;
82
+ responseModalities: Modality[];
83
+ vertexai: boolean;
84
+ project?: string;
85
+ location?: string;
86
+ candidateCount: number;
87
+ temperature?: number;
88
+ maxOutputTokens?: number;
89
+ topP?: number;
90
+ topK?: number;
91
+ presencePenalty?: number;
92
+ frequencyPenalty?: number;
93
+ instructions?: string;
94
+ inputAudioTranscription?: AudioTranscriptionConfig;
95
+ outputAudioTranscription?: AudioTranscriptionConfig;
96
+ imageEncodeOptions?: typeof DEFAULT_IMAGE_ENCODE_OPTIONS;
97
+ connOptions: APIConnectOptions;
98
+ httpOptions?: HttpOptions;
99
+ enableAffectiveDialog?: boolean;
100
+ proactivity?: boolean;
101
+ realtimeInputConfig?: RealtimeInputConfig;
102
+ contextWindowCompression?: ContextWindowCompressionConfig;
103
+ apiVersion?: string;
104
+ geminiTools?: LLMTools;
105
+ }
106
+
107
+ /**
108
+ * Response generation tracking
109
+ */
110
+ interface ResponseGeneration {
111
+ messageChannel: stream.StreamChannel<llm.MessageGeneration>;
112
+ functionChannel: stream.StreamChannel<llm.FunctionCall>;
113
+
114
+ inputId: string;
115
+ responseId: string;
116
+ textChannel: stream.StreamChannel<string>;
117
+ audioChannel: stream.StreamChannel<AudioFrame>;
118
+
119
+ inputTranscription: string;
120
+ outputText: string;
121
+
122
+ /** @internal */
123
+ _createdTimestamp: number;
124
+ /** @internal */
125
+ _firstTokenTimestamp?: number;
126
+ /** @internal */
127
+ _completedTimestamp?: number;
128
+ /** @internal */
129
+ _done: boolean;
130
+ }
131
+
132
+ /**
133
+ * Google Realtime Model for real-time voice conversations with Gemini models
134
+ */
135
+ export class RealtimeModel extends llm.RealtimeModel {
136
+ /** @internal */
137
+ _options: RealtimeOptions;
138
+
139
+ constructor(
140
+ options: {
141
+ /**
142
+ * Initial system instructions for the model
143
+ */
144
+ instructions?: string;
145
+
146
+ /**
147
+ * The name of the model to use
148
+ */
149
+ model?: LiveAPIModels | string;
150
+
151
+ /**
152
+ * Google Gemini API key. If not provided, will attempt to read from GOOGLE_API_KEY environment variable
153
+ */
154
+ apiKey?: string;
155
+
156
+ /**
157
+ * Voice setting for audio outputs
158
+ */
159
+ voice?: Voice | string;
160
+
161
+ /**
162
+ * The language (BCP-47 Code) to use for the API
163
+ * See https://ai.google.dev/gemini-api/docs/live#supported-languages
164
+ */
165
+ language?: string;
166
+
167
+ /**
168
+ * Modalities to use, such as [Modality.TEXT, Modality.AUDIO]
169
+ */
170
+ modalities?: Modality[];
171
+
172
+ /**
173
+ * Whether to use VertexAI for the API
174
+ */
175
+ vertexai?: boolean;
176
+
177
+ /**
178
+ * The project ID to use for the API (for VertexAI)
179
+ */
180
+ project?: string;
181
+
182
+ /**
183
+ * The location to use for the API (for VertexAI)
184
+ */
185
+ location?: string;
186
+
187
+ /**
188
+ * The number of candidate responses to generate
189
+ */
190
+ candidateCount?: number;
191
+
192
+ /**
193
+ * Sampling temperature for response generation
194
+ */
195
+ temperature?: number;
196
+
197
+ /**
198
+ * Maximum number of tokens in the response
199
+ */
200
+ maxOutputTokens?: number;
201
+
202
+ /**
203
+ * The top-p value for response generation
204
+ */
205
+ topP?: number;
206
+
207
+ /**
208
+ * The top-k value for response generation
209
+ */
210
+ topK?: number;
211
+
212
+ /**
213
+ * The presence penalty for response generation
214
+ */
215
+ presencePenalty?: number;
216
+
217
+ /**
218
+ * The frequency penalty for response generation
219
+ */
220
+ frequencyPenalty?: number;
221
+
222
+ /**
223
+ * The configuration for input audio transcription
224
+ */
225
+ inputAudioTranscription?: AudioTranscriptionConfig | null;
226
+
227
+ /**
228
+ * The configuration for output audio transcription
229
+ */
230
+ outputAudioTranscription?: AudioTranscriptionConfig | null;
231
+
232
+ /**
233
+ * The configuration for image encoding
234
+ */
235
+ imageEncodeOptions?: typeof DEFAULT_IMAGE_ENCODE_OPTIONS;
236
+
237
+ /**
238
+ * Whether to enable affective dialog
239
+ */
240
+ enableAffectiveDialog?: boolean;
241
+
242
+ /**
243
+ * Whether to enable proactive audio
244
+ */
245
+ proactivity?: boolean;
246
+
247
+ /**
248
+ * The configuration for realtime input
249
+ */
250
+ realtimeInputConfig?: RealtimeInputConfig;
251
+
252
+ /**
253
+ * The configuration for context window compression
254
+ */
255
+ contextWindowCompression?: ContextWindowCompressionConfig;
256
+
257
+ /**
258
+ * API version to use
259
+ */
260
+ apiVersion?: string;
261
+
262
+ /**
263
+ * The configuration for the API connection
264
+ */
265
+ connOptions?: APIConnectOptions;
266
+
267
+ /**
268
+ * HTTP options for API requests
269
+ */
270
+ httpOptions?: HttpOptions;
271
+
272
+ /**
273
+ * Gemini-specific tools to use for the session
274
+ */
275
+ geminiTools?: LLMTools;
276
+ } = {},
277
+ ) {
278
+ const inputAudioTranscription =
279
+ options.inputAudioTranscription === undefined ? {} : options.inputAudioTranscription;
280
+ const outputAudioTranscription =
281
+ options.outputAudioTranscription === undefined ? {} : options.outputAudioTranscription;
282
+
283
+ let serverTurnDetection = true;
284
+ if (options.realtimeInputConfig?.automaticActivityDetection?.disabled) {
285
+ serverTurnDetection = false;
286
+ }
287
+
288
+ super({
289
+ messageTruncation: false,
290
+ turnDetection: serverTurnDetection,
291
+ userTranscription: inputAudioTranscription !== null,
292
+ autoToolReplyGeneration: true,
293
+ });
294
+
295
+ // Environment variable fallbacks
296
+ const apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
297
+ const project = options.project || process.env.GOOGLE_CLOUD_PROJECT;
298
+ const location = options.location || process.env.GOOGLE_CLOUD_LOCATION || 'us-central1';
299
+ const vertexai = options.vertexai ?? false;
300
+
301
+ // Model selection based on API type
302
+ const defaultModel = vertexai ? 'gemini-2.0-flash-exp' : 'gemini-2.0-flash-live-001';
303
+
304
+ this._options = {
305
+ model: options.model || defaultModel,
306
+ apiKey,
307
+ voice: options.voice || 'Puck',
308
+ language: options.language,
309
+ responseModalities: options.modalities || [Modality.AUDIO],
310
+ vertexai,
311
+ project,
312
+ location,
313
+ candidateCount: options.candidateCount || 1,
314
+ temperature: options.temperature,
315
+ maxOutputTokens: options.maxOutputTokens,
316
+ topP: options.topP,
317
+ topK: options.topK,
318
+ presencePenalty: options.presencePenalty,
319
+ frequencyPenalty: options.frequencyPenalty,
320
+ instructions: options.instructions,
321
+ inputAudioTranscription: inputAudioTranscription || undefined,
322
+ outputAudioTranscription: outputAudioTranscription || undefined,
323
+ imageEncodeOptions: options.imageEncodeOptions || DEFAULT_IMAGE_ENCODE_OPTIONS,
324
+ connOptions: options.connOptions || DEFAULT_API_CONNECT_OPTIONS,
325
+ httpOptions: options.httpOptions,
326
+ enableAffectiveDialog: options.enableAffectiveDialog,
327
+ proactivity: options.proactivity,
328
+ realtimeInputConfig: options.realtimeInputConfig,
329
+ contextWindowCompression: options.contextWindowCompression,
330
+ apiVersion: options.apiVersion,
331
+ geminiTools: options.geminiTools,
332
+ };
333
+ }
334
+
335
+ /**
336
+ * Create a new realtime session
337
+ */
338
+ session() {
339
+ return new RealtimeSession(this);
340
+ }
341
+
342
+ /**
343
+ * Update model options
344
+ */
345
+ updateOptions(options: { voice?: Voice | string; temperature?: number }): void {
346
+ if (options.voice !== undefined) {
347
+ this._options.voice = options.voice;
348
+ }
349
+ if (options.temperature !== undefined) {
350
+ this._options.temperature = options.temperature;
351
+ }
352
+
353
+ // TODO: Notify active sessions of option changes
354
+ }
355
+
356
+ /**
357
+ * Close the model and cleanup resources
358
+ */
359
+ async close(): Promise<void> {
360
+ // TODO: Implementation depends on session management
361
+ }
362
+ }
363
+
364
+ /**
365
+ * Google Realtime Session for real-time voice conversations
366
+ *
367
+ * This session provides real-time streaming capabilities with Google's Gemini models,
368
+ * supporting both text and audio modalities with function calling capabilities.
369
+ */
370
+ export class RealtimeSession extends llm.RealtimeSession {
371
+ private _tools: llm.ToolContext = {};
372
+ private _chatCtx = llm.ChatContext.empty();
373
+
374
+ private options: RealtimeOptions;
375
+ private geminiDeclarations: types.FunctionDeclaration[] = [];
376
+ private messageChannel = new Queue<api_proto.ClientEvents>();
377
+ private inputResampler?: AudioResampler;
378
+ private inputResamplerInputRate?: number;
379
+ private instructions?: string;
380
+ private currentGeneration?: ResponseGeneration;
381
+ private bstream: AudioByteStream;
382
+
383
+ // Google-specific properties
384
+ private activeSession?: Session;
385
+ private sessionShouldClose = new Event();
386
+ private responseCreatedFutures: { [id: string]: Future<llm.GenerationCreatedEvent> } = {};
387
+ private pendingGenerationFut?: Future<llm.GenerationCreatedEvent>;
388
+
389
+ private sessionResumptionHandle?: string;
390
+ private inUserActivity = false;
391
+ private sessionLock = new Mutex();
392
+ private numRetries = 0;
393
+ private hasReceivedAudioInput = false;
394
+
395
+ #client: GoogleGenAI;
396
+ #task: Promise<void>;
397
+ #logger = log();
398
+ #closed = false;
399
+
400
+ constructor(realtimeModel: RealtimeModel) {
401
+ super(realtimeModel);
402
+
403
+ this.options = realtimeModel._options;
404
+ this.bstream = new AudioByteStream(
405
+ INPUT_AUDIO_SAMPLE_RATE,
406
+ INPUT_AUDIO_CHANNELS,
407
+ INPUT_AUDIO_SAMPLE_RATE / 20,
408
+ ); // 50ms chunks
409
+
410
+ const { apiKey, project, location, vertexai, enableAffectiveDialog, proactivity } =
411
+ this.options;
412
+
413
+ const apiVersion =
414
+ !this.options.apiVersion && (enableAffectiveDialog || proactivity)
415
+ ? 'v1alpha'
416
+ : this.options.apiVersion;
417
+
418
+ const httpOptions = {
419
+ ...this.options.httpOptions,
420
+ apiVersion,
421
+ timeout: this.options.connOptions.timeoutMs,
422
+ };
423
+
424
+ const clientOptions: types.GoogleGenAIOptions = vertexai
425
+ ? {
426
+ vertexai: true,
427
+ project,
428
+ location,
429
+ httpOptions,
430
+ }
431
+ : {
432
+ apiKey,
433
+ httpOptions,
434
+ };
435
+
436
+ this.#client = new GoogleGenAI(clientOptions);
437
+ this.#task = this.#mainTask();
438
+ }
439
+
440
+ private async closeActiveSession(): Promise<void> {
441
+ const unlock = await this.sessionLock.lock();
442
+
443
+ if (this.activeSession) {
444
+ try {
445
+ await this.activeSession.close();
446
+ } catch (error) {
447
+ this.#logger.warn({ error }, 'Error closing Gemini session');
448
+ } finally {
449
+ this.activeSession = undefined;
450
+ }
451
+ }
452
+
453
+ unlock();
454
+ }
455
+
456
+ private markRestartNeeded(): void {
457
+ if (!this.sessionShouldClose.isSet) {
458
+ this.sessionShouldClose.set();
459
+ this.messageChannel = new Queue();
460
+ }
461
+ }
462
+
463
+ private getToolResultsForRealtime(
464
+ ctx: llm.ChatContext,
465
+ vertexai: boolean,
466
+ ): types.LiveClientToolResponse | undefined {
467
+ const toolResponses: types.FunctionResponse[] = [];
468
+
469
+ for (const item of ctx.items) {
470
+ if (item.type === 'function_call_output') {
471
+ const response: types.FunctionResponse = {
472
+ id: item.callId,
473
+ name: item.name,
474
+ response: { output: item.output },
475
+ };
476
+
477
+ if (!vertexai) {
478
+ response.id = item.callId;
479
+ }
480
+
481
+ toolResponses.push(response);
482
+ }
483
+ }
484
+
485
+ return toolResponses.length > 0 ? { functionResponses: toolResponses } : undefined;
486
+ }
487
+
488
+ updateOptions(options: {
489
+ voice?: Voice | string;
490
+ temperature?: number;
491
+ toolChoice?: llm.ToolChoice;
492
+ }) {
493
+ let shouldRestart = false;
494
+
495
+ if (options.voice !== undefined && this.options.voice !== options.voice) {
496
+ this.options.voice = options.voice;
497
+ shouldRestart = true;
498
+ }
499
+
500
+ if (options.temperature !== undefined && this.options.temperature !== options.temperature) {
501
+ this.options.temperature = options.temperature;
502
+ shouldRestart = true;
503
+ }
504
+
505
+ if (shouldRestart) {
506
+ this.markRestartNeeded();
507
+ }
508
+ }
509
+
510
+ async updateInstructions(instructions: string): Promise<void> {
511
+ if (this.options.instructions === undefined || this.options.instructions !== instructions) {
512
+ this.options.instructions = instructions;
513
+ this.markRestartNeeded();
514
+ }
515
+ }
516
+
517
+ async updateChatCtx(chatCtx: llm.ChatContext): Promise<void> {
518
+ const unlock = await this.sessionLock.lock();
519
+ try {
520
+ if (!this.activeSession) {
521
+ this._chatCtx = chatCtx.copy();
522
+ return;
523
+ }
524
+ } finally {
525
+ unlock();
526
+ }
527
+
528
+ const diffOps = llm.computeChatCtxDiff(this._chatCtx, chatCtx);
529
+
530
+ if (diffOps.toRemove.length > 0) {
531
+ this.#logger.warn('Gemini Live does not support removing messages');
532
+ }
533
+
534
+ const appendCtx = llm.ChatContext.empty();
535
+ for (const [, itemId] of diffOps.toCreate) {
536
+ const item = chatCtx.getById(itemId);
537
+ if (item) {
538
+ appendCtx.items.push(item);
539
+ }
540
+ }
541
+
542
+ if (appendCtx.items.length > 0) {
543
+ const [turns] = await appendCtx
544
+ .copy({
545
+ excludeFunctionCall: true,
546
+ })
547
+ .toProviderFormat('google', false);
548
+
549
+ const toolResults = this.getToolResultsForRealtime(appendCtx, this.options.vertexai);
550
+
551
+ if (turns.length > 0) {
552
+ this.sendClientEvent({
553
+ type: 'content',
554
+ value: {
555
+ turns: turns as types.Content[],
556
+ turnComplete: false,
557
+ },
558
+ });
559
+ }
560
+
561
+ if (toolResults) {
562
+ this.sendClientEvent({
563
+ type: 'tool_response',
564
+ value: toolResults,
565
+ });
566
+ }
567
+ }
568
+
569
+ // since we don't have a view of the history on the server side, we'll assume
570
+ // the current state is accurate. this isn't perfect because removals aren't done.
571
+ this._chatCtx = chatCtx.copy();
572
+ }
573
+
574
+ async updateTools(tools: llm.ToolContext): Promise<void> {
575
+ const newDeclarations = toFunctionDeclarations(tools);
576
+ const currentToolNames = new Set(this.geminiDeclarations.map((f) => f.name));
577
+ const newToolNames = new Set(newDeclarations.map((f) => f.name));
578
+
579
+ if (!setsEqual(currentToolNames, newToolNames)) {
580
+ this.geminiDeclarations = newDeclarations;
581
+ this._tools = tools;
582
+ this.markRestartNeeded();
583
+ }
584
+ }
585
+
586
+ get chatCtx(): llm.ChatContext {
587
+ return this._chatCtx.copy();
588
+ }
589
+
590
+ get tools(): llm.ToolContext {
591
+ return { ...this._tools };
592
+ }
593
+
594
+ get manualActivityDetection(): boolean {
595
+ return this.options.realtimeInputConfig?.automaticActivityDetection?.disabled ?? false;
596
+ }
597
+
598
+ pushAudio(frame: AudioFrame): void {
599
+ // Track that we've received audio input
600
+ this.hasReceivedAudioInput = true;
601
+
602
+ for (const f of this.resampleAudio(frame)) {
603
+ for (const nf of this.bstream.write(f.data.buffer)) {
604
+ const realtimeInput: types.LiveClientRealtimeInput = {
605
+ mediaChunks: [
606
+ {
607
+ mimeType: 'audio/pcm',
608
+ data: Buffer.from(nf.data.buffer).toString('base64'),
609
+ },
610
+ ],
611
+ };
612
+ this.sendClientEvent({
613
+ type: 'realtime_input',
614
+ value: realtimeInput,
615
+ });
616
+ }
617
+ }
618
+ }
619
+
620
+ pushVideo(_: VideoFrame): void {
621
+ // TODO(brian): implement push video frames
622
+ }
623
+
624
+ private sendClientEvent(event: api_proto.ClientEvents) {
625
+ this.messageChannel.put(event);
626
+ }
627
+
628
+ async generateReply(instructions?: string): Promise<llm.GenerationCreatedEvent> {
629
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
630
+ this.#logger.warn(
631
+ 'generateReply called while another generation is pending, cancelling previous.',
632
+ );
633
+ this.pendingGenerationFut.reject(new Error('Superseded by new generate_reply call'));
634
+ }
635
+
636
+ const fut = new Future<llm.GenerationCreatedEvent>();
637
+ this.pendingGenerationFut = fut;
638
+
639
+ if (this.inUserActivity) {
640
+ this.sendClientEvent({
641
+ type: 'realtime_input',
642
+ value: {
643
+ activityEnd: {},
644
+ },
645
+ });
646
+ this.inUserActivity = false;
647
+ }
648
+
649
+ // Gemini requires the last message to end with user's turn
650
+ // so we need to add a placeholder user turn in order to trigger a new generation
651
+ const turns: types.Content[] = [];
652
+ if (instructions !== undefined) {
653
+ turns.push({
654
+ parts: [{ text: instructions }],
655
+ role: 'model',
656
+ });
657
+ }
658
+ turns.push({
659
+ parts: [{ text: '.' }],
660
+ role: 'user',
661
+ });
662
+
663
+ this.sendClientEvent({
664
+ type: 'content',
665
+ value: {
666
+ turns,
667
+ turnComplete: true,
668
+ },
669
+ });
670
+
671
+ const timeoutHandle = setTimeout(() => {
672
+ if (!fut.done) {
673
+ fut.reject(new Error('generateReply timed out waiting for generation_created event.'));
674
+ if (this.pendingGenerationFut === fut) {
675
+ this.pendingGenerationFut = undefined;
676
+ }
677
+ }
678
+ }, 5000);
679
+
680
+ fut.await.finally(() => clearTimeout(timeoutHandle));
681
+
682
+ return fut.await;
683
+ }
684
+
685
+ startUserActivity(): void {
686
+ if (!this.manualActivityDetection) {
687
+ return;
688
+ }
689
+
690
+ if (!this.inUserActivity) {
691
+ this.inUserActivity = true;
692
+ this.sendClientEvent({
693
+ type: 'realtime_input',
694
+ value: {
695
+ activityStart: {},
696
+ },
697
+ });
698
+ }
699
+ }
700
+
701
+ async interrupt() {
702
+ // Gemini Live treats activity start as interruption, so we rely on startUserActivity to handle it
703
+ if (this.options.realtimeInputConfig?.activityHandling === ActivityHandling.NO_INTERRUPTION) {
704
+ return;
705
+ }
706
+ this.startUserActivity();
707
+ }
708
+
709
+ async truncate(_options: { messageId: string; audioEndMs: number; audioTranscript?: string }) {
710
+ this.#logger.warn('truncate is not supported by the Google Realtime API.');
711
+ }
712
+
713
+ async close(): Promise<void> {
714
+ super.close();
715
+ this.#closed = true;
716
+
717
+ this.sessionShouldClose.set();
718
+
719
+ await this.closeActiveSession();
720
+
721
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
722
+ this.pendingGenerationFut.reject(new Error('Session closed'));
723
+ }
724
+
725
+ for (const fut of Object.values(this.responseCreatedFutures)) {
726
+ if (!fut.done) {
727
+ fut.reject(new Error('Session closed before response created'));
728
+ }
729
+ }
730
+ this.responseCreatedFutures = {};
731
+
732
+ if (this.currentGeneration) {
733
+ this.markCurrentGenerationDone();
734
+ }
735
+ }
736
+
737
+ async #mainTask(): Promise<void> {
738
+ const maxRetries = this.options.connOptions.maxRetry;
739
+
740
+ while (!this.#closed) {
741
+ // previous session might not be closed yet, we'll do it here.
742
+ await this.closeActiveSession();
743
+
744
+ this.sessionShouldClose.clear();
745
+ const config = this.buildConnectConfig();
746
+
747
+ try {
748
+ this.#logger.debug('Connecting to Gemini Realtime API...');
749
+
750
+ const sessionOpened = new Event();
751
+ const session = await this.#client.live.connect({
752
+ model: this.options.model,
753
+ callbacks: {
754
+ onopen: () => sessionOpened.set(),
755
+ onmessage: (message: types.LiveServerMessage) => {
756
+ this.onReceiveMessage(session, message);
757
+ },
758
+ onerror: (error: ErrorEvent) => {
759
+ this.#logger.error('Gemini Live session error:', error);
760
+ if (!this.sessionShouldClose.isSet) {
761
+ this.markRestartNeeded();
762
+ }
763
+ },
764
+ onclose: (event: CloseEvent) => {
765
+ this.#logger.debug('Gemini Live session closed:', event.code, event.reason);
766
+ this.markCurrentGenerationDone();
767
+ },
768
+ },
769
+ config,
770
+ });
771
+
772
+ await sessionOpened.wait();
773
+
774
+ const unlock = await this.sessionLock.lock();
775
+ try {
776
+ this.activeSession = session;
777
+
778
+ // Send existing chat context
779
+ const [turns] = await this._chatCtx
780
+ .copy({
781
+ excludeFunctionCall: true,
782
+ })
783
+ .toProviderFormat('google', false);
784
+
785
+ if (turns.length > 0) {
786
+ await session.sendClientContent({
787
+ turns,
788
+ turnComplete: false,
789
+ });
790
+ }
791
+ } finally {
792
+ unlock();
793
+ }
794
+
795
+ const sendTask = Task.from((controller) => this.sendTask(session, controller));
796
+ const restartWaitTask = Task.from(({ signal }) => {
797
+ const abortEvent = new Event();
798
+ signal.addEventListener('abort', () => abortEvent.set());
799
+ return Promise.race([this.sessionShouldClose.wait(), abortEvent.wait()]);
800
+ });
801
+
802
+ await Promise.race([sendTask.result, restartWaitTask.result]);
803
+
804
+ // TODO(brian): handle error from tasks
805
+
806
+ if (!restartWaitTask.done && this.#closed) {
807
+ break;
808
+ }
809
+
810
+ await cancelAndWait([sendTask, restartWaitTask], 2000);
811
+ } catch (error) {
812
+ this.#logger.error(`Gemini Realtime API error: ${error}`);
813
+
814
+ if (this.#closed) break;
815
+
816
+ if (maxRetries === 0) {
817
+ this.emitError(error as Error, false);
818
+ throw new APIConnectionError({
819
+ message: 'Failed to connect to Gemini Live',
820
+ });
821
+ }
822
+
823
+ if (this.numRetries >= maxRetries) {
824
+ this.emitError(error as Error, false);
825
+ throw new APIConnectionError({
826
+ message: `Failed to connect to Gemini Live after ${maxRetries} attempts`,
827
+ });
828
+ }
829
+
830
+ const retryInterval =
831
+ this.numRetries === 100 ? 0 : this.options.connOptions.retryIntervalMs;
832
+
833
+ this.#logger.warn(
834
+ {
835
+ attempt: this.numRetries,
836
+ maxRetries,
837
+ },
838
+ `Gemini Realtime API connection failed, retrying in ${retryInterval}ms`,
839
+ );
840
+
841
+ await delay(retryInterval);
842
+ this.numRetries++;
843
+ } finally {
844
+ await this.closeActiveSession();
845
+ }
846
+ }
847
+ }
848
+
849
+ private async sendTask(session: types.Session, controller: AbortController): Promise<void> {
850
+ try {
851
+ while (!this.#closed && !this.sessionShouldClose.isSet && !controller.signal.aborted) {
852
+ const msg = await this.messageChannel.get();
853
+ if (controller.signal.aborted) break;
854
+
855
+ const unlock = await this.sessionLock.lock();
856
+ try {
857
+ if (this.sessionShouldClose.isSet || this.activeSession !== session) {
858
+ break;
859
+ }
860
+ } finally {
861
+ unlock();
862
+ }
863
+
864
+ switch (msg.type) {
865
+ case 'content':
866
+ const { turns, turnComplete } = msg.value;
867
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
868
+ await session.sendClientContent({
869
+ turns,
870
+ turnComplete: turnComplete ?? true,
871
+ });
872
+ break;
873
+ case 'tool_response':
874
+ const { functionResponses } = msg.value;
875
+ if (functionResponses) {
876
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.loggableClientEvent(msg))}`);
877
+ await session.sendToolResponse({
878
+ functionResponses,
879
+ });
880
+ }
881
+ break;
882
+ case 'realtime_input':
883
+ const { mediaChunks, activityStart, activityEnd } = msg.value;
884
+ if (mediaChunks) {
885
+ for (const mediaChunk of mediaChunks) {
886
+ await session.sendRealtimeInput({ media: mediaChunk });
887
+ }
888
+ }
889
+ if (activityStart) await session.sendRealtimeInput({ activityStart });
890
+ if (activityEnd) await session.sendRealtimeInput({ activityEnd });
891
+ break;
892
+ default:
893
+ this.#logger.warn(`Warning: Received unhandled message type: ${msg.type}`);
894
+ break;
895
+ }
896
+ }
897
+ } catch (e) {
898
+ if (!this.sessionShouldClose.isSet) {
899
+ this.#logger.error(`Error in send task: ${e}`);
900
+ this.markRestartNeeded();
901
+ }
902
+ } finally {
903
+ this.#logger.debug(
904
+ {
905
+ closed: this.#closed,
906
+ sessionShouldClose: this.sessionShouldClose.isSet,
907
+ aborted: controller.signal.aborted,
908
+ },
909
+ 'send task finished.',
910
+ );
911
+ }
912
+ }
913
+
914
+ private async onReceiveMessage(
915
+ session: types.Session,
916
+ response: types.LiveServerMessage,
917
+ ): Promise<void> {
918
+ // Skip logging verbose audio data events
919
+ const hasAudioData = response.serverContent?.modelTurn?.parts?.some(
920
+ (part) => part.inlineData?.data,
921
+ );
922
+ if (!hasAudioData) {
923
+ this.#logger.debug(`(server) <- ${JSON.stringify(this.loggableServerMessage(response))}`);
924
+ }
925
+ const unlock = await this.sessionLock.lock();
926
+
927
+ try {
928
+ if (this.sessionShouldClose.isSet || this.activeSession !== session) {
929
+ this.#logger.debug('onReceiveMessage: Session changed or closed, stopping receive.');
930
+ return;
931
+ }
932
+ } finally {
933
+ unlock();
934
+ }
935
+
936
+ if (
937
+ (!this.currentGeneration || this.currentGeneration._done) &&
938
+ (response.serverContent || response.toolCall)
939
+ ) {
940
+ this.startNewGeneration();
941
+ }
942
+
943
+ if (response.sessionResumptionUpdate) {
944
+ if (
945
+ response.sessionResumptionUpdate.resumable &&
946
+ response.sessionResumptionUpdate.newHandle
947
+ ) {
948
+ this.sessionResumptionHandle = response.sessionResumptionUpdate.newHandle;
949
+ }
950
+ }
951
+
952
+ try {
953
+ if (response.serverContent) {
954
+ this.handleServerContent(response.serverContent);
955
+ }
956
+
957
+ if (response.toolCall) {
958
+ this.handleToolCall(response.toolCall);
959
+ }
960
+
961
+ if (response.toolCallCancellation) {
962
+ this.handleToolCallCancellation(response.toolCallCancellation);
963
+ }
964
+
965
+ if (response.usageMetadata) {
966
+ this.handleUsageMetadata(response.usageMetadata);
967
+ }
968
+
969
+ if (response.goAway) {
970
+ this.handleGoAway(response.goAway);
971
+ }
972
+
973
+ if (this.numRetries > 0) {
974
+ this.numRetries = 0;
975
+ }
976
+ } catch (e) {
977
+ if (!this.sessionShouldClose.isSet) {
978
+ this.#logger.error(`Error in onReceiveMessage: ${e}`);
979
+ this.markRestartNeeded();
980
+ }
981
+ }
982
+ }
983
+
984
+ /// Truncate large base64/audio payloads for logging to avoid flooding logs
985
+ private truncateString(data: string, maxLength: number = 30): string {
986
+ return data.length > maxLength ? `${data.slice(0, maxLength)}…` : data;
987
+ }
988
+
989
+ private loggableClientEvent(
990
+ event: api_proto.ClientEvents,
991
+ maxLength: number = 30,
992
+ ): Record<string, unknown> {
993
+ const obj: any = { ...event };
994
+ if (obj.type === 'realtime_input' && obj.value?.mediaChunks) {
995
+ obj.value = {
996
+ ...obj.value,
997
+ mediaChunks: (obj.value.mediaChunks as Array<{ mimeType?: string; data?: string }>).map(
998
+ (mc) => ({
999
+ ...mc,
1000
+ data: typeof mc.data === 'string' ? this.truncateString(mc.data, maxLength) : mc.data,
1001
+ }),
1002
+ ),
1003
+ };
1004
+ }
1005
+ return obj;
1006
+ }
1007
+
1008
+ private loggableServerMessage(
1009
+ message: types.LiveServerMessage,
1010
+ maxLength: number = 30,
1011
+ ): Record<string, unknown> {
1012
+ const obj: any = { ...message };
1013
+ if (
1014
+ obj.serverContent &&
1015
+ obj.serverContent.modelTurn &&
1016
+ Array.isArray(obj.serverContent.modelTurn.parts)
1017
+ ) {
1018
+ obj.serverContent = { ...obj.serverContent };
1019
+ obj.serverContent.modelTurn = { ...obj.serverContent.modelTurn };
1020
+ obj.serverContent.modelTurn.parts = obj.serverContent.modelTurn.parts.map((part: any) => {
1021
+ if (part?.inlineData?.data && typeof part.inlineData.data === 'string') {
1022
+ return {
1023
+ ...part,
1024
+ inlineData: {
1025
+ ...part.inlineData,
1026
+ data: this.truncateString(part.inlineData.data, maxLength),
1027
+ },
1028
+ };
1029
+ }
1030
+ return part;
1031
+ });
1032
+ }
1033
+ return obj;
1034
+ }
1035
+
1036
+ private markCurrentGenerationDone(): void {
1037
+ if (!this.currentGeneration || this.currentGeneration._done) {
1038
+ return;
1039
+ }
1040
+
1041
+ this.handleInputSpeechStopped();
1042
+
1043
+ const gen = this.currentGeneration;
1044
+
1045
+ // The only way we'd know that the transcription is complete is by when they are
1046
+ // done with generation
1047
+ if (gen.inputTranscription) {
1048
+ this.emit('input_audio_transcription_completed', {
1049
+ itemId: gen.inputId,
1050
+ transcript: gen.inputTranscription,
1051
+ isFinal: true,
1052
+ } as llm.InputTranscriptionCompleted);
1053
+
1054
+ // since gemini doesn't give us a view of the chat history on the server side,
1055
+ // we would handle it manually here
1056
+ this._chatCtx.addMessage({
1057
+ role: 'user',
1058
+ content: gen.inputTranscription,
1059
+ id: gen.inputId,
1060
+ });
1061
+ }
1062
+
1063
+ if (gen.outputText) {
1064
+ this._chatCtx.addMessage({
1065
+ role: 'assistant',
1066
+ content: gen.outputText,
1067
+ id: gen.responseId,
1068
+ });
1069
+ }
1070
+
1071
+ if (this.options.outputAudioTranscription === undefined) {
1072
+ // close the text data of transcription synchronizer
1073
+ gen.textChannel.write('');
1074
+ }
1075
+
1076
+ gen.textChannel.close();
1077
+ gen.audioChannel.close();
1078
+ gen.functionChannel.close();
1079
+ gen.messageChannel.close();
1080
+ gen._done = true;
1081
+ }
1082
+
1083
+ private emitError(error: Error, recoverable: boolean): void {
1084
+ this.emit('error', {
1085
+ timestamp: Date.now(),
1086
+ // TODO(brian): add label to realtime model
1087
+ label: 'google_realtime',
1088
+ error,
1089
+ recoverable,
1090
+ });
1091
+ }
1092
+
1093
+ private buildConnectConfig(): types.LiveConnectConfig {
1094
+ const opts = this.options;
1095
+
1096
+ const config: types.LiveConnectConfig = {
1097
+ responseModalities: opts.responseModalities,
1098
+ systemInstruction: opts.instructions
1099
+ ? {
1100
+ parts: [{ text: opts.instructions }],
1101
+ }
1102
+ : undefined,
1103
+ speechConfig: {
1104
+ voiceConfig: {
1105
+ prebuiltVoiceConfig: {
1106
+ voiceName: opts.voice as Voice,
1107
+ },
1108
+ },
1109
+ languageCode: opts.language,
1110
+ },
1111
+ tools: [
1112
+ {
1113
+ functionDeclarations: this.geminiDeclarations,
1114
+ ...this.options.geminiTools,
1115
+ },
1116
+ ],
1117
+ inputAudioTranscription: opts.inputAudioTranscription,
1118
+ outputAudioTranscription: opts.outputAudioTranscription,
1119
+ sessionResumption: {
1120
+ handle: this.sessionResumptionHandle,
1121
+ },
1122
+ };
1123
+
1124
+ // Add generation fields at TOP LEVEL (NO generationConfig!)
1125
+ if (opts.temperature !== undefined) {
1126
+ config.temperature = opts.temperature;
1127
+ }
1128
+ if (opts.maxOutputTokens !== undefined) {
1129
+ config.maxOutputTokens = opts.maxOutputTokens;
1130
+ }
1131
+ if (opts.topP !== undefined) {
1132
+ config.topP = opts.topP;
1133
+ }
1134
+ if (opts.topK !== undefined) {
1135
+ config.topK = opts.topK;
1136
+ }
1137
+
1138
+ if (opts.proactivity !== undefined) {
1139
+ config.proactivity = { proactiveAudio: opts.proactivity };
1140
+ }
1141
+
1142
+ if (opts.enableAffectiveDialog !== undefined) {
1143
+ config.enableAffectiveDialog = opts.enableAffectiveDialog;
1144
+ }
1145
+
1146
+ if (opts.realtimeInputConfig !== undefined) {
1147
+ config.realtimeInputConfig = opts.realtimeInputConfig;
1148
+ }
1149
+
1150
+ if (opts.contextWindowCompression !== undefined) {
1151
+ config.contextWindowCompression = opts.contextWindowCompression;
1152
+ }
1153
+
1154
+ return config;
1155
+ }
1156
+
1157
+ private startNewGeneration(): void {
1158
+ if (this.currentGeneration && !this.currentGeneration._done) {
1159
+ this.#logger.warn('Starting new generation while another is active. Finalizing previous.');
1160
+ this.markCurrentGenerationDone();
1161
+ }
1162
+
1163
+ const responseId = shortuuid('GR_');
1164
+ this.currentGeneration = {
1165
+ messageChannel: stream.createStreamChannel<llm.MessageGeneration>(),
1166
+ functionChannel: stream.createStreamChannel<llm.FunctionCall>(),
1167
+ responseId,
1168
+ inputId: shortuuid('GI_'),
1169
+ textChannel: stream.createStreamChannel<string>(),
1170
+ audioChannel: stream.createStreamChannel<AudioFrame>(),
1171
+ inputTranscription: '',
1172
+ outputText: '',
1173
+ _createdTimestamp: Date.now(),
1174
+ _done: false,
1175
+ };
1176
+
1177
+ // Close audio stream if audio output is not supported by the model
1178
+ if (!this.options.responseModalities.includes(Modality.AUDIO)) {
1179
+ this.currentGeneration.audioChannel.close();
1180
+ }
1181
+
1182
+ this.currentGeneration.messageChannel.write({
1183
+ messageId: responseId,
1184
+ textStream: this.currentGeneration.textChannel.stream(),
1185
+ audioStream: this.currentGeneration.audioChannel.stream(),
1186
+ });
1187
+
1188
+ const generationEvent: llm.GenerationCreatedEvent = {
1189
+ messageStream: this.currentGeneration.messageChannel.stream(),
1190
+ functionStream: this.currentGeneration.functionChannel.stream(),
1191
+ userInitiated: false,
1192
+ };
1193
+
1194
+ if (this.pendingGenerationFut && !this.pendingGenerationFut.done) {
1195
+ generationEvent.userInitiated = true;
1196
+ this.pendingGenerationFut.resolve(generationEvent);
1197
+ this.pendingGenerationFut = undefined;
1198
+ } else {
1199
+ // emit input_speech_started event before starting an agent initiated generation
1200
+ // to interrupt the previous audio playout if any
1201
+ this.handleInputSpeechStarted();
1202
+ }
1203
+
1204
+ this.emit('generation_created', generationEvent);
1205
+ }
1206
+
1207
+ private handleInputSpeechStarted(): void {
1208
+ this.emit('input_speech_started', {} as llm.InputSpeechStartedEvent);
1209
+ }
1210
+
1211
+ private handleInputSpeechStopped(): void {
1212
+ this.emit('input_speech_stopped', {
1213
+ userTranscriptionEnabled: false,
1214
+ } as llm.InputSpeechStoppedEvent);
1215
+ }
1216
+
1217
+ private handleServerContent(serverContent: types.LiveServerContent): void {
1218
+ if (!this.currentGeneration) {
1219
+ this.#logger.warn('received server content but no active generation.');
1220
+ return;
1221
+ }
1222
+
1223
+ const gen = this.currentGeneration;
1224
+
1225
+ if (serverContent.modelTurn) {
1226
+ const turn = serverContent.modelTurn;
1227
+
1228
+ for (const part of turn.parts || []) {
1229
+ if (part.text) {
1230
+ gen.outputText += part.text;
1231
+ gen.textChannel.write(part.text);
1232
+ }
1233
+
1234
+ if (part.inlineData) {
1235
+ if (!gen._firstTokenTimestamp) {
1236
+ gen._firstTokenTimestamp = Date.now();
1237
+ }
1238
+
1239
+ try {
1240
+ if (!part.inlineData.data) {
1241
+ throw new Error('frameData is not bytes');
1242
+ }
1243
+
1244
+ const binaryString = atob(part.inlineData.data);
1245
+ const len = binaryString.length;
1246
+ const bytes = new Uint8Array(len);
1247
+ for (let i = 0; i < len; i++) {
1248
+ bytes[i] = binaryString.charCodeAt(i);
1249
+ }
1250
+
1251
+ const int16Array = new Int16Array(bytes.buffer);
1252
+ const audioFrame = new AudioFrame(
1253
+ int16Array,
1254
+ OUTPUT_AUDIO_SAMPLE_RATE,
1255
+ OUTPUT_AUDIO_CHANNELS,
1256
+ int16Array.length / OUTPUT_AUDIO_CHANNELS,
1257
+ );
1258
+
1259
+ gen.audioChannel.write(audioFrame);
1260
+ } catch (error) {
1261
+ this.#logger.error('Error processing audio data:', error);
1262
+ }
1263
+ }
1264
+ }
1265
+ }
1266
+
1267
+ if (serverContent.inputTranscription && serverContent.inputTranscription.text) {
1268
+ let text = serverContent.inputTranscription.text;
1269
+
1270
+ if (gen.inputTranscription === '') {
1271
+ text = text.trimStart();
1272
+ }
1273
+
1274
+ gen.inputTranscription += text;
1275
+ this.emit('input_audio_transcription_completed', {
1276
+ itemId: gen.inputId,
1277
+ transcript: gen.inputTranscription,
1278
+ isFinal: false,
1279
+ } as llm.InputTranscriptionCompleted);
1280
+ }
1281
+
1282
+ if (serverContent.outputTranscription && serverContent.outputTranscription.text) {
1283
+ const text = serverContent.outputTranscription.text;
1284
+ gen.outputText += text;
1285
+ gen.textChannel.write(text);
1286
+ }
1287
+
1288
+ if (serverContent.generationComplete || serverContent.turnComplete) {
1289
+ gen._completedTimestamp = Date.now();
1290
+ }
1291
+
1292
+ if (serverContent.interrupted) {
1293
+ this.handleInputSpeechStarted();
1294
+ }
1295
+
1296
+ if (serverContent.turnComplete) {
1297
+ this.markCurrentGenerationDone();
1298
+ }
1299
+ }
1300
+
1301
+ private handleToolCall(toolCall: types.LiveServerToolCall): void {
1302
+ if (!this.currentGeneration) {
1303
+ this.#logger.warn('received tool call but no active generation.');
1304
+ return;
1305
+ }
1306
+
1307
+ const gen = this.currentGeneration;
1308
+
1309
+ for (const fc of toolCall.functionCalls || []) {
1310
+ gen.functionChannel.write({
1311
+ callId: fc.id || shortuuid('fnc-call-'),
1312
+ name: fc.name,
1313
+ args: fc.args ? JSON.stringify(fc.args) : '',
1314
+ } as llm.FunctionCall);
1315
+ }
1316
+
1317
+ this.markCurrentGenerationDone();
1318
+ }
1319
+
1320
+ private handleToolCallCancellation(cancellation: types.LiveServerToolCallCancellation): void {
1321
+ this.#logger.warn(
1322
+ {
1323
+ functionCallIds: cancellation.ids,
1324
+ },
1325
+ 'server cancelled tool calls',
1326
+ );
1327
+ }
1328
+
1329
+ private handleUsageMetadata(usage: types.UsageMetadata): void {
1330
+ if (!this.currentGeneration) {
1331
+ this.#logger.debug('Received usage metadata but no active generation');
1332
+ return;
1333
+ }
1334
+
1335
+ const gen = this.currentGeneration;
1336
+ const createdTimestamp = gen._createdTimestamp;
1337
+ const firstTokenTimestamp = gen._firstTokenTimestamp;
1338
+ const completedTimestamp = gen._completedTimestamp || Date.now();
1339
+
1340
+ // Calculate metrics
1341
+ const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
1342
+ const duration = (completedTimestamp - createdTimestamp) / 1000; // Convert to seconds
1343
+
1344
+ const inputTokens = usage.promptTokenCount || 0;
1345
+ const outputTokens = usage.responseTokenCount || 0;
1346
+ const totalTokens = usage.totalTokenCount || 0;
1347
+
1348
+ const realtimeMetrics = {
1349
+ type: 'realtime_model_metrics',
1350
+ timestamp: createdTimestamp / 1000,
1351
+ requestId: gen.responseId,
1352
+ ttft,
1353
+ duration,
1354
+ cancelled: gen._done && !gen._completedTimestamp,
1355
+ label: 'google_realtime',
1356
+ inputTokens,
1357
+ outputTokens,
1358
+ totalTokens,
1359
+ tokensPerSecond: duration > 0 ? outputTokens / duration : 0,
1360
+ inputTokenDetails: {
1361
+ ...this.tokenDetailsMap(usage.promptTokensDetails),
1362
+ cachedTokens: (usage.cacheTokensDetails || []).reduce(
1363
+ (sum, detail) => sum + (detail.tokenCount || 0),
1364
+ 0,
1365
+ ),
1366
+ cachedTokensDetails: this.tokenDetailsMap(usage.cacheTokensDetails),
1367
+ },
1368
+ outputTokenDetails: this.tokenDetailsMap(usage.responseTokensDetails),
1369
+ };
1370
+
1371
+ this.emit('metrics_collected', realtimeMetrics);
1372
+ }
1373
+
1374
+ private tokenDetailsMap(tokenDetails: types.ModalityTokenCount[] | undefined): {
1375
+ audioTokens: number;
1376
+ textTokens: number;
1377
+ imageTokens: number;
1378
+ } {
1379
+ const tokenDetailsMap = { audioTokens: 0, textTokens: 0, imageTokens: 0 };
1380
+ if (!tokenDetails) {
1381
+ return tokenDetailsMap;
1382
+ }
1383
+
1384
+ for (const tokenDetail of tokenDetails) {
1385
+ if (!tokenDetail.tokenCount) {
1386
+ continue;
1387
+ }
1388
+
1389
+ if (tokenDetail.modality === types.MediaModality.AUDIO) {
1390
+ tokenDetailsMap.audioTokens += tokenDetail.tokenCount;
1391
+ } else if (tokenDetail.modality === types.MediaModality.TEXT) {
1392
+ tokenDetailsMap.textTokens += tokenDetail.tokenCount;
1393
+ } else if (tokenDetail.modality === types.MediaModality.IMAGE) {
1394
+ tokenDetailsMap.imageTokens += tokenDetail.tokenCount;
1395
+ }
1396
+ }
1397
+ return tokenDetailsMap;
1398
+ }
1399
+
1400
+ private handleGoAway(goAway: types.LiveServerGoAway): void {
1401
+ this.#logger.warn({ timeLeft: goAway.timeLeft }, 'Gemini server indicates disconnection soon.');
1402
+ // TODO(brian): this isn't a seamless reconnection just yet
1403
+ this.sessionShouldClose.set();
1404
+ }
1405
+
1406
+ async commitAudio() {}
1407
+
1408
+ async clearAudio() {}
1409
+
1410
+ private *resampleAudio(frame: AudioFrame): Generator<AudioFrame> {
1411
+ if (this.inputResampler) {
1412
+ if (frame.sampleRate !== this.inputResamplerInputRate) {
1413
+ // input audio changed to a different sample rate
1414
+ this.inputResampler = undefined;
1415
+ this.inputResamplerInputRate = undefined;
1416
+ }
1417
+ }
1418
+
1419
+ if (
1420
+ this.inputResampler === undefined &&
1421
+ (frame.sampleRate !== INPUT_AUDIO_SAMPLE_RATE || frame.channels !== INPUT_AUDIO_CHANNELS)
1422
+ ) {
1423
+ this.inputResampler = new AudioResampler(
1424
+ frame.sampleRate,
1425
+ INPUT_AUDIO_SAMPLE_RATE,
1426
+ INPUT_AUDIO_CHANNELS,
1427
+ );
1428
+ this.inputResamplerInputRate = frame.sampleRate;
1429
+ }
1430
+
1431
+ if (this.inputResampler) {
1432
+ // TODO(brian): flush the resampler when the input source is changed
1433
+ for (const resampledFrame of this.inputResampler.push(frame)) {
1434
+ yield resampledFrame;
1435
+ }
1436
+ } else {
1437
+ yield frame;
1438
+ }
1439
+ }
1440
+ }