@livekit/agents-plugin-google 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ import {
15
15
  import type { APIConnectOptions } from '@livekit/agents';
16
16
  import {
17
17
  APIConnectionError,
18
+ APIStatusError,
18
19
  AudioByteStream,
19
20
  DEFAULT_API_CONNECT_OPTIONS,
20
21
  Event,
@@ -45,6 +46,8 @@ const OUTPUT_AUDIO_CHANNELS = 1;
45
46
 
46
47
  const LK_GOOGLE_DEBUG = Number(process.env.LK_GOOGLE_DEBUG ?? 0);
47
48
 
49
+ // WebSocket close codes (RFC 6455)
50
+ const WS_CLOSE_NORMAL = 1000;
48
51
  /**
49
52
  * Default image encoding options for Google Realtime API
50
53
  */
@@ -410,6 +413,8 @@ export class RealtimeSession extends llm.RealtimeSession {
410
413
  private sessionLock = new Mutex();
411
414
  private numRetries = 0;
412
415
  private hasReceivedAudioInput = false;
416
+ private pendingInterruptText = false;
417
+ private earlyCompletionPending = false;
413
418
 
414
419
  #client: GoogleGenAI;
415
420
  #task: Promise<void>;
@@ -468,6 +473,8 @@ export class RealtimeSession extends llm.RealtimeSession {
468
473
  this.activeSession = undefined;
469
474
  }
470
475
  }
476
+ this.earlyCompletionPending = false;
477
+ this.pendingInterruptText = false;
471
478
 
472
479
  unlock();
473
480
  }
@@ -568,6 +575,27 @@ export class RealtimeSession extends llm.RealtimeSession {
568
575
  const toolResults = this.getToolResultsForRealtime(appendCtx, this.options.vertexai);
569
576
 
570
577
  if (turns.length > 0) {
578
+ const shouldSendRealtimeText = this.pendingInterruptText;
579
+
580
+ if (shouldSendRealtimeText) {
581
+ for (const turn of turns as types.Content[]) {
582
+ if (turn.role !== 'user') continue;
583
+ // Realtime text drives live activity/interrupts
584
+ // { type: content: turnComplete: true } alone does not reliably preempt a streaming response in Gemini Live.
585
+ const text = (turn.parts || [])
586
+ .map((part) => (part as { text?: string }).text)
587
+ .filter((value): value is string => !!value)
588
+ .join('');
589
+ if (text) {
590
+ this.sendClientEvent({
591
+ type: 'realtime_input',
592
+ value: { text },
593
+ });
594
+ this.pendingInterruptText = false;
595
+ }
596
+ }
597
+ }
598
+
571
599
  this.sendClientEvent({
572
600
  type: 'content',
573
601
  value: {
@@ -717,11 +745,25 @@ export class RealtimeSession extends llm.RealtimeSession {
717
745
  }
718
746
  }
719
747
 
748
+ private generationHasOutput(gen: ResponseGeneration): boolean {
749
+ return Boolean(gen.outputText) || gen._firstTokenTimestamp !== undefined;
750
+ }
751
+
720
752
  async interrupt() {
721
753
  // Gemini Live treats activity start as interruption, so we rely on startUserActivity to handle it
722
754
  if (this.options.realtimeInputConfig?.activityHandling === ActivityHandling.NO_INTERRUPTION) {
755
+ if (LK_GOOGLE_DEBUG) {
756
+ this.#logger.debug('interrupt skipped (activityHandling = NO_INTERRUPTION)');
757
+ }
723
758
  return;
724
759
  }
760
+ if (this.currentGeneration && !this.currentGeneration._done) {
761
+ this.pendingInterruptText = true;
762
+ if (this.generationHasOutput(this.currentGeneration)) {
763
+ this.earlyCompletionPending = true;
764
+ this.markCurrentGenerationDone();
765
+ }
766
+ }
725
767
  this.startUserActivity();
726
768
  }
727
769
 
@@ -774,6 +816,8 @@ export class RealtimeSession extends llm.RealtimeSession {
774
816
  onmessage: (message: types.LiveServerMessage) => {
775
817
  this.onReceiveMessage(session, message);
776
818
  },
819
+ // onerror is called for network-level errors (connection refused, DNS failure, TLS errors).
820
+ // Application-level errors (e.g., invalid model name) come through onclose with error codes.
777
821
  onerror: (error: ErrorEvent) => {
778
822
  this.#logger.error('Gemini Live session error:', error);
779
823
  if (!this.sessionShouldClose.isSet) {
@@ -781,7 +825,33 @@ export class RealtimeSession extends llm.RealtimeSession {
781
825
  }
782
826
  },
783
827
  onclose: (event: CloseEvent) => {
784
- this.#logger.debug('Gemini Live session closed:', event.code, event.reason);
828
+ // Surface WebSocket close errors to the user instead of silently swallowing them
829
+ if (event.code !== WS_CLOSE_NORMAL) {
830
+ // Note: WebSocket close reasons are limited to 123 bytes by RFC 6455,
831
+ // so Google's error messages may be truncated at the protocol level
832
+ const isTruncated = event.reason && event.reason.length >= 120;
833
+ const truncationNote = isTruncated
834
+ ? ' (message may be truncated - check model name and API permissions)'
835
+ : '';
836
+ const errorMsg = event.reason || `WebSocket closed with code ${event.code}`;
837
+ this.#logger.error(`Gemini Live session error: ${errorMsg}${truncationNote}`);
838
+
839
+ this.emitError(
840
+ new APIStatusError({
841
+ message: `${errorMsg}${truncationNote}`,
842
+ options: {
843
+ statusCode: event.code,
844
+ retryable: false,
845
+ body: event.reason
846
+ ? { reason: event.reason, code: event.code, truncated: isTruncated }
847
+ : null,
848
+ },
849
+ }),
850
+ false,
851
+ );
852
+ } else {
853
+ this.#logger.debug('Gemini Live session closed:', event.code, event.reason);
854
+ }
785
855
  this.markCurrentGenerationDone();
786
856
  },
787
857
  },
@@ -903,12 +973,15 @@ export class RealtimeSession extends llm.RealtimeSession {
903
973
  }
904
974
  break;
905
975
  case 'realtime_input':
906
- const { mediaChunks, activityStart, activityEnd } = msg.value;
976
+ const { mediaChunks, activityStart, activityEnd, text } = msg.value;
907
977
  if (mediaChunks) {
908
978
  for (const mediaChunk of mediaChunks) {
909
979
  await session.sendRealtimeInput({ media: mediaChunk });
910
980
  }
911
981
  }
982
+ if (text) {
983
+ await session.sendRealtimeInput({ text });
984
+ }
912
985
  if (activityStart) await session.sendRealtimeInput({ activityStart });
913
986
  if (activityEnd) await session.sendRealtimeInput({ activityEnd });
914
987
  break;
@@ -960,7 +1033,6 @@ export class RealtimeSession extends llm.RealtimeSession {
960
1033
 
961
1034
  const shouldStartNewGeneration =
962
1035
  !this.currentGeneration || this.currentGeneration._done || !!this.pendingGenerationFut;
963
-
964
1036
  if (shouldStartNewGeneration) {
965
1037
  if (response.serverContent?.interrupted) {
966
1038
  // Two cases when an interrupted event is sent without an active generation:
@@ -1295,7 +1367,9 @@ export class RealtimeSession extends llm.RealtimeSession {
1295
1367
 
1296
1368
  const gen = this.currentGeneration;
1297
1369
 
1298
- if (serverContent.modelTurn) {
1370
+ const discardOutput = this.earlyCompletionPending;
1371
+
1372
+ if (serverContent.modelTurn && !discardOutput) {
1299
1373
  const turn = serverContent.modelTurn;
1300
1374
 
1301
1375
  for (const part of turn.parts || []) {
@@ -1357,7 +1431,11 @@ export class RealtimeSession extends llm.RealtimeSession {
1357
1431
  } as llm.InputTranscriptionCompleted);
1358
1432
  }
1359
1433
 
1360
- if (serverContent.outputTranscription && serverContent.outputTranscription.text) {
1434
+ if (
1435
+ !discardOutput &&
1436
+ serverContent.outputTranscription &&
1437
+ serverContent.outputTranscription.text
1438
+ ) {
1361
1439
  const text = serverContent.outputTranscription.text;
1362
1440
  gen.outputText += text;
1363
1441
  gen.textChannel.write(text);
@@ -1371,9 +1449,18 @@ export class RealtimeSession extends llm.RealtimeSession {
1371
1449
  this.handleInputSpeechStarted();
1372
1450
  }
1373
1451
 
1374
- if (serverContent.turnComplete) {
1452
+ if (serverContent.turnComplete && !this.earlyCompletionPending) {
1375
1453
  this.markCurrentGenerationDone();
1376
1454
  }
1455
+
1456
+ // Assume Gemini emits turnComplete/generationComplete before any new generation content.
1457
+ // We keep discarding until that signal to avoid old stream spillover after interrupts.
1458
+ if (
1459
+ this.earlyCompletionPending &&
1460
+ (serverContent.turnComplete || serverContent.generationComplete)
1461
+ ) {
1462
+ this.earlyCompletionPending = false;
1463
+ }
1377
1464
  }
1378
1465
 
1379
1466
  private handleToolCall(toolCall: types.LiveServerToolCall): void {
@@ -1529,6 +1616,9 @@ export class RealtimeSession extends llm.RealtimeSession {
1529
1616
  }
1530
1617
 
1531
1618
  private isNewGeneration(response: types.LiveServerMessage) {
1619
+ if (this.earlyCompletionPending) {
1620
+ return false;
1621
+ }
1532
1622
  if (response.toolCall) {
1533
1623
  return true;
1534
1624
  }