npm - @livekit/agents-plugin-google - Versions diffs - 1.0.39 → 1.0.41 - Mend

@livekit/agents-plugin-google 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/beta/realtime/realtime_api.cjs +67 -5
package/dist/beta/realtime/realtime_api.cjs.map +1 -1
package/dist/beta/realtime/realtime_api.d.cts +3 -0
package/dist/beta/realtime/realtime_api.d.ts +3 -0
package/dist/beta/realtime/realtime_api.d.ts.map +1 -1
package/dist/beta/realtime/realtime_api.js +68 -5
package/dist/beta/realtime/realtime_api.js.map +1 -1
package/package.json +5 -5
package/src/beta/realtime/realtime_api.ts +96 -6

package/src/beta/realtime/realtime_api.ts CHANGED Viewed

@@ -15,6 +15,7 @@ import {
 import type { APIConnectOptions } from '@livekit/agents';
 import {
   APIConnectionError,
+  APIStatusError,
   AudioByteStream,
   DEFAULT_API_CONNECT_OPTIONS,
   Event,
@@ -45,6 +46,8 @@ const OUTPUT_AUDIO_CHANNELS = 1;
 const LK_GOOGLE_DEBUG = Number(process.env.LK_GOOGLE_DEBUG ?? 0);
+// WebSocket close codes (RFC 6455)
+const WS_CLOSE_NORMAL = 1000;
 /**
  * Default image encoding options for Google Realtime API
  */
@@ -410,6 +413,8 @@ export class RealtimeSession extends llm.RealtimeSession {
   private sessionLock = new Mutex();
   private numRetries = 0;
   private hasReceivedAudioInput = false;
+  private pendingInterruptText = false;
+  private earlyCompletionPending = false;
   #client: GoogleGenAI;
   #task: Promise<void>;
@@ -468,6 +473,8 @@ export class RealtimeSession extends llm.RealtimeSession {
         this.activeSession = undefined;
       }
     }
+    this.earlyCompletionPending = false;
+    this.pendingInterruptText = false;
     unlock();
   }
@@ -568,6 +575,27 @@ export class RealtimeSession extends llm.RealtimeSession {
       const toolResults = this.getToolResultsForRealtime(appendCtx, this.options.vertexai);
       if (turns.length > 0) {
+        const shouldSendRealtimeText = this.pendingInterruptText;
+        if (shouldSendRealtimeText) {
+          for (const turn of turns as types.Content[]) {
+            if (turn.role !== 'user') continue;
+            // Realtime text drives live activity/interrupts
+            // { type: content:  turnComplete: true } alone does not reliably preempt a streaming response in Gemini Live.
+            const text = (turn.parts || [])
+              .map((part) => (part as { text?: string }).text)
+              .filter((value): value is string => !!value)
+              .join('');
+            if (text) {
+              this.sendClientEvent({
+                type: 'realtime_input',
+                value: { text },
+              });
+              this.pendingInterruptText = false;
+            }
+          }
+        }
         this.sendClientEvent({
           type: 'content',
           value: {
@@ -717,11 +745,25 @@ export class RealtimeSession extends llm.RealtimeSession {
     }
   }
+  private generationHasOutput(gen: ResponseGeneration): boolean {
+    return Boolean(gen.outputText) || gen._firstTokenTimestamp !== undefined;
+  }
   async interrupt() {
     // Gemini Live treats activity start as interruption, so we rely on startUserActivity to handle it
     if (this.options.realtimeInputConfig?.activityHandling === ActivityHandling.NO_INTERRUPTION) {
+      if (LK_GOOGLE_DEBUG) {
+        this.#logger.debug('interrupt skipped (activityHandling = NO_INTERRUPTION)');
+      }
       return;
     }
+    if (this.currentGeneration && !this.currentGeneration._done) {
+      this.pendingInterruptText = true;
+      if (this.generationHasOutput(this.currentGeneration)) {
+        this.earlyCompletionPending = true;
+        this.markCurrentGenerationDone();
+      }
+    }
     this.startUserActivity();
   }
@@ -774,6 +816,8 @@ export class RealtimeSession extends llm.RealtimeSession {
             onmessage: (message: types.LiveServerMessage) => {
               this.onReceiveMessage(session, message);
             },
+            // onerror is called for network-level errors (connection refused, DNS failure, TLS errors).
+            // Application-level errors (e.g., invalid model name) come through onclose with error codes.
             onerror: (error: ErrorEvent) => {
               this.#logger.error('Gemini Live session error:', error);
               if (!this.sessionShouldClose.isSet) {
@@ -781,7 +825,33 @@ export class RealtimeSession extends llm.RealtimeSession {
               }
             },
             onclose: (event: CloseEvent) => {
-              this.#logger.debug('Gemini Live session closed:', event.code, event.reason);
+              // Surface WebSocket close errors to the user instead of silently swallowing them
+              if (event.code !== WS_CLOSE_NORMAL) {
+                // Note: WebSocket close reasons are limited to 123 bytes by RFC 6455,
+                // so Google's error messages may be truncated at the protocol level
+                const isTruncated = event.reason && event.reason.length >= 120;
+                const truncationNote = isTruncated
+                  ? ' (message may be truncated - check model name and API permissions)'
+                  : '';
+                const errorMsg = event.reason || `WebSocket closed with code ${event.code}`;
+                this.#logger.error(`Gemini Live session error: ${errorMsg}${truncationNote}`);
+                this.emitError(
+                  new APIStatusError({
+                    message: `${errorMsg}${truncationNote}`,
+                    options: {
+                      statusCode: event.code,
+                      retryable: false,
+                      body: event.reason
+                        ? { reason: event.reason, code: event.code, truncated: isTruncated }
+                        : null,
+                    },
+                  }),
+                  false,
+                );
+              } else {
+                this.#logger.debug('Gemini Live session closed:', event.code, event.reason);
+              }
               this.markCurrentGenerationDone();
             },
           },
@@ -903,12 +973,15 @@ export class RealtimeSession extends llm.RealtimeSession {
             }
             break;
           case 'realtime_input':
-            const { mediaChunks, activityStart, activityEnd } = msg.value;
+            const { mediaChunks, activityStart, activityEnd, text } = msg.value;
             if (mediaChunks) {
               for (const mediaChunk of mediaChunks) {
                 await session.sendRealtimeInput({ media: mediaChunk });
               }
             }
+            if (text) {
+              await session.sendRealtimeInput({ text });
+            }
             if (activityStart) await session.sendRealtimeInput({ activityStart });
             if (activityEnd) await session.sendRealtimeInput({ activityEnd });
             break;
@@ -960,7 +1033,6 @@ export class RealtimeSession extends llm.RealtimeSession {
     const shouldStartNewGeneration =
       !this.currentGeneration || this.currentGeneration._done || !!this.pendingGenerationFut;
     if (shouldStartNewGeneration) {
       if (response.serverContent?.interrupted) {
         // Two cases when an interrupted event is sent without an active generation:
@@ -1295,7 +1367,9 @@ export class RealtimeSession extends llm.RealtimeSession {
     const gen = this.currentGeneration;
-    if (serverContent.modelTurn) {
+    const discardOutput = this.earlyCompletionPending;
+    if (serverContent.modelTurn && !discardOutput) {
       const turn = serverContent.modelTurn;
       for (const part of turn.parts || []) {
@@ -1357,7 +1431,11 @@ export class RealtimeSession extends llm.RealtimeSession {
       } as llm.InputTranscriptionCompleted);
     }
-    if (serverContent.outputTranscription && serverContent.outputTranscription.text) {
+    if (
+      !discardOutput &&
+      serverContent.outputTranscription &&
+      serverContent.outputTranscription.text
+    ) {
       const text = serverContent.outputTranscription.text;
       gen.outputText += text;
       gen.textChannel.write(text);
@@ -1371,9 +1449,18 @@ export class RealtimeSession extends llm.RealtimeSession {
       this.handleInputSpeechStarted();
     }
-    if (serverContent.turnComplete) {
+    if (serverContent.turnComplete && !this.earlyCompletionPending) {
       this.markCurrentGenerationDone();
     }
+    // Assume Gemini emits turnComplete/generationComplete before any new generation content.
+    // We keep discarding until that signal to avoid old stream spillover after interrupts.
+    if (
+      this.earlyCompletionPending &&
+      (serverContent.turnComplete || serverContent.generationComplete)
+    ) {
+      this.earlyCompletionPending = false;
+    }
   }
   private handleToolCall(toolCall: types.LiveServerToolCall): void {
@@ -1529,6 +1616,9 @@ export class RealtimeSession extends llm.RealtimeSession {
   }
   private isNewGeneration(response: types.LiveServerMessage) {
+    if (this.earlyCompletionPending) {
+      return false;
+    }
     if (response.toolCall) {
       return true;
     }