@livekit/agents-plugin-google 1.0.39 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/beta/realtime/realtime_api.cjs +67 -5
- package/dist/beta/realtime/realtime_api.cjs.map +1 -1
- package/dist/beta/realtime/realtime_api.d.cts +3 -0
- package/dist/beta/realtime/realtime_api.d.ts +3 -0
- package/dist/beta/realtime/realtime_api.d.ts.map +1 -1
- package/dist/beta/realtime/realtime_api.js +68 -5
- package/dist/beta/realtime/realtime_api.js.map +1 -1
- package/package.json +5 -5
- package/src/beta/realtime/realtime_api.ts +96 -6
|
@@ -15,6 +15,7 @@ import {
|
|
|
15
15
|
import type { APIConnectOptions } from '@livekit/agents';
|
|
16
16
|
import {
|
|
17
17
|
APIConnectionError,
|
|
18
|
+
APIStatusError,
|
|
18
19
|
AudioByteStream,
|
|
19
20
|
DEFAULT_API_CONNECT_OPTIONS,
|
|
20
21
|
Event,
|
|
@@ -45,6 +46,8 @@ const OUTPUT_AUDIO_CHANNELS = 1;
|
|
|
45
46
|
|
|
46
47
|
const LK_GOOGLE_DEBUG = Number(process.env.LK_GOOGLE_DEBUG ?? 0);
|
|
47
48
|
|
|
49
|
+
// WebSocket close codes (RFC 6455)
|
|
50
|
+
const WS_CLOSE_NORMAL = 1000;
|
|
48
51
|
/**
|
|
49
52
|
* Default image encoding options for Google Realtime API
|
|
50
53
|
*/
|
|
@@ -410,6 +413,8 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
410
413
|
private sessionLock = new Mutex();
|
|
411
414
|
private numRetries = 0;
|
|
412
415
|
private hasReceivedAudioInput = false;
|
|
416
|
+
private pendingInterruptText = false;
|
|
417
|
+
private earlyCompletionPending = false;
|
|
413
418
|
|
|
414
419
|
#client: GoogleGenAI;
|
|
415
420
|
#task: Promise<void>;
|
|
@@ -468,6 +473,8 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
468
473
|
this.activeSession = undefined;
|
|
469
474
|
}
|
|
470
475
|
}
|
|
476
|
+
this.earlyCompletionPending = false;
|
|
477
|
+
this.pendingInterruptText = false;
|
|
471
478
|
|
|
472
479
|
unlock();
|
|
473
480
|
}
|
|
@@ -568,6 +575,27 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
568
575
|
const toolResults = this.getToolResultsForRealtime(appendCtx, this.options.vertexai);
|
|
569
576
|
|
|
570
577
|
if (turns.length > 0) {
|
|
578
|
+
const shouldSendRealtimeText = this.pendingInterruptText;
|
|
579
|
+
|
|
580
|
+
if (shouldSendRealtimeText) {
|
|
581
|
+
for (const turn of turns as types.Content[]) {
|
|
582
|
+
if (turn.role !== 'user') continue;
|
|
583
|
+
// Realtime text drives live activity/interrupts
|
|
584
|
+
// { type: content: turnComplete: true } alone does not reliably preempt a streaming response in Gemini Live.
|
|
585
|
+
const text = (turn.parts || [])
|
|
586
|
+
.map((part) => (part as { text?: string }).text)
|
|
587
|
+
.filter((value): value is string => !!value)
|
|
588
|
+
.join('');
|
|
589
|
+
if (text) {
|
|
590
|
+
this.sendClientEvent({
|
|
591
|
+
type: 'realtime_input',
|
|
592
|
+
value: { text },
|
|
593
|
+
});
|
|
594
|
+
this.pendingInterruptText = false;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
|
|
571
599
|
this.sendClientEvent({
|
|
572
600
|
type: 'content',
|
|
573
601
|
value: {
|
|
@@ -717,11 +745,25 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
717
745
|
}
|
|
718
746
|
}
|
|
719
747
|
|
|
748
|
+
private generationHasOutput(gen: ResponseGeneration): boolean {
|
|
749
|
+
return Boolean(gen.outputText) || gen._firstTokenTimestamp !== undefined;
|
|
750
|
+
}
|
|
751
|
+
|
|
720
752
|
async interrupt() {
|
|
721
753
|
// Gemini Live treats activity start as interruption, so we rely on startUserActivity to handle it
|
|
722
754
|
if (this.options.realtimeInputConfig?.activityHandling === ActivityHandling.NO_INTERRUPTION) {
|
|
755
|
+
if (LK_GOOGLE_DEBUG) {
|
|
756
|
+
this.#logger.debug('interrupt skipped (activityHandling = NO_INTERRUPTION)');
|
|
757
|
+
}
|
|
723
758
|
return;
|
|
724
759
|
}
|
|
760
|
+
if (this.currentGeneration && !this.currentGeneration._done) {
|
|
761
|
+
this.pendingInterruptText = true;
|
|
762
|
+
if (this.generationHasOutput(this.currentGeneration)) {
|
|
763
|
+
this.earlyCompletionPending = true;
|
|
764
|
+
this.markCurrentGenerationDone();
|
|
765
|
+
}
|
|
766
|
+
}
|
|
725
767
|
this.startUserActivity();
|
|
726
768
|
}
|
|
727
769
|
|
|
@@ -774,6 +816,8 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
774
816
|
onmessage: (message: types.LiveServerMessage) => {
|
|
775
817
|
this.onReceiveMessage(session, message);
|
|
776
818
|
},
|
|
819
|
+
// onerror is called for network-level errors (connection refused, DNS failure, TLS errors).
|
|
820
|
+
// Application-level errors (e.g., invalid model name) come through onclose with error codes.
|
|
777
821
|
onerror: (error: ErrorEvent) => {
|
|
778
822
|
this.#logger.error('Gemini Live session error:', error);
|
|
779
823
|
if (!this.sessionShouldClose.isSet) {
|
|
@@ -781,7 +825,33 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
781
825
|
}
|
|
782
826
|
},
|
|
783
827
|
onclose: (event: CloseEvent) => {
|
|
784
|
-
|
|
828
|
+
// Surface WebSocket close errors to the user instead of silently swallowing them
|
|
829
|
+
if (event.code !== WS_CLOSE_NORMAL) {
|
|
830
|
+
// Note: WebSocket close reasons are limited to 123 bytes by RFC 6455,
|
|
831
|
+
// so Google's error messages may be truncated at the protocol level
|
|
832
|
+
const isTruncated = event.reason && event.reason.length >= 120;
|
|
833
|
+
const truncationNote = isTruncated
|
|
834
|
+
? ' (message may be truncated - check model name and API permissions)'
|
|
835
|
+
: '';
|
|
836
|
+
const errorMsg = event.reason || `WebSocket closed with code ${event.code}`;
|
|
837
|
+
this.#logger.error(`Gemini Live session error: ${errorMsg}${truncationNote}`);
|
|
838
|
+
|
|
839
|
+
this.emitError(
|
|
840
|
+
new APIStatusError({
|
|
841
|
+
message: `${errorMsg}${truncationNote}`,
|
|
842
|
+
options: {
|
|
843
|
+
statusCode: event.code,
|
|
844
|
+
retryable: false,
|
|
845
|
+
body: event.reason
|
|
846
|
+
? { reason: event.reason, code: event.code, truncated: isTruncated }
|
|
847
|
+
: null,
|
|
848
|
+
},
|
|
849
|
+
}),
|
|
850
|
+
false,
|
|
851
|
+
);
|
|
852
|
+
} else {
|
|
853
|
+
this.#logger.debug('Gemini Live session closed:', event.code, event.reason);
|
|
854
|
+
}
|
|
785
855
|
this.markCurrentGenerationDone();
|
|
786
856
|
},
|
|
787
857
|
},
|
|
@@ -903,12 +973,15 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
903
973
|
}
|
|
904
974
|
break;
|
|
905
975
|
case 'realtime_input':
|
|
906
|
-
const { mediaChunks, activityStart, activityEnd } = msg.value;
|
|
976
|
+
const { mediaChunks, activityStart, activityEnd, text } = msg.value;
|
|
907
977
|
if (mediaChunks) {
|
|
908
978
|
for (const mediaChunk of mediaChunks) {
|
|
909
979
|
await session.sendRealtimeInput({ media: mediaChunk });
|
|
910
980
|
}
|
|
911
981
|
}
|
|
982
|
+
if (text) {
|
|
983
|
+
await session.sendRealtimeInput({ text });
|
|
984
|
+
}
|
|
912
985
|
if (activityStart) await session.sendRealtimeInput({ activityStart });
|
|
913
986
|
if (activityEnd) await session.sendRealtimeInput({ activityEnd });
|
|
914
987
|
break;
|
|
@@ -960,7 +1033,6 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
960
1033
|
|
|
961
1034
|
const shouldStartNewGeneration =
|
|
962
1035
|
!this.currentGeneration || this.currentGeneration._done || !!this.pendingGenerationFut;
|
|
963
|
-
|
|
964
1036
|
if (shouldStartNewGeneration) {
|
|
965
1037
|
if (response.serverContent?.interrupted) {
|
|
966
1038
|
// Two cases when an interrupted event is sent without an active generation:
|
|
@@ -1295,7 +1367,9 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1295
1367
|
|
|
1296
1368
|
const gen = this.currentGeneration;
|
|
1297
1369
|
|
|
1298
|
-
|
|
1370
|
+
const discardOutput = this.earlyCompletionPending;
|
|
1371
|
+
|
|
1372
|
+
if (serverContent.modelTurn && !discardOutput) {
|
|
1299
1373
|
const turn = serverContent.modelTurn;
|
|
1300
1374
|
|
|
1301
1375
|
for (const part of turn.parts || []) {
|
|
@@ -1357,7 +1431,11 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1357
1431
|
} as llm.InputTranscriptionCompleted);
|
|
1358
1432
|
}
|
|
1359
1433
|
|
|
1360
|
-
if (
|
|
1434
|
+
if (
|
|
1435
|
+
!discardOutput &&
|
|
1436
|
+
serverContent.outputTranscription &&
|
|
1437
|
+
serverContent.outputTranscription.text
|
|
1438
|
+
) {
|
|
1361
1439
|
const text = serverContent.outputTranscription.text;
|
|
1362
1440
|
gen.outputText += text;
|
|
1363
1441
|
gen.textChannel.write(text);
|
|
@@ -1371,9 +1449,18 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1371
1449
|
this.handleInputSpeechStarted();
|
|
1372
1450
|
}
|
|
1373
1451
|
|
|
1374
|
-
if (serverContent.turnComplete) {
|
|
1452
|
+
if (serverContent.turnComplete && !this.earlyCompletionPending) {
|
|
1375
1453
|
this.markCurrentGenerationDone();
|
|
1376
1454
|
}
|
|
1455
|
+
|
|
1456
|
+
// Assume Gemini emits turnComplete/generationComplete before any new generation content.
|
|
1457
|
+
// We keep discarding until that signal to avoid old stream spillover after interrupts.
|
|
1458
|
+
if (
|
|
1459
|
+
this.earlyCompletionPending &&
|
|
1460
|
+
(serverContent.turnComplete || serverContent.generationComplete)
|
|
1461
|
+
) {
|
|
1462
|
+
this.earlyCompletionPending = false;
|
|
1463
|
+
}
|
|
1377
1464
|
}
|
|
1378
1465
|
|
|
1379
1466
|
private handleToolCall(toolCall: types.LiveServerToolCall): void {
|
|
@@ -1529,6 +1616,9 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1529
1616
|
}
|
|
1530
1617
|
|
|
1531
1618
|
private isNewGeneration(response: types.LiveServerMessage) {
|
|
1619
|
+
if (this.earlyCompletionPending) {
|
|
1620
|
+
return false;
|
|
1621
|
+
}
|
|
1532
1622
|
if (response.toolCall) {
|
|
1533
1623
|
return true;
|
|
1534
1624
|
}
|