@livekit/agents 1.1.0-dev.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +2 -0
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +2 -0
- package/dist/cli.js.map +1 -1
- package/dist/constants.cjs +3 -0
- package/dist/constants.cjs.map +1 -1
- package/dist/constants.d.cts +1 -0
- package/dist/constants.d.ts +1 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +2 -0
- package/dist/constants.js.map +1 -1
- package/dist/cpu.cjs +189 -0
- package/dist/cpu.cjs.map +1 -0
- package/dist/cpu.d.cts +24 -0
- package/dist/cpu.d.ts +24 -0
- package/dist/cpu.d.ts.map +1 -0
- package/dist/cpu.js +152 -0
- package/dist/cpu.js.map +1 -0
- package/dist/cpu.test.cjs +227 -0
- package/dist/cpu.test.cjs.map +1 -0
- package/dist/cpu.test.js +204 -0
- package/dist/cpu.test.js.map +1 -0
- package/dist/index.cjs +12 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +13 -13
- package/dist/index.d.ts +13 -13
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +11 -10
- package/dist/index.js.map +1 -1
- package/dist/inference/interruption/defaults.cjs +1 -1
- package/dist/inference/interruption/defaults.cjs.map +1 -1
- package/dist/inference/interruption/defaults.d.cts +1 -1
- package/dist/inference/interruption/defaults.d.ts +1 -1
- package/dist/inference/interruption/defaults.d.ts.map +1 -1
- package/dist/inference/interruption/defaults.js +1 -1
- package/dist/inference/interruption/defaults.js.map +1 -1
- package/dist/inference/interruption/http_transport.cjs +44 -28
- package/dist/inference/interruption/http_transport.cjs.map +1 -1
- package/dist/inference/interruption/http_transport.d.ts.map +1 -1
- package/dist/inference/interruption/http_transport.js +45 -29
- package/dist/inference/interruption/http_transport.js.map +1 -1
- package/dist/inference/interruption/interruption_detector.cjs +22 -5
- package/dist/inference/interruption/interruption_detector.cjs.map +1 -1
- package/dist/inference/interruption/interruption_detector.d.cts +2 -2
- package/dist/inference/interruption/interruption_detector.d.ts +2 -2
- package/dist/inference/interruption/interruption_detector.d.ts.map +1 -1
- package/dist/inference/interruption/interruption_detector.js +22 -5
- package/dist/inference/interruption/interruption_detector.js.map +1 -1
- package/dist/inference/interruption/interruption_stream.cjs +4 -4
- package/dist/inference/interruption/interruption_stream.cjs.map +1 -1
- package/dist/inference/interruption/interruption_stream.js +4 -4
- package/dist/inference/interruption/interruption_stream.js.map +1 -1
- package/dist/inference/interruption/types.cjs.map +1 -1
- package/dist/inference/interruption/types.d.cts +2 -2
- package/dist/inference/interruption/types.d.ts +2 -2
- package/dist/inference/interruption/types.d.ts.map +1 -1
- package/dist/inference/interruption/ws_transport.cjs +60 -47
- package/dist/inference/interruption/ws_transport.cjs.map +1 -1
- package/dist/inference/interruption/ws_transport.d.ts.map +1 -1
- package/dist/inference/interruption/ws_transport.js +60 -47
- package/dist/inference/interruption/ws_transport.js.map +1 -1
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +1 -1
- package/dist/inference/llm.d.ts +1 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +20 -12
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +3 -2
- package/dist/inference/stt.d.ts +3 -2
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +20 -12
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +14 -0
- package/dist/inference/stt.test.cjs.map +1 -1
- package/dist/inference/stt.test.js +14 -0
- package/dist/inference/stt.test.js.map +1 -1
- package/dist/inference/tts.cjs +13 -4
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +8 -1
- package/dist/inference/tts.d.ts +8 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +13 -4
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +10 -0
- package/dist/inference/tts.test.cjs.map +1 -1
- package/dist/inference/tts.test.js +10 -0
- package/dist/inference/tts.test.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +41 -23
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +41 -23
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/job.cjs +1 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.js +1 -1
- package/dist/job.js.map +1 -1
- package/dist/language.cjs +394 -0
- package/dist/language.cjs.map +1 -0
- package/dist/language.d.cts +15 -0
- package/dist/language.d.ts +15 -0
- package/dist/language.d.ts.map +1 -0
- package/dist/language.js +363 -0
- package/dist/language.js.map +1 -0
- package/dist/language.test.cjs +43 -0
- package/dist/language.test.cjs.map +1 -0
- package/dist/language.test.js +49 -0
- package/dist/language.test.js.map +1 -0
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +6 -2
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +6 -2
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +2 -1
- package/dist/stt/stt.d.ts +2 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js.map +1 -1
- package/dist/utils.cjs +15 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +8 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +13 -0
- package/dist/utils.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent.cjs +14 -17
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +10 -11
- package/dist/voice/agent.d.ts +10 -11
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +15 -18
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +194 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +195 -1
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +116 -39
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +2 -0
- package/dist/voice/agent_activity.d.ts +2 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +117 -40
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_activity.test.cjs +135 -0
- package/dist/voice/agent_activity.test.cjs.map +1 -0
- package/dist/voice/agent_activity.test.js +134 -0
- package/dist/voice/agent_activity.test.js.map +1 -0
- package/dist/voice/agent_session.cjs +38 -38
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +65 -56
- package/dist/voice/agent_session.d.ts +65 -56
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +37 -37
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +106 -52
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +4 -2
- package/dist/voice/audio_recognition.d.ts +4 -2
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +106 -52
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/audio_recognition_span.test.cjs +84 -22
- package/dist/voice/audio_recognition_span.test.cjs.map +1 -1
- package/dist/voice/audio_recognition_span.test.js +90 -23
- package/dist/voice/audio_recognition_span.test.js.map +1 -1
- package/dist/voice/events.cjs +1 -1
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +4 -3
- package/dist/voice/events.d.ts +4 -3
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +1 -1
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/index.cjs +9 -1
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +10 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/remote_session.cjs +922 -0
- package/dist/voice/remote_session.cjs.map +1 -0
- package/dist/voice/remote_session.d.cts +108 -0
- package/dist/voice/remote_session.d.ts +108 -0
- package/dist/voice/remote_session.d.ts.map +1 -0
- package/dist/voice/remote_session.js +887 -0
- package/dist/voice/remote_session.js.map +1 -0
- package/dist/voice/report.cjs +11 -10
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.d.cts +5 -3
- package/dist/voice/report.d.ts +5 -3
- package/dist/voice/report.d.ts.map +1 -1
- package/dist/voice/report.js +11 -10
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +15 -0
- package/dist/voice/report.test.cjs.map +1 -1
- package/dist/voice/report.test.js +15 -0
- package/dist/voice/report.test.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +39 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +3 -1
- package/dist/voice/room_io/room_io.d.ts +3 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +40 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/turn_config/interruption.cjs.map +1 -1
- package/dist/voice/turn_config/interruption.d.cts +1 -1
- package/dist/voice/turn_config/interruption.d.ts +1 -1
- package/dist/voice/turn_config/interruption.d.ts.map +1 -1
- package/dist/voice/turn_config/interruption.js.map +1 -1
- package/dist/voice/turn_config/utils.cjs +95 -35
- package/dist/voice/turn_config/utils.cjs.map +1 -1
- package/dist/voice/turn_config/utils.d.cts +17 -5
- package/dist/voice/turn_config/utils.d.ts +17 -5
- package/dist/voice/turn_config/utils.d.ts.map +1 -1
- package/dist/voice/turn_config/utils.js +93 -35
- package/dist/voice/turn_config/utils.js.map +1 -1
- package/dist/voice/turn_config/utils.test.cjs +83 -41
- package/dist/voice/turn_config/utils.test.cjs.map +1 -1
- package/dist/voice/turn_config/utils.test.js +84 -42
- package/dist/voice/turn_config/utils.test.js.map +1 -1
- package/dist/worker.cjs +6 -29
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +6 -19
- package/dist/worker.js.map +1 -1
- package/package.json +3 -2
- package/src/cli.ts +2 -0
- package/src/constants.ts +1 -0
- package/src/cpu.test.ts +239 -0
- package/src/cpu.ts +173 -0
- package/src/index.ts +13 -15
- package/src/inference/interruption/defaults.ts +1 -1
- package/src/inference/interruption/http_transport.ts +49 -30
- package/src/inference/interruption/interruption_detector.ts +22 -6
- package/src/inference/interruption/interruption_stream.ts +4 -4
- package/src/inference/interruption/types.ts +2 -2
- package/src/inference/interruption/ws_transport.ts +63 -59
- package/src/inference/llm.ts +3 -1
- package/src/inference/stt.test.ts +17 -0
- package/src/inference/stt.ts +22 -14
- package/src/inference/tts.test.ts +12 -0
- package/src/inference/tts.ts +22 -6
- package/src/ipc/job_proc_lazy_main.ts +44 -24
- package/src/job.ts +1 -1
- package/src/language.test.ts +62 -0
- package/src/language.ts +380 -0
- package/src/llm/index.ts +2 -0
- package/src/stream/deferred_stream.ts +5 -1
- package/src/stt/stt.ts +2 -1
- package/src/utils.ts +20 -0
- package/src/voice/agent.test.ts +208 -1
- package/src/voice/agent.ts +21 -22
- package/src/voice/agent_activity.test.ts +194 -0
- package/src/voice/agent_activity.ts +161 -43
- package/src/voice/agent_session.ts +103 -92
- package/src/voice/audio_recognition.ts +124 -61
- package/src/voice/audio_recognition_span.test.ts +115 -35
- package/src/voice/events.ts +4 -3
- package/src/voice/index.ts +10 -1
- package/src/voice/remote_session.ts +1083 -0
- package/src/voice/report.test.ts +22 -3
- package/src/voice/report.ts +31 -14
- package/src/voice/room_io/room_io.ts +52 -2
- package/src/voice/turn_config/interruption.ts +1 -1
- package/src/voice/turn_config/utils.test.ts +91 -43
- package/src/voice/turn_config/utils.ts +120 -56
- package/src/worker.ts +34 -50
- package/dist/voice/client_events.cjs +0 -554
- package/dist/voice/client_events.cjs.map +0 -1
- package/dist/voice/client_events.d.cts +0 -195
- package/dist/voice/client_events.d.ts +0 -195
- package/dist/voice/client_events.d.ts.map +0 -1
- package/dist/voice/client_events.js +0 -548
- package/dist/voice/client_events.js.map +0 -1
- package/dist/voice/wire_format.cjs +0 -798
- package/dist/voice/wire_format.cjs.map +0 -1
- package/dist/voice/wire_format.d.cts +0 -5503
- package/dist/voice/wire_format.d.ts +0 -5503
- package/dist/voice/wire_format.d.ts.map +0 -1
- package/dist/voice/wire_format.js +0 -728
- package/dist/voice/wire_format.js.map +0 -1
- package/src/voice/client_events.ts +0 -838
- package/src/voice/wire_format.ts +0 -827
|
@@ -45,7 +45,7 @@ import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
|
|
|
45
45
|
import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
|
|
46
46
|
import { splitWords } from '../tokenize/basic/word.js';
|
|
47
47
|
import { TTS, type TTSError } from '../tts/tts.js';
|
|
48
|
-
import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
|
|
48
|
+
import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from '../utils.js';
|
|
49
49
|
import { VAD, type VADEvent } from '../vad.js';
|
|
50
50
|
import type { Agent, ModelSettings } from './agent.js';
|
|
51
51
|
import {
|
|
@@ -152,10 +152,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
152
152
|
this.onError(ev);
|
|
153
153
|
|
|
154
154
|
private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
|
|
155
|
-
this.agentSession.emit(AgentSessionEventTypes.
|
|
155
|
+
this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
|
|
156
156
|
};
|
|
157
157
|
|
|
158
158
|
private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
|
|
159
|
+
this.agentSession._usageCollector.collect(ev);
|
|
159
160
|
this.agentSession.emit(
|
|
160
161
|
AgentSessionEventTypes.MetricsCollected,
|
|
161
162
|
createMetricsCollectedEvent({ metrics: ev }),
|
|
@@ -165,6 +166,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
165
166
|
private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
|
|
166
167
|
const errorEvent = createErrorEvent(ev, this.interruptionDetector);
|
|
167
168
|
this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
|
|
169
|
+
|
|
170
|
+
if (!ev.recoverable) {
|
|
171
|
+
this.agentSession._onError(ev);
|
|
172
|
+
this.fallbackToVadInterruption();
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
|
|
168
176
|
this.agentSession._onError(ev);
|
|
169
177
|
};
|
|
170
178
|
|
|
@@ -390,8 +398,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
390
398
|
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
391
399
|
turnDetectionMode: this.turnDetectionMode,
|
|
392
400
|
interruptionDetection: this.interruptionDetector,
|
|
393
|
-
minEndpointingDelay:
|
|
394
|
-
|
|
401
|
+
minEndpointingDelay:
|
|
402
|
+
this.agent.turnHandling?.endpointing?.minDelay ??
|
|
403
|
+
this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
|
|
404
|
+
maxEndpointingDelay:
|
|
405
|
+
this.agent.turnHandling?.endpointing?.maxDelay ??
|
|
406
|
+
this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
|
|
395
407
|
rootSpanContext: this.agentSession.rootSpanContext,
|
|
396
408
|
sttModel: this.stt?.label,
|
|
397
409
|
sttProvider: this.getSttProvider(),
|
|
@@ -464,8 +476,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
464
476
|
}
|
|
465
477
|
|
|
466
478
|
get allowInterruptions(): boolean {
|
|
467
|
-
|
|
468
|
-
|
|
479
|
+
return (
|
|
480
|
+
this.agent.turnHandling?.interruption?.enabled ??
|
|
481
|
+
this.agentSession.sessionOptions.turnHandling.interruption.enabled
|
|
482
|
+
);
|
|
469
483
|
}
|
|
470
484
|
|
|
471
485
|
get useTtsAlignedTranscript(): boolean {
|
|
@@ -474,10 +488,27 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
474
488
|
}
|
|
475
489
|
|
|
476
490
|
get turnDetection(): TurnDetectionMode | undefined {
|
|
477
|
-
|
|
478
|
-
|
|
491
|
+
return this.agent.turnHandling?.turnDetection ?? this.agentSession.turnDetection;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
get turnHandling() {
|
|
495
|
+
return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
|
|
479
496
|
}
|
|
480
497
|
|
|
498
|
+
// get minEndpointingDelay(): number {
|
|
499
|
+
// return (
|
|
500
|
+
// this.agent.turnHandling?.endpointing?.minDelay ??
|
|
501
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
|
|
502
|
+
// );
|
|
503
|
+
// }
|
|
504
|
+
|
|
505
|
+
// get maxEndpointingDelay(): number {
|
|
506
|
+
// return (
|
|
507
|
+
// this.agent.turnHandling?.endpointing?.maxDelay ??
|
|
508
|
+
// this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
|
|
509
|
+
// );
|
|
510
|
+
// }
|
|
511
|
+
|
|
481
512
|
get toolCtx(): ToolContext {
|
|
482
513
|
return this.agent.toolCtx;
|
|
483
514
|
}
|
|
@@ -569,16 +600,21 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
569
600
|
});
|
|
570
601
|
|
|
571
602
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
572
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
573
|
-
.pipeThrough(aecWarmupAudioFilter)
|
|
574
|
-
.tee();
|
|
575
603
|
|
|
576
|
-
if (this.realtimeSession) {
|
|
604
|
+
if (this.realtimeSession && this.audioRecognition) {
|
|
605
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
606
|
+
.pipeThrough(aecWarmupAudioFilter)
|
|
607
|
+
.tee();
|
|
577
608
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
578
|
-
}
|
|
579
|
-
|
|
580
|
-
if (this.audioRecognition) {
|
|
581
609
|
this.audioRecognition.setInputAudioStream(recognitionAudioStream);
|
|
610
|
+
} else if (this.realtimeSession) {
|
|
611
|
+
this.realtimeSession.setInputAudioStream(
|
|
612
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
|
|
613
|
+
);
|
|
614
|
+
} else if (this.audioRecognition) {
|
|
615
|
+
this.audioRecognition.setInputAudioStream(
|
|
616
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
|
|
617
|
+
);
|
|
582
618
|
}
|
|
583
619
|
}
|
|
584
620
|
|
|
@@ -693,6 +729,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
693
729
|
}
|
|
694
730
|
}
|
|
695
731
|
|
|
732
|
+
this.agentSession._usageCollector.collect(ev);
|
|
733
|
+
|
|
696
734
|
this.agentSession.emit(
|
|
697
735
|
AgentSessionEventTypes.MetricsCollected,
|
|
698
736
|
createMetricsCollectedEvent({ metrics: ev }),
|
|
@@ -828,7 +866,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
828
866
|
// Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
|
|
829
867
|
speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
|
|
830
868
|
}
|
|
831
|
-
this.agentSession._updateUserState('speaking',
|
|
869
|
+
this.agentSession._updateUserState('speaking', {
|
|
870
|
+
lastSpeakingTime: speechStartTime,
|
|
871
|
+
otelContext: otelContext.active(),
|
|
872
|
+
});
|
|
832
873
|
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
|
|
833
874
|
// Pass speechStartTime as the absolute startedAt timestamp.
|
|
834
875
|
this.audioRecognition.onStartOfOverlapSpeech(
|
|
@@ -852,7 +893,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
852
893
|
this.agentSession._userSpeakingSpan,
|
|
853
894
|
);
|
|
854
895
|
}
|
|
855
|
-
this.agentSession._updateUserState('listening',
|
|
896
|
+
this.agentSession._updateUserState('listening', {
|
|
897
|
+
lastSpeakingTime: speechEndTime,
|
|
898
|
+
otelContext: otelContext.active(),
|
|
899
|
+
});
|
|
856
900
|
}
|
|
857
901
|
|
|
858
902
|
onVADInferenceDone(ev: VADEvent): void {
|
|
@@ -861,7 +905,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
861
905
|
return;
|
|
862
906
|
}
|
|
863
907
|
|
|
864
|
-
if (
|
|
908
|
+
if (
|
|
909
|
+
ev.speechDuration >= this.agentSession.sessionOptions.turnHandling.interruption?.minDuration
|
|
910
|
+
) {
|
|
865
911
|
this.interruptByAudioActivity();
|
|
866
912
|
}
|
|
867
913
|
}
|
|
@@ -887,7 +933,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
887
933
|
// - This ensures consistent behavior across all interruption scenarios
|
|
888
934
|
if (
|
|
889
935
|
this.stt &&
|
|
890
|
-
this.agentSession.
|
|
936
|
+
this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0 &&
|
|
891
937
|
this.audioRecognition
|
|
892
938
|
) {
|
|
893
939
|
const text = this.audioRecognition.currentTranscript;
|
|
@@ -899,7 +945,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
899
945
|
|
|
900
946
|
// Only allow interruption if word count meets or exceeds minInterruptionWords
|
|
901
947
|
// This applies to all cases: empty strings, partial speech, and full speech
|
|
902
|
-
if (wordCount < this.agentSession.
|
|
948
|
+
if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
|
|
903
949
|
return;
|
|
904
950
|
}
|
|
905
951
|
}
|
|
@@ -924,7 +970,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
924
970
|
this.restoreInterruptionByAudioActivity();
|
|
925
971
|
this.interruptByAudioActivity();
|
|
926
972
|
if (this.audioRecognition) {
|
|
927
|
-
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.
|
|
973
|
+
this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
|
|
928
974
|
}
|
|
929
975
|
}
|
|
930
976
|
|
|
@@ -982,7 +1028,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
982
1028
|
|
|
983
1029
|
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
984
1030
|
if (
|
|
985
|
-
!this.agentSession.
|
|
1031
|
+
!this.agentSession.sessionOptions.preemptiveGeneration ||
|
|
986
1032
|
this.schedulingPaused ||
|
|
987
1033
|
(this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
|
|
988
1034
|
!(this.llm instanceof LLM)
|
|
@@ -1099,16 +1145,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1099
1145
|
this._currentSpeech &&
|
|
1100
1146
|
this._currentSpeech.allowInterruptions &&
|
|
1101
1147
|
!this._currentSpeech.interrupted &&
|
|
1102
|
-
this.agentSession.
|
|
1148
|
+
this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0
|
|
1103
1149
|
) {
|
|
1104
1150
|
const wordCount = splitWords(info.newTranscript, true).length;
|
|
1105
|
-
if (wordCount < this.agentSession.
|
|
1151
|
+
if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
|
|
1106
1152
|
// avoid interruption if the new_transcript contains fewer words than minInterruptionWords
|
|
1107
1153
|
this.cancelPreemptiveGeneration();
|
|
1108
1154
|
this.logger.info(
|
|
1109
1155
|
{
|
|
1110
1156
|
wordCount,
|
|
1111
|
-
minInterruptionWords:
|
|
1157
|
+
minInterruptionWords:
|
|
1158
|
+
this.agentSession.sessionOptions.turnHandling.interruption.minWords,
|
|
1112
1159
|
},
|
|
1113
1160
|
'skipping user input, word count below minimum interruption threshold',
|
|
1114
1161
|
);
|
|
@@ -1148,9 +1195,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1148
1195
|
throw new Error('Speech queue is empty');
|
|
1149
1196
|
}
|
|
1150
1197
|
const speechHandle = heapItem[2];
|
|
1198
|
+
|
|
1199
|
+
// Skip speech handles that were already interrupted/done before being
|
|
1200
|
+
// picked up from the queue (e.g. interrupted during shutdown before the
|
|
1201
|
+
// main loop had a chance to process them). Calling _authorizeGeneration
|
|
1202
|
+
// on a done handle would create a generation Future that nobody resolves,
|
|
1203
|
+
// causing the main loop to hang forever.
|
|
1204
|
+
if (speechHandle.interrupted || speechHandle.done()) {
|
|
1205
|
+
continue;
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1151
1208
|
this._currentSpeech = speechHandle;
|
|
1152
1209
|
speechHandle._authorizeGeneration();
|
|
1153
|
-
await speechHandle._waitForGeneration();
|
|
1210
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
|
|
1154
1211
|
this._currentSpeech = undefined;
|
|
1155
1212
|
}
|
|
1156
1213
|
|
|
@@ -1344,7 +1401,24 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1344
1401
|
|
|
1345
1402
|
this.realtimeSession?.interrupt();
|
|
1346
1403
|
|
|
1347
|
-
if (
|
|
1404
|
+
if (force) {
|
|
1405
|
+
// Force-interrupt (used during shutdown): cancel all speech tasks so they
|
|
1406
|
+
// don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
|
|
1407
|
+
// when the room is disconnected). Mark the current speech as done immediately
|
|
1408
|
+
// so the interrupt future resolves without waiting for tasks to finish.
|
|
1409
|
+
// Clear the queue so mainTask doesn't dequeue already-interrupted handles
|
|
1410
|
+
// and hang on _waitForGeneration() (the generation future created by
|
|
1411
|
+
// _authorizeGeneration would never resolve since _markDone is a no-op
|
|
1412
|
+
// once doneFut is already settled).
|
|
1413
|
+
for (const task of this.speechTasks) {
|
|
1414
|
+
task.cancel();
|
|
1415
|
+
}
|
|
1416
|
+
if (currentSpeech && !currentSpeech.done()) {
|
|
1417
|
+
currentSpeech._markDone();
|
|
1418
|
+
}
|
|
1419
|
+
this.speechQueue.clear();
|
|
1420
|
+
future.resolve();
|
|
1421
|
+
} else if (currentSpeech === undefined) {
|
|
1348
1422
|
future.resolve();
|
|
1349
1423
|
} else {
|
|
1350
1424
|
currentSpeech.addDoneCallback(() => {
|
|
@@ -1942,9 +2016,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1942
2016
|
}
|
|
1943
2017
|
|
|
1944
2018
|
replyAbortController.abort();
|
|
1945
|
-
await
|
|
1946
|
-
tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
|
|
1947
|
-
);
|
|
2019
|
+
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1948
2020
|
|
|
1949
2021
|
let forwardedText = textOut?.text || '';
|
|
1950
2022
|
|
|
@@ -2038,7 +2110,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2038
2110
|
if (toolOutput.output.length === 0) return;
|
|
2039
2111
|
|
|
2040
2112
|
// important: no agent output should be used after this point
|
|
2041
|
-
const { maxToolSteps } = this.agentSession.
|
|
2113
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
2042
2114
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
2043
2115
|
this.logger.warn(
|
|
2044
2116
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -2505,7 +2577,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2505
2577
|
}
|
|
2506
2578
|
|
|
2507
2579
|
// important: no agent ouput should be used after this point
|
|
2508
|
-
const { maxToolSteps } = this.agentSession.
|
|
2580
|
+
const { maxToolSteps } = this.agentSession.sessionOptions;
|
|
2509
2581
|
if (speechHandle.numSteps >= maxToolSteps) {
|
|
2510
2582
|
this.logger.warn(
|
|
2511
2583
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
@@ -2793,16 +2865,20 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2793
2865
|
const unlock = await this.lock.lock();
|
|
2794
2866
|
try {
|
|
2795
2867
|
this.cancelPreemptiveGeneration();
|
|
2868
|
+
|
|
2869
|
+
await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
2870
|
+
|
|
2871
|
+
if (this._currentSpeech && !this._currentSpeech.done()) {
|
|
2872
|
+
this._currentSpeech._markDone();
|
|
2873
|
+
}
|
|
2874
|
+
|
|
2796
2875
|
await this._closeSessionResources();
|
|
2797
2876
|
|
|
2798
2877
|
if (this._mainTask) {
|
|
2799
2878
|
await this._mainTask.cancelAndWait();
|
|
2800
2879
|
}
|
|
2801
2880
|
if (this.interruptionDetector) {
|
|
2802
|
-
this.interruptionDetector.off(
|
|
2803
|
-
'user_overlapping_speech',
|
|
2804
|
-
this.onInterruptionOverlappingSpeech,
|
|
2805
|
-
);
|
|
2881
|
+
this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2806
2882
|
this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2807
2883
|
this.interruptionDetector.off('error', this.onInterruptionError);
|
|
2808
2884
|
}
|
|
@@ -2814,8 +2890,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2814
2890
|
}
|
|
2815
2891
|
|
|
2816
2892
|
private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
|
|
2817
|
-
const
|
|
2818
|
-
|
|
2893
|
+
const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode;
|
|
2894
|
+
const sessionInterruptionDetection = this.agentSession.interruptionDetection;
|
|
2819
2895
|
if (
|
|
2820
2896
|
!(
|
|
2821
2897
|
this.stt &&
|
|
@@ -2827,25 +2903,43 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2827
2903
|
!(this.llm instanceof RealtimeModel)
|
|
2828
2904
|
)
|
|
2829
2905
|
) {
|
|
2830
|
-
if (
|
|
2906
|
+
if (
|
|
2907
|
+
agentInterruptionDetection === 'adaptive' ||
|
|
2908
|
+
sessionInterruptionDetection === 'adaptive'
|
|
2909
|
+
) {
|
|
2831
2910
|
this.logger.warn(
|
|
2832
2911
|
"interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
|
|
2833
2912
|
);
|
|
2834
|
-
return undefined;
|
|
2835
2913
|
}
|
|
2914
|
+
return undefined;
|
|
2915
|
+
}
|
|
2916
|
+
|
|
2917
|
+
if (!this.allowInterruptions) {
|
|
2918
|
+
return undefined;
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
if (agentInterruptionDetection === 'vad') {
|
|
2922
|
+
return undefined;
|
|
2923
|
+
}
|
|
2924
|
+
|
|
2925
|
+
if (sessionInterruptionDetection === 'vad') {
|
|
2926
|
+
return undefined;
|
|
2836
2927
|
}
|
|
2837
2928
|
|
|
2838
2929
|
if (
|
|
2839
|
-
|
|
2840
|
-
|
|
2930
|
+
agentInterruptionDetection === undefined &&
|
|
2931
|
+
sessionInterruptionDetection === undefined &&
|
|
2932
|
+
!isHosted() &&
|
|
2933
|
+
!isDevMode()
|
|
2841
2934
|
) {
|
|
2935
|
+
this.logger.info('adaptive interruption is disabled by default in production mode');
|
|
2842
2936
|
return undefined;
|
|
2843
2937
|
}
|
|
2844
2938
|
|
|
2845
2939
|
try {
|
|
2846
2940
|
const detector = new AdaptiveInterruptionDetector();
|
|
2847
2941
|
|
|
2848
|
-
detector.on('
|
|
2942
|
+
detector.on('overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2849
2943
|
detector.on('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2850
2944
|
detector.on('error', this.onInterruptionError);
|
|
2851
2945
|
|
|
@@ -2860,6 +2954,30 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2860
2954
|
this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
|
|
2861
2955
|
}
|
|
2862
2956
|
|
|
2957
|
+
private fallbackToVadInterruption(): void {
|
|
2958
|
+
if (!this.isInterruptionDetectionEnabled) return;
|
|
2959
|
+
|
|
2960
|
+
this.isInterruptionDetectionEnabled = false;
|
|
2961
|
+
this.restoreInterruptionByAudioActivity();
|
|
2962
|
+
|
|
2963
|
+
if (this.interruptionDetector) {
|
|
2964
|
+
this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
|
|
2965
|
+
this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
|
|
2966
|
+
this.interruptionDetector.off('error', this.onInterruptionError);
|
|
2967
|
+
this.interruptionDetector = undefined;
|
|
2968
|
+
}
|
|
2969
|
+
|
|
2970
|
+
if (this.audioRecognition) {
|
|
2971
|
+
this.audioRecognition.disableInterruptionDetection().catch((err) => {
|
|
2972
|
+
this.logger.warn({ err }, 'error while disabling interruption detection');
|
|
2973
|
+
});
|
|
2974
|
+
}
|
|
2975
|
+
|
|
2976
|
+
this.logger.warn(
|
|
2977
|
+
'adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption',
|
|
2978
|
+
);
|
|
2979
|
+
}
|
|
2980
|
+
|
|
2863
2981
|
private async _closeSessionResources(): Promise<void> {
|
|
2864
2982
|
// Unregister event handlers to prevent duplicate metrics
|
|
2865
2983
|
if (this.llm instanceof LLM) {
|