@livekit/agents 1.0.36-dev.0 → 1.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +0 -1
- package/dist/index.d.ts +0 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +0 -1
- package/dist/index.js.map +1 -1
- package/dist/inference/utils.cjs +2 -15
- package/dist/inference/utils.cjs.map +1 -1
- package/dist/inference/utils.d.cts +0 -1
- package/dist/inference/utils.d.ts +0 -1
- package/dist/inference/utils.d.ts.map +1 -1
- package/dist/inference/utils.js +1 -13
- package/dist/inference/utils.js.map +1 -1
- package/dist/stream/stream_channel.cjs +0 -3
- package/dist/stream/stream_channel.cjs.map +1 -1
- package/dist/stream/stream_channel.d.cts +2 -3
- package/dist/stream/stream_channel.d.ts +2 -3
- package/dist/stream/stream_channel.d.ts.map +1 -1
- package/dist/stream/stream_channel.js +0 -3
- package/dist/stream/stream_channel.js.map +1 -1
- package/dist/telemetry/trace_types.cjs +0 -15
- package/dist/telemetry/trace_types.cjs.map +1 -1
- package/dist/telemetry/trace_types.d.cts +0 -5
- package/dist/telemetry/trace_types.d.ts +0 -5
- package/dist/telemetry/trace_types.d.ts.map +1 -1
- package/dist/telemetry/trace_types.js +0 -10
- package/dist/telemetry/trace_types.js.map +1 -1
- package/dist/voice/agent_activity.cjs +19 -68
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +0 -14
- package/dist/voice/agent_activity.d.ts +0 -14
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +19 -68
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +65 -37
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +25 -4
- package/dist/voice/agent_session.d.ts +25 -4
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +65 -37
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +2 -124
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +1 -32
- package/dist/voice/audio_recognition.d.ts +1 -32
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +2 -127
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/index.cjs +14 -1
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +3 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +1 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +1 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +12 -3
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +12 -2
- package/dist/voice/speech_handle.d.ts +12 -2
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +10 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/index.cjs +54 -0
- package/dist/voice/testing/index.cjs.map +1 -0
- package/dist/voice/testing/index.d.cts +20 -0
- package/dist/voice/testing/index.d.ts +20 -0
- package/dist/voice/testing/index.d.ts.map +1 -0
- package/dist/voice/testing/index.js +33 -0
- package/dist/voice/testing/index.js.map +1 -0
- package/dist/voice/testing/run_result.cjs +766 -0
- package/dist/voice/testing/run_result.cjs.map +1 -0
- package/dist/voice/testing/run_result.d.cts +374 -0
- package/dist/voice/testing/run_result.d.ts +374 -0
- package/dist/voice/testing/run_result.d.ts.map +1 -0
- package/dist/voice/testing/run_result.js +739 -0
- package/dist/voice/testing/run_result.js.map +1 -0
- package/dist/{inference/interruption/index.cjs → voice/testing/types.cjs} +24 -12
- package/dist/voice/testing/types.cjs.map +1 -0
- package/dist/voice/testing/types.d.cts +83 -0
- package/dist/voice/testing/types.d.ts +83 -0
- package/dist/voice/testing/types.d.ts.map +1 -0
- package/dist/voice/testing/types.js +19 -0
- package/dist/voice/testing/types.js.map +1 -0
- package/package.json +3 -4
- package/src/index.ts +0 -2
- package/src/inference/utils.ts +0 -15
- package/src/stream/stream_channel.ts +2 -6
- package/src/telemetry/trace_types.ts +0 -7
- package/src/voice/agent_activity.ts +24 -83
- package/src/voice/agent_session.ts +74 -49
- package/src/voice/audio_recognition.ts +1 -161
- package/src/voice/index.ts +1 -0
- package/src/voice/room_io/room_io.ts +1 -0
- package/src/voice/speech_handle.ts +24 -4
- package/src/voice/testing/index.ts +50 -0
- package/src/voice/testing/run_result.ts +937 -0
- package/src/voice/testing/types.ts +118 -0
- package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs +0 -152
- package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs.map +0 -1
- package/dist/inference/interruption/AdaptiveInterruptionDetector.d.cts +0 -50
- package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts +0 -50
- package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts.map +0 -1
- package/dist/inference/interruption/AdaptiveInterruptionDetector.js +0 -125
- package/dist/inference/interruption/AdaptiveInterruptionDetector.js.map +0 -1
- package/dist/inference/interruption/InterruptionStream.cjs +0 -310
- package/dist/inference/interruption/InterruptionStream.cjs.map +0 -1
- package/dist/inference/interruption/InterruptionStream.d.cts +0 -57
- package/dist/inference/interruption/InterruptionStream.d.ts +0 -57
- package/dist/inference/interruption/InterruptionStream.d.ts.map +0 -1
- package/dist/inference/interruption/InterruptionStream.js +0 -288
- package/dist/inference/interruption/InterruptionStream.js.map +0 -1
- package/dist/inference/interruption/defaults.cjs +0 -76
- package/dist/inference/interruption/defaults.cjs.map +0 -1
- package/dist/inference/interruption/defaults.d.cts +0 -14
- package/dist/inference/interruption/defaults.d.ts +0 -14
- package/dist/inference/interruption/defaults.d.ts.map +0 -1
- package/dist/inference/interruption/defaults.js +0 -42
- package/dist/inference/interruption/defaults.js.map +0 -1
- package/dist/inference/interruption/errors.cjs +0 -2
- package/dist/inference/interruption/errors.cjs.map +0 -1
- package/dist/inference/interruption/errors.d.cts +0 -2
- package/dist/inference/interruption/errors.d.ts +0 -2
- package/dist/inference/interruption/errors.d.ts.map +0 -1
- package/dist/inference/interruption/errors.js +0 -1
- package/dist/inference/interruption/errors.js.map +0 -1
- package/dist/inference/interruption/http_transport.cjs +0 -57
- package/dist/inference/interruption/http_transport.cjs.map +0 -1
- package/dist/inference/interruption/http_transport.d.cts +0 -23
- package/dist/inference/interruption/http_transport.d.ts +0 -23
- package/dist/inference/interruption/http_transport.d.ts.map +0 -1
- package/dist/inference/interruption/http_transport.js +0 -33
- package/dist/inference/interruption/http_transport.js.map +0 -1
- package/dist/inference/interruption/index.cjs.map +0 -1
- package/dist/inference/interruption/index.d.cts +0 -5
- package/dist/inference/interruption/index.d.ts +0 -5
- package/dist/inference/interruption/index.d.ts.map +0 -1
- package/dist/inference/interruption/index.js +0 -7
- package/dist/inference/interruption/index.js.map +0 -1
- package/dist/inference/interruption/interruption.cjs +0 -85
- package/dist/inference/interruption/interruption.cjs.map +0 -1
- package/dist/inference/interruption/interruption.d.cts +0 -48
- package/dist/inference/interruption/interruption.d.ts +0 -48
- package/dist/inference/interruption/interruption.d.ts.map +0 -1
- package/dist/inference/interruption/interruption.js +0 -59
- package/dist/inference/interruption/interruption.js.map +0 -1
- package/dist/inference/utils.test.cjs +0 -20
- package/dist/inference/utils.test.cjs.map +0 -1
- package/dist/inference/utils.test.js +0 -19
- package/dist/inference/utils.test.js.map +0 -1
- package/dist/utils/ws_transport.cjs +0 -51
- package/dist/utils/ws_transport.cjs.map +0 -1
- package/dist/utils/ws_transport.d.cts +0 -9
- package/dist/utils/ws_transport.d.ts +0 -9
- package/dist/utils/ws_transport.d.ts.map +0 -1
- package/dist/utils/ws_transport.js +0 -17
- package/dist/utils/ws_transport.js.map +0 -1
- package/dist/utils/ws_transport.test.cjs +0 -212
- package/dist/utils/ws_transport.test.cjs.map +0 -1
- package/dist/utils/ws_transport.test.js +0 -211
- package/dist/utils/ws_transport.test.js.map +0 -1
- package/src/inference/interruption/AdaptiveInterruptionDetector.ts +0 -166
- package/src/inference/interruption/InterruptionStream.ts +0 -397
- package/src/inference/interruption/defaults.ts +0 -33
- package/src/inference/interruption/errors.ts +0 -0
- package/src/inference/interruption/http_transport.ts +0 -61
- package/src/inference/interruption/index.ts +0 -4
- package/src/inference/interruption/interruption.ts +0 -88
- package/src/inference/utils.test.ts +0 -31
- package/src/utils/ws_transport.test.ts +0 -282
- package/src/utils/ws_transport.ts +0 -22
|
@@ -41,8 +41,6 @@ import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js
|
|
|
41
41
|
import { splitWords } from '../tokenize/basic/word.js';
|
|
42
42
|
import { TTS, type TTSError } from '../tts/tts.js';
|
|
43
43
|
import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
|
|
44
|
-
import type { InterruptionEvent } from '../inference/interruption/interruption.js';
|
|
45
|
-
import { InterruptionEventType } from '../inference/interruption/interruption.js';
|
|
46
44
|
import { VAD, type VADEvent } from '../vad.js';
|
|
47
45
|
import type { Agent, ModelSettings } from './agent.js';
|
|
48
46
|
import { StopResponse, asyncLocalStorage } from './agent.js';
|
|
@@ -114,24 +112,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
114
112
|
_mainTask?: Task<void>;
|
|
115
113
|
_userTurnCompletedTask?: Promise<void>;
|
|
116
114
|
|
|
117
|
-
/**
|
|
118
|
-
* Notify that agent started speaking.
|
|
119
|
-
* This enables interruption detection in AudioRecognition.
|
|
120
|
-
* @internal
|
|
121
|
-
*/
|
|
122
|
-
notifyAgentSpeechStarted(): void {
|
|
123
|
-
this.audioRecognition?.onStartOfAgentSpeech();
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
/**
|
|
127
|
-
* Notify that agent stopped speaking.
|
|
128
|
-
* This disables interruption detection in AudioRecognition.
|
|
129
|
-
* @internal
|
|
130
|
-
*/
|
|
131
|
-
notifyAgentSpeechEnded(): void {
|
|
132
|
-
this.audioRecognition?.onEndOfAgentSpeech();
|
|
133
|
-
}
|
|
134
|
-
|
|
135
115
|
constructor(agent: Agent, agentSession: AgentSession) {
|
|
136
116
|
this.agent = agent;
|
|
137
117
|
this.agentSession = agentSession;
|
|
@@ -312,7 +292,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
312
292
|
// Disable stt node if stt is not provided
|
|
313
293
|
stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
|
|
314
294
|
vad: this.vad,
|
|
315
|
-
interruptionDetector: this.agentSession.interruptionDetector,
|
|
316
295
|
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
|
|
317
296
|
turnDetectionMode: this.turnDetectionMode,
|
|
318
297
|
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
@@ -718,46 +697,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
718
697
|
}
|
|
719
698
|
}
|
|
720
699
|
|
|
721
|
-
onInterruption(ev: InterruptionEvent): void {
|
|
722
|
-
if (ev.type !== InterruptionEventType.INTERRUPTION) {
|
|
723
|
-
// Only handle actual interruptions, not overlap_speech_ended events
|
|
724
|
-
return;
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
this.logger.info(
|
|
728
|
-
{
|
|
729
|
-
probability: ev.probability,
|
|
730
|
-
detectionDelay: ev.detectionDelay,
|
|
731
|
-
totalDuration: ev.totalDuration,
|
|
732
|
-
},
|
|
733
|
-
'adaptive interruption detected',
|
|
734
|
-
);
|
|
735
|
-
|
|
736
|
-
// Similar to onVADInferenceDone but triggered by the adaptive interruption detector
|
|
737
|
-
if (this.turnDetection === 'manual' || this.turnDetection === 'realtime_llm') {
|
|
738
|
-
return;
|
|
739
|
-
}
|
|
740
|
-
|
|
741
|
-
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
742
|
-
return;
|
|
743
|
-
}
|
|
744
|
-
|
|
745
|
-
this.realtimeSession?.startUserActivity();
|
|
746
|
-
|
|
747
|
-
if (
|
|
748
|
-
this._currentSpeech &&
|
|
749
|
-
!this._currentSpeech.interrupted &&
|
|
750
|
-
this._currentSpeech.allowInterruptions
|
|
751
|
-
) {
|
|
752
|
-
this.logger.info(
|
|
753
|
-
{ 'speech id': this._currentSpeech.id },
|
|
754
|
-
'speech interrupted by adaptive interruption detector',
|
|
755
|
-
);
|
|
756
|
-
this.realtimeSession?.interrupt();
|
|
757
|
-
this._currentSpeech.interrupt();
|
|
758
|
-
}
|
|
759
|
-
}
|
|
760
|
-
|
|
761
700
|
onInterimTranscript(ev: SpeechEvent): void {
|
|
762
701
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
763
702
|
// skip stt transcription if userTranscription is enabled on the realtime model
|
|
@@ -1411,11 +1350,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1411
1350
|
);
|
|
1412
1351
|
tasks.push(llmTask);
|
|
1413
1352
|
|
|
1414
|
-
const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
|
|
1415
|
-
|
|
1416
1353
|
let ttsTask: Task<void> | null = null;
|
|
1417
1354
|
let ttsStream: ReadableStream<AudioFrame> | null = null;
|
|
1355
|
+
let llmOutput: ReadableStream<string>;
|
|
1356
|
+
|
|
1418
1357
|
if (audioOutput) {
|
|
1358
|
+
// Only tee the stream when we need TTS
|
|
1359
|
+
const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
|
|
1360
|
+
llmOutput = textOutput;
|
|
1419
1361
|
[ttsTask, ttsStream] = performTTSInference(
|
|
1420
1362
|
(...args) => this.agent.ttsNode(...args),
|
|
1421
1363
|
ttsTextInput,
|
|
@@ -1423,6 +1365,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1423
1365
|
replyAbortController,
|
|
1424
1366
|
);
|
|
1425
1367
|
tasks.push(ttsTask);
|
|
1368
|
+
} else {
|
|
1369
|
+
// No TTS needed, use the stream directly
|
|
1370
|
+
llmOutput = llmGenData.textStream;
|
|
1426
1371
|
}
|
|
1427
1372
|
|
|
1428
1373
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
@@ -1482,12 +1427,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1482
1427
|
//TODO(AJS-272): before executing tools, make sure we generated all the text
|
|
1483
1428
|
// (this ensure everything is kept ordered)
|
|
1484
1429
|
|
|
1485
|
-
const onToolExecutionStarted = (
|
|
1486
|
-
|
|
1430
|
+
const onToolExecutionStarted = (f: FunctionCall) => {
|
|
1431
|
+
speechHandle._itemAdded([f]);
|
|
1432
|
+
this.agent._chatCtx.items.push(f);
|
|
1433
|
+
this.agentSession._toolItemsAdded([f]);
|
|
1487
1434
|
};
|
|
1488
1435
|
|
|
1489
|
-
const onToolExecutionCompleted = (
|
|
1490
|
-
|
|
1436
|
+
const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
|
|
1437
|
+
if (out.toolCallOutput) {
|
|
1438
|
+
speechHandle._itemAdded([out.toolCallOutput]);
|
|
1439
|
+
}
|
|
1491
1440
|
};
|
|
1492
1441
|
|
|
1493
1442
|
const [executeToolsTask, toolOutput] = performToolExecutions({
|
|
@@ -1562,6 +1511,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1562
1511
|
});
|
|
1563
1512
|
chatCtx.insert(message);
|
|
1564
1513
|
this.agent._chatCtx.insert(message);
|
|
1514
|
+
speechHandle._itemAdded([message]);
|
|
1565
1515
|
this.agentSession._conversationItemAdded(message);
|
|
1566
1516
|
}
|
|
1567
1517
|
|
|
@@ -1589,6 +1539,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1589
1539
|
});
|
|
1590
1540
|
chatCtx.insert(message);
|
|
1591
1541
|
this.agent._chatCtx.insert(message);
|
|
1542
|
+
speechHandle._itemAdded([message]);
|
|
1592
1543
|
this.agentSession._conversationItemAdded(message);
|
|
1593
1544
|
this.logger.info(
|
|
1594
1545
|
{ speech_id: speechHandle.id, message: textOut.text },
|
|
@@ -1673,28 +1624,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1673
1624
|
if (shouldGenerateToolReply) {
|
|
1674
1625
|
chatCtx.insert(toolMessages);
|
|
1675
1626
|
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
stepIndex: speechHandle._stepIndex + 1,
|
|
1679
|
-
parent: speechHandle,
|
|
1680
|
-
});
|
|
1681
|
-
this.agentSession.emit(
|
|
1682
|
-
AgentSessionEventTypes.SpeechCreated,
|
|
1683
|
-
createSpeechCreatedEvent({
|
|
1684
|
-
userInitiated: false,
|
|
1685
|
-
source: 'tool_response',
|
|
1686
|
-
speechHandle: handle,
|
|
1687
|
-
}),
|
|
1688
|
-
);
|
|
1627
|
+
// Increment step count on SAME handle (parity with Python agent_activity.py L2081)
|
|
1628
|
+
speechHandle._numSteps += 1;
|
|
1689
1629
|
|
|
1690
1630
|
// Avoid setting tool_choice to "required" or a specific function when
|
|
1691
1631
|
// passing tool response back to the LLM
|
|
1692
1632
|
const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1693
1633
|
|
|
1634
|
+
// Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
|
|
1694
1635
|
const toolResponseTask = this.createSpeechTask({
|
|
1695
1636
|
task: Task.from(() =>
|
|
1696
1637
|
this.pipelineReplyTask(
|
|
1697
|
-
|
|
1638
|
+
speechHandle,
|
|
1698
1639
|
chatCtx,
|
|
1699
1640
|
toolCtx,
|
|
1700
1641
|
{ toolChoice: respondToolChoice },
|
|
@@ -1704,13 +1645,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1704
1645
|
toolMessages,
|
|
1705
1646
|
),
|
|
1706
1647
|
),
|
|
1707
|
-
ownedSpeechHandle:
|
|
1648
|
+
ownedSpeechHandle: speechHandle,
|
|
1708
1649
|
name: 'AgentActivity.pipelineReply',
|
|
1709
1650
|
});
|
|
1710
1651
|
|
|
1711
1652
|
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1712
1653
|
|
|
1713
|
-
this.scheduleSpeech(
|
|
1654
|
+
this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1714
1655
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1715
1656
|
for (const msg of toolMessages) {
|
|
1716
1657
|
msg.createdAt = replyStartedAt;
|
|
@@ -15,7 +15,6 @@ import {
|
|
|
15
15
|
type STTModelString,
|
|
16
16
|
type TTSModelString,
|
|
17
17
|
} from '../inference/index.js';
|
|
18
|
-
import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js';
|
|
19
18
|
import { type JobContext, getJobContext } from '../job.js';
|
|
20
19
|
import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js';
|
|
21
20
|
import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js';
|
|
@@ -62,6 +61,7 @@ import { RecorderIO } from './recorder_io/index.js';
|
|
|
62
61
|
import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js';
|
|
63
62
|
import type { UnknownUserData } from './run_context.js';
|
|
64
63
|
import type { SpeechHandle } from './speech_handle.js';
|
|
64
|
+
import { RunResult } from './testing/run_result.js';
|
|
65
65
|
|
|
66
66
|
export interface VoiceOptions {
|
|
67
67
|
allowInterruptions: boolean;
|
|
@@ -107,7 +107,6 @@ export type AgentSessionOptions<UserData = UnknownUserData> = {
|
|
|
107
107
|
vad?: VAD;
|
|
108
108
|
llm?: LLM | RealtimeModel | LLMModels;
|
|
109
109
|
tts?: TTS | TTSModelString;
|
|
110
|
-
interruptionDetector?: AdaptiveInterruptionDetector;
|
|
111
110
|
userData?: UserData;
|
|
112
111
|
voiceOptions?: Partial<VoiceOptions>;
|
|
113
112
|
connOptions?: SessionConnectOptions;
|
|
@@ -169,7 +168,8 @@ export class AgentSession<
|
|
|
169
168
|
/** @internal - Timestamp when the session started (milliseconds) */
|
|
170
169
|
_startedAt?: number;
|
|
171
170
|
|
|
172
|
-
|
|
171
|
+
/** @internal - Current run state for testing */
|
|
172
|
+
_globalRunState?: RunResult;
|
|
173
173
|
|
|
174
174
|
constructor(opts: AgentSessionOptions<UserData>) {
|
|
175
175
|
super();
|
|
@@ -180,7 +180,6 @@ export class AgentSession<
|
|
|
180
180
|
llm,
|
|
181
181
|
tts,
|
|
182
182
|
turnDetection,
|
|
183
|
-
interruptionDetector,
|
|
184
183
|
userData,
|
|
185
184
|
voiceOptions = defaultVoiceOptions,
|
|
186
185
|
connOptions,
|
|
@@ -217,7 +216,6 @@ export class AgentSession<
|
|
|
217
216
|
}
|
|
218
217
|
|
|
219
218
|
this.turnDetection = turnDetection;
|
|
220
|
-
this.interruptionDetector = interruptionDetector;
|
|
221
219
|
this._userData = userData;
|
|
222
220
|
|
|
223
221
|
// configurable IO
|
|
@@ -278,7 +276,7 @@ export class AgentSession<
|
|
|
278
276
|
span,
|
|
279
277
|
}: {
|
|
280
278
|
agent: Agent;
|
|
281
|
-
room
|
|
279
|
+
room?: Room;
|
|
282
280
|
inputOptions?: Partial<RoomInputOptions>;
|
|
283
281
|
outputOptions?: Partial<RoomOutputOptions>;
|
|
284
282
|
span: Span;
|
|
@@ -289,41 +287,45 @@ export class AgentSession<
|
|
|
289
287
|
this._updateAgentState('initializing');
|
|
290
288
|
|
|
291
289
|
const tasks: Promise<void>[] = [];
|
|
292
|
-
// Check for existing input/output configuration and warn if needed
|
|
293
|
-
if (this.input.audio && inputOptions?.audioEnabled !== false) {
|
|
294
|
-
this.logger.warn('RoomIO audio input is enabled but input.audio is already set, ignoring..');
|
|
295
|
-
}
|
|
296
290
|
|
|
297
|
-
if (
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
291
|
+
if (room && !this.roomIO) {
|
|
292
|
+
// Check for existing input/output configuration and warn if needed
|
|
293
|
+
if (this.input.audio && inputOptions?.audioEnabled !== false) {
|
|
294
|
+
this.logger.warn(
|
|
295
|
+
'RoomIO audio input is enabled but input.audio is already set, ignoring..',
|
|
296
|
+
);
|
|
297
|
+
}
|
|
302
298
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
299
|
+
if (this.output.audio && outputOptions?.audioEnabled !== false) {
|
|
300
|
+
this.logger.warn(
|
|
301
|
+
'RoomIO audio output is enabled but output.audio is already set, ignoring..',
|
|
302
|
+
);
|
|
303
|
+
}
|
|
308
304
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
305
|
+
if (this.output.transcription && outputOptions?.transcriptionEnabled !== false) {
|
|
306
|
+
this.logger.warn(
|
|
307
|
+
'RoomIO transcription output is enabled but output.transcription is already set, ignoring..',
|
|
308
|
+
);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
this.roomIO = new RoomIO({
|
|
312
|
+
agentSession: this,
|
|
313
|
+
room,
|
|
314
|
+
inputOptions,
|
|
315
|
+
outputOptions,
|
|
316
|
+
});
|
|
317
|
+
this.roomIO.start();
|
|
318
|
+
}
|
|
316
319
|
|
|
317
320
|
let ctx: JobContext | undefined = undefined;
|
|
318
321
|
try {
|
|
319
322
|
ctx = getJobContext();
|
|
320
|
-
} catch
|
|
323
|
+
} catch {
|
|
321
324
|
// JobContext is not available in evals
|
|
322
|
-
this.logger.warn('JobContext is not available');
|
|
323
325
|
}
|
|
324
326
|
|
|
325
327
|
if (ctx) {
|
|
326
|
-
if (ctx.room === room && !room.isConnected) {
|
|
328
|
+
if (room && ctx.room === room && !room.isConnected) {
|
|
327
329
|
this.logger.debug('Auto-connecting to room via job context');
|
|
328
330
|
tasks.push(ctx.connect());
|
|
329
331
|
}
|
|
@@ -376,7 +378,7 @@ export class AgentSession<
|
|
|
376
378
|
record,
|
|
377
379
|
}: {
|
|
378
380
|
agent: Agent;
|
|
379
|
-
room
|
|
381
|
+
room?: Room;
|
|
380
382
|
inputOptions?: Partial<RoomInputOptions>;
|
|
381
383
|
outputOptions?: Partial<RoomOutputOptions>;
|
|
382
384
|
record?: boolean;
|
|
@@ -503,13 +505,50 @@ export class AgentSession<
|
|
|
503
505
|
|
|
504
506
|
// attach to the session span if called outside of the AgentSession
|
|
505
507
|
const activeSpan = trace.getActiveSpan();
|
|
508
|
+
let handle: SpeechHandle;
|
|
506
509
|
if (!activeSpan && this.rootSpanContext) {
|
|
507
|
-
|
|
510
|
+
handle = otelContext.with(this.rootSpanContext, () =>
|
|
508
511
|
doGenerateReply(this.activity!, this.nextActivity),
|
|
509
512
|
);
|
|
513
|
+
} else {
|
|
514
|
+
handle = doGenerateReply(this.activity!, this.nextActivity);
|
|
510
515
|
}
|
|
511
516
|
|
|
512
|
-
|
|
517
|
+
if (this._globalRunState) {
|
|
518
|
+
this._globalRunState._watchHandle(handle);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
return handle;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
/**
|
|
525
|
+
* Run a test with user input and return a result for assertions.
|
|
526
|
+
*
|
|
527
|
+
* This method is primarily used for testing agent behavior without
|
|
528
|
+
* requiring a real room connection.
|
|
529
|
+
*
|
|
530
|
+
* @example
|
|
531
|
+
* ```typescript
|
|
532
|
+
* const result = await session.run({ userInput: 'Hello' });
|
|
533
|
+
* result.expect.nextEvent().isMessage({ role: 'assistant' });
|
|
534
|
+
* result.expect.noMoreEvents();
|
|
535
|
+
* ```
|
|
536
|
+
*
|
|
537
|
+
* @param options - Run options including user input
|
|
538
|
+
* @returns A RunResult that resolves when the agent finishes responding
|
|
539
|
+
*
|
|
540
|
+
* TODO: Add outputType parameter for typed outputs (parity with Python)
|
|
541
|
+
*/
|
|
542
|
+
run(options: { userInput: string }): RunResult {
|
|
543
|
+
if (this._globalRunState && !this._globalRunState.done()) {
|
|
544
|
+
throw new Error('nested runs are not supported');
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
const runState = new RunResult({ userInput: options.userInput });
|
|
548
|
+
this._globalRunState = runState;
|
|
549
|
+
this.generateReply({ userInput: options.userInput });
|
|
550
|
+
|
|
551
|
+
return runState;
|
|
513
552
|
}
|
|
514
553
|
|
|
515
554
|
private async updateActivity(agent: Agent): Promise<void> {
|
|
@@ -643,8 +682,6 @@ export class AgentSession<
|
|
|
643
682
|
return;
|
|
644
683
|
}
|
|
645
684
|
|
|
646
|
-
const oldState = this._agentState;
|
|
647
|
-
|
|
648
685
|
if (state === 'speaking') {
|
|
649
686
|
// Reset error counts when agent starts speaking
|
|
650
687
|
this.llmErrorCounts = 0;
|
|
@@ -659,25 +696,13 @@ export class AgentSession<
|
|
|
659
696
|
// TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
|
|
660
697
|
// (Ref: Python agent_session.py line 1161-1164)
|
|
661
698
|
}
|
|
662
|
-
|
|
663
|
-
// Notify AudioRecognition that agent started speaking (for interruption detection)
|
|
664
|
-
this.activity?.notifyAgentSpeechStarted();
|
|
665
|
-
} else if (oldState === 'speaking') {
|
|
666
|
-
// Agent stopped speaking
|
|
667
|
-
if (this.agentSpeakingSpan !== undefined) {
|
|
668
|
-
// TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
|
|
669
|
-
this.agentSpeakingSpan.end();
|
|
670
|
-
this.agentSpeakingSpan = undefined;
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
// Notify AudioRecognition that agent stopped speaking (for interruption detection)
|
|
674
|
-
this.activity?.notifyAgentSpeechEnded();
|
|
675
699
|
} else if (this.agentSpeakingSpan !== undefined) {
|
|
676
|
-
//
|
|
700
|
+
// TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
|
|
677
701
|
this.agentSpeakingSpan.end();
|
|
678
702
|
this.agentSpeakingSpan = undefined;
|
|
679
703
|
}
|
|
680
704
|
|
|
705
|
+
const oldState = this._agentState;
|
|
681
706
|
this._agentState = state;
|
|
682
707
|
|
|
683
708
|
// Handle user away timer based on state changes
|
|
@@ -5,12 +5,6 @@ import { AudioFrame } from '@livekit/rtc-node';
|
|
|
5
5
|
import type { Context, Span } from '@opentelemetry/api';
|
|
6
6
|
import type { WritableStreamDefaultWriter } from 'node:stream/web';
|
|
7
7
|
import { ReadableStream } from 'node:stream/web';
|
|
8
|
-
import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js';
|
|
9
|
-
import {
|
|
10
|
-
InterruptionStreamBase,
|
|
11
|
-
InterruptionStreamSentinel,
|
|
12
|
-
} from '../inference/interruption/InterruptionStream.js';
|
|
13
|
-
import type { InterruptionEvent } from '../inference/interruption/interruption.js';
|
|
14
8
|
import { type ChatContext } from '../llm/chat_context.js';
|
|
15
9
|
import { log } from '../log.js';
|
|
16
10
|
import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
|
|
@@ -45,7 +39,6 @@ export interface RecognitionHooks {
|
|
|
45
39
|
onFinalTranscript: (ev: SpeechEvent) => void;
|
|
46
40
|
onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
|
|
47
41
|
onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
|
|
48
|
-
onInterruption: (ev: InterruptionEvent) => void;
|
|
49
42
|
|
|
50
43
|
retrieveChatCtx: () => ChatContext;
|
|
51
44
|
}
|
|
@@ -60,7 +53,6 @@ export interface AudioRecognitionOptions {
|
|
|
60
53
|
recognitionHooks: RecognitionHooks;
|
|
61
54
|
stt?: STTNode;
|
|
62
55
|
vad?: VAD;
|
|
63
|
-
interruptionDetector?: AdaptiveInterruptionDetector;
|
|
64
56
|
turnDetector?: _TurnDetector;
|
|
65
57
|
turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
66
58
|
minEndpointingDelay: number;
|
|
@@ -96,7 +88,6 @@ export class AudioRecognition {
|
|
|
96
88
|
|
|
97
89
|
private vadInputStream: ReadableStream<AudioFrame>;
|
|
98
90
|
private sttInputStream: ReadableStream<AudioFrame>;
|
|
99
|
-
private interruptionInputStream: ReadableStream<AudioFrame>;
|
|
100
91
|
private silenceAudioTransform = new IdentityTransform<AudioFrame>();
|
|
101
92
|
private silenceAudioWriter: WritableStreamDefaultWriter<AudioFrame>;
|
|
102
93
|
|
|
@@ -105,19 +96,11 @@ export class AudioRecognition {
|
|
|
105
96
|
private commitUserTurnTask?: Task<void>;
|
|
106
97
|
private vadTask?: Task<void>;
|
|
107
98
|
private sttTask?: Task<void>;
|
|
108
|
-
private interruptionTask?: Task<void>;
|
|
109
|
-
|
|
110
|
-
// interruption detection
|
|
111
|
-
private interruptionDetector?: AdaptiveInterruptionDetector;
|
|
112
|
-
private interruptionStream?: InterruptionStreamBase;
|
|
113
|
-
private interruptionEnabled = false;
|
|
114
|
-
private agentSpeaking = false;
|
|
115
99
|
|
|
116
100
|
constructor(opts: AudioRecognitionOptions) {
|
|
117
101
|
this.hooks = opts.recognitionHooks;
|
|
118
102
|
this.stt = opts.stt;
|
|
119
103
|
this.vad = opts.vad;
|
|
120
|
-
this.interruptionDetector = opts.interruptionDetector;
|
|
121
104
|
this.turnDetector = opts.turnDetector;
|
|
122
105
|
this.turnDetectionMode = opts.turnDetectionMode;
|
|
123
106
|
this.minEndpointingDelay = opts.minEndpointingDelay;
|
|
@@ -125,15 +108,10 @@ export class AudioRecognition {
|
|
|
125
108
|
this.lastLanguage = undefined;
|
|
126
109
|
this.rootSpanContext = opts.rootSpanContext;
|
|
127
110
|
|
|
128
|
-
// Interruption detection is only enabled if both detector and VAD are provided
|
|
129
|
-
this.interruptionEnabled = this.interruptionDetector !== undefined && this.vad !== undefined;
|
|
130
|
-
|
|
131
111
|
this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
|
|
132
|
-
const [vadInputStream,
|
|
133
|
-
const [sttInputStream, interruptionInputStream] = rest.tee();
|
|
112
|
+
const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
|
|
134
113
|
this.vadInputStream = vadInputStream;
|
|
135
114
|
this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable);
|
|
136
|
-
this.interruptionInputStream = interruptionInputStream;
|
|
137
115
|
this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
|
|
138
116
|
}
|
|
139
117
|
|
|
@@ -157,15 +135,6 @@ export class AudioRecognition {
|
|
|
157
135
|
this.sttTask.result.catch((err) => {
|
|
158
136
|
this.logger.error(`Error running STT task: ${err}`);
|
|
159
137
|
});
|
|
160
|
-
|
|
161
|
-
if (this.interruptionEnabled && this.interruptionDetector) {
|
|
162
|
-
this.interruptionTask = Task.from(({ signal }) =>
|
|
163
|
-
this.createInterruptionTask(this.interruptionDetector!, signal),
|
|
164
|
-
);
|
|
165
|
-
this.interruptionTask.result.catch((err) => {
|
|
166
|
-
this.logger.error(`Error running interruption task: ${err}`);
|
|
167
|
-
});
|
|
168
|
-
}
|
|
169
138
|
}
|
|
170
139
|
|
|
171
140
|
private async onSTTEvent(ev: SpeechEvent) {
|
|
@@ -608,11 +577,6 @@ export class AudioRecognition {
|
|
|
608
577
|
this.sampleRate = ev.frames[0].sampleRate;
|
|
609
578
|
}
|
|
610
579
|
|
|
611
|
-
// If agent is speaking, user speech is overlap - trigger interruption detection
|
|
612
|
-
if (this.agentSpeaking && this.interruptionEnabled) {
|
|
613
|
-
this.onStartOfOverlapSpeech(ev.speechDuration, this.userTurnSpan);
|
|
614
|
-
}
|
|
615
|
-
|
|
616
580
|
this.bounceEOUTask?.cancel();
|
|
617
581
|
break;
|
|
618
582
|
case VADEventType.INFERENCE_DONE:
|
|
@@ -633,11 +597,6 @@ export class AudioRecognition {
|
|
|
633
597
|
// when VAD fires END_OF_SPEECH, it already waited for the silence_duration
|
|
634
598
|
this.speaking = false;
|
|
635
599
|
|
|
636
|
-
// If we were in overlap speech (agent speaking + user speaking), end it
|
|
637
|
-
if (this.agentSpeaking && this.interruptionEnabled) {
|
|
638
|
-
this.onEndOfOverlapSpeech();
|
|
639
|
-
}
|
|
640
|
-
|
|
641
600
|
if (
|
|
642
601
|
this.vadBaseTurnDetection ||
|
|
643
602
|
(this.turnDetectionMode === 'stt' && this.userTurnCommitted)
|
|
@@ -655,123 +614,6 @@ export class AudioRecognition {
|
|
|
655
614
|
}
|
|
656
615
|
}
|
|
657
616
|
|
|
658
|
-
private async createInterruptionTask(
|
|
659
|
-
interruptionDetector: AdaptiveInterruptionDetector,
|
|
660
|
-
signal: AbortSignal,
|
|
661
|
-
) {
|
|
662
|
-
// Create the interruption stream from the detector
|
|
663
|
-
this.interruptionStream = interruptionDetector.createStream();
|
|
664
|
-
|
|
665
|
-
// Forward audio frames to the interruption stream
|
|
666
|
-
const reader = this.interruptionInputStream.getReader();
|
|
667
|
-
|
|
668
|
-
const forwardTask = (async () => {
|
|
669
|
-
try {
|
|
670
|
-
while (!signal.aborted) {
|
|
671
|
-
const { done, value: frame } = await reader.read();
|
|
672
|
-
if (done) break;
|
|
673
|
-
await this.interruptionStream?.pushFrame(frame);
|
|
674
|
-
}
|
|
675
|
-
} catch (e) {
|
|
676
|
-
if (!signal.aborted) {
|
|
677
|
-
this.logger.error(e, 'Error forwarding audio to interruption stream');
|
|
678
|
-
}
|
|
679
|
-
} finally {
|
|
680
|
-
reader.releaseLock();
|
|
681
|
-
}
|
|
682
|
-
})();
|
|
683
|
-
|
|
684
|
-
// Read interruption events from the stream
|
|
685
|
-
const eventStream = this.interruptionStream.stream;
|
|
686
|
-
const eventReader = eventStream.getReader();
|
|
687
|
-
|
|
688
|
-
const abortHandler = () => {
|
|
689
|
-
eventReader.releaseLock();
|
|
690
|
-
this.interruptionStream?.close();
|
|
691
|
-
signal.removeEventListener('abort', abortHandler);
|
|
692
|
-
};
|
|
693
|
-
signal.addEventListener('abort', abortHandler);
|
|
694
|
-
|
|
695
|
-
try {
|
|
696
|
-
while (!signal.aborted) {
|
|
697
|
-
const { done, value: ev } = await eventReader.read();
|
|
698
|
-
if (done) break;
|
|
699
|
-
|
|
700
|
-
this.logger.debug({ type: ev.type, probability: ev.probability }, 'Interruption event');
|
|
701
|
-
this.hooks.onInterruption(ev);
|
|
702
|
-
}
|
|
703
|
-
} catch (e) {
|
|
704
|
-
if (!signal.aborted) {
|
|
705
|
-
this.logger.error(e, 'Error in interruption task');
|
|
706
|
-
}
|
|
707
|
-
} finally {
|
|
708
|
-
this.logger.debug('Interruption task closed');
|
|
709
|
-
await forwardTask;
|
|
710
|
-
}
|
|
711
|
-
}
|
|
712
|
-
|
|
713
|
-
/**
|
|
714
|
-
* Called when the agent starts speaking.
|
|
715
|
-
* Enables interruption detection by sending the agent-speech-started sentinel.
|
|
716
|
-
*/
|
|
717
|
-
onStartOfAgentSpeech(): void {
|
|
718
|
-
this.agentSpeaking = true;
|
|
719
|
-
|
|
720
|
-
if (!this.interruptionEnabled || !this.interruptionStream) {
|
|
721
|
-
return;
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechStarted());
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
/**
|
|
728
|
-
* Called when the agent stops speaking.
|
|
729
|
-
* Disables interruption detection by sending the agent-speech-ended sentinel.
|
|
730
|
-
*/
|
|
731
|
-
onEndOfAgentSpeech(): void {
|
|
732
|
-
if (!this.interruptionEnabled || !this.interruptionStream) {
|
|
733
|
-
this.agentSpeaking = false;
|
|
734
|
-
return;
|
|
735
|
-
}
|
|
736
|
-
|
|
737
|
-
this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechEnded());
|
|
738
|
-
|
|
739
|
-
if (this.agentSpeaking) {
|
|
740
|
-
// No interruption was detected, end the overlap inference (idempotent)
|
|
741
|
-
this.onEndOfOverlapSpeech();
|
|
742
|
-
}
|
|
743
|
-
|
|
744
|
-
this.agentSpeaking = false;
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
/**
|
|
748
|
-
* Called when user starts speaking while agent is speaking (overlap speech).
|
|
749
|
-
* This triggers the interruption detection inference.
|
|
750
|
-
*/
|
|
751
|
-
onStartOfOverlapSpeech(speechDuration: number, userSpeakingSpan?: Span): void {
|
|
752
|
-
if (!this.interruptionEnabled || !this.interruptionStream) {
|
|
753
|
-
return;
|
|
754
|
-
}
|
|
755
|
-
|
|
756
|
-
if (this.agentSpeaking && userSpeakingSpan) {
|
|
757
|
-
this.interruptionStream.pushFrame(
|
|
758
|
-
InterruptionStreamSentinel.overlapSpeechStarted(speechDuration, userSpeakingSpan),
|
|
759
|
-
);
|
|
760
|
-
}
|
|
761
|
-
}
|
|
762
|
-
|
|
763
|
-
/**
|
|
764
|
-
* Called when user stops speaking during overlap.
|
|
765
|
-
* This ends the interruption detection inference for this overlap period.
|
|
766
|
-
*/
|
|
767
|
-
onEndOfOverlapSpeech(): void {
|
|
768
|
-
if (!this.interruptionEnabled || !this.interruptionStream) {
|
|
769
|
-
return;
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
this.interruptionStream.pushFrame(InterruptionStreamSentinel.overlapSpeechEnded());
|
|
773
|
-
}
|
|
774
|
-
|
|
775
617
|
setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {
|
|
776
618
|
this.deferredInputStream.setSource(audioStream);
|
|
777
619
|
}
|
|
@@ -844,8 +686,6 @@ export class AudioRecognition {
|
|
|
844
686
|
await this.sttTask?.cancelAndWait();
|
|
845
687
|
await this.vadTask?.cancelAndWait();
|
|
846
688
|
await this.bounceEOUTask?.cancelAndWait();
|
|
847
|
-
await this.interruptionTask?.cancelAndWait();
|
|
848
|
-
await this.interruptionStream?.close();
|
|
849
689
|
}
|
|
850
690
|
|
|
851
691
|
private _endUserTurnSpan({
|
package/src/voice/index.ts
CHANGED