@livekit/agents 1.0.37 → 1.0.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/inference/api_protos.cjs +68 -0
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +345 -4
- package/dist/inference/api_protos.d.ts +345 -4
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +60 -0
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/llm.cjs +7 -3
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +5 -6
- package/dist/inference/llm.d.ts +5 -6
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +7 -3
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +32 -21
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +5 -4
- package/dist/inference/stt.d.ts +5 -4
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +34 -21
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +10 -7
- package/dist/inference/tts.d.ts +10 -7
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/stt/stream_adapter.cjs +9 -1
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +9 -1
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +10 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +12 -0
- package/dist/stt/stt.d.ts +12 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +10 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +4 -3
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +2 -0
- package/dist/telemetry/traces.d.ts +2 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +4 -3
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/utils.cjs +11 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +10 -0
- package/dist/utils.d.ts +10 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +10 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +6 -2
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +6 -2
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +72 -37
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +2 -1
- package/dist/voice/agent_activity.d.ts +2 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +73 -38
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +7 -5
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -2
- package/dist/voice/agent_session.d.ts +5 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +7 -5
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +3 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +3 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +6 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +6 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +14 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -2
- package/dist/voice/generation.d.ts +3 -2
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +14 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +12 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +19 -1
- package/dist/voice/io.d.ts +19 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +12 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +91 -28
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +91 -28
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +40 -11
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +4 -1
- package/dist/voice/room_io/_input.d.ts +4 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +31 -2
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +6 -0
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +1 -0
- package/dist/voice/room_io/_output.d.ts +1 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +6 -0
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +2 -2
- package/dist/voice/room_io/room_io.d.ts +2 -2
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +2 -0
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +3 -0
- package/dist/voice/speech_handle.d.ts +3 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +2 -0
- package/dist/voice/speech_handle.js.map +1 -1
- package/package.json +2 -2
- package/src/inference/api_protos.ts +83 -0
- package/src/inference/llm.ts +20 -15
- package/src/inference/stt.ts +48 -29
- package/src/inference/tts.ts +36 -16
- package/src/stt/stream_adapter.ts +12 -1
- package/src/stt/stt.ts +21 -0
- package/src/telemetry/traces.ts +6 -2
- package/src/utils.ts +21 -0
- package/src/voice/agent.ts +11 -2
- package/src/voice/agent_activity.ts +108 -41
- package/src/voice/agent_session.ts +6 -5
- package/src/voice/audio_recognition.ts +2 -0
- package/src/voice/avatar/datastream_io.ts +8 -0
- package/src/voice/generation.ts +24 -12
- package/src/voice/io.ts +27 -5
- package/src/voice/recorder_io/recorder_io.ts +123 -31
- package/src/voice/room_io/_input.ts +32 -4
- package/src/voice/room_io/_output.ts +8 -0
- package/src/voice/room_io/room_io.ts +3 -1
- package/src/voice/speech_handle.ts +4 -0
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
import { Mutex } from '@livekit/mutex';
|
|
5
5
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
6
6
|
import type { Span } from '@opentelemetry/api';
|
|
7
|
-
import { ROOT_CONTEXT, trace } from '@opentelemetry/api';
|
|
7
|
+
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
8
8
|
import { Heap } from 'heap-js';
|
|
9
9
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
10
10
|
import { ReadableStream } from 'node:stream/web';
|
|
@@ -194,12 +194,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
194
194
|
if (
|
|
195
195
|
!this.vad &&
|
|
196
196
|
this.stt &&
|
|
197
|
+
!this.stt.capabilities.streaming &&
|
|
197
198
|
this.llm instanceof LLM &&
|
|
198
199
|
this.allowInterruptions &&
|
|
199
200
|
this.turnDetectionMode === undefined
|
|
200
201
|
) {
|
|
201
202
|
this.logger.warn(
|
|
202
|
-
'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
|
|
203
|
+
'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
|
|
203
204
|
'for more responsive interruption handling.',
|
|
204
205
|
);
|
|
205
206
|
}
|
|
@@ -637,9 +638,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
637
638
|
}
|
|
638
639
|
|
|
639
640
|
// recognition hooks
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
641
|
+
onStartOfSpeech(ev: VADEvent): void {
|
|
642
|
+
let speechStartTime = Date.now();
|
|
643
|
+
if (ev) {
|
|
644
|
+
speechStartTime = speechStartTime - ev.speechDuration;
|
|
645
|
+
}
|
|
646
|
+
this.agentSession._updateUserState('speaking', speechStartTime);
|
|
643
647
|
}
|
|
644
648
|
|
|
645
649
|
onEndOfSpeech(ev: VADEvent): void {
|
|
@@ -656,12 +660,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
656
660
|
return;
|
|
657
661
|
}
|
|
658
662
|
|
|
659
|
-
if (
|
|
660
|
-
|
|
661
|
-
return;
|
|
663
|
+
if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
|
|
664
|
+
this.interruptByAudioActivity();
|
|
662
665
|
}
|
|
666
|
+
}
|
|
663
667
|
|
|
664
|
-
|
|
668
|
+
private interruptByAudioActivity(): void {
|
|
669
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
670
|
+
// skip speech handle interruption if server side turn detection is enabled
|
|
665
671
|
return;
|
|
666
672
|
}
|
|
667
673
|
|
|
@@ -691,7 +697,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
691
697
|
!this._currentSpeech.interrupted &&
|
|
692
698
|
this._currentSpeech.allowInterruptions
|
|
693
699
|
) {
|
|
694
|
-
this.logger.info(
|
|
700
|
+
this.logger.info(
|
|
701
|
+
{ 'speech id': this._currentSpeech.id },
|
|
702
|
+
'speech interrupted by audio activity',
|
|
703
|
+
);
|
|
695
704
|
this.realtimeSession?.interrupt();
|
|
696
705
|
this._currentSpeech.interrupt();
|
|
697
706
|
}
|
|
@@ -712,6 +721,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
712
721
|
// TODO(AJS-106): add multi participant support
|
|
713
722
|
}),
|
|
714
723
|
);
|
|
724
|
+
|
|
725
|
+
if (ev.alternatives![0].text) {
|
|
726
|
+
this.interruptByAudioActivity();
|
|
727
|
+
}
|
|
715
728
|
}
|
|
716
729
|
|
|
717
730
|
onFinalTranscript(ev: SpeechEvent): void {
|
|
@@ -729,6 +742,20 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
729
742
|
// TODO(AJS-106): add multi participant support
|
|
730
743
|
}),
|
|
731
744
|
);
|
|
745
|
+
|
|
746
|
+
// agent speech might not be interrupted if VAD failed and a final transcript is received
|
|
747
|
+
// we call interruptByAudioActivity (idempotent) to pause the speech, if possible
|
|
748
|
+
if (
|
|
749
|
+
this.audioRecognition &&
|
|
750
|
+
this.turnDetection !== 'manual' &&
|
|
751
|
+
this.turnDetection !== 'realtime_llm'
|
|
752
|
+
) {
|
|
753
|
+
this.interruptByAudioActivity();
|
|
754
|
+
|
|
755
|
+
// TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// TODO: resume false interruption - start interrupt paused speech task
|
|
732
759
|
}
|
|
733
760
|
|
|
734
761
|
onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
|
|
@@ -1168,6 +1195,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1168
1195
|
replyAbortController: AbortController,
|
|
1169
1196
|
audio?: ReadableStream<AudioFrame> | null,
|
|
1170
1197
|
): Promise<void> {
|
|
1198
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1199
|
+
|
|
1171
1200
|
speechHandleStorage.enterWith(speechHandle);
|
|
1172
1201
|
|
|
1173
1202
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled
|
|
@@ -1212,13 +1241,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1212
1241
|
tasks.push(textForwardTask);
|
|
1213
1242
|
}
|
|
1214
1243
|
|
|
1215
|
-
const onFirstFrame = () => {
|
|
1216
|
-
this.agentSession._updateAgentState('speaking'
|
|
1244
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1245
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1246
|
+
startTime: startedSpeakingAt,
|
|
1247
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1248
|
+
});
|
|
1217
1249
|
};
|
|
1218
1250
|
|
|
1219
1251
|
if (!audioOutput) {
|
|
1220
1252
|
if (textOut) {
|
|
1221
|
-
textOut.firstTextFut.await
|
|
1253
|
+
textOut.firstTextFut.await
|
|
1254
|
+
.then(() => onFirstFrame())
|
|
1255
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1222
1256
|
}
|
|
1223
1257
|
} else {
|
|
1224
1258
|
let audioOut: _AudioOut | null = null;
|
|
@@ -1249,7 +1283,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1249
1283
|
tasks.push(forwardTask);
|
|
1250
1284
|
audioOut = _audioOut;
|
|
1251
1285
|
}
|
|
1252
|
-
audioOut.firstFrameFut.await
|
|
1286
|
+
audioOut.firstFrameFut.await
|
|
1287
|
+
.then((ts) => onFirstFrame(ts))
|
|
1288
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1253
1289
|
}
|
|
1254
1290
|
|
|
1255
1291
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
@@ -1303,6 +1339,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1303
1339
|
toolsMessages?: ChatItem[];
|
|
1304
1340
|
span: Span;
|
|
1305
1341
|
}): Promise<void> => {
|
|
1342
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1343
|
+
|
|
1306
1344
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1307
1345
|
if (instructions) {
|
|
1308
1346
|
span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
@@ -1402,8 +1440,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1402
1440
|
textOut = _textOut;
|
|
1403
1441
|
}
|
|
1404
1442
|
|
|
1405
|
-
const onFirstFrame = () => {
|
|
1406
|
-
this.agentSession._updateAgentState('speaking'
|
|
1443
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1444
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1445
|
+
startTime: startedSpeakingAt,
|
|
1446
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1447
|
+
});
|
|
1407
1448
|
};
|
|
1408
1449
|
|
|
1409
1450
|
let audioOut: _AudioOut | null = null;
|
|
@@ -1416,12 +1457,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1416
1457
|
);
|
|
1417
1458
|
audioOut = _audioOut;
|
|
1418
1459
|
tasks.push(forwardTask);
|
|
1419
|
-
audioOut.firstFrameFut.await
|
|
1460
|
+
audioOut.firstFrameFut.await
|
|
1461
|
+
.then((ts) => onFirstFrame(ts))
|
|
1462
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1420
1463
|
} else {
|
|
1421
1464
|
throw Error('ttsStream is null when audioOutput is enabled');
|
|
1422
1465
|
}
|
|
1423
1466
|
} else {
|
|
1424
|
-
textOut?.firstTextFut.await
|
|
1467
|
+
textOut?.firstTextFut.await
|
|
1468
|
+
.then(() => onFirstFrame())
|
|
1469
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1425
1470
|
}
|
|
1426
1471
|
|
|
1427
1472
|
//TODO(AJS-272): before executing tools, make sure we generated all the text
|
|
@@ -1462,8 +1507,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1462
1507
|
msg.createdAt = replyStartedAt;
|
|
1463
1508
|
}
|
|
1464
1509
|
this.agent._chatCtx.insert(toolsMessages);
|
|
1465
|
-
//
|
|
1466
|
-
|
|
1510
|
+
// Only add FunctionCallOutput items to session history since FunctionCall items
|
|
1511
|
+
// were already added by onToolExecutionStarted when the tool execution began
|
|
1512
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1513
|
+
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1514
|
+
);
|
|
1515
|
+
if (toolCallOutputs.length > 0) {
|
|
1516
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1517
|
+
}
|
|
1467
1518
|
}
|
|
1468
1519
|
|
|
1469
1520
|
if (speechHandle.interrupted) {
|
|
@@ -1487,10 +1538,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1487
1538
|
|
|
1488
1539
|
if (audioOutput) {
|
|
1489
1540
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1490
|
-
if (audioOut?.firstFrameFut.done) {
|
|
1541
|
+
if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
|
|
1491
1542
|
// playback EV is valid only if the first frame was already played
|
|
1492
1543
|
this.logger.info(
|
|
1493
|
-
{ speech_id: speechHandle.id,
|
|
1544
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
1494
1545
|
'playout interrupted',
|
|
1495
1546
|
);
|
|
1496
1547
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1656,8 +1707,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1656
1707
|
for (const msg of toolMessages) {
|
|
1657
1708
|
msg.createdAt = replyStartedAt;
|
|
1658
1709
|
}
|
|
1710
|
+
|
|
1659
1711
|
this.agent._chatCtx.insert(toolMessages);
|
|
1660
|
-
|
|
1712
|
+
|
|
1713
|
+
// Only add FunctionCallOutput items to session history since FunctionCall items
|
|
1714
|
+
// were already added by onToolExecutionStarted when the tool execution began
|
|
1715
|
+
const toolCallOutputs = toolMessages.filter(
|
|
1716
|
+
(m): m is FunctionCallOutput => m.type === 'function_call_output',
|
|
1717
|
+
);
|
|
1718
|
+
|
|
1719
|
+
if (toolCallOutputs.length > 0) {
|
|
1720
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1721
|
+
}
|
|
1661
1722
|
}
|
|
1662
1723
|
};
|
|
1663
1724
|
|
|
@@ -1725,6 +1786,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1725
1786
|
replyAbortController: AbortController;
|
|
1726
1787
|
span: Span;
|
|
1727
1788
|
}): Promise<void> {
|
|
1789
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1790
|
+
|
|
1728
1791
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1729
1792
|
|
|
1730
1793
|
speechHandleStorage.enterWith(speechHandle);
|
|
@@ -1762,8 +1825,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1762
1825
|
return;
|
|
1763
1826
|
}
|
|
1764
1827
|
|
|
1765
|
-
const onFirstFrame = () => {
|
|
1766
|
-
this.agentSession._updateAgentState('speaking'
|
|
1828
|
+
const onFirstFrame = (startedSpeakingAt?: number) => {
|
|
1829
|
+
this.agentSession._updateAgentState('speaking', {
|
|
1830
|
+
startTime: startedSpeakingAt,
|
|
1831
|
+
otelContext: speechHandle._agentTurnContext,
|
|
1832
|
+
});
|
|
1767
1833
|
};
|
|
1768
1834
|
|
|
1769
1835
|
const readMessages = async (
|
|
@@ -1851,10 +1917,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1851
1917
|
);
|
|
1852
1918
|
forwardTasks.push(forwardTask);
|
|
1853
1919
|
audioOut = _audioOut;
|
|
1854
|
-
audioOut.firstFrameFut.await
|
|
1920
|
+
audioOut.firstFrameFut.await
|
|
1921
|
+
.then((ts) => onFirstFrame(ts))
|
|
1922
|
+
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1855
1923
|
}
|
|
1856
1924
|
} else if (textOut) {
|
|
1857
|
-
textOut.firstTextFut.await
|
|
1925
|
+
textOut.firstTextFut.await
|
|
1926
|
+
.then(() => onFirstFrame())
|
|
1927
|
+
.catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
|
|
1858
1928
|
}
|
|
1859
1929
|
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1860
1930
|
}
|
|
@@ -1936,7 +2006,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1936
2006
|
|
|
1937
2007
|
if (audioOutput) {
|
|
1938
2008
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1939
|
-
this.agentSession._updateAgentState('listening');
|
|
1940
2009
|
}
|
|
1941
2010
|
|
|
1942
2011
|
if (speechHandle.interrupted) {
|
|
@@ -1955,11 +2024,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1955
2024
|
if (audioOutput) {
|
|
1956
2025
|
audioOutput.clearBuffer();
|
|
1957
2026
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1958
|
-
let
|
|
1959
|
-
if (audioOut?.firstFrameFut.done) {
|
|
2027
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
2028
|
+
if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
|
|
1960
2029
|
// playback EV is valid only if the first frame was already played
|
|
1961
2030
|
this.logger.info(
|
|
1962
|
-
{ speech_id: speechHandle.id,
|
|
2031
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1963
2032
|
'playout interrupted',
|
|
1964
2033
|
);
|
|
1965
2034
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1967,13 +2036,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1967
2036
|
}
|
|
1968
2037
|
} else {
|
|
1969
2038
|
forwardedText = '';
|
|
1970
|
-
|
|
2039
|
+
playbackPositionInS = 0;
|
|
1971
2040
|
}
|
|
1972
2041
|
|
|
1973
2042
|
// truncate server-side message
|
|
1974
2043
|
this.realtimeSession.truncate({
|
|
1975
2044
|
messageId: msgId,
|
|
1976
|
-
audioEndMs: Math.floor(
|
|
2045
|
+
audioEndMs: Math.floor(playbackPositionInS * 1000),
|
|
1977
2046
|
modalities: msgModalities,
|
|
1978
2047
|
audioTranscript: forwardedText,
|
|
1979
2048
|
});
|
|
@@ -2023,17 +2092,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2023
2092
|
speechHandle._markGenerationDone();
|
|
2024
2093
|
// TODO(brian): close tees
|
|
2025
2094
|
|
|
2026
|
-
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
2027
|
-
this.agentSession._updateAgentState('thinking');
|
|
2028
|
-
});
|
|
2029
|
-
|
|
2030
2095
|
await executeToolsTask.result;
|
|
2031
2096
|
|
|
2097
|
+
if (toolOutput.output.length > 0) {
|
|
2098
|
+
this.agentSession._updateAgentState('thinking');
|
|
2099
|
+
} else if (this.agentSession.agentState === 'speaking') {
|
|
2100
|
+
this.agentSession._updateAgentState('listening');
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2032
2103
|
if (toolOutput.output.length === 0) {
|
|
2033
|
-
// return to listening state for thinking-only turns (no audio output, no tools)
|
|
2034
|
-
if (!speechHandle.interrupted) {
|
|
2035
|
-
this.agentSession._updateAgentState('listening');
|
|
2036
|
-
}
|
|
2037
2104
|
return;
|
|
2038
2105
|
}
|
|
2039
2106
|
|
|
@@ -677,7 +677,7 @@ export class AgentSession<
|
|
|
677
677
|
}
|
|
678
678
|
|
|
679
679
|
/** @internal */
|
|
680
|
-
_updateAgentState(state: AgentState) {
|
|
680
|
+
_updateAgentState(state: AgentState, options?: { startTime?: number; otelContext?: Context }) {
|
|
681
681
|
if (this._agentState === state) {
|
|
682
682
|
return;
|
|
683
683
|
}
|
|
@@ -690,7 +690,8 @@ export class AgentSession<
|
|
|
690
690
|
if (this.agentSpeakingSpan === undefined) {
|
|
691
691
|
this.agentSpeakingSpan = tracer.startSpan({
|
|
692
692
|
name: 'agent_speaking',
|
|
693
|
-
context: this.rootSpanContext,
|
|
693
|
+
context: options?.otelContext ?? this.rootSpanContext,
|
|
694
|
+
startTime: options?.startTime,
|
|
694
695
|
});
|
|
695
696
|
|
|
696
697
|
// TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
|
|
@@ -719,7 +720,7 @@ export class AgentSession<
|
|
|
719
720
|
}
|
|
720
721
|
|
|
721
722
|
/** @internal */
|
|
722
|
-
_updateUserState(state: UserState,
|
|
723
|
+
_updateUserState(state: UserState, lastSpeakingTime?: number) {
|
|
723
724
|
if (this.userState === state) {
|
|
724
725
|
return;
|
|
725
726
|
}
|
|
@@ -728,13 +729,13 @@ export class AgentSession<
|
|
|
728
729
|
this.userSpeakingSpan = tracer.startSpan({
|
|
729
730
|
name: 'user_speaking',
|
|
730
731
|
context: this.rootSpanContext,
|
|
732
|
+
startTime: lastSpeakingTime,
|
|
731
733
|
});
|
|
732
734
|
|
|
733
735
|
// TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
|
|
734
736
|
// (Ref: Python agent_session.py line 1192-1195)
|
|
735
737
|
} else if (this.userSpeakingSpan !== undefined) {
|
|
736
|
-
|
|
737
|
-
this.userSpeakingSpan.end();
|
|
738
|
+
this.userSpeakingSpan.end(lastSpeakingTime);
|
|
738
739
|
this.userSpeakingSpan = undefined;
|
|
739
740
|
}
|
|
740
741
|
|
|
@@ -566,9 +566,11 @@ export class AudioRecognition {
|
|
|
566
566
|
this.speaking = true;
|
|
567
567
|
|
|
568
568
|
if (!this.userTurnSpan) {
|
|
569
|
+
const startTime = Date.now() - ev.speechDuration;
|
|
569
570
|
this.userTurnSpan = tracer.startSpan({
|
|
570
571
|
name: 'user_turn',
|
|
571
572
|
context: this.rootSpanContext,
|
|
573
|
+
startTime,
|
|
572
574
|
});
|
|
573
575
|
}
|
|
574
576
|
|
|
@@ -47,6 +47,7 @@ export class DataStreamAudioOutput extends AudioOutput {
|
|
|
47
47
|
private started: boolean = false;
|
|
48
48
|
private lock = new Mutex();
|
|
49
49
|
private startTask?: Task<void>;
|
|
50
|
+
private firstFrameEmitted: boolean = false;
|
|
50
51
|
|
|
51
52
|
#logger = log();
|
|
52
53
|
|
|
@@ -146,6 +147,11 @@ export class DataStreamAudioOutput extends AudioOutput {
|
|
|
146
147
|
await this.startTask.result;
|
|
147
148
|
await super.captureFrame(frame);
|
|
148
149
|
|
|
150
|
+
if (!this.firstFrameEmitted) {
|
|
151
|
+
this.firstFrameEmitted = true;
|
|
152
|
+
this.onPlaybackStarted(Date.now());
|
|
153
|
+
}
|
|
154
|
+
|
|
149
155
|
if (!this.streamWriter) {
|
|
150
156
|
this.streamWriter = await this.room.localParticipant!.streamBytes({
|
|
151
157
|
name: shortuuid('AUDIO_'),
|
|
@@ -174,6 +180,8 @@ export class DataStreamAudioOutput extends AudioOutput {
|
|
|
174
180
|
this.streamWriter.close().finally(() => {
|
|
175
181
|
this.streamWriter = undefined;
|
|
176
182
|
});
|
|
183
|
+
|
|
184
|
+
this.firstFrameEmitted = false;
|
|
177
185
|
}
|
|
178
186
|
|
|
179
187
|
clearBuffer(): void {
|
package/src/voice/generation.ts
CHANGED
|
@@ -27,7 +27,7 @@ import { traceTypes, tracer } from '../telemetry/index.js';
|
|
|
27
27
|
import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
|
|
28
28
|
import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
|
|
29
29
|
import type { AgentSession } from './agent_session.js';
|
|
30
|
-
import
|
|
30
|
+
import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
|
|
31
31
|
import { RunContext } from './run_context.js';
|
|
32
32
|
import type { SpeechHandle } from './speech_handle.js';
|
|
33
33
|
|
|
@@ -608,7 +608,8 @@ export function performTextForwarding(
|
|
|
608
608
|
|
|
609
609
|
export interface _AudioOut {
|
|
610
610
|
audio: Array<AudioFrame>;
|
|
611
|
-
|
|
611
|
+
/** Future that will be set with the timestamp of the first frame's capture */
|
|
612
|
+
firstFrameFut: Future<number>;
|
|
612
613
|
}
|
|
613
614
|
|
|
614
615
|
async function forwardAudio(
|
|
@@ -620,7 +621,16 @@ async function forwardAudio(
|
|
|
620
621
|
const reader = ttsStream.getReader();
|
|
621
622
|
let resampler: AudioResampler | null = null;
|
|
622
623
|
|
|
624
|
+
const onPlaybackStarted = (ev: { createdAt: number }) => {
|
|
625
|
+
if (!out.firstFrameFut.done) {
|
|
626
|
+
out.firstFrameFut.resolve(ev.createdAt);
|
|
627
|
+
}
|
|
628
|
+
};
|
|
629
|
+
|
|
623
630
|
try {
|
|
631
|
+
audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
632
|
+
audioOuput.resume();
|
|
633
|
+
|
|
624
634
|
while (true) {
|
|
625
635
|
if (signal?.aborted) {
|
|
626
636
|
break;
|
|
@@ -647,20 +657,21 @@ async function forwardAudio(
|
|
|
647
657
|
} else {
|
|
648
658
|
await audioOuput.captureFrame(frame);
|
|
649
659
|
}
|
|
650
|
-
|
|
651
|
-
// set the first frame future if not already set
|
|
652
|
-
// (after completing the first frame)
|
|
653
|
-
if (!out.firstFrameFut.done) {
|
|
654
|
-
out.firstFrameFut.resolve();
|
|
655
|
-
}
|
|
656
660
|
}
|
|
657
|
-
|
|
658
|
-
reader?.releaseLock();
|
|
661
|
+
|
|
659
662
|
if (resampler) {
|
|
660
663
|
for (const f of resampler.flush()) {
|
|
661
664
|
await audioOuput.captureFrame(f);
|
|
662
665
|
}
|
|
663
666
|
}
|
|
667
|
+
} finally {
|
|
668
|
+
audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
669
|
+
|
|
670
|
+
if (!out.firstFrameFut.done) {
|
|
671
|
+
out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
reader?.releaseLock();
|
|
664
675
|
audioOuput.flush();
|
|
665
676
|
}
|
|
666
677
|
}
|
|
@@ -670,10 +681,11 @@ export function performAudioForwarding(
|
|
|
670
681
|
audioOutput: AudioOutput,
|
|
671
682
|
controller: AbortController,
|
|
672
683
|
): [Task<void>, _AudioOut] {
|
|
673
|
-
const out = {
|
|
684
|
+
const out: _AudioOut = {
|
|
674
685
|
audio: [],
|
|
675
|
-
firstFrameFut: new Future(),
|
|
686
|
+
firstFrameFut: new Future<number>(),
|
|
676
687
|
};
|
|
688
|
+
|
|
677
689
|
return [
|
|
678
690
|
Task.from(
|
|
679
691
|
(controller) => forwardAudio(ttsStream, audioOutput, out, controller.signal),
|
package/src/voice/io.ts
CHANGED
|
@@ -30,12 +30,14 @@ export type TTSNode = (
|
|
|
30
30
|
) => Promise<ReadableStream<AudioFrame> | null>;
|
|
31
31
|
|
|
32
32
|
/**
|
|
33
|
-
*
|
|
33
|
+
*A string with optional start and end timestamps for word-level alignment.
|
|
34
34
|
*/
|
|
35
35
|
export interface TimedString {
|
|
36
36
|
text: string;
|
|
37
37
|
startTime?: number; // seconds
|
|
38
38
|
endTime?: number; // seconds
|
|
39
|
+
confidence?: number;
|
|
40
|
+
startTimeOffset?: number;
|
|
39
41
|
}
|
|
40
42
|
|
|
41
43
|
export interface AudioOutputCapabilities {
|
|
@@ -57,6 +59,7 @@ export abstract class AudioInput {
|
|
|
57
59
|
}
|
|
58
60
|
|
|
59
61
|
export abstract class AudioOutput extends EventEmitter {
|
|
62
|
+
static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';
|
|
60
63
|
static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';
|
|
61
64
|
|
|
62
65
|
private playbackFinishedFuture: Future<void> = new Future();
|
|
@@ -77,7 +80,11 @@ export abstract class AudioOutput extends EventEmitter {
|
|
|
77
80
|
) {
|
|
78
81
|
super();
|
|
79
82
|
this.capabilities = capabilities;
|
|
83
|
+
|
|
80
84
|
if (this.nextInChain) {
|
|
85
|
+
this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>
|
|
86
|
+
this.onPlaybackStarted(ev.createdAt),
|
|
87
|
+
);
|
|
81
88
|
this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>
|
|
82
89
|
this.onPlaybackFinished(ev),
|
|
83
90
|
);
|
|
@@ -117,6 +124,14 @@ export abstract class AudioOutput extends EventEmitter {
|
|
|
117
124
|
return this.lastPlaybackEvent;
|
|
118
125
|
}
|
|
119
126
|
|
|
127
|
+
/**
|
|
128
|
+
* Called when playback actually starts (first frame is sent to output).
|
|
129
|
+
* Developers building audio sinks should call this when the first frame is captured.
|
|
130
|
+
*/
|
|
131
|
+
onPlaybackStarted(createdAt: number): void {
|
|
132
|
+
this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);
|
|
133
|
+
}
|
|
134
|
+
|
|
120
135
|
/**
|
|
121
136
|
* Developers building audio sinks must call this method when a playback/segment is finished.
|
|
122
137
|
* Segments are segmented by calls to flush() or clearBuffer()
|
|
@@ -174,15 +189,22 @@ export abstract class AudioOutput extends EventEmitter {
|
|
|
174
189
|
}
|
|
175
190
|
|
|
176
191
|
export interface PlaybackFinishedEvent {
|
|
177
|
-
|
|
192
|
+
/** How much of the audio was played back, in seconds */
|
|
178
193
|
playbackPosition: number;
|
|
179
|
-
|
|
194
|
+
/** True if playback was interrupted (clearBuffer() was called) */
|
|
180
195
|
interrupted: boolean;
|
|
181
|
-
|
|
182
|
-
|
|
196
|
+
/**
|
|
197
|
+
* Transcript synced with playback; may be partial if the audio was interrupted.
|
|
198
|
+
* When undefined, the transcript is not synchronized with the playback.
|
|
199
|
+
*/
|
|
183
200
|
synchronizedTranscript?: string;
|
|
184
201
|
}
|
|
185
202
|
|
|
203
|
+
export interface PlaybackStartedEvent {
|
|
204
|
+
/** The timestamp (Date.now()) when the playback started */
|
|
205
|
+
createdAt: number;
|
|
206
|
+
}
|
|
207
|
+
|
|
186
208
|
export abstract class TextOutput {
|
|
187
209
|
constructor(protected readonly nextInChain?: TextOutput) {}
|
|
188
210
|
|