@livekit/agents 1.0.40 → 1.0.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +20 -18
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +20 -18
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +5 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/stt.cjs +2 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +2 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +5 -1
- package/dist/llm/realtime.d.ts +5 -1
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +15 -1
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -1
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +9 -1
- package/dist/tts/tts.d.ts +9 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js.map +1 -1
- package/dist/types.cjs +3 -0
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +4 -0
- package/dist/types.d.ts +4 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -1
- package/dist/voice/agent.cjs +11 -1
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +7 -3
- package/dist/voice/agent.d.ts +7 -3
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +11 -1
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +30 -14
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -0
- package/dist/voice/agent_activity.d.ts +1 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +30 -14
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +5 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +2 -0
- package/dist/voice/agent_session.d.ts +2 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +5 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +1 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +1 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/background_audio.cjs +2 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/background_audio.d.cts +4 -2
- package/dist/voice/background_audio.d.ts +4 -2
- package/dist/voice/background_audio.d.ts.map +1 -1
- package/dist/voice/background_audio.js +2 -1
- package/dist/voice/background_audio.js.map +1 -1
- package/dist/voice/generation.cjs +58 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +17 -3
- package/dist/voice/generation.d.ts +17 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +63 -6
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +22 -2
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +21 -5
- package/dist/voice/io.d.ts +21 -5
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +18 -1
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +3 -2
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +3 -3
- package/dist/voice/room_io/_output.d.ts +3 -3
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +4 -3
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/transcription/synchronizer.cjs +137 -13
- package/dist/voice/transcription/synchronizer.cjs.map +1 -1
- package/dist/voice/transcription/synchronizer.d.cts +34 -4
- package/dist/voice/transcription/synchronizer.d.ts +34 -4
- package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
- package/dist/voice/transcription/synchronizer.js +141 -14
- package/dist/voice/transcription/synchronizer.js.map +1 -1
- package/dist/voice/transcription/synchronizer.test.cjs +151 -0
- package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
- package/dist/voice/transcription/synchronizer.test.js +150 -0
- package/dist/voice/transcription/synchronizer.test.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -18
- package/src/index.ts +1 -0
- package/src/inference/stt.ts +9 -8
- package/src/llm/realtime.ts +5 -1
- package/src/tts/stream_adapter.ts +23 -1
- package/src/tts/tts.ts +10 -1
- package/src/types.ts +5 -0
- package/src/voice/agent.ts +19 -4
- package/src/voice/agent_activity.ts +38 -13
- package/src/voice/agent_session.ts +6 -0
- package/src/voice/audio_recognition.ts +2 -1
- package/src/voice/background_audio.ts +6 -3
- package/src/voice/generation.ts +115 -10
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +40 -5
- package/src/voice/room_io/_output.ts +6 -5
- package/src/voice/transcription/synchronizer.test.ts +206 -0
- package/src/voice/transcription/synchronizer.ts +202 -17
package/src/voice/agent.ts
CHANGED
|
@@ -26,9 +26,11 @@ import { StreamAdapter as STTStreamAdapter } from '../stt/index.js';
|
|
|
26
26
|
import { SentenceTokenizer as BasicSentenceTokenizer } from '../tokenize/basic/index.js';
|
|
27
27
|
import type { TTS } from '../tts/index.js';
|
|
28
28
|
import { SynthesizeStream, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
|
|
29
|
+
import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
|
|
29
30
|
import type { VAD } from '../vad.js';
|
|
30
31
|
import type { AgentActivity } from './agent_activity.js';
|
|
31
32
|
import type { AgentSession, TurnDetectionMode } from './agent_session.js';
|
|
33
|
+
import type { TimedString } from './io.js';
|
|
32
34
|
|
|
33
35
|
export const asyncLocalStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>();
|
|
34
36
|
export const STOP_RESPONSE_SYMBOL = Symbol('StopResponse');
|
|
@@ -70,6 +72,7 @@ export interface AgentOptions<UserData> {
|
|
|
70
72
|
tts?: TTS | TTSModelString;
|
|
71
73
|
allowInterruptions?: boolean;
|
|
72
74
|
minConsecutiveSpeechDelay?: number;
|
|
75
|
+
useTtsAlignedTranscript?: boolean;
|
|
73
76
|
}
|
|
74
77
|
|
|
75
78
|
export class Agent<UserData = any> {
|
|
@@ -79,6 +82,7 @@ export class Agent<UserData = any> {
|
|
|
79
82
|
private _vad?: VAD;
|
|
80
83
|
private _llm?: LLM | RealtimeModel;
|
|
81
84
|
private _tts?: TTS;
|
|
85
|
+
private _useTtsAlignedTranscript?: boolean;
|
|
82
86
|
|
|
83
87
|
/** @internal */
|
|
84
88
|
_agentActivity?: AgentActivity;
|
|
@@ -102,6 +106,7 @@ export class Agent<UserData = any> {
|
|
|
102
106
|
vad,
|
|
103
107
|
llm,
|
|
104
108
|
tts,
|
|
109
|
+
useTtsAlignedTranscript,
|
|
105
110
|
}: AgentOptions<UserData>) {
|
|
106
111
|
if (id) {
|
|
107
112
|
this._id = id;
|
|
@@ -147,6 +152,8 @@ export class Agent<UserData = any> {
|
|
|
147
152
|
this._tts = tts;
|
|
148
153
|
}
|
|
149
154
|
|
|
155
|
+
this._useTtsAlignedTranscript = useTtsAlignedTranscript;
|
|
156
|
+
|
|
150
157
|
this._agentActivity = undefined;
|
|
151
158
|
}
|
|
152
159
|
|
|
@@ -166,6 +173,10 @@ export class Agent<UserData = any> {
|
|
|
166
173
|
return this._tts;
|
|
167
174
|
}
|
|
168
175
|
|
|
176
|
+
get useTtsAlignedTranscript(): boolean | undefined {
|
|
177
|
+
return this._useTtsAlignedTranscript;
|
|
178
|
+
}
|
|
179
|
+
|
|
169
180
|
get chatCtx(): ReadonlyChatContext {
|
|
170
181
|
return new ReadonlyChatContext(this._chatCtx.items);
|
|
171
182
|
}
|
|
@@ -191,9 +202,9 @@ export class Agent<UserData = any> {
|
|
|
191
202
|
async onExit(): Promise<void> {}
|
|
192
203
|
|
|
193
204
|
async transcriptionNode(
|
|
194
|
-
text: ReadableStream<string>,
|
|
205
|
+
text: ReadableStream<string | TimedString>,
|
|
195
206
|
modelSettings: ModelSettings,
|
|
196
|
-
): Promise<ReadableStream<string> | null> {
|
|
207
|
+
): Promise<ReadableStream<string | TimedString> | null> {
|
|
197
208
|
return Agent.default.transcriptionNode(this, text, modelSettings);
|
|
198
209
|
}
|
|
199
210
|
|
|
@@ -395,6 +406,10 @@ export class Agent<UserData = any> {
|
|
|
395
406
|
if (chunk === SynthesizeStream.END_OF_STREAM) {
|
|
396
407
|
break;
|
|
397
408
|
}
|
|
409
|
+
// Attach timed transcripts to frame.userdata
|
|
410
|
+
if (chunk.timedTranscripts && chunk.timedTranscripts.length > 0) {
|
|
411
|
+
chunk.frame.userdata[USERDATA_TIMED_TRANSCRIPT] = chunk.timedTranscripts;
|
|
412
|
+
}
|
|
398
413
|
controller.enqueue(chunk.frame);
|
|
399
414
|
}
|
|
400
415
|
controller.close();
|
|
@@ -410,9 +425,9 @@ export class Agent<UserData = any> {
|
|
|
410
425
|
|
|
411
426
|
async transcriptionNode(
|
|
412
427
|
agent: Agent,
|
|
413
|
-
text: ReadableStream<string>,
|
|
428
|
+
text: ReadableStream<string | TimedString>,
|
|
414
429
|
_modelSettings: ModelSettings,
|
|
415
|
-
): Promise<ReadableStream<string> | null> {
|
|
430
|
+
): Promise<ReadableStream<string | TimedString> | null> {
|
|
416
431
|
return text;
|
|
417
432
|
},
|
|
418
433
|
|
|
@@ -60,7 +60,7 @@ import {
|
|
|
60
60
|
createSpeechCreatedEvent,
|
|
61
61
|
createUserInputTranscribedEvent,
|
|
62
62
|
} from './events.js';
|
|
63
|
-
import type { ToolExecutionOutput } from './generation.js';
|
|
63
|
+
import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
|
|
64
64
|
import {
|
|
65
65
|
type _AudioOut,
|
|
66
66
|
type _TextOut,
|
|
@@ -72,6 +72,7 @@ import {
|
|
|
72
72
|
removeInstructions,
|
|
73
73
|
updateInstructions,
|
|
74
74
|
} from './generation.js';
|
|
75
|
+
import type { TimedString } from './io.js';
|
|
75
76
|
import { SpeechHandle } from './speech_handle.js';
|
|
76
77
|
|
|
77
78
|
const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
|
|
@@ -359,6 +360,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
359
360
|
return this.agentSession.options.allowInterruptions;
|
|
360
361
|
}
|
|
361
362
|
|
|
363
|
+
get useTtsAlignedTranscript(): boolean {
|
|
364
|
+
// Agent setting takes precedence over session setting
|
|
365
|
+
return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
|
|
366
|
+
}
|
|
367
|
+
|
|
362
368
|
get turnDetection(): TurnDetectionMode | undefined {
|
|
363
369
|
// TODO(brian): prioritize using agent.turn_detection
|
|
364
370
|
return this.agentSession.turnDetection;
|
|
@@ -1258,7 +1264,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1258
1264
|
let audioOut: _AudioOut | null = null;
|
|
1259
1265
|
if (!audio) {
|
|
1260
1266
|
// generate audio using TTS
|
|
1261
|
-
const [ttsTask,
|
|
1267
|
+
const [ttsTask, ttsGenData] = performTTSInference(
|
|
1262
1268
|
(...args) => this.agent.ttsNode(...args),
|
|
1263
1269
|
audioSource,
|
|
1264
1270
|
modelSettings,
|
|
@@ -1267,7 +1273,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1267
1273
|
tasks.push(ttsTask);
|
|
1268
1274
|
|
|
1269
1275
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1270
|
-
|
|
1276
|
+
ttsGenData.audioStream,
|
|
1271
1277
|
audioOutput,
|
|
1272
1278
|
replyAbortController,
|
|
1273
1279
|
);
|
|
@@ -1389,14 +1395,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1389
1395
|
tasks.push(llmTask);
|
|
1390
1396
|
|
|
1391
1397
|
let ttsTask: Task<void> | null = null;
|
|
1392
|
-
let
|
|
1398
|
+
let ttsGenData: _TTSGenerationData | null = null;
|
|
1393
1399
|
let llmOutput: ReadableStream<string>;
|
|
1394
1400
|
|
|
1395
1401
|
if (audioOutput) {
|
|
1396
1402
|
// Only tee the stream when we need TTS
|
|
1397
1403
|
const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
|
|
1398
1404
|
llmOutput = textOutput;
|
|
1399
|
-
[ttsTask,
|
|
1405
|
+
[ttsTask, ttsGenData] = performTTSInference(
|
|
1400
1406
|
(...args) => this.agent.ttsNode(...args),
|
|
1401
1407
|
ttsTextInput,
|
|
1402
1408
|
modelSettings,
|
|
@@ -1428,7 +1434,26 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1428
1434
|
speechHandle._clearAuthorization();
|
|
1429
1435
|
|
|
1430
1436
|
const replyStartedAt = Date.now();
|
|
1431
|
-
|
|
1437
|
+
|
|
1438
|
+
// Determine the transcription input source
|
|
1439
|
+
let transcriptionInput: ReadableStream<string | TimedString> = llmOutput;
|
|
1440
|
+
|
|
1441
|
+
// Check if we should use TTS aligned transcripts
|
|
1442
|
+
if (this.useTtsAlignedTranscript && this.tts?.capabilities.alignedTranscript && ttsGenData) {
|
|
1443
|
+
// Race timedTextsFut with ttsTask to avoid hanging if TTS fails before resolving the future
|
|
1444
|
+
const timedTextsStream = await Promise.race([
|
|
1445
|
+
ttsGenData.timedTextsFut.await,
|
|
1446
|
+
ttsTask?.result.catch(() =>
|
|
1447
|
+
this.logger.warn('TTS task failed before resolving timedTextsFut'),
|
|
1448
|
+
) ?? Promise.resolve(),
|
|
1449
|
+
]);
|
|
1450
|
+
if (timedTextsStream) {
|
|
1451
|
+
this.logger.debug('Using TTS aligned transcripts for transcription node input');
|
|
1452
|
+
transcriptionInput = timedTextsStream;
|
|
1453
|
+
}
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
|
|
1432
1457
|
let textOut: _TextOut | null = null;
|
|
1433
1458
|
if (trNodeResult) {
|
|
1434
1459
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -1449,9 +1474,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1449
1474
|
|
|
1450
1475
|
let audioOut: _AudioOut | null = null;
|
|
1451
1476
|
if (audioOutput) {
|
|
1452
|
-
if (
|
|
1477
|
+
if (ttsGenData) {
|
|
1453
1478
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1454
|
-
|
|
1479
|
+
ttsGenData.audioStream,
|
|
1455
1480
|
audioOutput,
|
|
1456
1481
|
replyAbortController,
|
|
1457
1482
|
);
|
|
@@ -1461,7 +1486,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1461
1486
|
.then((ts) => onFirstFrame(ts))
|
|
1462
1487
|
.catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
|
|
1463
1488
|
} else {
|
|
1464
|
-
throw Error('
|
|
1489
|
+
throw Error('ttsGenData is null when audioOutput is enabled');
|
|
1465
1490
|
}
|
|
1466
1491
|
} else {
|
|
1467
1492
|
textOut?.firstTextFut.await
|
|
@@ -1851,8 +1876,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1851
1876
|
}
|
|
1852
1877
|
|
|
1853
1878
|
const msgModalities = msg.modalities ? await msg.modalities : undefined;
|
|
1854
|
-
let ttsTextInput: ReadableStream<string> | null = null;
|
|
1855
|
-
let trTextInput: ReadableStream<string>;
|
|
1879
|
+
let ttsTextInput: ReadableStream<string | TimedString> | null = null;
|
|
1880
|
+
let trTextInput: ReadableStream<string | TimedString>;
|
|
1856
1881
|
|
|
1857
1882
|
if (msgModalities && !msgModalities.includes('audio') && this.tts) {
|
|
1858
1883
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
@@ -1884,14 +1909,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1884
1909
|
let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
|
|
1885
1910
|
|
|
1886
1911
|
if (ttsTextInput) {
|
|
1887
|
-
const [ttsTask,
|
|
1912
|
+
const [ttsTask, ttsGenData] = performTTSInference(
|
|
1888
1913
|
(...args) => this.agent.ttsNode(...args),
|
|
1889
1914
|
ttsTextInput,
|
|
1890
1915
|
modelSettings,
|
|
1891
1916
|
abortController,
|
|
1892
1917
|
);
|
|
1893
1918
|
tasks.push(ttsTask);
|
|
1894
|
-
realtimeAudioResult =
|
|
1919
|
+
realtimeAudioResult = ttsGenData.audioStream;
|
|
1895
1920
|
} else if (msgModalities && msgModalities.includes('audio')) {
|
|
1896
1921
|
realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
|
|
1897
1922
|
msg.audioStream,
|
|
@@ -73,6 +73,7 @@ export interface VoiceOptions {
|
|
|
73
73
|
maxToolSteps: number;
|
|
74
74
|
preemptiveGeneration: boolean;
|
|
75
75
|
userAwayTimeout?: number | null;
|
|
76
|
+
useTtsAlignedTranscript: boolean;
|
|
76
77
|
}
|
|
77
78
|
|
|
78
79
|
const defaultVoiceOptions: VoiceOptions = {
|
|
@@ -85,6 +86,7 @@ const defaultVoiceOptions: VoiceOptions = {
|
|
|
85
86
|
maxToolSteps: 3,
|
|
86
87
|
preemptiveGeneration: false,
|
|
87
88
|
userAwayTimeout: 15.0,
|
|
89
|
+
useTtsAlignedTranscript: true,
|
|
88
90
|
} as const;
|
|
89
91
|
|
|
90
92
|
export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
|
|
@@ -264,6 +266,10 @@ export class AgentSession<
|
|
|
264
266
|
return this._connOptions;
|
|
265
267
|
}
|
|
266
268
|
|
|
269
|
+
get useTtsAlignedTranscript(): boolean {
|
|
270
|
+
return this.options.useTtsAlignedTranscript;
|
|
271
|
+
}
|
|
272
|
+
|
|
267
273
|
set userData(value: UserData) {
|
|
268
274
|
this._userData = value;
|
|
269
275
|
}
|
|
@@ -161,7 +161,6 @@ export class AudioRecognition {
|
|
|
161
161
|
|
|
162
162
|
switch (ev.type) {
|
|
163
163
|
case SpeechEventType.FINAL_TRANSCRIPT:
|
|
164
|
-
this.hooks.onFinalTranscript(ev);
|
|
165
164
|
const transcript = ev.alternatives?.[0]?.text;
|
|
166
165
|
const confidence = ev.alternatives?.[0]?.confidence ?? 0;
|
|
167
166
|
this.lastLanguage = ev.alternatives?.[0]?.language;
|
|
@@ -171,6 +170,8 @@ export class AudioRecognition {
|
|
|
171
170
|
return;
|
|
172
171
|
}
|
|
173
172
|
|
|
173
|
+
this.hooks.onFinalTranscript(ev);
|
|
174
|
+
|
|
174
175
|
this.logger.debug(
|
|
175
176
|
{
|
|
176
177
|
user_transcript: transcript,
|
|
@@ -63,8 +63,10 @@ export interface BackgroundAudioPlayerOptions {
|
|
|
63
63
|
thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[];
|
|
64
64
|
|
|
65
65
|
/**
|
|
66
|
-
* Stream timeout in milliseconds
|
|
67
|
-
*
|
|
66
|
+
* Stream timeout in milliseconds for the audio mixer.
|
|
67
|
+
* Controls how long the mixer waits for a stream to produce data before timing out.
|
|
68
|
+
* Higher values are more tolerant of network latency and processing delays.
|
|
69
|
+
* @defaultValue 2000
|
|
68
70
|
*/
|
|
69
71
|
streamTimeoutMs?: number;
|
|
70
72
|
}
|
|
@@ -78,6 +80,7 @@ export interface BackgroundAudioStartOptions {
|
|
|
78
80
|
// Queue size for AudioSource buffer (400ms)
|
|
79
81
|
// Kept small to avoid abrupt cutoffs when removing sounds
|
|
80
82
|
const AUDIO_SOURCE_BUFFER_MS = 400;
|
|
83
|
+
const STREAM_TIMEOUT_MS = 2000;
|
|
81
84
|
|
|
82
85
|
export class PlayHandle {
|
|
83
86
|
private doneFuture = new Future<void>();
|
|
@@ -155,7 +158,7 @@ export class BackgroundAudioPlayer {
|
|
|
155
158
|
#logger = log();
|
|
156
159
|
|
|
157
160
|
constructor(options?: BackgroundAudioPlayerOptions) {
|
|
158
|
-
const { ambientSound, thinkingSound, streamTimeoutMs =
|
|
161
|
+
const { ambientSound, thinkingSound, streamTimeoutMs = STREAM_TIMEOUT_MS } = options || {};
|
|
159
162
|
|
|
160
163
|
this.ambientSound = ambientSound;
|
|
161
164
|
this.thinkingSound = thinkingSound;
|
package/src/voice/generation.ts
CHANGED
|
@@ -24,10 +24,19 @@ import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
|
|
|
24
24
|
import { log } from '../log.js';
|
|
25
25
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
26
26
|
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
27
|
+
import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
|
|
27
28
|
import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
|
|
28
29
|
import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
|
|
29
30
|
import type { AgentSession } from './agent_session.js';
|
|
30
|
-
import {
|
|
31
|
+
import {
|
|
32
|
+
AudioOutput,
|
|
33
|
+
type LLMNode,
|
|
34
|
+
type TTSNode,
|
|
35
|
+
type TextOutput,
|
|
36
|
+
type TimedString,
|
|
37
|
+
createTimedString,
|
|
38
|
+
isTimedString,
|
|
39
|
+
} from './io.js';
|
|
31
40
|
import { RunContext } from './run_context.js';
|
|
32
41
|
import type { SpeechHandle } from './speech_handle.js';
|
|
33
42
|
|
|
@@ -46,6 +55,21 @@ export class _LLMGenerationData {
|
|
|
46
55
|
}
|
|
47
56
|
}
|
|
48
57
|
|
|
58
|
+
/**
|
|
59
|
+
* TTS generation data containing audio stream and optional timed transcripts.
|
|
60
|
+
* @internal
|
|
61
|
+
*/
|
|
62
|
+
export interface _TTSGenerationData {
|
|
63
|
+
/** Audio frame stream from TTS */
|
|
64
|
+
audioStream: ReadableStream<AudioFrame>;
|
|
65
|
+
/**
|
|
66
|
+
* Future that resolves to a stream of timed transcripts, or null if TTS doesn't support it.
|
|
67
|
+
*/
|
|
68
|
+
timedTextsFut: Future<ReadableStream<TimedString> | null>;
|
|
69
|
+
/** Time to first byte (set when first audio frame is received) */
|
|
70
|
+
ttfb?: number;
|
|
71
|
+
}
|
|
72
|
+
|
|
49
73
|
// TODO(brian): remove this class in favor of ToolOutput
|
|
50
74
|
export class _ToolOutput {
|
|
51
75
|
output: _JsOutput[];
|
|
@@ -494,35 +518,105 @@ export function performLLMInference(
|
|
|
494
518
|
|
|
495
519
|
export function performTTSInference(
|
|
496
520
|
node: TTSNode,
|
|
497
|
-
text: ReadableStream<string>,
|
|
521
|
+
text: ReadableStream<string | TimedString>,
|
|
498
522
|
modelSettings: ModelSettings,
|
|
499
523
|
controller: AbortController,
|
|
500
|
-
): [Task<void>,
|
|
524
|
+
): [Task<void>, _TTSGenerationData] {
|
|
501
525
|
const audioStream = new IdentityTransform<AudioFrame>();
|
|
502
526
|
const outputWriter = audioStream.writable.getWriter();
|
|
503
527
|
const audioOutputStream = audioStream.readable;
|
|
504
528
|
|
|
529
|
+
const timedTextsFut = new Future<ReadableStream<TimedString> | null>();
|
|
530
|
+
const timedTextsStream = new IdentityTransform<TimedString>();
|
|
531
|
+
const timedTextsWriter = timedTextsStream.writable.getWriter();
|
|
532
|
+
|
|
533
|
+
// Transform stream to extract text from TimedString objects
|
|
534
|
+
const textOnlyStream = new IdentityTransform<string>();
|
|
535
|
+
const textOnlyWriter = textOnlyStream.writable.getWriter();
|
|
536
|
+
(async () => {
|
|
537
|
+
const reader = text.getReader();
|
|
538
|
+
try {
|
|
539
|
+
while (true) {
|
|
540
|
+
const { done, value } = await reader.read();
|
|
541
|
+
if (done) {
|
|
542
|
+
break;
|
|
543
|
+
}
|
|
544
|
+
const textValue = typeof value === 'string' ? value : value.text;
|
|
545
|
+
await textOnlyWriter.write(textValue);
|
|
546
|
+
}
|
|
547
|
+
await textOnlyWriter.close();
|
|
548
|
+
} catch (e) {
|
|
549
|
+
await textOnlyWriter.abort(e as Error);
|
|
550
|
+
} finally {
|
|
551
|
+
reader.releaseLock();
|
|
552
|
+
}
|
|
553
|
+
})();
|
|
554
|
+
|
|
505
555
|
const _performTTSInferenceImpl = async (signal: AbortSignal) => {
|
|
506
556
|
let ttsStreamReader: ReadableStreamDefaultReader<AudioFrame> | null = null;
|
|
507
557
|
let ttsStream: ReadableStream<AudioFrame> | null = null;
|
|
558
|
+
let pushedDuration = 0;
|
|
508
559
|
|
|
509
560
|
try {
|
|
510
|
-
ttsStream = await node(
|
|
561
|
+
ttsStream = await node(textOnlyStream.readable, modelSettings);
|
|
511
562
|
if (ttsStream === null) {
|
|
563
|
+
timedTextsFut.resolve(null);
|
|
512
564
|
await outputWriter.close();
|
|
565
|
+
await timedTextsWriter.close();
|
|
513
566
|
return;
|
|
514
567
|
}
|
|
515
568
|
|
|
569
|
+
// This is critical: the future must be resolved with the channel/stream before the loop
|
|
570
|
+
// so that agent_activity can start reading while we write
|
|
571
|
+
if (!timedTextsFut.done) {
|
|
572
|
+
timedTextsFut.resolve(timedTextsStream.readable);
|
|
573
|
+
}
|
|
574
|
+
|
|
516
575
|
ttsStreamReader = ttsStream.getReader();
|
|
576
|
+
|
|
577
|
+
// In Python, perform_tts_inference has a while loop processing multiple input segments
|
|
578
|
+
// (separated by FlushSentinel), with pushed_duration accumulating across segments.
|
|
579
|
+
// JS currently only does single inference, so initialPushedDuration is always 0.
|
|
580
|
+
// TODO: Add FlushSentinel + multi-segment loop
|
|
581
|
+
const initialPushedDuration = pushedDuration;
|
|
582
|
+
|
|
517
583
|
while (true) {
|
|
518
584
|
if (signal.aborted) {
|
|
519
585
|
break;
|
|
520
586
|
}
|
|
521
|
-
const { done, value:
|
|
587
|
+
const { done, value: frame } = await ttsStreamReader.read();
|
|
522
588
|
if (done) {
|
|
523
589
|
break;
|
|
524
590
|
}
|
|
525
|
-
|
|
591
|
+
|
|
592
|
+
// Write the audio frame to the output stream
|
|
593
|
+
await outputWriter.write(frame);
|
|
594
|
+
|
|
595
|
+
const timedTranscripts = frame.userdata[USERDATA_TIMED_TRANSCRIPT] as
|
|
596
|
+
| TimedString[]
|
|
597
|
+
| undefined;
|
|
598
|
+
if (timedTranscripts && timedTranscripts.length > 0) {
|
|
599
|
+
for (const timedText of timedTranscripts) {
|
|
600
|
+
// Uses the INITIAL value (from previous inferences), not the accumulated value
|
|
601
|
+
const adjustedTimedText = createTimedString({
|
|
602
|
+
text: timedText.text,
|
|
603
|
+
startTime:
|
|
604
|
+
timedText.startTime !== undefined
|
|
605
|
+
? timedText.startTime + initialPushedDuration
|
|
606
|
+
: undefined,
|
|
607
|
+
endTime:
|
|
608
|
+
timedText.endTime !== undefined
|
|
609
|
+
? timedText.endTime + initialPushedDuration
|
|
610
|
+
: undefined,
|
|
611
|
+
confidence: timedText.confidence,
|
|
612
|
+
startTimeOffset: timedText.startTimeOffset,
|
|
613
|
+
});
|
|
614
|
+
await timedTextsWriter.write(adjustedTimedText);
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
const frameDuration = frame.samplesPerChannel / frame.sampleRate;
|
|
619
|
+
pushedDuration += frameDuration;
|
|
526
620
|
}
|
|
527
621
|
} catch (error) {
|
|
528
622
|
if (error instanceof DOMException && error.name === 'AbortError') {
|
|
@@ -534,6 +628,7 @@ export function performTTSInference(
|
|
|
534
628
|
ttsStreamReader?.releaseLock();
|
|
535
629
|
await ttsStream?.cancel();
|
|
536
630
|
await outputWriter.close();
|
|
631
|
+
await timedTextsWriter.close();
|
|
537
632
|
}
|
|
538
633
|
};
|
|
539
634
|
|
|
@@ -546,9 +641,14 @@ export function performTTSInference(
|
|
|
546
641
|
context: currentContext,
|
|
547
642
|
});
|
|
548
643
|
|
|
644
|
+
const genData: _TTSGenerationData = {
|
|
645
|
+
audioStream: audioOutputStream,
|
|
646
|
+
timedTextsFut,
|
|
647
|
+
};
|
|
648
|
+
|
|
549
649
|
return [
|
|
550
650
|
Task.from((controller) => inferenceTask(controller.signal), controller, 'performTTSInference'),
|
|
551
|
-
|
|
651
|
+
genData,
|
|
552
652
|
];
|
|
553
653
|
}
|
|
554
654
|
|
|
@@ -558,7 +658,7 @@ export interface _TextOut {
|
|
|
558
658
|
}
|
|
559
659
|
|
|
560
660
|
async function forwardText(
|
|
561
|
-
source: ReadableStream<string>,
|
|
661
|
+
source: ReadableStream<string | TimedString>,
|
|
562
662
|
out: _TextOut,
|
|
563
663
|
signal: AbortSignal,
|
|
564
664
|
textOutput: TextOutput | null,
|
|
@@ -571,8 +671,13 @@ async function forwardText(
|
|
|
571
671
|
}
|
|
572
672
|
const { done, value: delta } = await reader.read();
|
|
573
673
|
if (done) break;
|
|
574
|
-
|
|
674
|
+
|
|
675
|
+
const deltaIsTimedString = isTimedString(delta);
|
|
676
|
+
const textDelta = deltaIsTimedString ? delta.text : delta;
|
|
677
|
+
|
|
678
|
+
out.text += textDelta;
|
|
575
679
|
if (textOutput !== null) {
|
|
680
|
+
// Pass TimedString to textOutput for synchronized transcription
|
|
576
681
|
await textOutput.captureText(delta);
|
|
577
682
|
}
|
|
578
683
|
if (!out.firstTextFut.done) {
|
|
@@ -588,7 +693,7 @@ async function forwardText(
|
|
|
588
693
|
}
|
|
589
694
|
|
|
590
695
|
export function performTextForwarding(
|
|
591
|
-
source: ReadableStream<string>,
|
|
696
|
+
source: ReadableStream<string | TimedString>,
|
|
592
697
|
controller: AbortController,
|
|
593
698
|
textOutput: TextOutput | null,
|
|
594
699
|
): [Task<void>, _TextOut] {
|
package/src/voice/index.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
export { Agent, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
|
|
5
|
-
export { AgentSession, type AgentSessionOptions } from './agent_session.js';
|
|
5
|
+
export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js';
|
|
6
6
|
export * from './avatar/index.js';
|
|
7
7
|
export * from './background_audio.js';
|
|
8
8
|
export * from './events.js';
|
package/src/voice/io.ts
CHANGED
|
@@ -30,9 +30,15 @@ export type TTSNode = (
|
|
|
30
30
|
) => Promise<ReadableStream<AudioFrame> | null>;
|
|
31
31
|
|
|
32
32
|
/**
|
|
33
|
-
*
|
|
33
|
+
* Symbol used to identify TimedString objects.
|
|
34
|
+
*/
|
|
35
|
+
export const TIMED_STRING_SYMBOL = Symbol.for('lk.TimedString');
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* A string with optional start and end timestamps for word-level alignment.
|
|
34
39
|
*/
|
|
35
40
|
export interface TimedString {
|
|
41
|
+
readonly [TIMED_STRING_SYMBOL]: true;
|
|
36
42
|
text: string;
|
|
37
43
|
startTime?: number; // seconds
|
|
38
44
|
endTime?: number; // seconds
|
|
@@ -40,6 +46,38 @@ export interface TimedString {
|
|
|
40
46
|
startTimeOffset?: number;
|
|
41
47
|
}
|
|
42
48
|
|
|
49
|
+
/**
|
|
50
|
+
* Factory function to create a TimedString object.
|
|
51
|
+
*/
|
|
52
|
+
export function createTimedString(opts: {
|
|
53
|
+
text: string;
|
|
54
|
+
startTime?: number;
|
|
55
|
+
endTime?: number;
|
|
56
|
+
confidence?: number;
|
|
57
|
+
startTimeOffset?: number;
|
|
58
|
+
}): TimedString {
|
|
59
|
+
return {
|
|
60
|
+
[TIMED_STRING_SYMBOL]: true,
|
|
61
|
+
text: opts.text,
|
|
62
|
+
startTime: opts.startTime,
|
|
63
|
+
endTime: opts.endTime,
|
|
64
|
+
confidence: opts.confidence,
|
|
65
|
+
startTimeOffset: opts.startTimeOffset,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Type guard to check if a value is a TimedString.
|
|
71
|
+
*/
|
|
72
|
+
export function isTimedString(value: unknown): value is TimedString {
|
|
73
|
+
return (
|
|
74
|
+
typeof value === 'object' &&
|
|
75
|
+
value !== null &&
|
|
76
|
+
TIMED_STRING_SYMBOL in value &&
|
|
77
|
+
(value as TimedString)[TIMED_STRING_SYMBOL] === true
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
|
|
43
81
|
export interface AudioOutputCapabilities {
|
|
44
82
|
/** Whether this output supports pause/resume functionality */
|
|
45
83
|
pause: boolean;
|
|
@@ -208,10 +246,7 @@ export interface PlaybackStartedEvent {
|
|
|
208
246
|
export abstract class TextOutput {
|
|
209
247
|
constructor(protected readonly nextInChain?: TextOutput) {}
|
|
210
248
|
|
|
211
|
-
|
|
212
|
-
* Capture a text segment (Used by the output of LLM nodes)
|
|
213
|
-
*/
|
|
214
|
-
abstract captureText(text: string): Promise<void>;
|
|
249
|
+
abstract captureText(text: string | TimedString): Promise<void>;
|
|
215
250
|
|
|
216
251
|
/**
|
|
217
252
|
* Mark the current text segment as complete (e.g LLM generation is complete)
|
|
@@ -23,7 +23,7 @@ import {
|
|
|
23
23
|
} from '../../constants.js';
|
|
24
24
|
import { log } from '../../log.js';
|
|
25
25
|
import { Future, Task, shortuuid } from '../../utils.js';
|
|
26
|
-
import { AudioOutput, TextOutput } from '../io.js';
|
|
26
|
+
import { AudioOutput, TextOutput, type TimedString, isTimedString } from '../io.js';
|
|
27
27
|
import { findMicrophoneTrackId } from '../transcription/index.js';
|
|
28
28
|
|
|
29
29
|
abstract class BaseParticipantTranscriptionOutput extends TextOutput {
|
|
@@ -102,13 +102,14 @@ abstract class BaseParticipantTranscriptionOutput extends TextOutput {
|
|
|
102
102
|
this.latestText = '';
|
|
103
103
|
}
|
|
104
104
|
|
|
105
|
-
async captureText(text: string) {
|
|
105
|
+
async captureText(text: string | TimedString) {
|
|
106
106
|
if (!this.participantIdentity) {
|
|
107
107
|
return;
|
|
108
108
|
}
|
|
109
109
|
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
const textStr = isTimedString(text) ? text.text : text;
|
|
111
|
+
this.latestText = textStr;
|
|
112
|
+
await this.handleCaptureText(textStr);
|
|
112
113
|
}
|
|
113
114
|
|
|
114
115
|
flush() {
|
|
@@ -298,7 +299,7 @@ export class ParalellTextOutput extends TextOutput {
|
|
|
298
299
|
this._sinks = sinks;
|
|
299
300
|
}
|
|
300
301
|
|
|
301
|
-
async captureText(text: string) {
|
|
302
|
+
async captureText(text: string | TimedString) {
|
|
302
303
|
await Promise.all(this._sinks.map((sink) => sink.captureText(text)));
|
|
303
304
|
}
|
|
304
305
|
|