@livekit/agents 1.0.25 → 1.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/connection_pool.cjs +242 -0
- package/dist/connection_pool.cjs.map +1 -0
- package/dist/connection_pool.d.cts +123 -0
- package/dist/connection_pool.d.ts +123 -0
- package/dist/connection_pool.d.ts.map +1 -0
- package/dist/connection_pool.js +218 -0
- package/dist/connection_pool.js.map +1 -0
- package/dist/connection_pool.test.cjs +256 -0
- package/dist/connection_pool.test.cjs.map +1 -0
- package/dist/connection_pool.test.js +255 -0
- package/dist/connection_pool.test.js.map +1 -0
- package/dist/index.cjs +2 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/tts.cjs +172 -58
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +3 -1
- package/dist/inference/tts.d.ts +3 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +173 -59
- package/dist/inference/tts.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +6 -3
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.cts +1 -1
- package/dist/tts/stream_adapter.d.ts +1 -1
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +6 -3
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +26 -15
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +7 -4
- package/dist/tts/tts.d.ts +7 -4
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +26 -15
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +20 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +19 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +3 -1
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +3 -1
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +6 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +6 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +1 -1
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.js +1 -1
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs +77 -37
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/background_audio.d.cts +10 -3
- package/dist/voice/background_audio.d.ts +10 -3
- package/dist/voice/background_audio.d.ts.map +1 -1
- package/dist/voice/background_audio.js +78 -37
- package/dist/voice/background_audio.js.map +1 -1
- package/dist/voice/index.cjs +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +10 -1
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +18 -1
- package/dist/voice/io.d.ts +18 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +10 -1
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +1 -1
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +1 -1
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.js +1 -1
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/transcription/synchronizer.cjs +1 -1
- package/dist/voice/transcription/synchronizer.cjs.map +1 -1
- package/dist/voice/transcription/synchronizer.js +1 -1
- package/dist/voice/transcription/synchronizer.js.map +1 -1
- package/dist/worker.cjs +4 -6
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +4 -6
- package/dist/worker.js.map +1 -1
- package/package.json +3 -3
- package/src/connection_pool.test.ts +346 -0
- package/src/connection_pool.ts +307 -0
- package/src/index.ts +1 -0
- package/src/inference/tts.ts +206 -65
- package/src/tts/stream_adapter.ts +10 -3
- package/src/tts/tts.ts +41 -18
- package/src/utils.ts +25 -0
- package/src/voice/agent_activity.ts +7 -1
- package/src/voice/agent_session.ts +6 -1
- package/src/voice/avatar/datastream_io.ts +1 -1
- package/src/voice/background_audio.ts +95 -55
- package/src/voice/index.ts +1 -0
- package/src/voice/io.ts +24 -0
- package/src/voice/recorder_io/recorder_io.ts +1 -1
- package/src/voice/room_io/_output.ts +1 -1
- package/src/voice/transcription/synchronizer.ts +1 -1
- package/src/worker.ts +4 -7
package/src/inference/tts.ts
CHANGED
|
@@ -5,13 +5,14 @@ import type { AudioFrame } from '@livekit/rtc-node';
|
|
|
5
5
|
import { WebSocket } from 'ws';
|
|
6
6
|
import { APIError, APIStatusError } from '../_exceptions.js';
|
|
7
7
|
import { AudioByteStream } from '../audio.js';
|
|
8
|
+
import { ConnectionPool } from '../connection_pool.js';
|
|
8
9
|
import { log } from '../log.js';
|
|
9
10
|
import { createStreamChannel } from '../stream/stream_channel.js';
|
|
10
11
|
import { basic as tokenizeBasic } from '../tokenize/index.js';
|
|
11
12
|
import type { ChunkedStream } from '../tts/index.js';
|
|
12
13
|
import { SynthesizeStream as BaseSynthesizeStream, TTS as BaseTTS } from '../tts/index.js';
|
|
13
14
|
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
14
|
-
import { shortuuid } from '../utils.js';
|
|
15
|
+
import { Event, Future, Task, cancelAndWait, combineSignals, shortuuid } from '../utils.js';
|
|
15
16
|
import {
|
|
16
17
|
type TtsClientEvent,
|
|
17
18
|
type TtsServerEvent,
|
|
@@ -95,6 +96,7 @@ export interface InferenceTTSOptions<TModel extends TTSModels> {
|
|
|
95
96
|
export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
96
97
|
private opts: InferenceTTSOptions<TModel>;
|
|
97
98
|
private streams: Set<SynthesizeStream<TModel>> = new Set();
|
|
99
|
+
pool: ConnectionPool<WebSocket>;
|
|
98
100
|
|
|
99
101
|
#logger = log();
|
|
100
102
|
|
|
@@ -165,6 +167,15 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
165
167
|
apiSecret: lkApiSecret,
|
|
166
168
|
modelOptions,
|
|
167
169
|
};
|
|
170
|
+
|
|
171
|
+
// Initialize connection pool
|
|
172
|
+
this.pool = new ConnectionPool<WebSocket>({
|
|
173
|
+
connectCb: (timeout) => this.connectWs(timeout),
|
|
174
|
+
closeCb: (ws) => this.closeWs(ws),
|
|
175
|
+
maxSessionDuration: 300_000,
|
|
176
|
+
markRefreshedOnGet: true,
|
|
177
|
+
connectTimeout: 10_000, // 10 seconds default
|
|
178
|
+
});
|
|
168
179
|
}
|
|
169
180
|
|
|
170
181
|
get label() {
|
|
@@ -218,6 +229,7 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
218
229
|
if (this.opts.model) params.model = this.opts.model;
|
|
219
230
|
if (this.opts.language) params.language = this.opts.language;
|
|
220
231
|
|
|
232
|
+
this.#logger.debug({ url }, 'inference.TTS creating new websocket connection (pool miss)');
|
|
221
233
|
const socket = await connectWs(url, headers, timeout);
|
|
222
234
|
socket.send(JSON.stringify(params));
|
|
223
235
|
return socket;
|
|
@@ -227,18 +239,22 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
227
239
|
await ws.close();
|
|
228
240
|
}
|
|
229
241
|
|
|
242
|
+
prewarm(): void {
|
|
243
|
+
this.pool.prewarm();
|
|
244
|
+
}
|
|
245
|
+
|
|
230
246
|
async close() {
|
|
231
247
|
for (const stream of this.streams) {
|
|
232
248
|
await stream.close();
|
|
233
249
|
}
|
|
234
250
|
this.streams.clear();
|
|
251
|
+
await this.pool.close();
|
|
235
252
|
}
|
|
236
253
|
}
|
|
237
254
|
|
|
238
255
|
export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeStream {
|
|
239
256
|
private opts: InferenceTTSOptions<TModel>;
|
|
240
257
|
private tts: TTS<TModel>;
|
|
241
|
-
private connOptions: APIConnectOptions;
|
|
242
258
|
|
|
243
259
|
#logger = log();
|
|
244
260
|
|
|
@@ -246,7 +262,6 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
|
|
|
246
262
|
super(tts, connOptions);
|
|
247
263
|
this.opts = opts;
|
|
248
264
|
this.tts = tts;
|
|
249
|
-
this.connOptions = connOptions;
|
|
250
265
|
}
|
|
251
266
|
|
|
252
267
|
get label() {
|
|
@@ -258,30 +273,31 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
|
|
|
258
273
|
}
|
|
259
274
|
|
|
260
275
|
protected async run(): Promise<void> {
|
|
261
|
-
let ws: WebSocket | null = null;
|
|
262
276
|
let closing = false;
|
|
263
|
-
let finalReceived = false;
|
|
264
277
|
let lastFrame: AudioFrame | undefined;
|
|
265
278
|
|
|
266
279
|
const sendTokenizerStream = new tokenizeBasic.SentenceTokenizer().stream();
|
|
267
280
|
const eventChannel = createStreamChannel<TtsServerEvent>();
|
|
268
281
|
const requestId = shortuuid('tts_request_');
|
|
282
|
+
const inputSentEvent = new Event();
|
|
283
|
+
|
|
284
|
+
// Signal for protocol-driven completion (when 'done' message is received)
|
|
285
|
+
const completionFuture = new Future<void>();
|
|
269
286
|
|
|
270
|
-
const resourceCleanup = () => {
|
|
287
|
+
const resourceCleanup = async () => {
|
|
271
288
|
if (closing) return;
|
|
272
289
|
closing = true;
|
|
273
290
|
sendTokenizerStream.close();
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
ws?.close();
|
|
291
|
+
// close() returns a promise; don't leak it
|
|
292
|
+
await eventChannel.close();
|
|
277
293
|
};
|
|
278
294
|
|
|
279
|
-
const sendClientEvent = async (event: TtsClientEvent) => {
|
|
295
|
+
const sendClientEvent = async (event: TtsClientEvent, ws: WebSocket, signal: AbortSignal) => {
|
|
280
296
|
// Don't send events to a closed WebSocket or aborted controller
|
|
281
|
-
if (
|
|
297
|
+
if (signal.aborted || closing) return;
|
|
282
298
|
|
|
283
299
|
const validatedEvent = await ttsClientEventSchema.parseAsync(event);
|
|
284
|
-
if (
|
|
300
|
+
if (ws.readyState !== WebSocket.OPEN) {
|
|
285
301
|
this.#logger.warn('Trying to send client TTS event to a closed WebSocket');
|
|
286
302
|
return;
|
|
287
303
|
}
|
|
@@ -295,9 +311,9 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
|
|
|
295
311
|
}
|
|
296
312
|
};
|
|
297
313
|
|
|
298
|
-
const createInputTask = async () => {
|
|
314
|
+
const createInputTask = async (signal: AbortSignal) => {
|
|
299
315
|
for await (const data of this.input) {
|
|
300
|
-
if (
|
|
316
|
+
if (signal.aborted || closing) break;
|
|
301
317
|
if (data === SynthesizeStream.FLUSH_SENTINEL) {
|
|
302
318
|
sendTokenizerStream.flush();
|
|
303
319
|
continue;
|
|
@@ -310,55 +326,108 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
|
|
|
310
326
|
}
|
|
311
327
|
};
|
|
312
328
|
|
|
313
|
-
const createSentenceStreamTask = async () => {
|
|
329
|
+
const createSentenceStreamTask = async (ws: WebSocket, signal: AbortSignal) => {
|
|
314
330
|
for await (const ev of sendTokenizerStream) {
|
|
315
|
-
if (
|
|
316
|
-
|
|
317
|
-
sendClientEvent(
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
331
|
+
if (signal.aborted || closing) break;
|
|
332
|
+
|
|
333
|
+
await sendClientEvent(
|
|
334
|
+
{
|
|
335
|
+
type: 'input_transcript',
|
|
336
|
+
transcript: ev.token + ' ',
|
|
337
|
+
},
|
|
338
|
+
ws,
|
|
339
|
+
signal,
|
|
340
|
+
);
|
|
341
|
+
inputSentEvent.set();
|
|
321
342
|
}
|
|
322
343
|
|
|
323
|
-
sendClientEvent({ type: 'session.flush' });
|
|
344
|
+
await sendClientEvent({ type: 'session.flush' }, ws, signal);
|
|
345
|
+
// needed in case empty input is sent
|
|
346
|
+
inputSentEvent.set();
|
|
324
347
|
};
|
|
325
348
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
});
|
|
332
|
-
|
|
333
|
-
ws.on('message', async (data) => {
|
|
349
|
+
// Handles WebSocket message routing and error handling
|
|
350
|
+
// Completes based on protocol messages, NOT on ws.close()
|
|
351
|
+
const createWsListenerTask = async (ws: WebSocket, signal: AbortSignal) => {
|
|
352
|
+
const onMessage = (data: Buffer) => {
|
|
353
|
+
try {
|
|
334
354
|
const eventJson = JSON.parse(data.toString()) as Record<string, unknown>;
|
|
335
355
|
const validatedEvent = ttsServerEventSchema.parse(eventJson);
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
})
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
if (!closing) return this.#logger.error('WebSocket closed unexpectedly');
|
|
349
|
-
if (finalReceived) return resolve();
|
|
356
|
+
// writer.write returns a promise; avoid unhandled rejections if stream is closed
|
|
357
|
+
void eventChannel.write(validatedEvent).catch((error) => {
|
|
358
|
+
this.#logger.debug(
|
|
359
|
+
{ error },
|
|
360
|
+
'Failed writing TTS event to stream channel (likely closed)',
|
|
361
|
+
);
|
|
362
|
+
});
|
|
363
|
+
} catch (e) {
|
|
364
|
+
this.#logger.error({ error: e }, 'Error parsing WebSocket message');
|
|
365
|
+
}
|
|
366
|
+
};
|
|
350
367
|
|
|
351
|
-
|
|
368
|
+
const onError = (e: Error) => {
|
|
369
|
+
this.#logger.error({ error: e }, 'WebSocket error');
|
|
370
|
+
void resourceCleanup();
|
|
371
|
+
try {
|
|
372
|
+
// If the ws is misbehaving, hard-stop it immediately to avoid buffering.
|
|
373
|
+
ws.terminate?.();
|
|
374
|
+
} catch {
|
|
375
|
+
// ignore
|
|
376
|
+
}
|
|
377
|
+
// Ensure this ws is not reused
|
|
378
|
+
this.tts.pool.remove(ws);
|
|
379
|
+
completionFuture.reject(e);
|
|
380
|
+
};
|
|
381
|
+
|
|
382
|
+
const onClose = () => {
|
|
383
|
+
// WebSocket closed unexpectedly (not by us)
|
|
384
|
+
if (!closing) {
|
|
385
|
+
this.#logger.error('WebSocket closed unexpectedly');
|
|
386
|
+
void resourceCleanup();
|
|
387
|
+
// Ensure this ws is not reused
|
|
388
|
+
this.tts.pool.remove(ws);
|
|
389
|
+
completionFuture.reject(
|
|
352
390
|
new APIStatusError({
|
|
353
391
|
message: 'Gateway connection closed unexpectedly',
|
|
354
392
|
options: { requestId },
|
|
355
393
|
}),
|
|
356
394
|
);
|
|
357
|
-
}
|
|
358
|
-
}
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
|
|
398
|
+
const onAbort = () => {
|
|
399
|
+
void resourceCleanup();
|
|
400
|
+
try {
|
|
401
|
+
// On interruption/abort, close the websocket immediately so the server stops streaming
|
|
402
|
+
// and the ws library doesn't buffer unread frames in memory.
|
|
403
|
+
ws.terminate?.();
|
|
404
|
+
} catch {
|
|
405
|
+
// ignore
|
|
406
|
+
}
|
|
407
|
+
this.tts.pool.remove(ws);
|
|
408
|
+
inputSentEvent.set();
|
|
409
|
+
completionFuture.resolve();
|
|
410
|
+
};
|
|
411
|
+
|
|
412
|
+
// Attach listeners
|
|
413
|
+
ws.on('message', onMessage);
|
|
414
|
+
ws.on('error', onError);
|
|
415
|
+
ws.on('close', onClose);
|
|
416
|
+
signal.addEventListener('abort', onAbort);
|
|
417
|
+
|
|
418
|
+
try {
|
|
419
|
+
// Wait for protocol-driven completion or error
|
|
420
|
+
await completionFuture.await;
|
|
421
|
+
} finally {
|
|
422
|
+
// IMPORTANT: Remove listeners so connection can be reused
|
|
423
|
+
ws.off('message', onMessage);
|
|
424
|
+
ws.off('error', onError);
|
|
425
|
+
ws.off('close', onClose);
|
|
426
|
+
signal.removeEventListener('abort', onAbort);
|
|
427
|
+
}
|
|
359
428
|
};
|
|
360
429
|
|
|
361
|
-
const createRecvTask = async () => {
|
|
430
|
+
const createRecvTask = async (signal: AbortSignal) => {
|
|
362
431
|
let currentSessionId: string | null = null;
|
|
363
432
|
|
|
364
433
|
const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
|
|
@@ -366,9 +435,11 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
|
|
|
366
435
|
const reader = serverEventStream.getReader();
|
|
367
436
|
|
|
368
437
|
try {
|
|
369
|
-
|
|
438
|
+
await inputSentEvent.wait();
|
|
439
|
+
|
|
440
|
+
while (!this.closed && !signal.aborted) {
|
|
370
441
|
const result = await reader.read();
|
|
371
|
-
if (
|
|
442
|
+
if (signal.aborted) return;
|
|
372
443
|
if (result.done) return;
|
|
373
444
|
|
|
374
445
|
const serverEvent = result.value;
|
|
@@ -384,24 +455,29 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
|
|
|
384
455
|
}
|
|
385
456
|
break;
|
|
386
457
|
case 'done':
|
|
387
|
-
finalReceived = true;
|
|
388
458
|
for (const frame of bstream.flush()) {
|
|
389
459
|
sendLastFrame(currentSessionId!, false);
|
|
390
460
|
lastFrame = frame;
|
|
391
461
|
}
|
|
392
462
|
sendLastFrame(currentSessionId!, true);
|
|
393
463
|
this.queue.put(SynthesizeStream.END_OF_STREAM);
|
|
394
|
-
|
|
464
|
+
await resourceCleanup();
|
|
465
|
+
completionFuture.resolve();
|
|
466
|
+
return;
|
|
395
467
|
case 'session.closed':
|
|
396
|
-
resourceCleanup();
|
|
397
|
-
|
|
468
|
+
await resourceCleanup();
|
|
469
|
+
completionFuture.resolve();
|
|
470
|
+
return;
|
|
398
471
|
case 'error':
|
|
399
472
|
this.#logger.error(
|
|
400
473
|
{ serverEvent },
|
|
401
474
|
'Received error message from LiveKit TTS WebSocket',
|
|
402
475
|
);
|
|
403
|
-
resourceCleanup();
|
|
404
|
-
|
|
476
|
+
await resourceCleanup();
|
|
477
|
+
completionFuture.reject(
|
|
478
|
+
new APIError(`LiveKit TTS returned error: ${serverEvent.message}`),
|
|
479
|
+
);
|
|
480
|
+
return;
|
|
405
481
|
default:
|
|
406
482
|
this.#logger.warn('Unexpected message %s', serverEvent);
|
|
407
483
|
break;
|
|
@@ -418,16 +494,81 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
|
|
|
418
494
|
};
|
|
419
495
|
|
|
420
496
|
try {
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
497
|
+
await this.tts.pool.withConnection(
|
|
498
|
+
async (ws: WebSocket) => {
|
|
499
|
+
try {
|
|
500
|
+
// IMPORTANT: don't cancel the stream's controller on normal completion,
|
|
501
|
+
// otherwise the pool will remove+close the ws and every run becomes a pool miss.
|
|
502
|
+
const runController = new AbortController();
|
|
503
|
+
const onStreamAbort = () => runController.abort(this.abortController.signal.reason);
|
|
504
|
+
this.abortController.signal.addEventListener('abort', onStreamAbort, { once: true });
|
|
505
|
+
|
|
506
|
+
const tasks = [
|
|
507
|
+
Task.from(
|
|
508
|
+
async (controller) => {
|
|
509
|
+
const combined = combineSignals(runController.signal, controller.signal);
|
|
510
|
+
await createInputTask(combined);
|
|
511
|
+
},
|
|
512
|
+
undefined,
|
|
513
|
+
'inference-tts-input',
|
|
514
|
+
),
|
|
515
|
+
Task.from(
|
|
516
|
+
async (controller) => {
|
|
517
|
+
const combined = combineSignals(runController.signal, controller.signal);
|
|
518
|
+
await createSentenceStreamTask(ws, combined);
|
|
519
|
+
},
|
|
520
|
+
undefined,
|
|
521
|
+
'inference-tts-sentence',
|
|
522
|
+
),
|
|
523
|
+
Task.from(
|
|
524
|
+
async (controller) => {
|
|
525
|
+
const combined = combineSignals(runController.signal, controller.signal);
|
|
526
|
+
await createWsListenerTask(ws, combined);
|
|
527
|
+
},
|
|
528
|
+
undefined,
|
|
529
|
+
'inference-tts-ws-listener',
|
|
530
|
+
),
|
|
531
|
+
Task.from(
|
|
532
|
+
async (controller) => {
|
|
533
|
+
const combined = combineSignals(runController.signal, controller.signal);
|
|
534
|
+
await createRecvTask(combined);
|
|
535
|
+
},
|
|
536
|
+
undefined,
|
|
537
|
+
'inference-tts-recv',
|
|
538
|
+
),
|
|
539
|
+
];
|
|
540
|
+
|
|
541
|
+
try {
|
|
542
|
+
await Promise.all(tasks.map((t) => t.result));
|
|
543
|
+
} finally {
|
|
544
|
+
// Mirror python finally: unblock recv and cancel all tasks.
|
|
545
|
+
inputSentEvent.set();
|
|
546
|
+
await resourceCleanup();
|
|
547
|
+
await cancelAndWait(tasks, 5000);
|
|
548
|
+
this.abortController.signal.removeEventListener('abort', onStreamAbort);
|
|
549
|
+
}
|
|
550
|
+
} catch (e) {
|
|
551
|
+
// If aborted, don't throw - let cleanup handle it
|
|
552
|
+
if (e instanceof Error && e.name === 'AbortError') {
|
|
553
|
+
return;
|
|
554
|
+
}
|
|
555
|
+
throw e;
|
|
556
|
+
}
|
|
557
|
+
},
|
|
558
|
+
{
|
|
559
|
+
timeout: this.connOptions.timeoutMs,
|
|
560
|
+
},
|
|
561
|
+
);
|
|
562
|
+
} catch (e) {
|
|
563
|
+
// Handle connection errors
|
|
564
|
+
if (e instanceof Error && e.name === 'AbortError') {
|
|
565
|
+
// Abort is expected during normal shutdown
|
|
566
|
+
return;
|
|
567
|
+
}
|
|
568
|
+
throw e;
|
|
429
569
|
} finally {
|
|
430
|
-
|
|
570
|
+
// Ensure cleanup always runs (and don't leak the promise)
|
|
571
|
+
await resourceCleanup();
|
|
431
572
|
}
|
|
432
573
|
}
|
|
433
574
|
}
|
|
@@ -22,10 +22,17 @@ export class StreamAdapter extends TTS {
|
|
|
22
22
|
this.#tts.on('metrics_collected', (metrics) => {
|
|
23
23
|
this.emit('metrics_collected', metrics);
|
|
24
24
|
});
|
|
25
|
+
this.#tts.on('error', (error) => {
|
|
26
|
+
this.emit('error', error);
|
|
27
|
+
});
|
|
25
28
|
}
|
|
26
29
|
|
|
27
|
-
synthesize(
|
|
28
|
-
|
|
30
|
+
synthesize(
|
|
31
|
+
text: string,
|
|
32
|
+
connOptions?: APIConnectOptions,
|
|
33
|
+
abortSignal?: AbortSignal,
|
|
34
|
+
): ChunkedStream {
|
|
35
|
+
return this.#tts.synthesize(text, connOptions, abortSignal);
|
|
29
36
|
}
|
|
30
37
|
|
|
31
38
|
stream(options?: { connOptions?: APIConnectOptions }): StreamAdapterWrapper {
|
|
@@ -85,7 +92,7 @@ export class StreamAdapterWrapper extends SynthesizeStream {
|
|
|
85
92
|
prevTask: Task<void> | undefined,
|
|
86
93
|
controller: AbortController,
|
|
87
94
|
) => {
|
|
88
|
-
const audioStream = this.#tts.synthesize(token);
|
|
95
|
+
const audioStream = this.#tts.synthesize(token, this.connOptions, this.abortSignal);
|
|
89
96
|
|
|
90
97
|
// wait for previous audio transcription to complete before starting
|
|
91
98
|
// to queuing audio frames of the current token
|
package/src/tts/tts.ts
CHANGED
|
@@ -90,7 +90,11 @@ export abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCal
|
|
|
90
90
|
/**
|
|
91
91
|
* Receives text and returns synthesis in the form of a {@link ChunkedStream}
|
|
92
92
|
*/
|
|
93
|
-
abstract synthesize(
|
|
93
|
+
abstract synthesize(
|
|
94
|
+
text: string,
|
|
95
|
+
connOptions?: APIConnectOptions,
|
|
96
|
+
abortSignal?: AbortSignal,
|
|
97
|
+
): ChunkedStream;
|
|
94
98
|
|
|
95
99
|
/**
|
|
96
100
|
* Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
|
|
@@ -131,30 +135,33 @@ export abstract class SynthesizeStream
|
|
|
131
135
|
SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
|
|
132
136
|
>();
|
|
133
137
|
protected closed = false;
|
|
134
|
-
|
|
135
|
-
#tts: TTS;
|
|
136
|
-
#metricsPendingTexts: string[] = [];
|
|
137
|
-
#metricsText = '';
|
|
138
|
-
#monitorMetricsTask?: Promise<void>;
|
|
139
|
-
private _connOptions: APIConnectOptions;
|
|
138
|
+
protected connOptions: APIConnectOptions;
|
|
140
139
|
protected abortController = new AbortController();
|
|
141
|
-
#ttsRequestSpan?: Span;
|
|
142
140
|
|
|
143
141
|
private deferredInputStream: DeferredReadableStream<
|
|
144
142
|
string | typeof SynthesizeStream.FLUSH_SENTINEL
|
|
145
143
|
>;
|
|
146
144
|
private logger = log();
|
|
147
145
|
|
|
146
|
+
abstract label: string;
|
|
147
|
+
|
|
148
|
+
#tts: TTS;
|
|
149
|
+
#metricsPendingTexts: string[] = [];
|
|
150
|
+
#metricsText = '';
|
|
151
|
+
#monitorMetricsTask?: Promise<void>;
|
|
152
|
+
#ttsRequestSpan?: Span;
|
|
153
|
+
|
|
148
154
|
constructor(tts: TTS, connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS) {
|
|
149
155
|
this.#tts = tts;
|
|
150
|
-
this.
|
|
156
|
+
this.connOptions = connOptions;
|
|
151
157
|
this.deferredInputStream = new DeferredReadableStream();
|
|
152
158
|
this.pumpInput();
|
|
159
|
+
|
|
153
160
|
this.abortController.signal.addEventListener('abort', () => {
|
|
154
161
|
this.deferredInputStream.detachSource();
|
|
155
162
|
// TODO (AJS-36) clean this up when we refactor with streams
|
|
156
|
-
this.input.close();
|
|
157
|
-
this.output.close();
|
|
163
|
+
if (!this.input.closed) this.input.close();
|
|
164
|
+
if (!this.output.closed) this.output.close();
|
|
158
165
|
this.closed = true;
|
|
159
166
|
});
|
|
160
167
|
|
|
@@ -172,7 +179,7 @@ export abstract class SynthesizeStream
|
|
|
172
179
|
[traceTypes.ATTR_TTS_LABEL]: this.#tts.label,
|
|
173
180
|
});
|
|
174
181
|
|
|
175
|
-
for (let i = 0; i < this.
|
|
182
|
+
for (let i = 0; i < this.connOptions.maxRetry + 1; i++) {
|
|
176
183
|
try {
|
|
177
184
|
return await tracer.startActiveSpan(
|
|
178
185
|
async (attemptSpan) => {
|
|
@@ -188,15 +195,15 @@ export abstract class SynthesizeStream
|
|
|
188
195
|
);
|
|
189
196
|
} catch (error) {
|
|
190
197
|
if (error instanceof APIError) {
|
|
191
|
-
const retryInterval = intervalForRetry(this.
|
|
198
|
+
const retryInterval = intervalForRetry(this.connOptions, i);
|
|
192
199
|
|
|
193
|
-
if (this.
|
|
200
|
+
if (this.connOptions.maxRetry === 0 || !error.retryable) {
|
|
194
201
|
this.emitError({ error, recoverable: false });
|
|
195
202
|
throw error;
|
|
196
|
-
} else if (i === this.
|
|
203
|
+
} else if (i === this.connOptions.maxRetry) {
|
|
197
204
|
this.emitError({ error, recoverable: false });
|
|
198
205
|
throw new APIConnectionError({
|
|
199
|
-
message: `failed to generate TTS completion after ${this.
|
|
206
|
+
message: `failed to generate TTS completion after ${this.connOptions.maxRetry + 1} attempts`,
|
|
200
207
|
options: { retryable: false },
|
|
201
208
|
});
|
|
202
209
|
} else {
|
|
@@ -380,6 +387,10 @@ export abstract class SynthesizeStream
|
|
|
380
387
|
return this.output.next();
|
|
381
388
|
}
|
|
382
389
|
|
|
390
|
+
get abortSignal(): AbortSignal {
|
|
391
|
+
return this.abortController.signal;
|
|
392
|
+
}
|
|
393
|
+
|
|
383
394
|
/** Close both the input and output of the TTS stream */
|
|
384
395
|
close() {
|
|
385
396
|
this.abortController.abort();
|
|
@@ -415,15 +426,22 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
415
426
|
private _connOptions: APIConnectOptions;
|
|
416
427
|
private logger = log();
|
|
417
428
|
|
|
429
|
+
protected abortController = new AbortController();
|
|
430
|
+
|
|
418
431
|
constructor(
|
|
419
432
|
text: string,
|
|
420
433
|
tts: TTS,
|
|
421
434
|
connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
|
435
|
+
abortSignal?: AbortSignal,
|
|
422
436
|
) {
|
|
423
437
|
this.#text = text;
|
|
424
438
|
this.#tts = tts;
|
|
425
439
|
this._connOptions = connOptions;
|
|
426
440
|
|
|
441
|
+
if (abortSignal) {
|
|
442
|
+
abortSignal.addEventListener('abort', () => this.abortController.abort(), { once: true });
|
|
443
|
+
}
|
|
444
|
+
|
|
427
445
|
this.monitorMetrics();
|
|
428
446
|
|
|
429
447
|
// this is a hack to immitate asyncio.create_task so that mainTask
|
|
@@ -510,6 +528,10 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
510
528
|
return this.#text;
|
|
511
529
|
}
|
|
512
530
|
|
|
531
|
+
get abortSignal(): AbortSignal {
|
|
532
|
+
return this.abortController.signal;
|
|
533
|
+
}
|
|
534
|
+
|
|
513
535
|
protected async monitorMetrics() {
|
|
514
536
|
const startTime = process.hrtime.bigint();
|
|
515
537
|
let audioDurationMs = 0;
|
|
@@ -564,8 +586,9 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
564
586
|
|
|
565
587
|
/** Close both the input and output of the TTS stream */
|
|
566
588
|
close() {
|
|
567
|
-
this.queue.close();
|
|
568
|
-
this.output.close();
|
|
589
|
+
if (!this.queue.closed) this.queue.close();
|
|
590
|
+
if (!this.output.closed) this.output.close();
|
|
591
|
+
if (!this.abortController.signal.aborted) this.abortController.abort();
|
|
569
592
|
this.closed = true;
|
|
570
593
|
}
|
|
571
594
|
|
package/src/utils.ts
CHANGED
|
@@ -840,6 +840,31 @@ export async function waitForAbort(signal: AbortSignal) {
|
|
|
840
840
|
return await abortFuture.await;
|
|
841
841
|
}
|
|
842
842
|
|
|
843
|
+
/**
|
|
844
|
+
* Combines two abort signals into a single abort signal.
|
|
845
|
+
* @param a - The first abort signal.
|
|
846
|
+
* @param b - The second abort signal.
|
|
847
|
+
* @returns A new abort signal that is aborted when either of the input signals is aborted.
|
|
848
|
+
*/
|
|
849
|
+
export const combineSignals = (a: AbortSignal, b: AbortSignal): AbortSignal => {
|
|
850
|
+
const c = new AbortController();
|
|
851
|
+
const abortFrom = (s: AbortSignal) => {
|
|
852
|
+
if (c.signal.aborted) return;
|
|
853
|
+
c.abort((s as any).reason);
|
|
854
|
+
};
|
|
855
|
+
if (a.aborted) {
|
|
856
|
+
abortFrom(a);
|
|
857
|
+
} else {
|
|
858
|
+
a.addEventListener('abort', () => abortFrom(a), { once: true });
|
|
859
|
+
}
|
|
860
|
+
if (b.aborted) {
|
|
861
|
+
abortFrom(b);
|
|
862
|
+
} else {
|
|
863
|
+
b.addEventListener('abort', () => abortFrom(b), { once: true });
|
|
864
|
+
}
|
|
865
|
+
return c.signal;
|
|
866
|
+
};
|
|
867
|
+
|
|
843
868
|
export const isCloud = (url: URL) => {
|
|
844
869
|
const hostname = url.hostname;
|
|
845
870
|
return hostname.endsWith('.livekit.cloud') || hostname.endsWith('.livekit.run');
|
|
@@ -1449,6 +1449,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1449
1449
|
{ speech_id: speechHandle.id },
|
|
1450
1450
|
'Aborting all pipeline reply tasks due to interruption',
|
|
1451
1451
|
);
|
|
1452
|
+
|
|
1453
|
+
// Stop playout ASAP (don't wait for cancellations), otherwise the segment may finish and we
|
|
1454
|
+
// will correctly (but undesirably) commit a long transcript even though the user said "stop".
|
|
1455
|
+
if (audioOutput) {
|
|
1456
|
+
audioOutput.clearBuffer();
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1452
1459
|
replyAbortController.abort();
|
|
1453
1460
|
await Promise.allSettled(
|
|
1454
1461
|
tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
|
|
@@ -1457,7 +1464,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1457
1464
|
let forwardedText = textOut?.text || '';
|
|
1458
1465
|
|
|
1459
1466
|
if (audioOutput) {
|
|
1460
|
-
audioOutput.clearBuffer();
|
|
1461
1467
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1462
1468
|
if (audioOut?.firstFrameFut.done) {
|
|
1463
1469
|
// playback EV is valid only if the first frame was already played
|
|
@@ -313,6 +313,7 @@ export class AgentSession<
|
|
|
313
313
|
ctx = getJobContext();
|
|
314
314
|
} catch (error) {
|
|
315
315
|
// JobContext is not available in evals
|
|
316
|
+
this.logger.warn('JobContext is not available');
|
|
316
317
|
}
|
|
317
318
|
|
|
318
319
|
if (ctx) {
|
|
@@ -393,6 +394,7 @@ export class AgentSession<
|
|
|
393
394
|
}
|
|
394
395
|
} catch (error) {
|
|
395
396
|
// JobContext is not available in evals
|
|
397
|
+
this.logger.warn('JobContext is not available');
|
|
396
398
|
}
|
|
397
399
|
|
|
398
400
|
this.sessionSpan = tracer.startSpan({
|
|
@@ -525,7 +527,10 @@ export class AgentSession<
|
|
|
525
527
|
newAgentId: agent.id,
|
|
526
528
|
}),
|
|
527
529
|
);
|
|
528
|
-
this.logger.debug(
|
|
530
|
+
this.logger.debug(
|
|
531
|
+
{ previousAgentId: previousActivity?.agent.id, newAgentId: agent.id },
|
|
532
|
+
'Agent handoff inserted into chat context',
|
|
533
|
+
);
|
|
529
534
|
|
|
530
535
|
await this.activity.start();
|
|
531
536
|
|
|
@@ -51,7 +51,7 @@ export class DataStreamAudioOutput extends AudioOutput {
|
|
|
51
51
|
#logger = log();
|
|
52
52
|
|
|
53
53
|
constructor(opts: DataStreamAudioOutputOptions) {
|
|
54
|
-
super(opts.sampleRate, undefined);
|
|
54
|
+
super(opts.sampleRate, undefined, { pause: false });
|
|
55
55
|
|
|
56
56
|
const { room, destinationIdentity, sampleRate, waitRemoteTrack } = opts;
|
|
57
57
|
this.room = room;
|