@livekit/agents 1.0.25 → 1.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/dist/connection_pool.cjs +242 -0
  2. package/dist/connection_pool.cjs.map +1 -0
  3. package/dist/connection_pool.d.cts +123 -0
  4. package/dist/connection_pool.d.ts +123 -0
  5. package/dist/connection_pool.d.ts.map +1 -0
  6. package/dist/connection_pool.js +218 -0
  7. package/dist/connection_pool.js.map +1 -0
  8. package/dist/connection_pool.test.cjs +256 -0
  9. package/dist/connection_pool.test.cjs.map +1 -0
  10. package/dist/connection_pool.test.js +255 -0
  11. package/dist/connection_pool.test.js.map +1 -0
  12. package/dist/index.cjs +2 -0
  13. package/dist/index.cjs.map +1 -1
  14. package/dist/index.d.cts +1 -0
  15. package/dist/index.d.ts +1 -0
  16. package/dist/index.d.ts.map +1 -1
  17. package/dist/index.js +1 -0
  18. package/dist/index.js.map +1 -1
  19. package/dist/inference/tts.cjs +172 -58
  20. package/dist/inference/tts.cjs.map +1 -1
  21. package/dist/inference/tts.d.cts +3 -1
  22. package/dist/inference/tts.d.ts +3 -1
  23. package/dist/inference/tts.d.ts.map +1 -1
  24. package/dist/inference/tts.js +173 -59
  25. package/dist/inference/tts.js.map +1 -1
  26. package/dist/tts/stream_adapter.cjs +6 -3
  27. package/dist/tts/stream_adapter.cjs.map +1 -1
  28. package/dist/tts/stream_adapter.d.cts +1 -1
  29. package/dist/tts/stream_adapter.d.ts +1 -1
  30. package/dist/tts/stream_adapter.d.ts.map +1 -1
  31. package/dist/tts/stream_adapter.js +6 -3
  32. package/dist/tts/stream_adapter.js.map +1 -1
  33. package/dist/tts/tts.cjs +26 -15
  34. package/dist/tts/tts.cjs.map +1 -1
  35. package/dist/tts/tts.d.cts +7 -4
  36. package/dist/tts/tts.d.ts +7 -4
  37. package/dist/tts/tts.d.ts.map +1 -1
  38. package/dist/tts/tts.js +26 -15
  39. package/dist/tts/tts.js.map +1 -1
  40. package/dist/utils.cjs +20 -0
  41. package/dist/utils.cjs.map +1 -1
  42. package/dist/utils.d.cts +7 -0
  43. package/dist/utils.d.ts +7 -0
  44. package/dist/utils.d.ts.map +1 -1
  45. package/dist/utils.js +19 -0
  46. package/dist/utils.js.map +1 -1
  47. package/dist/voice/agent_activity.cjs +3 -1
  48. package/dist/voice/agent_activity.cjs.map +1 -1
  49. package/dist/voice/agent_activity.d.ts.map +1 -1
  50. package/dist/voice/agent_activity.js +3 -1
  51. package/dist/voice/agent_activity.js.map +1 -1
  52. package/dist/voice/agent_session.cjs +6 -1
  53. package/dist/voice/agent_session.cjs.map +1 -1
  54. package/dist/voice/agent_session.d.ts.map +1 -1
  55. package/dist/voice/agent_session.js +6 -1
  56. package/dist/voice/agent_session.js.map +1 -1
  57. package/dist/voice/avatar/datastream_io.cjs +1 -1
  58. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  59. package/dist/voice/avatar/datastream_io.js +1 -1
  60. package/dist/voice/avatar/datastream_io.js.map +1 -1
  61. package/dist/voice/background_audio.cjs +77 -37
  62. package/dist/voice/background_audio.cjs.map +1 -1
  63. package/dist/voice/background_audio.d.cts +10 -3
  64. package/dist/voice/background_audio.d.ts +10 -3
  65. package/dist/voice/background_audio.d.ts.map +1 -1
  66. package/dist/voice/background_audio.js +78 -37
  67. package/dist/voice/background_audio.js.map +1 -1
  68. package/dist/voice/index.cjs +1 -0
  69. package/dist/voice/index.cjs.map +1 -1
  70. package/dist/voice/index.d.cts +1 -0
  71. package/dist/voice/index.d.ts +1 -0
  72. package/dist/voice/index.d.ts.map +1 -1
  73. package/dist/voice/index.js +1 -0
  74. package/dist/voice/index.js.map +1 -1
  75. package/dist/voice/io.cjs +10 -1
  76. package/dist/voice/io.cjs.map +1 -1
  77. package/dist/voice/io.d.cts +18 -1
  78. package/dist/voice/io.d.ts +18 -1
  79. package/dist/voice/io.d.ts.map +1 -1
  80. package/dist/voice/io.js +10 -1
  81. package/dist/voice/io.js.map +1 -1
  82. package/dist/voice/recorder_io/recorder_io.cjs +1 -1
  83. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  84. package/dist/voice/recorder_io/recorder_io.js +1 -1
  85. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  86. package/dist/voice/room_io/_output.cjs +1 -1
  87. package/dist/voice/room_io/_output.cjs.map +1 -1
  88. package/dist/voice/room_io/_output.js +1 -1
  89. package/dist/voice/room_io/_output.js.map +1 -1
  90. package/dist/voice/transcription/synchronizer.cjs +1 -1
  91. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  92. package/dist/voice/transcription/synchronizer.js +1 -1
  93. package/dist/voice/transcription/synchronizer.js.map +1 -1
  94. package/dist/worker.cjs +4 -6
  95. package/dist/worker.cjs.map +1 -1
  96. package/dist/worker.d.ts.map +1 -1
  97. package/dist/worker.js +4 -6
  98. package/dist/worker.js.map +1 -1
  99. package/package.json +3 -3
  100. package/src/connection_pool.test.ts +346 -0
  101. package/src/connection_pool.ts +307 -0
  102. package/src/index.ts +1 -0
  103. package/src/inference/tts.ts +206 -65
  104. package/src/tts/stream_adapter.ts +10 -3
  105. package/src/tts/tts.ts +41 -18
  106. package/src/utils.ts +25 -0
  107. package/src/voice/agent_activity.ts +7 -1
  108. package/src/voice/agent_session.ts +6 -1
  109. package/src/voice/avatar/datastream_io.ts +1 -1
  110. package/src/voice/background_audio.ts +95 -55
  111. package/src/voice/index.ts +1 -0
  112. package/src/voice/io.ts +24 -0
  113. package/src/voice/recorder_io/recorder_io.ts +1 -1
  114. package/src/voice/room_io/_output.ts +1 -1
  115. package/src/voice/transcription/synchronizer.ts +1 -1
  116. package/src/worker.ts +4 -7
@@ -5,13 +5,14 @@ import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { WebSocket } from 'ws';
6
6
  import { APIError, APIStatusError } from '../_exceptions.js';
7
7
  import { AudioByteStream } from '../audio.js';
8
+ import { ConnectionPool } from '../connection_pool.js';
8
9
  import { log } from '../log.js';
9
10
  import { createStreamChannel } from '../stream/stream_channel.js';
10
11
  import { basic as tokenizeBasic } from '../tokenize/index.js';
11
12
  import type { ChunkedStream } from '../tts/index.js';
12
13
  import { SynthesizeStream as BaseSynthesizeStream, TTS as BaseTTS } from '../tts/index.js';
13
14
  import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
14
- import { shortuuid } from '../utils.js';
15
+ import { Event, Future, Task, cancelAndWait, combineSignals, shortuuid } from '../utils.js';
15
16
  import {
16
17
  type TtsClientEvent,
17
18
  type TtsServerEvent,
@@ -95,6 +96,7 @@ export interface InferenceTTSOptions<TModel extends TTSModels> {
95
96
  export class TTS<TModel extends TTSModels> extends BaseTTS {
96
97
  private opts: InferenceTTSOptions<TModel>;
97
98
  private streams: Set<SynthesizeStream<TModel>> = new Set();
99
+ pool: ConnectionPool<WebSocket>;
98
100
 
99
101
  #logger = log();
100
102
 
@@ -165,6 +167,15 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
165
167
  apiSecret: lkApiSecret,
166
168
  modelOptions,
167
169
  };
170
+
171
+ // Initialize connection pool
172
+ this.pool = new ConnectionPool<WebSocket>({
173
+ connectCb: (timeout) => this.connectWs(timeout),
174
+ closeCb: (ws) => this.closeWs(ws),
175
+ maxSessionDuration: 300_000,
176
+ markRefreshedOnGet: true,
177
+ connectTimeout: 10_000, // 10 seconds default
178
+ });
168
179
  }
169
180
 
170
181
  get label() {
@@ -218,6 +229,7 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
218
229
  if (this.opts.model) params.model = this.opts.model;
219
230
  if (this.opts.language) params.language = this.opts.language;
220
231
 
232
+ this.#logger.debug({ url }, 'inference.TTS creating new websocket connection (pool miss)');
221
233
  const socket = await connectWs(url, headers, timeout);
222
234
  socket.send(JSON.stringify(params));
223
235
  return socket;
@@ -227,18 +239,22 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
227
239
  await ws.close();
228
240
  }
229
241
 
242
+ prewarm(): void {
243
+ this.pool.prewarm();
244
+ }
245
+
230
246
  async close() {
231
247
  for (const stream of this.streams) {
232
248
  await stream.close();
233
249
  }
234
250
  this.streams.clear();
251
+ await this.pool.close();
235
252
  }
236
253
  }
237
254
 
238
255
  export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeStream {
239
256
  private opts: InferenceTTSOptions<TModel>;
240
257
  private tts: TTS<TModel>;
241
- private connOptions: APIConnectOptions;
242
258
 
243
259
  #logger = log();
244
260
 
@@ -246,7 +262,6 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
246
262
  super(tts, connOptions);
247
263
  this.opts = opts;
248
264
  this.tts = tts;
249
- this.connOptions = connOptions;
250
265
  }
251
266
 
252
267
  get label() {
@@ -258,30 +273,31 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
258
273
  }
259
274
 
260
275
  protected async run(): Promise<void> {
261
- let ws: WebSocket | null = null;
262
276
  let closing = false;
263
- let finalReceived = false;
264
277
  let lastFrame: AudioFrame | undefined;
265
278
 
266
279
  const sendTokenizerStream = new tokenizeBasic.SentenceTokenizer().stream();
267
280
  const eventChannel = createStreamChannel<TtsServerEvent>();
268
281
  const requestId = shortuuid('tts_request_');
282
+ const inputSentEvent = new Event();
283
+
284
+ // Signal for protocol-driven completion (when 'done' message is received)
285
+ const completionFuture = new Future<void>();
269
286
 
270
- const resourceCleanup = () => {
287
+ const resourceCleanup = async () => {
271
288
  if (closing) return;
272
289
  closing = true;
273
290
  sendTokenizerStream.close();
274
- eventChannel.close();
275
- ws?.removeAllListeners();
276
- ws?.close();
291
+ // close() returns a promise; don't leak it
292
+ await eventChannel.close();
277
293
  };
278
294
 
279
- const sendClientEvent = async (event: TtsClientEvent) => {
295
+ const sendClientEvent = async (event: TtsClientEvent, ws: WebSocket, signal: AbortSignal) => {
280
296
  // Don't send events to a closed WebSocket or aborted controller
281
- if (this.abortController.signal.aborted || closing) return;
297
+ if (signal.aborted || closing) return;
282
298
 
283
299
  const validatedEvent = await ttsClientEventSchema.parseAsync(event);
284
- if (!ws || ws.readyState !== WebSocket.OPEN) {
300
+ if (ws.readyState !== WebSocket.OPEN) {
285
301
  this.#logger.warn('Trying to send client TTS event to a closed WebSocket');
286
302
  return;
287
303
  }
@@ -295,9 +311,9 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
295
311
  }
296
312
  };
297
313
 
298
- const createInputTask = async () => {
314
+ const createInputTask = async (signal: AbortSignal) => {
299
315
  for await (const data of this.input) {
300
- if (this.abortController.signal.aborted || closing) break;
316
+ if (signal.aborted || closing) break;
301
317
  if (data === SynthesizeStream.FLUSH_SENTINEL) {
302
318
  sendTokenizerStream.flush();
303
319
  continue;
@@ -310,55 +326,108 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
310
326
  }
311
327
  };
312
328
 
313
- const createSentenceStreamTask = async () => {
329
+ const createSentenceStreamTask = async (ws: WebSocket, signal: AbortSignal) => {
314
330
  for await (const ev of sendTokenizerStream) {
315
- if (this.abortController.signal.aborted) break;
316
-
317
- sendClientEvent({
318
- type: 'input_transcript',
319
- transcript: ev.token + ' ',
320
- });
331
+ if (signal.aborted || closing) break;
332
+
333
+ await sendClientEvent(
334
+ {
335
+ type: 'input_transcript',
336
+ transcript: ev.token + ' ',
337
+ },
338
+ ws,
339
+ signal,
340
+ );
341
+ inputSentEvent.set();
321
342
  }
322
343
 
323
- sendClientEvent({ type: 'session.flush' });
344
+ await sendClientEvent({ type: 'session.flush' }, ws, signal);
345
+ // needed in case empty input is sent
346
+ inputSentEvent.set();
324
347
  };
325
348
 
326
- const createWsListenerTask = async (ws: WebSocket) => {
327
- return new Promise<void>((resolve, reject) => {
328
- this.abortController.signal.addEventListener('abort', () => {
329
- resourceCleanup();
330
- resolve(); // Abort is triggered by close(), which is a normal shutdown, not an error
331
- });
332
-
333
- ws.on('message', async (data) => {
349
+ // Handles WebSocket message routing and error handling
350
+ // Completes based on protocol messages, NOT on ws.close()
351
+ const createWsListenerTask = async (ws: WebSocket, signal: AbortSignal) => {
352
+ const onMessage = (data: Buffer) => {
353
+ try {
334
354
  const eventJson = JSON.parse(data.toString()) as Record<string, unknown>;
335
355
  const validatedEvent = ttsServerEventSchema.parse(eventJson);
336
- eventChannel.write(validatedEvent);
337
- });
338
-
339
- ws.on('error', (e) => {
340
- this.#logger.error({ error: e }, 'WebSocket error');
341
- resourceCleanup();
342
- reject(e);
343
- });
344
-
345
- ws.on('close', () => {
346
- resourceCleanup();
347
-
348
- if (!closing) return this.#logger.error('WebSocket closed unexpectedly');
349
- if (finalReceived) return resolve();
356
+ // writer.write returns a promise; avoid unhandled rejections if stream is closed
357
+ void eventChannel.write(validatedEvent).catch((error) => {
358
+ this.#logger.debug(
359
+ { error },
360
+ 'Failed writing TTS event to stream channel (likely closed)',
361
+ );
362
+ });
363
+ } catch (e) {
364
+ this.#logger.error({ error: e }, 'Error parsing WebSocket message');
365
+ }
366
+ };
350
367
 
351
- reject(
368
+ const onError = (e: Error) => {
369
+ this.#logger.error({ error: e }, 'WebSocket error');
370
+ void resourceCleanup();
371
+ try {
372
+ // If the ws is misbehaving, hard-stop it immediately to avoid buffering.
373
+ ws.terminate?.();
374
+ } catch {
375
+ // ignore
376
+ }
377
+ // Ensure this ws is not reused
378
+ this.tts.pool.remove(ws);
379
+ completionFuture.reject(e);
380
+ };
381
+
382
+ const onClose = () => {
383
+ // WebSocket closed unexpectedly (not by us)
384
+ if (!closing) {
385
+ this.#logger.error('WebSocket closed unexpectedly');
386
+ void resourceCleanup();
387
+ // Ensure this ws is not reused
388
+ this.tts.pool.remove(ws);
389
+ completionFuture.reject(
352
390
  new APIStatusError({
353
391
  message: 'Gateway connection closed unexpectedly',
354
392
  options: { requestId },
355
393
  }),
356
394
  );
357
- });
358
- });
395
+ }
396
+ };
397
+
398
+ const onAbort = () => {
399
+ void resourceCleanup();
400
+ try {
401
+ // On interruption/abort, close the websocket immediately so the server stops streaming
402
+ // and the ws library doesn't buffer unread frames in memory.
403
+ ws.terminate?.();
404
+ } catch {
405
+ // ignore
406
+ }
407
+ this.tts.pool.remove(ws);
408
+ inputSentEvent.set();
409
+ completionFuture.resolve();
410
+ };
411
+
412
+ // Attach listeners
413
+ ws.on('message', onMessage);
414
+ ws.on('error', onError);
415
+ ws.on('close', onClose);
416
+ signal.addEventListener('abort', onAbort);
417
+
418
+ try {
419
+ // Wait for protocol-driven completion or error
420
+ await completionFuture.await;
421
+ } finally {
422
+ // IMPORTANT: Remove listeners so connection can be reused
423
+ ws.off('message', onMessage);
424
+ ws.off('error', onError);
425
+ ws.off('close', onClose);
426
+ signal.removeEventListener('abort', onAbort);
427
+ }
359
428
  };
360
429
 
361
- const createRecvTask = async () => {
430
+ const createRecvTask = async (signal: AbortSignal) => {
362
431
  let currentSessionId: string | null = null;
363
432
 
364
433
  const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
@@ -366,9 +435,11 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
366
435
  const reader = serverEventStream.getReader();
367
436
 
368
437
  try {
369
- while (!this.closed && !this.abortController.signal.aborted) {
438
+ await inputSentEvent.wait();
439
+
440
+ while (!this.closed && !signal.aborted) {
370
441
  const result = await reader.read();
371
- if (this.abortController.signal.aborted) return;
442
+ if (signal.aborted) return;
372
443
  if (result.done) return;
373
444
 
374
445
  const serverEvent = result.value;
@@ -384,24 +455,29 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
384
455
  }
385
456
  break;
386
457
  case 'done':
387
- finalReceived = true;
388
458
  for (const frame of bstream.flush()) {
389
459
  sendLastFrame(currentSessionId!, false);
390
460
  lastFrame = frame;
391
461
  }
392
462
  sendLastFrame(currentSessionId!, true);
393
463
  this.queue.put(SynthesizeStream.END_OF_STREAM);
394
- break;
464
+ await resourceCleanup();
465
+ completionFuture.resolve();
466
+ return;
395
467
  case 'session.closed':
396
- resourceCleanup();
397
- break;
468
+ await resourceCleanup();
469
+ completionFuture.resolve();
470
+ return;
398
471
  case 'error':
399
472
  this.#logger.error(
400
473
  { serverEvent },
401
474
  'Received error message from LiveKit TTS WebSocket',
402
475
  );
403
- resourceCleanup();
404
- throw new APIError(`LiveKit TTS returned error: ${serverEvent.message}`);
476
+ await resourceCleanup();
477
+ completionFuture.reject(
478
+ new APIError(`LiveKit TTS returned error: ${serverEvent.message}`),
479
+ );
480
+ return;
405
481
  default:
406
482
  this.#logger.warn('Unexpected message %s', serverEvent);
407
483
  break;
@@ -418,16 +494,81 @@ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeSt
418
494
  };
419
495
 
420
496
  try {
421
- ws = await this.tts.connectWs(this.connOptions.timeoutMs);
422
-
423
- await Promise.all([
424
- createInputTask(),
425
- createSentenceStreamTask(),
426
- createWsListenerTask(ws),
427
- createRecvTask(),
428
- ]);
497
+ await this.tts.pool.withConnection(
498
+ async (ws: WebSocket) => {
499
+ try {
500
+ // IMPORTANT: don't cancel the stream's controller on normal completion,
501
+ // otherwise the pool will remove+close the ws and every run becomes a pool miss.
502
+ const runController = new AbortController();
503
+ const onStreamAbort = () => runController.abort(this.abortController.signal.reason);
504
+ this.abortController.signal.addEventListener('abort', onStreamAbort, { once: true });
505
+
506
+ const tasks = [
507
+ Task.from(
508
+ async (controller) => {
509
+ const combined = combineSignals(runController.signal, controller.signal);
510
+ await createInputTask(combined);
511
+ },
512
+ undefined,
513
+ 'inference-tts-input',
514
+ ),
515
+ Task.from(
516
+ async (controller) => {
517
+ const combined = combineSignals(runController.signal, controller.signal);
518
+ await createSentenceStreamTask(ws, combined);
519
+ },
520
+ undefined,
521
+ 'inference-tts-sentence',
522
+ ),
523
+ Task.from(
524
+ async (controller) => {
525
+ const combined = combineSignals(runController.signal, controller.signal);
526
+ await createWsListenerTask(ws, combined);
527
+ },
528
+ undefined,
529
+ 'inference-tts-ws-listener',
530
+ ),
531
+ Task.from(
532
+ async (controller) => {
533
+ const combined = combineSignals(runController.signal, controller.signal);
534
+ await createRecvTask(combined);
535
+ },
536
+ undefined,
537
+ 'inference-tts-recv',
538
+ ),
539
+ ];
540
+
541
+ try {
542
+ await Promise.all(tasks.map((t) => t.result));
543
+ } finally {
544
+ // Mirror python finally: unblock recv and cancel all tasks.
545
+ inputSentEvent.set();
546
+ await resourceCleanup();
547
+ await cancelAndWait(tasks, 5000);
548
+ this.abortController.signal.removeEventListener('abort', onStreamAbort);
549
+ }
550
+ } catch (e) {
551
+ // If aborted, don't throw - let cleanup handle it
552
+ if (e instanceof Error && e.name === 'AbortError') {
553
+ return;
554
+ }
555
+ throw e;
556
+ }
557
+ },
558
+ {
559
+ timeout: this.connOptions.timeoutMs,
560
+ },
561
+ );
562
+ } catch (e) {
563
+ // Handle connection errors
564
+ if (e instanceof Error && e.name === 'AbortError') {
565
+ // Abort is expected during normal shutdown
566
+ return;
567
+ }
568
+ throw e;
429
569
  } finally {
430
- resourceCleanup();
570
+ // Ensure cleanup always runs (and don't leak the promise)
571
+ await resourceCleanup();
431
572
  }
432
573
  }
433
574
  }
@@ -22,10 +22,17 @@ export class StreamAdapter extends TTS {
22
22
  this.#tts.on('metrics_collected', (metrics) => {
23
23
  this.emit('metrics_collected', metrics);
24
24
  });
25
+ this.#tts.on('error', (error) => {
26
+ this.emit('error', error);
27
+ });
25
28
  }
26
29
 
27
- synthesize(text: string): ChunkedStream {
28
- return this.#tts.synthesize(text);
30
+ synthesize(
31
+ text: string,
32
+ connOptions?: APIConnectOptions,
33
+ abortSignal?: AbortSignal,
34
+ ): ChunkedStream {
35
+ return this.#tts.synthesize(text, connOptions, abortSignal);
29
36
  }
30
37
 
31
38
  stream(options?: { connOptions?: APIConnectOptions }): StreamAdapterWrapper {
@@ -85,7 +92,7 @@ export class StreamAdapterWrapper extends SynthesizeStream {
85
92
  prevTask: Task<void> | undefined,
86
93
  controller: AbortController,
87
94
  ) => {
88
- const audioStream = this.#tts.synthesize(token);
95
+ const audioStream = this.#tts.synthesize(token, this.connOptions, this.abortSignal);
89
96
 
90
97
  // wait for previous audio transcription to complete before starting
91
98
  // to queuing audio frames of the current token
package/src/tts/tts.ts CHANGED
@@ -90,7 +90,11 @@ export abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCal
90
90
  /**
91
91
  * Receives text and returns synthesis in the form of a {@link ChunkedStream}
92
92
  */
93
- abstract synthesize(text: string): ChunkedStream;
93
+ abstract synthesize(
94
+ text: string,
95
+ connOptions?: APIConnectOptions,
96
+ abortSignal?: AbortSignal,
97
+ ): ChunkedStream;
94
98
 
95
99
  /**
96
100
  * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data
@@ -131,30 +135,33 @@ export abstract class SynthesizeStream
131
135
  SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
132
136
  >();
133
137
  protected closed = false;
134
- abstract label: string;
135
- #tts: TTS;
136
- #metricsPendingTexts: string[] = [];
137
- #metricsText = '';
138
- #monitorMetricsTask?: Promise<void>;
139
- private _connOptions: APIConnectOptions;
138
+ protected connOptions: APIConnectOptions;
140
139
  protected abortController = new AbortController();
141
- #ttsRequestSpan?: Span;
142
140
 
143
141
  private deferredInputStream: DeferredReadableStream<
144
142
  string | typeof SynthesizeStream.FLUSH_SENTINEL
145
143
  >;
146
144
  private logger = log();
147
145
 
146
+ abstract label: string;
147
+
148
+ #tts: TTS;
149
+ #metricsPendingTexts: string[] = [];
150
+ #metricsText = '';
151
+ #monitorMetricsTask?: Promise<void>;
152
+ #ttsRequestSpan?: Span;
153
+
148
154
  constructor(tts: TTS, connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS) {
149
155
  this.#tts = tts;
150
- this._connOptions = connOptions;
156
+ this.connOptions = connOptions;
151
157
  this.deferredInputStream = new DeferredReadableStream();
152
158
  this.pumpInput();
159
+
153
160
  this.abortController.signal.addEventListener('abort', () => {
154
161
  this.deferredInputStream.detachSource();
155
162
  // TODO (AJS-36) clean this up when we refactor with streams
156
- this.input.close();
157
- this.output.close();
163
+ if (!this.input.closed) this.input.close();
164
+ if (!this.output.closed) this.output.close();
158
165
  this.closed = true;
159
166
  });
160
167
 
@@ -172,7 +179,7 @@ export abstract class SynthesizeStream
172
179
  [traceTypes.ATTR_TTS_LABEL]: this.#tts.label,
173
180
  });
174
181
 
175
- for (let i = 0; i < this._connOptions.maxRetry + 1; i++) {
182
+ for (let i = 0; i < this.connOptions.maxRetry + 1; i++) {
176
183
  try {
177
184
  return await tracer.startActiveSpan(
178
185
  async (attemptSpan) => {
@@ -188,15 +195,15 @@ export abstract class SynthesizeStream
188
195
  );
189
196
  } catch (error) {
190
197
  if (error instanceof APIError) {
191
- const retryInterval = intervalForRetry(this._connOptions, i);
198
+ const retryInterval = intervalForRetry(this.connOptions, i);
192
199
 
193
- if (this._connOptions.maxRetry === 0 || !error.retryable) {
200
+ if (this.connOptions.maxRetry === 0 || !error.retryable) {
194
201
  this.emitError({ error, recoverable: false });
195
202
  throw error;
196
- } else if (i === this._connOptions.maxRetry) {
203
+ } else if (i === this.connOptions.maxRetry) {
197
204
  this.emitError({ error, recoverable: false });
198
205
  throw new APIConnectionError({
199
- message: `failed to generate TTS completion after ${this._connOptions.maxRetry + 1} attempts`,
206
+ message: `failed to generate TTS completion after ${this.connOptions.maxRetry + 1} attempts`,
200
207
  options: { retryable: false },
201
208
  });
202
209
  } else {
@@ -380,6 +387,10 @@ export abstract class SynthesizeStream
380
387
  return this.output.next();
381
388
  }
382
389
 
390
+ get abortSignal(): AbortSignal {
391
+ return this.abortController.signal;
392
+ }
393
+
383
394
  /** Close both the input and output of the TTS stream */
384
395
  close() {
385
396
  this.abortController.abort();
@@ -415,15 +426,22 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
415
426
  private _connOptions: APIConnectOptions;
416
427
  private logger = log();
417
428
 
429
+ protected abortController = new AbortController();
430
+
418
431
  constructor(
419
432
  text: string,
420
433
  tts: TTS,
421
434
  connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
435
+ abortSignal?: AbortSignal,
422
436
  ) {
423
437
  this.#text = text;
424
438
  this.#tts = tts;
425
439
  this._connOptions = connOptions;
426
440
 
441
+ if (abortSignal) {
442
+ abortSignal.addEventListener('abort', () => this.abortController.abort(), { once: true });
443
+ }
444
+
427
445
  this.monitorMetrics();
428
446
 
429
447
  // this is a hack to immitate asyncio.create_task so that mainTask
@@ -510,6 +528,10 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
510
528
  return this.#text;
511
529
  }
512
530
 
531
+ get abortSignal(): AbortSignal {
532
+ return this.abortController.signal;
533
+ }
534
+
513
535
  protected async monitorMetrics() {
514
536
  const startTime = process.hrtime.bigint();
515
537
  let audioDurationMs = 0;
@@ -564,8 +586,9 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
564
586
 
565
587
  /** Close both the input and output of the TTS stream */
566
588
  close() {
567
- this.queue.close();
568
- this.output.close();
589
+ if (!this.queue.closed) this.queue.close();
590
+ if (!this.output.closed) this.output.close();
591
+ if (!this.abortController.signal.aborted) this.abortController.abort();
569
592
  this.closed = true;
570
593
  }
571
594
 
package/src/utils.ts CHANGED
@@ -840,6 +840,31 @@ export async function waitForAbort(signal: AbortSignal) {
840
840
  return await abortFuture.await;
841
841
  }
842
842
 
843
+ /**
844
+ * Combines two abort signals into a single abort signal.
845
+ * @param a - The first abort signal.
846
+ * @param b - The second abort signal.
847
+ * @returns A new abort signal that is aborted when either of the input signals is aborted.
848
+ */
849
+ export const combineSignals = (a: AbortSignal, b: AbortSignal): AbortSignal => {
850
+ const c = new AbortController();
851
+ const abortFrom = (s: AbortSignal) => {
852
+ if (c.signal.aborted) return;
853
+ c.abort((s as any).reason);
854
+ };
855
+ if (a.aborted) {
856
+ abortFrom(a);
857
+ } else {
858
+ a.addEventListener('abort', () => abortFrom(a), { once: true });
859
+ }
860
+ if (b.aborted) {
861
+ abortFrom(b);
862
+ } else {
863
+ b.addEventListener('abort', () => abortFrom(b), { once: true });
864
+ }
865
+ return c.signal;
866
+ };
867
+
843
868
  export const isCloud = (url: URL) => {
844
869
  const hostname = url.hostname;
845
870
  return hostname.endsWith('.livekit.cloud') || hostname.endsWith('.livekit.run');
@@ -1449,6 +1449,13 @@ export class AgentActivity implements RecognitionHooks {
1449
1449
  { speech_id: speechHandle.id },
1450
1450
  'Aborting all pipeline reply tasks due to interruption',
1451
1451
  );
1452
+
1453
+ // Stop playout ASAP (don't wait for cancellations), otherwise the segment may finish and we
1454
+ // will correctly (but undesirably) commit a long transcript even though the user said "stop".
1455
+ if (audioOutput) {
1456
+ audioOutput.clearBuffer();
1457
+ }
1458
+
1452
1459
  replyAbortController.abort();
1453
1460
  await Promise.allSettled(
1454
1461
  tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
@@ -1457,7 +1464,6 @@ export class AgentActivity implements RecognitionHooks {
1457
1464
  let forwardedText = textOut?.text || '';
1458
1465
 
1459
1466
  if (audioOutput) {
1460
- audioOutput.clearBuffer();
1461
1467
  const playbackEv = await audioOutput.waitForPlayout();
1462
1468
  if (audioOut?.firstFrameFut.done) {
1463
1469
  // playback EV is valid only if the first frame was already played
@@ -313,6 +313,7 @@ export class AgentSession<
313
313
  ctx = getJobContext();
314
314
  } catch (error) {
315
315
  // JobContext is not available in evals
316
+ this.logger.warn('JobContext is not available');
316
317
  }
317
318
 
318
319
  if (ctx) {
@@ -393,6 +394,7 @@ export class AgentSession<
393
394
  }
394
395
  } catch (error) {
395
396
  // JobContext is not available in evals
397
+ this.logger.warn('JobContext is not available');
396
398
  }
397
399
 
398
400
  this.sessionSpan = tracer.startSpan({
@@ -525,7 +527,10 @@ export class AgentSession<
525
527
  newAgentId: agent.id,
526
528
  }),
527
529
  );
528
- this.logger.debug({ previousActivity, agent }, 'Agent handoff inserted into chat context');
530
+ this.logger.debug(
531
+ { previousAgentId: previousActivity?.agent.id, newAgentId: agent.id },
532
+ 'Agent handoff inserted into chat context',
533
+ );
529
534
 
530
535
  await this.activity.start();
531
536
 
@@ -51,7 +51,7 @@ export class DataStreamAudioOutput extends AudioOutput {
51
51
  #logger = log();
52
52
 
53
53
  constructor(opts: DataStreamAudioOutputOptions) {
54
- super(opts.sampleRate, undefined);
54
+ super(opts.sampleRate, undefined, { pause: false });
55
55
 
56
56
  const { room, destinationIdentity, sampleRate, waitRemoteTrack } = opts;
57
57
  this.room = room;