@livekit/agents 0.6.4 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/dist/cli.cjs +8 -0
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +8 -0
  5. package/dist/cli.js.map +1 -1
  6. package/dist/index.cjs +6 -1
  7. package/dist/index.cjs.map +1 -1
  8. package/dist/index.d.ts +3 -1
  9. package/dist/index.d.ts.map +1 -1
  10. package/dist/index.js +3 -0
  11. package/dist/index.js.map +1 -1
  12. package/dist/inference_runner.cjs +38 -0
  13. package/dist/inference_runner.cjs.map +1 -0
  14. package/dist/inference_runner.d.ts +11 -0
  15. package/dist/inference_runner.d.ts.map +1 -0
  16. package/dist/inference_runner.js +14 -0
  17. package/dist/inference_runner.js.map +1 -0
  18. package/dist/ipc/index.cjs +23 -0
  19. package/dist/ipc/index.cjs.map +1 -0
  20. package/dist/ipc/index.d.ts +2 -0
  21. package/dist/ipc/index.d.ts.map +1 -0
  22. package/dist/ipc/index.js +2 -0
  23. package/dist/ipc/index.js.map +1 -0
  24. package/dist/ipc/inference_executor.cjs +17 -0
  25. package/dist/ipc/inference_executor.cjs.map +1 -0
  26. package/dist/ipc/inference_executor.d.ts +4 -0
  27. package/dist/ipc/inference_executor.d.ts.map +1 -0
  28. package/dist/ipc/inference_executor.js +1 -0
  29. package/dist/ipc/inference_executor.js.map +1 -0
  30. package/dist/ipc/inference_proc_executor.cjs +97 -0
  31. package/dist/ipc/inference_proc_executor.cjs.map +1 -0
  32. package/dist/ipc/inference_proc_executor.d.ts +23 -0
  33. package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
  34. package/dist/ipc/inference_proc_executor.js +72 -0
  35. package/dist/ipc/inference_proc_executor.js.map +1 -0
  36. package/dist/ipc/inference_proc_lazy_main.cjs +92 -0
  37. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
  38. package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
  39. package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
  40. package/dist/ipc/inference_proc_lazy_main.js +69 -0
  41. package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
  42. package/dist/ipc/job_executor.cjs +8 -7
  43. package/dist/ipc/job_executor.cjs.map +1 -1
  44. package/dist/ipc/job_executor.d.ts +14 -15
  45. package/dist/ipc/job_executor.d.ts.map +1 -1
  46. package/dist/ipc/job_executor.js +7 -6
  47. package/dist/ipc/job_executor.js.map +1 -1
  48. package/dist/ipc/job_proc_executor.cjs +108 -0
  49. package/dist/ipc/job_proc_executor.cjs.map +1 -0
  50. package/dist/ipc/job_proc_executor.d.ts +19 -0
  51. package/dist/ipc/job_proc_executor.d.ts.map +1 -0
  52. package/dist/ipc/job_proc_executor.js +83 -0
  53. package/dist/ipc/job_proc_executor.js.map +1 -0
  54. package/dist/ipc/{job_main.cjs → job_proc_lazy_main.cjs} +46 -36
  55. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
  56. package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
  57. package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
  58. package/dist/ipc/{job_main.js → job_proc_lazy_main.js} +46 -11
  59. package/dist/ipc/job_proc_lazy_main.js.map +1 -0
  60. package/dist/ipc/message.cjs.map +1 -1
  61. package/dist/ipc/message.d.ts +17 -0
  62. package/dist/ipc/message.d.ts.map +1 -1
  63. package/dist/ipc/proc_pool.cjs +30 -4
  64. package/dist/ipc/proc_pool.cjs.map +1 -1
  65. package/dist/ipc/proc_pool.d.ts +5 -1
  66. package/dist/ipc/proc_pool.d.ts.map +1 -1
  67. package/dist/ipc/proc_pool.js +30 -4
  68. package/dist/ipc/proc_pool.js.map +1 -1
  69. package/dist/ipc/{proc_job_executor.cjs → supervised_proc.cjs} +57 -45
  70. package/dist/ipc/supervised_proc.cjs.map +1 -0
  71. package/dist/ipc/supervised_proc.d.ts +30 -0
  72. package/dist/ipc/supervised_proc.d.ts.map +1 -0
  73. package/dist/ipc/{proc_job_executor.js → supervised_proc.js} +53 -31
  74. package/dist/ipc/supervised_proc.js.map +1 -0
  75. package/dist/job.cjs +18 -1
  76. package/dist/job.cjs.map +1 -1
  77. package/dist/job.d.ts +9 -1
  78. package/dist/job.d.ts.map +1 -1
  79. package/dist/job.js +17 -1
  80. package/dist/job.js.map +1 -1
  81. package/dist/multimodal/agent_playout.cjs +18 -16
  82. package/dist/multimodal/agent_playout.cjs.map +1 -1
  83. package/dist/multimodal/agent_playout.d.ts +4 -4
  84. package/dist/multimodal/agent_playout.d.ts.map +1 -1
  85. package/dist/multimodal/agent_playout.js +18 -16
  86. package/dist/multimodal/agent_playout.js.map +1 -1
  87. package/dist/multimodal/multimodal_agent.cjs +12 -8
  88. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  89. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  90. package/dist/multimodal/multimodal_agent.js +13 -9
  91. package/dist/multimodal/multimodal_agent.js.map +1 -1
  92. package/dist/pipeline/agent_output.cjs +22 -4
  93. package/dist/pipeline/agent_output.cjs.map +1 -1
  94. package/dist/pipeline/agent_output.d.ts +4 -2
  95. package/dist/pipeline/agent_output.d.ts.map +1 -1
  96. package/dist/pipeline/agent_output.js +22 -4
  97. package/dist/pipeline/agent_output.js.map +1 -1
  98. package/dist/pipeline/agent_playout.cjs +9 -3
  99. package/dist/pipeline/agent_playout.cjs.map +1 -1
  100. package/dist/pipeline/agent_playout.d.ts +4 -2
  101. package/dist/pipeline/agent_playout.d.ts.map +1 -1
  102. package/dist/pipeline/agent_playout.js +9 -3
  103. package/dist/pipeline/agent_playout.js.map +1 -1
  104. package/dist/pipeline/human_input.cjs +6 -0
  105. package/dist/pipeline/human_input.cjs.map +1 -1
  106. package/dist/pipeline/human_input.d.ts +3 -1
  107. package/dist/pipeline/human_input.d.ts.map +1 -1
  108. package/dist/pipeline/human_input.js +6 -0
  109. package/dist/pipeline/human_input.js.map +1 -1
  110. package/dist/pipeline/pipeline_agent.cjs +79 -12
  111. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  112. package/dist/pipeline/pipeline_agent.d.ts +8 -0
  113. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  114. package/dist/pipeline/pipeline_agent.js +79 -12
  115. package/dist/pipeline/pipeline_agent.js.map +1 -1
  116. package/dist/stt/stream_adapter.cjs +16 -4
  117. package/dist/stt/stream_adapter.cjs.map +1 -1
  118. package/dist/stt/stream_adapter.d.ts.map +1 -1
  119. package/dist/stt/stream_adapter.js +16 -4
  120. package/dist/stt/stream_adapter.js.map +1 -1
  121. package/dist/tokenize/basic/basic.cjs +2 -0
  122. package/dist/tokenize/basic/basic.cjs.map +1 -1
  123. package/dist/tokenize/basic/basic.d.ts +2 -0
  124. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  125. package/dist/tokenize/basic/basic.js +1 -0
  126. package/dist/tokenize/basic/basic.js.map +1 -1
  127. package/dist/tokenize/basic/index.cjs +2 -0
  128. package/dist/tokenize/basic/index.cjs.map +1 -1
  129. package/dist/tokenize/basic/index.d.ts +1 -1
  130. package/dist/tokenize/basic/index.d.ts.map +1 -1
  131. package/dist/tokenize/basic/index.js +8 -1
  132. package/dist/tokenize/basic/index.js.map +1 -1
  133. package/dist/tokenize/token_stream.cjs +5 -3
  134. package/dist/tokenize/token_stream.cjs.map +1 -1
  135. package/dist/tokenize/token_stream.d.ts.map +1 -1
  136. package/dist/tokenize/token_stream.js +5 -3
  137. package/dist/tokenize/token_stream.js.map +1 -1
  138. package/dist/transcription.cjs +203 -86
  139. package/dist/transcription.cjs.map +1 -1
  140. package/dist/transcription.d.ts +24 -17
  141. package/dist/transcription.d.ts.map +1 -1
  142. package/dist/transcription.js +201 -85
  143. package/dist/transcription.js.map +1 -1
  144. package/dist/worker.cjs +42 -9
  145. package/dist/worker.cjs.map +1 -1
  146. package/dist/worker.d.ts +5 -1
  147. package/dist/worker.d.ts.map +1 -1
  148. package/dist/worker.js +42 -9
  149. package/dist/worker.js.map +1 -1
  150. package/package.json +3 -3
  151. package/src/cli.ts +9 -0
  152. package/src/index.ts +3 -1
  153. package/src/inference_runner.ts +19 -0
  154. package/src/ipc/index.ts +5 -0
  155. package/src/ipc/inference_executor.ts +7 -0
  156. package/src/ipc/inference_proc_executor.ts +93 -0
  157. package/src/ipc/inference_proc_lazy_main.ts +90 -0
  158. package/src/ipc/job_executor.ts +15 -17
  159. package/src/ipc/job_proc_executor.ts +112 -0
  160. package/src/ipc/{job_main.ts → job_proc_lazy_main.ts} +52 -14
  161. package/src/ipc/message.ts +14 -1
  162. package/src/ipc/proc_pool.ts +33 -3
  163. package/src/ipc/{proc_job_executor.ts → supervised_proc.ts} +77 -29
  164. package/src/job.ts +21 -0
  165. package/src/multimodal/agent_playout.ts +19 -18
  166. package/src/multimodal/multimodal_agent.ts +13 -9
  167. package/src/pipeline/agent_output.ts +36 -5
  168. package/src/pipeline/agent_playout.ts +10 -1
  169. package/src/pipeline/human_input.ts +8 -0
  170. package/src/pipeline/pipeline_agent.ts +96 -11
  171. package/src/stt/stream_adapter.ts +17 -5
  172. package/src/tokenize/basic/basic.ts +2 -0
  173. package/src/tokenize/basic/index.ts +7 -1
  174. package/src/tokenize/token_stream.ts +6 -3
  175. package/src/transcription.ts +270 -96
  176. package/src/worker.ts +42 -5
  177. package/dist/ipc/job_main.cjs.map +0 -1
  178. package/dist/ipc/job_main.d.ts +0 -8
  179. package/dist/ipc/job_main.d.ts.map +0 -1
  180. package/dist/ipc/job_main.js.map +0 -1
  181. package/dist/ipc/proc_job_executor.cjs.map +0 -1
  182. package/dist/ipc/proc_job_executor.d.ts +0 -15
  183. package/dist/ipc/proc_job_executor.d.ts.map +0 -1
  184. package/dist/ipc/proc_job_executor.js.map +0 -1
@@ -5,7 +5,7 @@ import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { type AudioSource } from '@livekit/rtc-node';
6
6
  import { EventEmitter } from 'node:events';
7
7
  import { AudioByteStream } from '../audio.js';
8
- import type { TranscriptionForwarder } from '../transcription.js';
8
+ import type { TextAudioSynchronizer } from '../transcription.js';
9
9
  import { type AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
10
10
 
11
11
  export const proto = {};
@@ -16,7 +16,7 @@ export class PlayoutHandle extends EventEmitter {
16
16
  #itemId: string;
17
17
  #contentIndex: number;
18
18
  /** @internal */
19
- transcriptionFwd: TranscriptionForwarder;
19
+ synchronizer: TextAudioSynchronizer;
20
20
  /** @internal */
21
21
  doneFut: Future;
22
22
  /** @internal */
@@ -33,14 +33,14 @@ export class PlayoutHandle extends EventEmitter {
33
33
  sampleRate: number,
34
34
  itemId: string,
35
35
  contentIndex: number,
36
- transcriptionFwd: TranscriptionForwarder,
36
+ synchronizer: TextAudioSynchronizer,
37
37
  ) {
38
38
  super();
39
39
  this.#audioSource = audioSource;
40
40
  this.#sampleRate = sampleRate;
41
41
  this.#itemId = itemId;
42
42
  this.#contentIndex = contentIndex;
43
- this.transcriptionFwd = transcriptionFwd;
43
+ this.synchronizer = synchronizer;
44
44
  this.doneFut = new Future();
45
45
  this.intFut = new Future();
46
46
  this.#interrupted = false;
@@ -57,13 +57,16 @@ export class PlayoutHandle extends EventEmitter {
57
57
  return Math.floor(this.totalPlayedTime * this.#sampleRate);
58
58
  }
59
59
 
60
- return Math.floor(
61
- (this.pushedDuration - this.#audioSource.queuedDuration) * (this.#sampleRate / 1000),
60
+ return Math.max(
61
+ 0,
62
+ Math.floor(
63
+ (this.pushedDuration - this.#audioSource.queuedDuration) * (this.#sampleRate / 1000),
64
+ ),
62
65
  );
63
66
  }
64
67
 
65
68
  get textChars(): number {
66
- return this.transcriptionFwd.currentCharacterIndex;
69
+ return this.synchronizer.playedText.length;
67
70
  }
68
71
 
69
72
  get contentIndex(): number {
@@ -111,7 +114,7 @@ export class AgentPlayout extends EventEmitter {
111
114
  play(
112
115
  itemId: string,
113
116
  contentIndex: number,
114
- transcriptionFwd: TranscriptionForwarder,
117
+ synchronizer: TextAudioSynchronizer,
115
118
  textStream: AsyncIterableQueue<string>,
116
119
  audioStream: AsyncIterableQueue<AudioFrame>,
117
120
  ): PlayoutHandle {
@@ -120,7 +123,7 @@ export class AgentPlayout extends EventEmitter {
120
123
  this.#sampleRate,
121
124
  itemId,
122
125
  contentIndex,
123
- transcriptionFwd,
126
+ synchronizer,
124
127
  );
125
128
  this.#playoutTask = this.#makePlayoutTask(this.#playoutTask, handle, textStream, audioStream);
126
129
  return handle;
@@ -159,8 +162,9 @@ export class AgentPlayout extends EventEmitter {
159
162
  if (cancelledText || cancelled) {
160
163
  break;
161
164
  }
162
- handle.transcriptionFwd.pushText(text);
165
+ handle.synchronizer.pushText(text);
163
166
  }
167
+ handle.synchronizer.markTextSegmentEnd();
164
168
  resolveText();
165
169
  } catch (error) {
166
170
  rejectText(error);
@@ -189,12 +193,12 @@ export class AgentPlayout extends EventEmitter {
189
193
  break;
190
194
  }
191
195
  if (firstFrame) {
192
- handle.transcriptionFwd.start();
196
+ handle.synchronizer.segmentPlayoutStarted();
193
197
  this.emit('playout_started');
194
198
  firstFrame = false;
195
199
  }
196
200
 
197
- handle.transcriptionFwd.pushAudio(frame);
201
+ handle.synchronizer.pushAudio(frame);
198
202
 
199
203
  for (const f of bstream.write(frame.data.buffer)) {
200
204
  handle.pushedDuration += (f.samplesPerChannel / f.sampleRate) * 1000;
@@ -208,7 +212,7 @@ export class AgentPlayout extends EventEmitter {
208
212
  await this.#audioSource.captureFrame(f);
209
213
  }
210
214
 
211
- handle.transcriptionFwd.markAudioComplete();
215
+ handle.synchronizer.markAudioSegmentEnd();
212
216
 
213
217
  await this.#audioSource.waitForPlayout();
214
218
  }
@@ -233,6 +237,7 @@ export class AgentPlayout extends EventEmitter {
233
237
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
234
238
 
235
239
  if (handle.interrupted || captureTask.error) {
240
+ await handle.synchronizer.close(true);
236
241
  this.#audioSource.clearQueue(); // make sure to remove any queued frames
237
242
  }
238
243
 
@@ -241,15 +246,11 @@ export class AgentPlayout extends EventEmitter {
241
246
  }
242
247
 
243
248
  if (!firstFrame) {
244
- if (!handle.interrupted) {
245
- handle.transcriptionFwd.markTextComplete();
246
- }
247
-
248
249
  this.emit('playout_stopped', handle.interrupted);
249
250
  }
250
251
 
251
252
  handle.doneFut.resolve();
252
- await handle.transcriptionFwd.close(handle.interrupted);
253
+ await handle.synchronizer.close(false);
253
254
  }
254
255
 
255
256
  resolve();
@@ -22,7 +22,7 @@ import { AudioByteStream } from '../audio.js';
22
22
  import * as llm from '../llm/index.js';
23
23
  import { log } from '../log.js';
24
24
  import type { MultimodalLLMMetrics } from '../metrics/base.js';
25
- import { BasicTranscriptionForwarder } from '../transcription.js';
25
+ import { TextAudioSynchronizer, defaultTextSyncOptions } from '../transcription.js';
26
26
  import { findMicroTrackId } from '../utils.js';
27
27
  import { AgentPlayout, type PlayoutHandle } from './agent_playout.js';
28
28
 
@@ -190,7 +190,7 @@ export class MultimodalAgent extends EventEmitter {
190
190
  this.emit('agent_stopped_speaking');
191
191
  this.#speaking = false;
192
192
  if (this.#playingHandle) {
193
- let text = this.#playingHandle.transcriptionFwd.text;
193
+ let text = this.#playingHandle.synchronizer.playedText;
194
194
  if (interrupted) {
195
195
  text += '…';
196
196
  }
@@ -245,17 +245,21 @@ export class MultimodalAgent extends EventEmitter {
245
245
  // openai.realtime.RealtimeContent
246
246
  if (message.contentType === 'text') return;
247
247
 
248
- const trFwd = new BasicTranscriptionForwarder(
249
- this.room!,
250
- this.room!.localParticipant!.identity!,
251
- this.#getLocalTrackSid()!,
252
- message.responseId,
253
- );
248
+ const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
249
+ synchronizer.on('textUpdated', (text) => {
250
+ this.#publishTranscription(
251
+ this.room!.localParticipant!.identity!,
252
+ this.#getLocalTrackSid()!,
253
+ text.text,
254
+ text.final,
255
+ text.id,
256
+ );
257
+ });
254
258
 
255
259
  const handle = this.#agentPlayout?.play(
256
260
  message.itemId,
257
261
  message.contentIndex,
258
- trFwd,
262
+ synchronizer,
259
263
  message.textStream,
260
264
  message.audioStream,
261
265
  );
@@ -3,6 +3,7 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { log } from '../log.js';
6
+ import type { TextAudioSynchronizer } from '../transcription.js';
6
7
  import { SynthesizeStream, type TTS } from '../tts/index.js';
7
8
  import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
8
9
  import type { AgentPlayout, PlayoutHandle } from './agent_playout.js';
@@ -21,12 +22,20 @@ export class SynthesisHandle {
21
22
  #playHandle?: PlayoutHandle;
22
23
  intFut = new Future();
23
24
  #logger = log();
24
-
25
- constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS) {
25
+ synchronizer: TextAudioSynchronizer;
26
+
27
+ constructor(
28
+ speechId: string,
29
+ ttsSource: SpeechSource,
30
+ agentPlayout: AgentPlayout,
31
+ tts: TTS,
32
+ synchronizer: TextAudioSynchronizer,
33
+ ) {
26
34
  this.#speechId = speechId;
27
35
  this.ttsSource = ttsSource;
28
36
  this.#agentPlayout = agentPlayout;
29
37
  this.tts = tts;
38
+ this.synchronizer = synchronizer;
30
39
  }
31
40
 
32
41
  get speechId(): string {
@@ -51,7 +60,7 @@ export class SynthesisHandle {
51
60
  throw new Error('synthesis was interrupted');
52
61
  }
53
62
 
54
- this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);
63
+ this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue, this.synchronizer);
55
64
  return this.#playHandle;
56
65
  }
57
66
 
@@ -86,8 +95,18 @@ export class AgentOutput {
86
95
  await Promise.all(this.#tasks);
87
96
  }
88
97
 
89
- synthesize(speechId: string, ttsSource: SpeechSource): SynthesisHandle {
90
- const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);
98
+ synthesize(
99
+ speechId: string,
100
+ ttsSource: SpeechSource,
101
+ synchronizer: TextAudioSynchronizer,
102
+ ): SynthesisHandle {
103
+ const handle = new SynthesisHandle(
104
+ speechId,
105
+ ttsSource,
106
+ this.#agentPlayout,
107
+ this.#tts,
108
+ synchronizer,
109
+ );
91
110
  const task = this.#synthesize(handle);
92
111
  this.#tasks.push(task);
93
112
  task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));
@@ -136,6 +155,8 @@ const stringSynthesisTask = (text: string, handle: SynthesisHandle): Cancellable
136
155
 
137
156
  const ttsStream = handle.tts.stream();
138
157
  ttsStream.pushText(text);
158
+ handle.synchronizer.pushText(text);
159
+ handle.synchronizer.markTextSegmentEnd();
139
160
  ttsStream.flush();
140
161
  ttsStream.endInput();
141
162
  for await (const audio of ttsStream) {
@@ -178,8 +199,18 @@ const streamSynthesisTask = (
178
199
  for await (const text of stream) {
179
200
  fullText += text;
180
201
  if (cancelled) break;
202
+ handle.synchronizer.pushText(text);
181
203
  ttsStream.pushText(text);
182
204
  }
205
+ if (!cancelled) {
206
+ handle.synchronizer.markTextSegmentEnd();
207
+ }
208
+
209
+ // end the audio queue early if there is no actual text to turn into speech
210
+ if (!fullText || fullText.trim().length === 0) {
211
+ cancelled = true;
212
+ handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
213
+ }
183
214
  ttsStream.flush();
184
215
  ttsStream.endInput();
185
216
 
@@ -5,6 +5,7 @@ import type { AudioFrame, AudioSource } from '@livekit/rtc-node';
5
5
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
6
  import EventEmitter from 'node:events';
7
7
  import { log } from '../log.js';
8
+ import type { TextAudioSynchronizer } from '../transcription.js';
8
9
  import { CancellablePromise, Future, gracefullyCancel } from '../utils.js';
9
10
  import { SynthesisHandle } from './agent_output.js';
10
11
 
@@ -23,6 +24,7 @@ export class PlayoutHandle {
23
24
  #audioSource: AudioSource;
24
25
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
25
26
  totalPlayedTime?: number;
27
+ synchronizer: TextAudioSynchronizer;
26
28
  #interrupted = false;
27
29
  pushedDuration = 0;
28
30
  intFut = new Future();
@@ -32,10 +34,12 @@ export class PlayoutHandle {
32
34
  speechId: string,
33
35
  audioSource: AudioSource,
34
36
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
37
+ synchronizer: TextAudioSynchronizer,
35
38
  ) {
36
39
  this.#speechId = speechId;
37
40
  this.#audioSource = audioSource;
38
41
  this.playoutSource = playoutSource;
42
+ this.synchronizer = synchronizer;
39
43
  }
40
44
 
41
45
  get speechId(): string {
@@ -91,12 +95,13 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
91
95
  play(
92
96
  speechId: string,
93
97
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
98
+ synchronizer: TextAudioSynchronizer,
94
99
  ): PlayoutHandle {
95
100
  if (this.#closed) {
96
101
  throw new Error('source closed');
97
102
  }
98
103
 
99
- const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
104
+ const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);
100
105
 
101
106
  this.#playoutTask = this.#playout(handle, this.#playoutTask);
102
107
  return handle;
@@ -109,6 +114,7 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
109
114
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
110
115
 
111
116
  if (handle.interrupted || captureTask.error) {
117
+ handle.synchronizer.close(true);
112
118
  this.#audioSource.clearQueue(); // make sure to remove any queued frames
113
119
  }
114
120
 
@@ -156,9 +162,11 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
156
162
  .child({ speechId: handle.speechId })
157
163
  .debug('started playing the first time');
158
164
  this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);
165
+ handle.synchronizer.segmentPlayoutStarted();
159
166
  firstFrame = false;
160
167
  }
161
168
  handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;
169
+ handle.synchronizer.pushAudio(frame);
162
170
  await this.#audioSource.captureFrame(frame);
163
171
  await this.#audioSource.waitForPlayout();
164
172
  }
@@ -170,6 +178,7 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
170
178
  // await this.#audioSource.waitForPlayout();
171
179
  // }
172
180
 
181
+ handle.synchronizer.close(false);
173
182
  resolve();
174
183
  });
175
184
 
@@ -57,6 +57,14 @@ export class HumanInput extends (EventEmitter as new () => TypedEmitter<HumanInp
57
57
  this.#subscribeToMicrophone();
58
58
  }
59
59
 
60
+ get participant(): RemoteParticipant {
61
+ return this.#participant;
62
+ }
63
+
64
+ get subscribedTrack(): RemoteAudioTrack | undefined {
65
+ return this.#subscribedTrack;
66
+ }
67
+
60
68
  #subscribeToMicrophone(): void {
61
69
  if (!this.#participant) {
62
70
  this.#logger.error('Participant is not set');
@@ -10,6 +10,7 @@ import {
10
10
  TrackSource,
11
11
  } from '@livekit/rtc-node';
12
12
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
13
+ import { randomUUID } from 'node:crypto';
13
14
  import EventEmitter from 'node:events';
14
15
  import type {
15
16
  CallableFunctionResult,
@@ -28,6 +29,7 @@ import {
28
29
  hyphenateWord,
29
30
  } from '../tokenize/basic/index.js';
30
31
  import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
32
+ import { TextAudioSynchronizer, defaultTextSyncOptions } from '../transcription.js';
31
33
  import type { TTS } from '../tts/index.js';
32
34
  import { TTSEvent, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
33
35
  import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
@@ -78,6 +80,12 @@ export type VPACallbacks = {
78
80
  [VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
79
81
  };
80
82
 
83
+ interface TurnDetector {
84
+ unlikelyThreshold: number;
85
+ supportsLanguage: (language?: string) => boolean;
86
+ predictEndOfTurn: (chatCtx: ChatContext) => Promise<number>;
87
+ }
88
+
81
89
  export class AgentCallContext {
82
90
  #agent: VoicePipelineAgent;
83
91
  #llmStream: LLMStream;
@@ -206,6 +214,8 @@ export interface VPAOptions {
206
214
  beforeTTSCallback: BeforeTTSCallback;
207
215
  /** Options for assistant transcription. */
208
216
  transcription: AgentTranscriptionOptions;
217
+ /** Turn detection model to use. */
218
+ turnDetector?: TurnDetector;
209
219
  }
210
220
 
211
221
  const defaultVPAOptions: VPAOptions = {
@@ -238,7 +248,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
238
248
  #pendingAgentReply?: SpeechHandle;
239
249
  #agentReplyTask?: CancellablePromise<void>;
240
250
  #playingSpeech?: SpeechHandle;
241
- #transcribedText = '';
251
+ transcribedText = '';
242
252
  #transcribedInterimText = '';
243
253
  #speechQueueOpen = new Future();
244
254
  #speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();
@@ -251,6 +261,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
251
261
  #agentPublication?: LocalTrackPublication;
252
262
  #lastFinalTranscriptTime?: number;
253
263
  #lastSpeechTime?: number;
264
+ #transcriptionId?: string;
265
+ #agentTranscribedText = '';
254
266
 
255
267
  constructor(
256
268
  /** Voice Activity Detection instance. */
@@ -284,6 +296,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
284
296
  this.#deferredValidation = new DeferredReplyValidation(
285
297
  this.#validateReplyIfPossible.bind(this),
286
298
  this.#opts.minEndpointingDelay,
299
+ this,
300
+ this.#opts.turnDetector,
287
301
  );
288
302
  }
289
303
 
@@ -492,14 +506,52 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
492
506
  this.#deferredValidation.onHumanEndOfSpeech(event);
493
507
  });
494
508
  this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
509
+ if (!this.#transcriptionId) {
510
+ this.#transcriptionId = randomUUID();
511
+ }
495
512
  this.#transcribedInterimText = event.alternatives![0].text;
513
+
514
+ this.#room!.localParticipant!.publishTranscription({
515
+ participantIdentity: this.#humanInput!.participant.identity,
516
+ trackSid: this.#humanInput!.subscribedTrack!.sid!,
517
+ segments: [
518
+ {
519
+ text: this.#transcribedInterimText,
520
+ id: this.#transcriptionId,
521
+ final: true,
522
+ startTime: BigInt(0),
523
+ endTime: BigInt(0),
524
+ language: '',
525
+ },
526
+ ],
527
+ });
496
528
  });
497
529
  this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
498
530
  const newTranscript = event.alternatives![0].text;
499
531
  if (!newTranscript) return;
500
532
 
533
+ if (!this.#transcriptionId) {
534
+ this.#transcriptionId = randomUUID();
535
+ }
536
+
501
537
  this.#lastFinalTranscriptTime = Date.now();
502
- this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;
538
+ this.transcribedText += (this.transcribedText ? ' ' : '') + newTranscript;
539
+
540
+ this.#room!.localParticipant!.publishTranscription({
541
+ participantIdentity: this.#humanInput!.participant.identity,
542
+ trackSid: this.#humanInput!.subscribedTrack!.sid!,
543
+ segments: [
544
+ {
545
+ text: this.transcribedText,
546
+ id: this.#transcriptionId,
547
+ final: true,
548
+ startTime: BigInt(0),
549
+ endTime: BigInt(0),
550
+ language: '',
551
+ },
552
+ ],
553
+ });
554
+ this.#transcriptionId = undefined;
503
555
 
504
556
  if (
505
557
  this.#opts.preemptiveSynthesis &&
@@ -564,7 +616,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
564
616
  this.#pendingAgentReply = SpeechHandle.createAssistantReply(
565
617
  this.#opts.allowInterruptions,
566
618
  true,
567
- this.#transcribedText,
619
+ this.transcribedText,
568
620
  );
569
621
  const newHandle = this.#pendingAgentReply;
570
622
  this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);
@@ -674,7 +726,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
674
726
  this.chatCtx.messages.push(userMsg);
675
727
  this.emit(VPAEvent.USER_SPEECH_COMMITTED, userMsg);
676
728
 
677
- this.#transcribedText = this.#transcribedText.slice(userQuestion.length);
729
+ this.transcribedText = this.transcribedText.slice(userQuestion.length);
678
730
  handle.markUserCommitted();
679
731
  };
680
732
 
@@ -692,7 +744,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
692
744
  }
693
745
  commitUserQuestionIfNeeded();
694
746
 
695
- const collectedText = handle.synthesisHandle.text;
747
+ let collectedText = this.#agentTranscribedText;
696
748
  const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
697
749
  const interrupted = handle.interrupted;
698
750
 
@@ -701,7 +753,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
701
753
  this.chatCtx.messages.push(...handle.extraToolsMessages);
702
754
  }
703
755
  if (interrupted) {
704
- collectedText + '…';
756
+ collectedText += '…';
705
757
  }
706
758
 
707
759
  const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });
@@ -798,6 +850,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
798
850
  chatCtx,
799
851
  fncCtx: this.fncCtx,
800
852
  });
853
+
801
854
  const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
802
855
  newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
803
856
  handle.addNestedSpeech(newSpeechHandle);
@@ -832,6 +885,16 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
832
885
  speechId: string,
833
886
  source: string | LLMStream | AsyncIterable<string>,
834
887
  ): SynthesisHandle {
888
+ const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
889
+ synchronizer.on('textUpdated', (text) => {
890
+ this.#agentTranscribedText = text.text;
891
+ this.#room!.localParticipant!.publishTranscription({
892
+ participantIdentity: this.#room!.localParticipant!.identity,
893
+ trackSid: this.#agentPublication!.sid!,
894
+ segments: [text],
895
+ });
896
+ });
897
+
835
898
  if (!this.#agentOutput) {
836
899
  throw new Error('agent output should be initialized when ready');
837
900
  }
@@ -850,7 +913,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
850
913
  throw new Error('beforeTTSCallback must return string or AsyncIterable<string>');
851
914
  }
852
915
 
853
- return this.#agentOutput.synthesize(speechId, ttsSource);
916
+ return this.#agentOutput.synthesize(speechId, ttsSource, synchronizer);
854
917
  }
855
918
 
856
919
  async #validateReplyIfPossible() {
@@ -862,7 +925,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
862
925
  }
863
926
 
864
927
  if (!this.#pendingAgentReply) {
865
- if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {
928
+ if (this.#opts.preemptiveSynthesis || !this.transcribedText) {
866
929
  return;
867
930
  }
868
931
  this.#synthesizeAgentReply();
@@ -969,6 +1032,7 @@ class DeferredReplyValidation {
969
1032
  readonly PUNCTUATION = '.!?';
970
1033
  readonly PUNCTUATION_REDUCE_FACTOR = 0.75;
971
1034
  readonly LATE_TRANSCRIPT_TOLERANCE = 1.5; // late compared to end of speech
1035
+ readonly UNLIKELY_ENDPOINT_DELAY = 6000;
972
1036
 
973
1037
  #validateFunc: () => Promise<void>;
974
1038
  #validatingPromise?: Promise<void>;
@@ -978,12 +1042,21 @@ class DeferredReplyValidation {
978
1042
  #speaking = false;
979
1043
  #endOfSpeechDelay: number;
980
1044
  #finalTranscriptDelay: number;
1045
+ #turnDetector?: TurnDetector;
1046
+ #agent: VoicePipelineAgent;
981
1047
  #abort?: AbortController;
982
1048
 
983
- constructor(validateFunc: () => Promise<void>, minEndpointingDelay: number) {
1049
+ constructor(
1050
+ validateFunc: () => Promise<void>,
1051
+ minEndpointingDelay: number,
1052
+ agent: VoicePipelineAgent,
1053
+ turnDetector?: TurnDetector,
1054
+ ) {
984
1055
  this.#validateFunc = validateFunc;
985
1056
  this.#endOfSpeechDelay = minEndpointingDelay;
986
1057
  this.#finalTranscriptDelay = minEndpointingDelay;
1058
+ this.#agent = agent;
1059
+ this.#turnDetector = turnDetector;
987
1060
  }
988
1061
 
989
1062
  get validating(): boolean {
@@ -1038,7 +1111,17 @@ class DeferredReplyValidation {
1038
1111
  }
1039
1112
 
1040
1113
  #run(delay: number) {
1041
- const runTask = async (delay: number, signal: AbortSignal) => {
1114
+ const runTask = async (delay: number, chatCtx: ChatContext, signal: AbortSignal) => {
1115
+ if (this.#lastFinalTranscript && !this.#speaking && this.#turnDetector) {
1116
+ const startTime = Date.now();
1117
+ const eotProb = await this.#turnDetector.predictEndOfTurn(chatCtx);
1118
+ const unlikelyThreshold = this.#turnDetector.unlikelyThreshold;
1119
+ const elapsed = Date.now() - startTime;
1120
+ if (eotProb < unlikelyThreshold) {
1121
+ delay = this.UNLIKELY_ENDPOINT_DELAY;
1122
+ }
1123
+ delay = Math.max(0, delay - elapsed);
1124
+ }
1042
1125
  const timeout = setTimeout(() => {
1043
1126
  this.#resetStates();
1044
1127
  this.#validateFunc();
@@ -1051,6 +1134,8 @@ class DeferredReplyValidation {
1051
1134
  this.#abort?.abort();
1052
1135
  this.#abort = new AbortController();
1053
1136
  this.#validatingFuture = new Future();
1054
- this.#validatingPromise = runTask(delay, this.#abort.signal);
1137
+ const detectCtx = this.#agent.chatCtx.copy();
1138
+ detectCtx.append({ text: this.#agent.transcribedText, role: ChatRole.USER });
1139
+ this.#validatingPromise = runTask(delay, detectCtx, this.#abort.signal);
1055
1140
  }
1056
1141
  }
@@ -2,6 +2,7 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
+ import { log } from '../log.js';
5
6
  import type { VAD, VADStream } from '../vad.js';
6
7
  import { VADEventType } from '../vad.js';
7
8
  import type { SpeechEvent } from './stt.js';
@@ -71,13 +72,24 @@ export class StreamAdapterWrapper extends SpeechStream {
71
72
  case VADEventType.END_OF_SPEECH:
72
73
  this.output.put({ type: SpeechEventType.END_OF_SPEECH });
73
74
 
74
- const event = await this.#stt.recognize(ev.frames);
75
- if (!event.alternatives![0].text) {
75
+ try {
76
+ const event = await this.#stt.recognize(ev.frames);
77
+ if (!event.alternatives![0].text) {
78
+ continue;
79
+ }
80
+
81
+ this.output.put(event);
82
+ break;
83
+ } catch (error) {
84
+ let logger = log();
85
+ if (error instanceof Error) {
86
+ logger = logger.child({ error: error.message });
87
+ } else {
88
+ logger = logger.child({ error });
89
+ }
90
+ logger.error(`${this.label}: provider recognize task failed`);
76
91
  continue;
77
92
  }
78
-
79
- this.output.put(event);
80
- break;
81
93
  }
82
94
  }
83
95
  };
@@ -68,6 +68,8 @@ export const hyphenateWord = (word: string): string[] => {
68
68
  return hyphenator.hyphenateWord(word);
69
69
  };
70
70
 
71
+ export { splitWords };
72
+
71
73
  export const tokenizeParagraphs = (text: string): string[] => {
72
74
  return splitParagraphs(text).map((tok) => tok[0]);
73
75
  };
@@ -2,4 +2,10 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
 
5
- export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord } from './basic.js';
5
+ export {
6
+ SentenceTokenizer,
7
+ WordTokenizer,
8
+ tokenizeParagraphs,
9
+ hyphenateWord,
10
+ splitWords,
11
+ } from './basic.js';