@livekit/agents 0.6.4 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/dist/index.cjs +6 -1
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +3 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +3 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/inference_runner.cjs +38 -0
  8. package/dist/inference_runner.cjs.map +1 -0
  9. package/dist/inference_runner.d.ts +11 -0
  10. package/dist/inference_runner.d.ts.map +1 -0
  11. package/dist/inference_runner.js +14 -0
  12. package/dist/inference_runner.js.map +1 -0
  13. package/dist/ipc/index.cjs +23 -0
  14. package/dist/ipc/index.cjs.map +1 -0
  15. package/dist/ipc/index.d.ts +2 -0
  16. package/dist/ipc/index.d.ts.map +1 -0
  17. package/dist/ipc/index.js +2 -0
  18. package/dist/ipc/index.js.map +1 -0
  19. package/dist/ipc/inference_executor.cjs +17 -0
  20. package/dist/ipc/inference_executor.cjs.map +1 -0
  21. package/dist/ipc/inference_executor.d.ts +4 -0
  22. package/dist/ipc/inference_executor.d.ts.map +1 -0
  23. package/dist/ipc/inference_executor.js +1 -0
  24. package/dist/ipc/inference_executor.js.map +1 -0
  25. package/dist/ipc/inference_proc_executor.cjs +97 -0
  26. package/dist/ipc/inference_proc_executor.cjs.map +1 -0
  27. package/dist/ipc/inference_proc_executor.d.ts +23 -0
  28. package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
  29. package/dist/ipc/inference_proc_executor.js +72 -0
  30. package/dist/ipc/inference_proc_executor.js.map +1 -0
  31. package/dist/ipc/inference_proc_lazy_main.cjs +90 -0
  32. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
  33. package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
  34. package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
  35. package/dist/ipc/inference_proc_lazy_main.js +67 -0
  36. package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
  37. package/dist/ipc/job_executor.cjs +8 -7
  38. package/dist/ipc/job_executor.cjs.map +1 -1
  39. package/dist/ipc/job_executor.d.ts +14 -15
  40. package/dist/ipc/job_executor.d.ts.map +1 -1
  41. package/dist/ipc/job_executor.js +7 -6
  42. package/dist/ipc/job_executor.js.map +1 -1
  43. package/dist/ipc/job_proc_executor.cjs +108 -0
  44. package/dist/ipc/job_proc_executor.cjs.map +1 -0
  45. package/dist/ipc/job_proc_executor.d.ts +19 -0
  46. package/dist/ipc/job_proc_executor.d.ts.map +1 -0
  47. package/dist/ipc/job_proc_executor.js +83 -0
  48. package/dist/ipc/job_proc_executor.js.map +1 -0
  49. package/dist/ipc/{job_main.cjs → job_proc_lazy_main.cjs} +41 -36
  50. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
  51. package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
  52. package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
  53. package/dist/ipc/{job_main.js → job_proc_lazy_main.js} +41 -11
  54. package/dist/ipc/job_proc_lazy_main.js.map +1 -0
  55. package/dist/ipc/message.cjs.map +1 -1
  56. package/dist/ipc/message.d.ts +17 -0
  57. package/dist/ipc/message.d.ts.map +1 -1
  58. package/dist/ipc/proc_pool.cjs +30 -4
  59. package/dist/ipc/proc_pool.cjs.map +1 -1
  60. package/dist/ipc/proc_pool.d.ts +5 -1
  61. package/dist/ipc/proc_pool.d.ts.map +1 -1
  62. package/dist/ipc/proc_pool.js +30 -4
  63. package/dist/ipc/proc_pool.js.map +1 -1
  64. package/dist/ipc/{proc_job_executor.cjs → supervised_proc.cjs} +57 -45
  65. package/dist/ipc/supervised_proc.cjs.map +1 -0
  66. package/dist/ipc/supervised_proc.d.ts +30 -0
  67. package/dist/ipc/supervised_proc.d.ts.map +1 -0
  68. package/dist/ipc/{proc_job_executor.js → supervised_proc.js} +53 -31
  69. package/dist/ipc/supervised_proc.js.map +1 -0
  70. package/dist/job.cjs +18 -1
  71. package/dist/job.cjs.map +1 -1
  72. package/dist/job.d.ts +9 -1
  73. package/dist/job.d.ts.map +1 -1
  74. package/dist/job.js +17 -1
  75. package/dist/job.js.map +1 -1
  76. package/dist/multimodal/agent_playout.cjs +13 -14
  77. package/dist/multimodal/agent_playout.cjs.map +1 -1
  78. package/dist/multimodal/agent_playout.d.ts +4 -4
  79. package/dist/multimodal/agent_playout.d.ts.map +1 -1
  80. package/dist/multimodal/agent_playout.js +13 -14
  81. package/dist/multimodal/agent_playout.js.map +1 -1
  82. package/dist/multimodal/multimodal_agent.cjs +12 -8
  83. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  84. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  85. package/dist/multimodal/multimodal_agent.js +13 -9
  86. package/dist/multimodal/multimodal_agent.js.map +1 -1
  87. package/dist/pipeline/agent_output.cjs +20 -4
  88. package/dist/pipeline/agent_output.cjs.map +1 -1
  89. package/dist/pipeline/agent_output.d.ts +4 -2
  90. package/dist/pipeline/agent_output.d.ts.map +1 -1
  91. package/dist/pipeline/agent_output.js +20 -4
  92. package/dist/pipeline/agent_output.js.map +1 -1
  93. package/dist/pipeline/agent_playout.cjs +9 -3
  94. package/dist/pipeline/agent_playout.cjs.map +1 -1
  95. package/dist/pipeline/agent_playout.d.ts +4 -2
  96. package/dist/pipeline/agent_playout.d.ts.map +1 -1
  97. package/dist/pipeline/agent_playout.js +9 -3
  98. package/dist/pipeline/agent_playout.js.map +1 -1
  99. package/dist/pipeline/human_input.cjs +6 -0
  100. package/dist/pipeline/human_input.cjs.map +1 -1
  101. package/dist/pipeline/human_input.d.ts +3 -1
  102. package/dist/pipeline/human_input.d.ts.map +1 -1
  103. package/dist/pipeline/human_input.js +6 -0
  104. package/dist/pipeline/human_input.js.map +1 -1
  105. package/dist/pipeline/pipeline_agent.cjs +79 -12
  106. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  107. package/dist/pipeline/pipeline_agent.d.ts +8 -0
  108. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  109. package/dist/pipeline/pipeline_agent.js +79 -12
  110. package/dist/pipeline/pipeline_agent.js.map +1 -1
  111. package/dist/stt/stream_adapter.cjs +16 -4
  112. package/dist/stt/stream_adapter.cjs.map +1 -1
  113. package/dist/stt/stream_adapter.d.ts.map +1 -1
  114. package/dist/stt/stream_adapter.js +16 -4
  115. package/dist/stt/stream_adapter.js.map +1 -1
  116. package/dist/tokenize/basic/basic.cjs +2 -0
  117. package/dist/tokenize/basic/basic.cjs.map +1 -1
  118. package/dist/tokenize/basic/basic.d.ts +2 -0
  119. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  120. package/dist/tokenize/basic/basic.js +1 -0
  121. package/dist/tokenize/basic/basic.js.map +1 -1
  122. package/dist/tokenize/basic/index.cjs +2 -0
  123. package/dist/tokenize/basic/index.cjs.map +1 -1
  124. package/dist/tokenize/basic/index.d.ts +1 -1
  125. package/dist/tokenize/basic/index.d.ts.map +1 -1
  126. package/dist/tokenize/basic/index.js +8 -1
  127. package/dist/tokenize/basic/index.js.map +1 -1
  128. package/dist/tokenize/token_stream.cjs +5 -3
  129. package/dist/tokenize/token_stream.cjs.map +1 -1
  130. package/dist/tokenize/token_stream.d.ts.map +1 -1
  131. package/dist/tokenize/token_stream.js +5 -3
  132. package/dist/tokenize/token_stream.js.map +1 -1
  133. package/dist/transcription.cjs +203 -86
  134. package/dist/transcription.cjs.map +1 -1
  135. package/dist/transcription.d.ts +24 -17
  136. package/dist/transcription.d.ts.map +1 -1
  137. package/dist/transcription.js +201 -85
  138. package/dist/transcription.js.map +1 -1
  139. package/dist/worker.cjs +42 -9
  140. package/dist/worker.cjs.map +1 -1
  141. package/dist/worker.d.ts +5 -1
  142. package/dist/worker.d.ts.map +1 -1
  143. package/dist/worker.js +42 -9
  144. package/dist/worker.js.map +1 -1
  145. package/package.json +3 -3
  146. package/src/index.ts +3 -1
  147. package/src/inference_runner.ts +19 -0
  148. package/src/ipc/index.ts +5 -0
  149. package/src/ipc/inference_executor.ts +7 -0
  150. package/src/ipc/inference_proc_executor.ts +93 -0
  151. package/src/ipc/inference_proc_lazy_main.ts +86 -0
  152. package/src/ipc/job_executor.ts +15 -17
  153. package/src/ipc/job_proc_executor.ts +112 -0
  154. package/src/ipc/{job_main.ts → job_proc_lazy_main.ts} +44 -14
  155. package/src/ipc/message.ts +14 -1
  156. package/src/ipc/proc_pool.ts +33 -3
  157. package/src/ipc/{proc_job_executor.ts → supervised_proc.ts} +77 -29
  158. package/src/job.ts +21 -0
  159. package/src/multimodal/agent_playout.ts +14 -16
  160. package/src/multimodal/multimodal_agent.ts +13 -9
  161. package/src/pipeline/agent_output.ts +34 -5
  162. package/src/pipeline/agent_playout.ts +10 -1
  163. package/src/pipeline/human_input.ts +8 -0
  164. package/src/pipeline/pipeline_agent.ts +96 -11
  165. package/src/stt/stream_adapter.ts +17 -5
  166. package/src/tokenize/basic/basic.ts +2 -0
  167. package/src/tokenize/basic/index.ts +7 -1
  168. package/src/tokenize/token_stream.ts +6 -3
  169. package/src/transcription.ts +270 -96
  170. package/src/worker.ts +42 -5
  171. package/dist/ipc/job_main.cjs.map +0 -1
  172. package/dist/ipc/job_main.d.ts +0 -8
  173. package/dist/ipc/job_main.d.ts.map +0 -1
  174. package/dist/ipc/job_main.js.map +0 -1
  175. package/dist/ipc/proc_job_executor.cjs.map +0 -1
  176. package/dist/ipc/proc_job_executor.d.ts +0 -15
  177. package/dist/ipc/proc_job_executor.d.ts.map +0 -1
  178. package/dist/ipc/proc_job_executor.js.map +0 -1
@@ -6,31 +6,54 @@ import { once } from 'node:events';
6
6
  import type { RunningJobInfo } from '../job.js';
7
7
  import { log, loggerOptions } from '../log.js';
8
8
  import { Future } from '../utils.js';
9
- import type { ProcOpts } from './job_executor.js';
10
- import { JobExecutor } from './job_executor.js';
11
9
  import type { IPCMessage } from './message.js';
12
10
 
13
- export class ProcJobExecutor extends JobExecutor {
11
+ export interface ProcOpts {
12
+ initializeTimeout: number;
13
+ closeTimeout: number;
14
+ memoryWarnMB: number;
15
+ memoryLimitMB: number;
16
+ pingInterval: number;
17
+ pingTimeout: number;
18
+ highPingThreshold: number;
19
+ }
20
+
21
+ export abstract class SupervisedProc {
14
22
  #opts: ProcOpts;
15
23
  #started = false;
16
24
  #closing = false;
17
25
  #runningJob?: RunningJobInfo = undefined;
18
- #proc?: ChildProcess;
26
+ proc?: ChildProcess;
19
27
  #pingInterval?: ReturnType<typeof setInterval>;
28
+ #memoryWatch?: ReturnType<typeof setInterval>;
20
29
  #pongTimeout?: ReturnType<typeof setTimeout>;
21
- #init = new Future();
30
+ protected init = new Future();
22
31
  #join = new Future();
23
32
  #logger = log().child({ runningJob: this.#runningJob });
24
33
 
25
- constructor(agent: string, initializeTimeout: number, closeTimeout: number) {
26
- super();
34
+ constructor(
35
+ initializeTimeout: number,
36
+ closeTimeout: number,
37
+ memoryWarnMB: number,
38
+ memoryLimitMB: number,
39
+ pingInterval: number,
40
+ pingTimeout: number,
41
+ highPingThreshold: number,
42
+ ) {
27
43
  this.#opts = {
28
- agent,
29
44
  initializeTimeout,
30
45
  closeTimeout,
46
+ memoryWarnMB,
47
+ memoryLimitMB,
48
+ pingInterval,
49
+ pingTimeout,
50
+ highPingThreshold,
31
51
  };
32
52
  }
33
53
 
54
+ abstract createProcess(): ChildProcess;
55
+ abstract mainTask(child: ChildProcess): Promise<void>;
56
+
34
57
  get started(): boolean {
35
58
  return this.#started;
36
59
  }
@@ -46,36 +69,50 @@ export class ProcJobExecutor extends JobExecutor {
46
69
  throw new Error('runner is closed');
47
70
  }
48
71
 
49
- this.#proc = await import('./job_main.js').then((m) =>
50
- m.runProcess({
51
- agentFile: this.#opts.agent,
52
- }),
53
- );
72
+ this.proc = this.createProcess();
54
73
 
55
74
  this.#started = true;
56
75
  this.run();
57
76
  }
58
77
 
59
78
  async run() {
60
- await this.#init.await;
79
+ await this.init.await;
61
80
 
62
81
  this.#pingInterval = setInterval(() => {
63
- this.#proc!.send({ case: 'pingRequest', value: { timestamp: Date.now() } });
64
- }, this.PING_INTERVAL);
82
+ this.proc!.send({ case: 'pingRequest', value: { timestamp: Date.now() } });
83
+ }, this.#opts.pingInterval);
65
84
 
66
85
  this.#pongTimeout = setTimeout(() => {
67
86
  this.#logger.warn('job is unresponsive');
68
87
  clearTimeout(this.#pongTimeout);
69
88
  clearInterval(this.#pingInterval);
70
- this.#proc!.kill();
89
+ this.proc!.kill();
71
90
  this.#join.resolve();
72
- }, this.PING_TIMEOUT);
91
+ }, this.#opts.pingTimeout);
92
+
93
+ this.#memoryWatch = setInterval(() => {
94
+ const memoryMB = process.memoryUsage().heapUsed / (1024 * 1024);
95
+ if (this.#opts.memoryLimitMB > 0 && memoryMB > this.#opts.memoryLimitMB) {
96
+ this.#logger
97
+ .child({ memoryUsageMB: memoryMB, memoryLimitMB: this.#opts.memoryLimitMB })
98
+ .error('process exceeded memory limit, killing process');
99
+ this.close();
100
+ } else if (this.#opts.memoryWarnMB > 0 && memoryMB > this.#opts.memoryWarnMB) {
101
+ this.#logger
102
+ .child({
103
+ memoryUsageMB: memoryMB,
104
+ memoryWarnMB: this.#opts.memoryWarnMB,
105
+ memoryLimitMB: this.#opts.memoryLimitMB,
106
+ })
107
+ .error('process memory usage is high');
108
+ }
109
+ });
73
110
 
74
111
  const listener = (msg: IPCMessage) => {
75
112
  switch (msg.case) {
76
113
  case 'pongResponse': {
77
114
  const delay = Date.now() - msg.value.timestamp;
78
- if (delay > this.HIGH_PING_THRESHOLD) {
115
+ if (delay > this.#opts.highPingThreshold) {
79
116
  this.#logger.child({ delay }).warn('job executor is unresponsive');
80
117
  }
81
118
  this.#pongTimeout?.refresh();
@@ -87,23 +124,26 @@ export class ProcJobExecutor extends JobExecutor {
87
124
  }
88
125
  case 'done': {
89
126
  this.#closing = true;
90
- this.#proc!.off('message', listener);
127
+ this.proc!.off('message', listener);
91
128
  this.#join.resolve();
92
129
  break;
93
130
  }
94
131
  }
95
132
  };
96
- this.#proc!.on('message', listener);
97
- this.#proc!.on('error', (err) => {
133
+ this.proc!.on('message', listener);
134
+ this.proc!.on('error', (err) => {
98
135
  if (this.#closing) return;
99
136
  this.#logger
100
137
  .child({ err })
101
138
  .warn('job process exited unexpectedly; this likely means the error above caused a crash');
102
139
  clearTimeout(this.#pongTimeout);
103
140
  clearInterval(this.#pingInterval);
141
+ clearInterval(this.#memoryWatch);
104
142
  this.#join.resolve();
105
143
  });
106
144
 
145
+ this.mainTask(this.proc!);
146
+
107
147
  await this.#join.await;
108
148
  }
109
149
 
@@ -118,17 +158,25 @@ export class ProcJobExecutor extends JobExecutor {
118
158
  async initialize() {
119
159
  const timer = setTimeout(() => {
120
160
  const err = new Error('runner initialization timed out');
121
- this.#init.reject(err);
161
+ this.init.reject(err);
122
162
  throw err;
123
163
  }, this.#opts.initializeTimeout);
124
- this.#proc!.send({ case: 'initializeRequest', value: { loggerOptions } });
125
- await once(this.#proc!, 'message').then(([msg]: IPCMessage[]) => {
164
+ this.proc!.send({
165
+ case: 'initializeRequest',
166
+ value: {
167
+ loggerOptions,
168
+ pingInterval: this.#opts.pingInterval,
169
+ pingTimeout: this.#opts.pingTimeout,
170
+ highPingThreshold: this.#opts.highPingThreshold,
171
+ },
172
+ });
173
+ await once(this.proc!, 'message').then(([msg]: IPCMessage[]) => {
126
174
  clearTimeout(timer);
127
175
  if (msg!.case !== 'initializeResponse') {
128
176
  throw new Error('first message must be InitializeResponse');
129
177
  }
130
178
  });
131
- this.#init.resolve();
179
+ this.init.resolve();
132
180
  }
133
181
 
134
182
  async close() {
@@ -138,11 +186,11 @@ export class ProcJobExecutor extends JobExecutor {
138
186
  this.#closing = true;
139
187
 
140
188
  if (!this.#runningJob) {
141
- this.#proc!.kill();
189
+ this.proc!.kill();
142
190
  this.#join.resolve();
143
191
  }
144
192
 
145
- this.#proc!.send({ case: 'shutdownRequest' });
193
+ this.proc!.send({ case: 'shutdownRequest' });
146
194
 
147
195
  const timer = setTimeout(() => {
148
196
  this.#logger.error('job shutdown is taking too much time');
@@ -159,6 +207,6 @@ export class ProcJobExecutor extends JobExecutor {
159
207
  throw new Error('executor already has a running job');
160
208
  }
161
209
  this.#runningJob = info;
162
- this.#proc!.send({ case: 'startJobRequest', value: { runningJob: info } });
210
+ this.proc!.send({ case: 'startJobRequest', value: { runningJob: info } });
163
211
  }
164
212
  }
package/src/job.ts CHANGED
@@ -11,8 +11,21 @@ import type {
11
11
  } from '@livekit/rtc-node';
12
12
  import { ParticipantKind, RoomEvent, TrackKind } from '@livekit/rtc-node';
13
13
  import type { Logger } from 'pino';
14
+ import type { InferenceExecutor } from './ipc/inference_executor.js';
14
15
  import { log } from './log.js';
15
16
 
17
+ export class CurrentJobContext {
18
+ static #current: JobContext;
19
+
20
+ constructor(proc: JobContext) {
21
+ CurrentJobContext.#current = proc;
22
+ }
23
+
24
+ static getCurrent(): JobContext {
25
+ return CurrentJobContext.#current;
26
+ }
27
+ }
28
+
16
29
  /** Which tracks, if any, should the agent automatically subscribe to? */
17
30
  export enum AutoSubscribe {
18
31
  SUBSCRIBE_ALL,
@@ -60,6 +73,7 @@ export class JobContext {
60
73
  };
61
74
  } = {};
62
75
  #logger: Logger;
76
+ #inferenceExecutor: InferenceExecutor;
63
77
 
64
78
  constructor(
65
79
  proc: JobProcess,
@@ -67,6 +81,7 @@ export class JobContext {
67
81
  room: Room,
68
82
  onConnect: () => void,
69
83
  onShutdown: (s: string) => void,
84
+ inferenceExecutor: InferenceExecutor,
70
85
  ) {
71
86
  this.#proc = proc;
72
87
  this.#info = info;
@@ -76,6 +91,7 @@ export class JobContext {
76
91
  this.onParticipantConnected = this.onParticipantConnected.bind(this);
77
92
  this.#room.on(RoomEvent.ParticipantConnected, this.onParticipantConnected);
78
93
  this.#logger = log().child({ info: this.#info });
94
+ this.#inferenceExecutor = inferenceExecutor;
79
95
  }
80
96
 
81
97
  get proc(): JobProcess {
@@ -96,6 +112,11 @@ export class JobContext {
96
112
  return this.#room.localParticipant;
97
113
  }
98
114
 
115
+ /** @returns The global inference executor */
116
+ get inferenceExecutor(): InferenceExecutor {
117
+ return this.#inferenceExecutor;
118
+ }
119
+
99
120
  /** Adds a promise to be awaited when {@link JobContext.shutdown | shutdown} is called. */
100
121
  addShutdownCallback(callback: () => Promise<void>) {
101
122
  this.shutdownCallbacks.push(callback);
@@ -5,7 +5,7 @@ import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { type AudioSource } from '@livekit/rtc-node';
6
6
  import { EventEmitter } from 'node:events';
7
7
  import { AudioByteStream } from '../audio.js';
8
- import type { TranscriptionForwarder } from '../transcription.js';
8
+ import type { TextAudioSynchronizer } from '../transcription.js';
9
9
  import { type AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
10
10
 
11
11
  export const proto = {};
@@ -16,7 +16,7 @@ export class PlayoutHandle extends EventEmitter {
16
16
  #itemId: string;
17
17
  #contentIndex: number;
18
18
  /** @internal */
19
- transcriptionFwd: TranscriptionForwarder;
19
+ synchronizer: TextAudioSynchronizer;
20
20
  /** @internal */
21
21
  doneFut: Future;
22
22
  /** @internal */
@@ -33,14 +33,14 @@ export class PlayoutHandle extends EventEmitter {
33
33
  sampleRate: number,
34
34
  itemId: string,
35
35
  contentIndex: number,
36
- transcriptionFwd: TranscriptionForwarder,
36
+ synchronizer: TextAudioSynchronizer,
37
37
  ) {
38
38
  super();
39
39
  this.#audioSource = audioSource;
40
40
  this.#sampleRate = sampleRate;
41
41
  this.#itemId = itemId;
42
42
  this.#contentIndex = contentIndex;
43
- this.transcriptionFwd = transcriptionFwd;
43
+ this.synchronizer = synchronizer;
44
44
  this.doneFut = new Future();
45
45
  this.intFut = new Future();
46
46
  this.#interrupted = false;
@@ -63,7 +63,7 @@ export class PlayoutHandle extends EventEmitter {
63
63
  }
64
64
 
65
65
  get textChars(): number {
66
- return this.transcriptionFwd.currentCharacterIndex;
66
+ return this.synchronizer.playedText.length;
67
67
  }
68
68
 
69
69
  get contentIndex(): number {
@@ -111,7 +111,7 @@ export class AgentPlayout extends EventEmitter {
111
111
  play(
112
112
  itemId: string,
113
113
  contentIndex: number,
114
- transcriptionFwd: TranscriptionForwarder,
114
+ synchronizer: TextAudioSynchronizer,
115
115
  textStream: AsyncIterableQueue<string>,
116
116
  audioStream: AsyncIterableQueue<AudioFrame>,
117
117
  ): PlayoutHandle {
@@ -120,7 +120,7 @@ export class AgentPlayout extends EventEmitter {
120
120
  this.#sampleRate,
121
121
  itemId,
122
122
  contentIndex,
123
- transcriptionFwd,
123
+ synchronizer,
124
124
  );
125
125
  this.#playoutTask = this.#makePlayoutTask(this.#playoutTask, handle, textStream, audioStream);
126
126
  return handle;
@@ -159,8 +159,9 @@ export class AgentPlayout extends EventEmitter {
159
159
  if (cancelledText || cancelled) {
160
160
  break;
161
161
  }
162
- handle.transcriptionFwd.pushText(text);
162
+ handle.synchronizer.pushText(text);
163
163
  }
164
+ handle.synchronizer.markTextSegmentEnd();
164
165
  resolveText();
165
166
  } catch (error) {
166
167
  rejectText(error);
@@ -189,12 +190,12 @@ export class AgentPlayout extends EventEmitter {
189
190
  break;
190
191
  }
191
192
  if (firstFrame) {
192
- handle.transcriptionFwd.start();
193
+ handle.synchronizer.segmentPlayoutStarted();
193
194
  this.emit('playout_started');
194
195
  firstFrame = false;
195
196
  }
196
197
 
197
- handle.transcriptionFwd.pushAudio(frame);
198
+ handle.synchronizer.pushAudio(frame);
198
199
 
199
200
  for (const f of bstream.write(frame.data.buffer)) {
200
201
  handle.pushedDuration += (f.samplesPerChannel / f.sampleRate) * 1000;
@@ -208,7 +209,7 @@ export class AgentPlayout extends EventEmitter {
208
209
  await this.#audioSource.captureFrame(f);
209
210
  }
210
211
 
211
- handle.transcriptionFwd.markAudioComplete();
212
+ handle.synchronizer.markAudioSegmentEnd();
212
213
 
213
214
  await this.#audioSource.waitForPlayout();
214
215
  }
@@ -233,6 +234,7 @@ export class AgentPlayout extends EventEmitter {
233
234
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
234
235
 
235
236
  if (handle.interrupted || captureTask.error) {
237
+ await handle.synchronizer.close(true);
236
238
  this.#audioSource.clearQueue(); // make sure to remove any queued frames
237
239
  }
238
240
 
@@ -241,15 +243,11 @@ export class AgentPlayout extends EventEmitter {
241
243
  }
242
244
 
243
245
  if (!firstFrame) {
244
- if (!handle.interrupted) {
245
- handle.transcriptionFwd.markTextComplete();
246
- }
247
-
248
246
  this.emit('playout_stopped', handle.interrupted);
249
247
  }
250
248
 
251
249
  handle.doneFut.resolve();
252
- await handle.transcriptionFwd.close(handle.interrupted);
250
+ await handle.synchronizer.close(false);
253
251
  }
254
252
 
255
253
  resolve();
@@ -22,7 +22,7 @@ import { AudioByteStream } from '../audio.js';
22
22
  import * as llm from '../llm/index.js';
23
23
  import { log } from '../log.js';
24
24
  import type { MultimodalLLMMetrics } from '../metrics/base.js';
25
- import { BasicTranscriptionForwarder } from '../transcription.js';
25
+ import { TextAudioSynchronizer, defaultTextSyncOptions } from '../transcription.js';
26
26
  import { findMicroTrackId } from '../utils.js';
27
27
  import { AgentPlayout, type PlayoutHandle } from './agent_playout.js';
28
28
 
@@ -190,7 +190,7 @@ export class MultimodalAgent extends EventEmitter {
190
190
  this.emit('agent_stopped_speaking');
191
191
  this.#speaking = false;
192
192
  if (this.#playingHandle) {
193
- let text = this.#playingHandle.transcriptionFwd.text;
193
+ let text = this.#playingHandle.synchronizer.playedText;
194
194
  if (interrupted) {
195
195
  text += '…';
196
196
  }
@@ -245,17 +245,21 @@ export class MultimodalAgent extends EventEmitter {
245
245
  // openai.realtime.RealtimeContent
246
246
  if (message.contentType === 'text') return;
247
247
 
248
- const trFwd = new BasicTranscriptionForwarder(
249
- this.room!,
250
- this.room!.localParticipant!.identity!,
251
- this.#getLocalTrackSid()!,
252
- message.responseId,
253
- );
248
+ const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
249
+ synchronizer.on('textUpdated', (text) => {
250
+ this.#publishTranscription(
251
+ this.room!.localParticipant!.identity!,
252
+ this.#getLocalTrackSid()!,
253
+ text.text,
254
+ text.final,
255
+ text.id,
256
+ );
257
+ });
254
258
 
255
259
  const handle = this.#agentPlayout?.play(
256
260
  message.itemId,
257
261
  message.contentIndex,
258
- trFwd,
262
+ synchronizer,
259
263
  message.textStream,
260
264
  message.audioStream,
261
265
  );
@@ -3,6 +3,7 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { log } from '../log.js';
6
+ import type { TextAudioSynchronizer } from '../transcription.js';
6
7
  import { SynthesizeStream, type TTS } from '../tts/index.js';
7
8
  import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
8
9
  import type { AgentPlayout, PlayoutHandle } from './agent_playout.js';
@@ -21,12 +22,20 @@ export class SynthesisHandle {
21
22
  #playHandle?: PlayoutHandle;
22
23
  intFut = new Future();
23
24
  #logger = log();
24
-
25
- constructor(speechId: string, ttsSource: SpeechSource, agentPlayout: AgentPlayout, tts: TTS) {
25
+ synchronizer: TextAudioSynchronizer;
26
+
27
+ constructor(
28
+ speechId: string,
29
+ ttsSource: SpeechSource,
30
+ agentPlayout: AgentPlayout,
31
+ tts: TTS,
32
+ synchronizer: TextAudioSynchronizer,
33
+ ) {
26
34
  this.#speechId = speechId;
27
35
  this.ttsSource = ttsSource;
28
36
  this.#agentPlayout = agentPlayout;
29
37
  this.tts = tts;
38
+ this.synchronizer = synchronizer;
30
39
  }
31
40
 
32
41
  get speechId(): string {
@@ -51,7 +60,7 @@ export class SynthesisHandle {
51
60
  throw new Error('synthesis was interrupted');
52
61
  }
53
62
 
54
- this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue);
63
+ this.#playHandle = this.#agentPlayout.play(this.#speechId, this.queue, this.synchronizer);
55
64
  return this.#playHandle;
56
65
  }
57
66
 
@@ -86,8 +95,18 @@ export class AgentOutput {
86
95
  await Promise.all(this.#tasks);
87
96
  }
88
97
 
89
- synthesize(speechId: string, ttsSource: SpeechSource): SynthesisHandle {
90
- const handle = new SynthesisHandle(speechId, ttsSource, this.#agentPlayout, this.#tts);
98
+ synthesize(
99
+ speechId: string,
100
+ ttsSource: SpeechSource,
101
+ synchronizer: TextAudioSynchronizer,
102
+ ): SynthesisHandle {
103
+ const handle = new SynthesisHandle(
104
+ speechId,
105
+ ttsSource,
106
+ this.#agentPlayout,
107
+ this.#tts,
108
+ synchronizer,
109
+ );
91
110
  const task = this.#synthesize(handle);
92
111
  this.#tasks.push(task);
93
112
  task.finally(() => this.#tasks.splice(this.#tasks.indexOf(task)));
@@ -136,6 +155,8 @@ const stringSynthesisTask = (text: string, handle: SynthesisHandle): Cancellable
136
155
 
137
156
  const ttsStream = handle.tts.stream();
138
157
  ttsStream.pushText(text);
158
+ handle.synchronizer.pushText(text);
159
+ handle.synchronizer.markTextSegmentEnd();
139
160
  ttsStream.flush();
140
161
  ttsStream.endInput();
141
162
  for await (const audio of ttsStream) {
@@ -178,8 +199,16 @@ const streamSynthesisTask = (
178
199
  for await (const text of stream) {
179
200
  fullText += text;
180
201
  if (cancelled) break;
202
+ handle.synchronizer.pushText(text);
181
203
  ttsStream.pushText(text);
182
204
  }
205
+ handle.synchronizer.markTextSegmentEnd();
206
+
207
+ // end the audio queue early if there is no actual text to turn into speech
208
+ if (!fullText || fullText.trim().length === 0) {
209
+ cancelled = true;
210
+ handle.queue.put(SynthesisHandle.FLUSH_SENTINEL);
211
+ }
183
212
  ttsStream.flush();
184
213
  ttsStream.endInput();
185
214
 
@@ -5,6 +5,7 @@ import type { AudioFrame, AudioSource } from '@livekit/rtc-node';
5
5
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
6
  import EventEmitter from 'node:events';
7
7
  import { log } from '../log.js';
8
+ import type { TextAudioSynchronizer } from '../transcription.js';
8
9
  import { CancellablePromise, Future, gracefullyCancel } from '../utils.js';
9
10
  import { SynthesisHandle } from './agent_output.js';
10
11
 
@@ -23,6 +24,7 @@ export class PlayoutHandle {
23
24
  #audioSource: AudioSource;
24
25
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>;
25
26
  totalPlayedTime?: number;
27
+ synchronizer: TextAudioSynchronizer;
26
28
  #interrupted = false;
27
29
  pushedDuration = 0;
28
30
  intFut = new Future();
@@ -32,10 +34,12 @@ export class PlayoutHandle {
32
34
  speechId: string,
33
35
  audioSource: AudioSource,
34
36
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
37
+ synchronizer: TextAudioSynchronizer,
35
38
  ) {
36
39
  this.#speechId = speechId;
37
40
  this.#audioSource = audioSource;
38
41
  this.playoutSource = playoutSource;
42
+ this.synchronizer = synchronizer;
39
43
  }
40
44
 
41
45
  get speechId(): string {
@@ -91,12 +95,13 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
91
95
  play(
92
96
  speechId: string,
93
97
  playoutSource: AsyncIterable<AudioFrame | typeof SynthesisHandle.FLUSH_SENTINEL>,
98
+ synchronizer: TextAudioSynchronizer,
94
99
  ): PlayoutHandle {
95
100
  if (this.#closed) {
96
101
  throw new Error('source closed');
97
102
  }
98
103
 
99
- const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource);
104
+ const handle = new PlayoutHandle(speechId, this.#audioSource, playoutSource, synchronizer);
100
105
 
101
106
  this.#playoutTask = this.#playout(handle, this.#playoutTask);
102
107
  return handle;
@@ -109,6 +114,7 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
109
114
  handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
110
115
 
111
116
  if (handle.interrupted || captureTask.error) {
117
+ handle.synchronizer.close(true);
112
118
  this.#audioSource.clearQueue(); // make sure to remove any queued frames
113
119
  }
114
120
 
@@ -156,9 +162,11 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
156
162
  .child({ speechId: handle.speechId })
157
163
  .debug('started playing the first time');
158
164
  this.emit(AgentPlayoutEvent.PLAYOUT_STARTED);
165
+ handle.synchronizer.segmentPlayoutStarted();
159
166
  firstFrame = false;
160
167
  }
161
168
  handle.pushedDuration += (frame.samplesPerChannel / frame.sampleRate) * 1000;
169
+ handle.synchronizer.pushAudio(frame);
162
170
  await this.#audioSource.captureFrame(frame);
163
171
  await this.#audioSource.waitForPlayout();
164
172
  }
@@ -170,6 +178,7 @@ export class AgentPlayout extends (EventEmitter as new () => TypedEmitter<AgentP
170
178
  // await this.#audioSource.waitForPlayout();
171
179
  // }
172
180
 
181
+ handle.synchronizer.close(false);
173
182
  resolve();
174
183
  });
175
184
 
@@ -57,6 +57,14 @@ export class HumanInput extends (EventEmitter as new () => TypedEmitter<HumanInp
57
57
  this.#subscribeToMicrophone();
58
58
  }
59
59
 
60
+ get participant(): RemoteParticipant {
61
+ return this.#participant;
62
+ }
63
+
64
+ get subscribedTrack(): RemoteAudioTrack | undefined {
65
+ return this.#subscribedTrack;
66
+ }
67
+
60
68
  #subscribeToMicrophone(): void {
61
69
  if (!this.#participant) {
62
70
  this.#logger.error('Participant is not set');