@livekit/agents 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +40 -0
  3. package/dist/audio.d.ts +1 -4
  4. package/dist/audio.d.ts.map +1 -1
  5. package/dist/audio.js +30 -12
  6. package/dist/audio.js.map +1 -1
  7. package/dist/cli.d.ts +1 -1
  8. package/dist/cli.d.ts.map +1 -1
  9. package/dist/cli.js +41 -17
  10. package/dist/cli.js.map +1 -1
  11. package/dist/generator.d.ts +5 -0
  12. package/dist/generator.d.ts.map +1 -1
  13. package/dist/generator.js +11 -0
  14. package/dist/generator.js.map +1 -1
  15. package/dist/http_server.d.ts +1 -0
  16. package/dist/http_server.d.ts.map +1 -1
  17. package/dist/http_server.js +13 -0
  18. package/dist/http_server.js.map +1 -1
  19. package/dist/index.d.ts +3 -1
  20. package/dist/index.d.ts.map +1 -1
  21. package/dist/index.js +3 -1
  22. package/dist/index.js.map +1 -1
  23. package/dist/ipc/job_main.js +9 -1
  24. package/dist/ipc/job_main.js.map +1 -1
  25. package/dist/ipc/proc_pool.d.ts.map +1 -1
  26. package/dist/ipc/proc_pool.js +1 -0
  27. package/dist/ipc/proc_pool.js.map +1 -1
  28. package/dist/job.d.ts +1 -0
  29. package/dist/job.d.ts.map +1 -1
  30. package/dist/job.js +30 -1
  31. package/dist/job.js.map +1 -1
  32. package/dist/multimodal/agent_playout.d.ts +34 -0
  33. package/dist/multimodal/agent_playout.d.ts.map +1 -0
  34. package/dist/multimodal/agent_playout.js +221 -0
  35. package/dist/multimodal/agent_playout.js.map +1 -0
  36. package/dist/multimodal/index.d.ts +3 -0
  37. package/dist/multimodal/index.d.ts.map +1 -0
  38. package/dist/multimodal/index.js +6 -0
  39. package/dist/multimodal/index.js.map +1 -0
  40. package/dist/multimodal/multimodal_agent.d.ts +47 -0
  41. package/dist/multimodal/multimodal_agent.d.ts.map +1 -0
  42. package/dist/multimodal/multimodal_agent.js +329 -0
  43. package/dist/multimodal/multimodal_agent.js.map +1 -0
  44. package/dist/transcription.d.ts +22 -0
  45. package/dist/transcription.d.ts.map +1 -0
  46. package/dist/transcription.js +112 -0
  47. package/dist/transcription.js.map +1 -0
  48. package/dist/utils.d.ts +29 -1
  49. package/dist/utils.d.ts.map +1 -1
  50. package/dist/utils.js +117 -15
  51. package/dist/utils.js.map +1 -1
  52. package/dist/worker.d.ts +3 -1
  53. package/dist/worker.d.ts.map +1 -1
  54. package/dist/worker.js +49 -9
  55. package/dist/worker.js.map +1 -1
  56. package/package.json +6 -4
  57. package/src/audio.ts +21 -20
  58. package/src/cli.ts +42 -17
  59. package/src/generator.ts +14 -0
  60. package/src/http_server.ts +6 -0
  61. package/src/index.ts +3 -1
  62. package/src/ipc/job_main.ts +9 -2
  63. package/src/ipc/proc_pool.ts +1 -0
  64. package/src/job.ts +37 -1
  65. package/src/multimodal/agent_playout.ts +254 -0
  66. package/src/multimodal/index.ts +5 -0
  67. package/src/multimodal/multimodal_agent.ts +426 -0
  68. package/src/transcription.ts +129 -0
  69. package/src/utils.ts +151 -12
  70. package/src/worker.ts +60 -14
  71. package/tsconfig.json +1 -1
package/src/cli.ts CHANGED
@@ -5,7 +5,7 @@ import { Command, Option } from 'commander';
5
5
  import type { EventEmitter } from 'events';
6
6
  import { initializeLogger, log } from './log.js';
7
7
  import { version } from './version.js';
8
- import { Worker, type WorkerOptions } from './worker.js';
8
+ import { Worker, WorkerOptions } from './worker.js';
9
9
 
10
10
  type CliArgs = {
11
11
  opts: WorkerOptions;
@@ -18,11 +18,15 @@ type CliArgs = {
18
18
 
19
19
  const runWorker = async (args: CliArgs) => {
20
20
  initializeLogger({ pretty: !args.production, level: args.opts.logLevel });
21
- const worker = new Worker(args.opts);
21
+ const logger = log();
22
+
23
+ // though `production` is defined in WorkerOptions, it will always be overriddden by CLI.
24
+ const { production: _, ...opts } = args.opts; // eslint-disable-line @typescript-eslint/no-unused-vars
25
+ const worker = new Worker(new WorkerOptions({ production: args.production, ...opts }));
22
26
 
23
27
  if (args.room) {
24
28
  worker.event.once('worker_registered', () => {
25
- log().info(`connecting to room ${args.room}`);
29
+ logger.info(`connecting to room ${args.room}`);
26
30
  worker.simulateJob(args.room!, args.participantIdentity);
27
31
  });
28
32
  }
@@ -30,21 +34,21 @@ const runWorker = async (args: CliArgs) => {
30
34
  process.once('SIGINT', async () => {
31
35
  // allow C-c C-c for force interrupt
32
36
  process.once('SIGINT', () => {
33
- log().info('worker closed forcefully');
37
+ logger.info('worker closed forcefully');
34
38
  process.exit(130); // SIGINT exit code
35
39
  });
36
40
  if (args.production) {
37
41
  await worker.drain();
38
42
  }
39
43
  await worker.close();
40
- log().info('worker closed');
44
+ logger.info('worker closed');
41
45
  process.exit(130); // SIGINT exit code
42
46
  });
43
47
 
44
48
  try {
45
49
  await worker.run();
46
50
  } catch {
47
- log().fatal('worker failed');
51
+ logger.fatal('worker failed');
48
52
  process.exit(1);
49
53
  }
50
54
  };
@@ -72,20 +76,29 @@ export const runApp = (opts: WorkerOptions) => {
72
76
  .env('LOG_LEVEL'),
73
77
  )
74
78
  .addOption(
75
- new Option('--url <string>', 'LiveKit server or Cloud project websocket URL')
76
- .makeOptionMandatory(true)
77
- .env('LIVEKIT_URL'),
79
+ new Option('--url <string>', 'LiveKit server or Cloud project websocket URL').env(
80
+ 'LIVEKIT_URL',
81
+ ),
78
82
  )
79
83
  .addOption(
80
- new Option('--api-key <string>', "LiveKit server or Cloud project's API key")
81
- .makeOptionMandatory(true)
82
- .env('LIVEKIT_API_KEY'),
84
+ new Option('--api-key <string>', "LiveKit server or Cloud project's API key").env(
85
+ 'LIVEKIT_API_KEY',
86
+ ),
83
87
  )
84
88
  .addOption(
85
- new Option('--api-secret <string>', "LiveKit server or Cloud project's API secret")
86
- .makeOptionMandatory(true)
87
- .env('LIVEKIT_API_SECRET'),
88
- );
89
+ new Option('--api-secret <string>', "LiveKit server or Cloud project's API secret").env(
90
+ 'LIVEKIT_API_SECRET',
91
+ ),
92
+ )
93
+ .action(() => {
94
+ if (
95
+ // do not run CLI if origin file is agents/ipc/job_main.js
96
+ process.argv[1] !== new URL('ipc/job_main.js', import.meta.url).pathname ||
97
+ process.argv.length < 3
98
+ ) {
99
+ program.help();
100
+ }
101
+ });
89
102
 
90
103
  program
91
104
  .command('start')
@@ -106,6 +119,12 @@ export const runApp = (opts: WorkerOptions) => {
106
119
  program
107
120
  .command('dev')
108
121
  .description('Start the worker in development mode')
122
+ .addOption(
123
+ new Option('--log-level <level>', 'Set the logging level')
124
+ .choices(['trace', 'debug', 'info', 'warn', 'error', 'fatal'])
125
+ .default('debug')
126
+ .env('LOG_LEVEL'),
127
+ )
109
128
  .action(() => {
110
129
  const options = program.optsWithGlobals();
111
130
  opts.wsURL = options.url || opts.wsURL;
@@ -123,7 +142,13 @@ export const runApp = (opts: WorkerOptions) => {
123
142
  .command('connect')
124
143
  .description('Connect to a specific room')
125
144
  .requiredOption('--room <string>', 'Room name to connect to')
126
- .option('--participant-identity <string>', 'Participant identitiy to connect as')
145
+ .option('--participant-identity <string>', 'Identity of user to listen to')
146
+ .addOption(
147
+ new Option('--log-level <level>', 'Set the logging level')
148
+ .choices(['trace', 'debug', 'info', 'warn', 'error', 'fatal'])
149
+ .default('debug')
150
+ .env('LOG_LEVEL'),
151
+ )
127
152
  .action((...[, command]) => {
128
153
  const options = command.optsWithGlobals();
129
154
  opts.wsURL = options.url || opts.wsURL;
package/src/generator.ts CHANGED
@@ -9,6 +9,20 @@ export interface Agent {
9
9
  prewarm?: (proc: JobProcess) => unknown;
10
10
  }
11
11
 
12
+ /** Helper to check if an object is an agent before running it.
13
+ *
14
+ * @internal
15
+ */
16
+ export function isAgent(obj: unknown): obj is Agent {
17
+ return (
18
+ typeof obj === 'object' &&
19
+ obj !== null &&
20
+ 'entry' in obj &&
21
+ typeof (obj as Agent).entry === 'function' &&
22
+ (('prewarm' in obj && typeof (obj as Agent).prewarm === 'function') || !('prewarm' in obj))
23
+ );
24
+ }
25
+
12
26
  /**
13
27
  * Helper to define an agent according to the required interface.
14
28
  * @example A basic agent with entry and prewarm functions
@@ -2,6 +2,7 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { type IncomingMessage, type Server, type ServerResponse, createServer } from 'http';
5
+ import { log } from './log.js';
5
6
 
6
7
  const healthCheck = async (res: ServerResponse) => {
7
8
  res.writeHead(200);
@@ -12,6 +13,7 @@ export class HTTPServer {
12
13
  host: string;
13
14
  port: number;
14
15
  app: Server;
16
+ #logger = log();
15
17
 
16
18
  constructor(host: string, port: number) {
17
19
  this.host = host;
@@ -31,6 +33,10 @@ export class HTTPServer {
31
33
  return new Promise((resolve, reject) => {
32
34
  this.app.listen(this.port, this.host, (err?: Error) => {
33
35
  if (err) reject(err);
36
+ const address = this.app.address();
37
+ if (typeof address! !== 'string') {
38
+ this.#logger.info(`Server is listening on port ${address!.port}`);
39
+ }
34
40
  resolve();
35
41
  });
36
42
  });
package/src/index.ts CHANGED
@@ -11,6 +11,7 @@
11
11
  */
12
12
  import * as cli from './cli.js';
13
13
  import * as llm from './llm/index.js';
14
+ import * as multimodal from './multimodal/index.js';
14
15
  import * as stt from './stt/index.js';
15
16
  import * as tts from './tts/index.js';
16
17
 
@@ -24,5 +25,6 @@ export * from './log.js';
24
25
  export * from './generator.js';
25
26
  export * from './tokenize.js';
26
27
  export * from './audio.js';
28
+ export * from './transcription.js';
27
29
 
28
- export { cli, stt, tts, llm };
30
+ export { cli, stt, tts, llm, multimodal };
@@ -7,7 +7,7 @@ import { fork } from 'child_process';
7
7
  import { EventEmitter, once } from 'events';
8
8
  import type { Logger } from 'pino';
9
9
  import { fileURLToPath } from 'url';
10
- import type { Agent } from '../generator.js';
10
+ import { type Agent, isAgent } from '../generator.js';
11
11
  import type { RunningJobInfo } from '../job.js';
12
12
  import { JobContext } from '../job.js';
13
13
  import { JobProcess } from '../job.js';
@@ -93,7 +93,14 @@ if (process.send) {
93
93
  // [0] `node'
94
94
  // [1] import.meta.filename
95
95
  // [2] import.meta.filename of function containing entry file
96
- const agent: Agent = await import(process.argv[2]).then((agent) => agent.default);
96
+ const moduleFile = process.argv[2];
97
+ const agent: Agent = await import(moduleFile).then((module) => {
98
+ const agent = module.default;
99
+ if (agent === undefined || !isAgent(agent)) {
100
+ throw new Error(`Unable to load agent: Missing or invalid default export in ${moduleFile}`);
101
+ }
102
+ return agent;
103
+ });
97
104
  if (!agent.prewarm) {
98
105
  agent.prewarm = defaultInitializeProcessFunc;
99
106
  }
@@ -102,6 +102,7 @@ export class ProcPool {
102
102
  }
103
103
  this.closed = true;
104
104
  this.controller.abort();
105
+ this.warmedProcQueue.items.forEach((e) => e.close());
105
106
  this.executors.forEach((e) => e.close());
106
107
  await Promise.allSettled(this.tasks);
107
108
  }
package/src/job.ts CHANGED
@@ -9,7 +9,7 @@ import type {
9
9
  Room,
10
10
  RtcConfiguration,
11
11
  } from '@livekit/rtc-node';
12
- import { RoomEvent, TrackKind } from '@livekit/rtc-node';
12
+ import { ParticipantKind, RoomEvent, TrackKind } from '@livekit/rtc-node';
13
13
  import type { Logger } from 'pino';
14
14
  import { log } from './log.js';
15
15
 
@@ -100,6 +100,42 @@ export class JobContext {
100
100
  this.shutdownCallbacks.push(callback);
101
101
  }
102
102
 
103
+ async waitForParticipant(identity?: string): Promise<RemoteParticipant> {
104
+ if (!this.#room.isConnected) {
105
+ throw new Error('room is not connected');
106
+ }
107
+
108
+ for (const p of this.#room.remoteParticipants.values()) {
109
+ if ((!identity || p.identity === identity) && p.info.kind != ParticipantKind.AGENT) {
110
+ return p;
111
+ }
112
+ }
113
+
114
+ return new Promise((resolve, reject) => {
115
+ const onParticipantConnected = (participant: RemoteParticipant) => {
116
+ if (
117
+ (!identity || participant.identity === identity) &&
118
+ participant.info.kind != ParticipantKind.AGENT
119
+ ) {
120
+ clearHandlers();
121
+ resolve(participant);
122
+ }
123
+ };
124
+ const onDisconnected = () => {
125
+ clearHandlers();
126
+ reject(new Error('Room disconnected while waiting for participant'));
127
+ };
128
+
129
+ const clearHandlers = () => {
130
+ this.#room.off(RoomEvent.ParticipantConnected, onParticipantConnected);
131
+ this.#room.off(RoomEvent.Disconnected, onDisconnected);
132
+ };
133
+
134
+ this.#room.on(RoomEvent.ParticipantConnected, onParticipantConnected);
135
+ this.#room.on(RoomEvent.Disconnected, onDisconnected);
136
+ });
137
+ }
138
+
103
139
  /**
104
140
  * Connects the agent to the room.
105
141
  *
@@ -0,0 +1,254 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AudioFrame } from '@livekit/rtc-node';
5
+ import { type AudioSource } from '@livekit/rtc-node';
6
+ import { EventEmitter } from 'events';
7
+ import { AudioByteStream } from '../audio.js';
8
+ import type { TranscriptionForwarder } from '../transcription.js';
9
+ import { type AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
10
+
11
+ export const proto = {};
12
+
13
+ export class PlayoutHandle extends EventEmitter {
14
+ #audioSource: AudioSource;
15
+ #sampleRate: number;
16
+ #itemId: string;
17
+ #contentIndex: number;
18
+ /** @internal */
19
+ transcriptionFwd: TranscriptionForwarder;
20
+ /** @internal */
21
+ doneFut: Future;
22
+ /** @internal */
23
+ intFut: Future;
24
+ /** @internal */
25
+ #interrupted: boolean;
26
+ /** @internal */
27
+ pushedDuration: number;
28
+ /** @internal */
29
+ totalPlayedTime: number | undefined; // Set when playout is done
30
+
31
+ constructor(
32
+ audioSource: AudioSource,
33
+ sampleRate: number,
34
+ itemId: string,
35
+ contentIndex: number,
36
+ transcriptionFwd: TranscriptionForwarder,
37
+ ) {
38
+ super();
39
+ this.#audioSource = audioSource;
40
+ this.#sampleRate = sampleRate;
41
+ this.#itemId = itemId;
42
+ this.#contentIndex = contentIndex;
43
+ this.transcriptionFwd = transcriptionFwd;
44
+ this.doneFut = new Future();
45
+ this.intFut = new Future();
46
+ this.#interrupted = false;
47
+ this.pushedDuration = 0;
48
+ this.totalPlayedTime = undefined;
49
+ }
50
+
51
+ get itemId(): string {
52
+ return this.#itemId;
53
+ }
54
+
55
+ get audioSamples(): number {
56
+ if (this.totalPlayedTime !== undefined) {
57
+ return Math.floor(this.totalPlayedTime * this.#sampleRate);
58
+ }
59
+
60
+ return Math.floor(this.pushedDuration - this.#audioSource.queuedDuration * this.#sampleRate);
61
+ }
62
+
63
+ get textChars(): number {
64
+ return this.transcriptionFwd.currentCharacterIndex;
65
+ }
66
+
67
+ get contentIndex(): number {
68
+ return this.#contentIndex;
69
+ }
70
+
71
+ get interrupted(): boolean {
72
+ return this.#interrupted;
73
+ }
74
+
75
+ get done(): boolean {
76
+ return this.doneFut.done || this.#interrupted;
77
+ }
78
+
79
+ interrupt() {
80
+ if (this.doneFut.done) return;
81
+ this.intFut.resolve();
82
+ this.#interrupted = true;
83
+ }
84
+ }
85
+
86
+ export class AgentPlayout {
87
+ #audioSource: AudioSource;
88
+ #playoutTask: CancellablePromise<void> | null;
89
+ #sampleRate: number;
90
+ #numChannels: number;
91
+ #inFrameSize: number;
92
+ #outFrameSize: number;
93
+ constructor(
94
+ audioSource: AudioSource,
95
+ sampleRate: number,
96
+ numChannels: number,
97
+ inFrameSize: number,
98
+ outFrameSize: number,
99
+ ) {
100
+ this.#audioSource = audioSource;
101
+ this.#playoutTask = null;
102
+ this.#sampleRate = sampleRate;
103
+ this.#numChannels = numChannels;
104
+ this.#inFrameSize = inFrameSize;
105
+ this.#outFrameSize = outFrameSize;
106
+ }
107
+
108
+ play(
109
+ itemId: string,
110
+ contentIndex: number,
111
+ transcriptionFwd: TranscriptionForwarder,
112
+ textStream: AsyncIterableQueue<string>,
113
+ audioStream: AsyncIterableQueue<AudioFrame>,
114
+ ): PlayoutHandle {
115
+ const handle = new PlayoutHandle(
116
+ this.#audioSource,
117
+ this.#sampleRate,
118
+ itemId,
119
+ contentIndex,
120
+ transcriptionFwd,
121
+ );
122
+ this.#playoutTask = this.#makePlayoutTask(this.#playoutTask, handle, textStream, audioStream);
123
+ return handle;
124
+ }
125
+
126
+ #makePlayoutTask(
127
+ oldTask: CancellablePromise<void> | null,
128
+ handle: PlayoutHandle,
129
+ textStream: AsyncIterableQueue<string>,
130
+ audioStream: AsyncIterableQueue<AudioFrame>,
131
+ ): CancellablePromise<void> {
132
+ return new CancellablePromise<void>((resolve, reject, onCancel) => {
133
+ let cancelled = false;
134
+ onCancel(() => {
135
+ cancelled = true;
136
+ });
137
+
138
+ (async () => {
139
+ try {
140
+ if (oldTask) {
141
+ await gracefullyCancel(oldTask);
142
+ }
143
+
144
+ let firstFrame = true;
145
+
146
+ const readText = () =>
147
+ new CancellablePromise<void>((resolveText, rejectText, onCancelText) => {
148
+ let cancelledText = false;
149
+ onCancelText(() => {
150
+ cancelledText = true;
151
+ });
152
+
153
+ (async () => {
154
+ try {
155
+ for await (const text of textStream) {
156
+ if (cancelledText || cancelled) {
157
+ break;
158
+ }
159
+ handle.transcriptionFwd.pushText(text);
160
+ }
161
+ resolveText();
162
+ } catch (error) {
163
+ rejectText(error);
164
+ }
165
+ })();
166
+ });
167
+
168
+ const capture = () =>
169
+ new CancellablePromise<void>((resolveCapture, rejectCapture, onCancelCapture) => {
170
+ let cancelledCapture = false;
171
+ onCancelCapture(() => {
172
+ cancelledCapture = true;
173
+ });
174
+
175
+ (async () => {
176
+ try {
177
+ const samplesPerChannel = this.#outFrameSize;
178
+ const bstream = new AudioByteStream(
179
+ this.#sampleRate,
180
+ this.#numChannels,
181
+ samplesPerChannel,
182
+ );
183
+
184
+ for await (const frame of audioStream) {
185
+ if (cancelledCapture || cancelled) {
186
+ break;
187
+ }
188
+ if (firstFrame) {
189
+ handle.transcriptionFwd.start();
190
+ firstFrame = false;
191
+ }
192
+
193
+ handle.transcriptionFwd.pushAudio(frame);
194
+
195
+ for (const f of bstream.write(frame.data.buffer)) {
196
+ handle.pushedDuration += f.samplesPerChannel / f.sampleRate;
197
+ await this.#audioSource.captureFrame(f);
198
+ }
199
+ }
200
+
201
+ if (!cancelledCapture && !cancelled) {
202
+ for (const f of bstream.flush()) {
203
+ handle.pushedDuration += f.samplesPerChannel / f.sampleRate;
204
+ await this.#audioSource.captureFrame(f);
205
+ }
206
+
207
+ handle.transcriptionFwd.markAudioComplete();
208
+
209
+ await this.#audioSource.waitForPlayout();
210
+ }
211
+
212
+ resolveCapture();
213
+ } catch (error) {
214
+ rejectCapture(error);
215
+ }
216
+ })();
217
+ });
218
+
219
+ const readTextTask = readText();
220
+ const captureTask = capture();
221
+
222
+ try {
223
+ await Promise.race([captureTask, handle.intFut.await]);
224
+ } finally {
225
+ if (!captureTask.isCancelled) {
226
+ await gracefullyCancel(captureTask);
227
+ }
228
+
229
+ handle.totalPlayedTime = handle.pushedDuration - this.#audioSource.queuedDuration;
230
+
231
+ if (handle.interrupted || captureTask.error) {
232
+ this.#audioSource.clearQueue(); // make sure to remove any queued frames
233
+ }
234
+
235
+ if (!readTextTask.isCancelled) {
236
+ await gracefullyCancel(readTextTask);
237
+ }
238
+
239
+ if (!firstFrame && !handle.interrupted) {
240
+ handle.transcriptionFwd.markTextComplete();
241
+ }
242
+
243
+ handle.doneFut.resolve();
244
+ await handle.transcriptionFwd.close(handle.interrupted);
245
+ }
246
+
247
+ resolve();
248
+ } catch (error) {
249
+ reject(error);
250
+ }
251
+ })();
252
+ });
253
+ }
254
+ }
@@ -0,0 +1,5 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ export * from './multimodal_agent.js';
5
+ export * from './agent_playout.js';