@livekit/agents 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,7 @@ export abstract class RealtimeSession extends EventEmitter {
36
36
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
37
37
  abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer
38
38
  abstract fncCtx: llm.FunctionContext | undefined;
39
+ abstract recoverFromTextResponse(itemId: string): void;
39
40
  }
40
41
 
41
42
  /**
@@ -63,19 +64,25 @@ export class MultimodalAgent extends EventEmitter {
63
64
  subscribedTrack: RemoteAudioTrack | null = null;
64
65
  readMicroTask: Promise<void> | null = null;
65
66
 
67
+ #textResponseRetries = 0;
68
+ #maxTextResponseRetries: number;
69
+
66
70
  constructor({
67
71
  model,
68
72
  chatCtx,
69
73
  fncCtx,
74
+ maxTextResponseRetries = 5,
70
75
  }: {
71
76
  model: RealtimeModel;
72
77
  chatCtx?: llm.ChatContext;
73
78
  fncCtx?: llm.FunctionContext;
79
+ maxTextResponseRetries?: number;
74
80
  }) {
75
81
  super();
76
82
  this.model = model;
77
83
  this.#chatCtx = chatCtx;
78
84
  this.#fncCtx = fncCtx;
85
+ this.#maxTextResponseRetries = maxTextResponseRetries;
79
86
  }
80
87
 
81
88
  #participant: RemoteParticipant | string | null = null;
@@ -146,7 +153,7 @@ export class MultimodalAgent extends EventEmitter {
146
153
  if (this.linkedParticipant) {
147
154
  return;
148
155
  }
149
- this.#linkParticipant(participant.identity);
156
+ this.#linkParticipant(participant.identity!);
150
157
  });
151
158
  room.on(
152
159
  RoomEvent.TrackPublished,
@@ -220,12 +227,12 @@ export class MultimodalAgent extends EventEmitter {
220
227
  if (typeof participant === 'string') {
221
228
  this.#linkParticipant(participant);
222
229
  } else {
223
- this.#linkParticipant(participant.identity);
230
+ this.#linkParticipant(participant.identity!);
224
231
  }
225
232
  } else {
226
233
  // No participant specified, try to find the first participant in the room
227
234
  for (const participant of room.remoteParticipants.values()) {
228
- this.#linkParticipant(participant.identity);
235
+ this.#linkParticipant(participant.identity!);
229
236
  break;
230
237
  }
231
238
  }
@@ -236,9 +243,11 @@ export class MultimodalAgent extends EventEmitter {
236
243
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
237
244
  this.#session.on('response_content_added', (message: any) => {
238
245
  // openai.realtime.RealtimeContent
246
+ if (message.contentType === 'text') return;
247
+
239
248
  const trFwd = new BasicTranscriptionForwarder(
240
249
  this.room!,
241
- this.room!.localParticipant!.identity,
250
+ this.room!.localParticipant!.identity!,
242
251
  this.#getLocalTrackSid()!,
243
252
  message.responseId,
244
253
  );
@@ -253,6 +262,36 @@ export class MultimodalAgent extends EventEmitter {
253
262
  this.#playingHandle = handle;
254
263
  });
255
264
 
265
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
266
+ this.#session.on('response_content_done', (message: any) => {
267
+ // openai.realtime.RealtimeContent
268
+ if (message.contentType === 'text') {
269
+ if (this.#textResponseRetries >= this.#maxTextResponseRetries) {
270
+ throw new Error(
271
+ 'The OpenAI Realtime API returned a text response ' +
272
+ `after ${this.#maxTextResponseRetries} retries. ` +
273
+ 'Please try to reduce the number of text system or ' +
274
+ 'assistant messages in the chat context.',
275
+ );
276
+ }
277
+
278
+ this.#textResponseRetries++;
279
+ this.#logger
280
+ .child({
281
+ itemId: message.itemId,
282
+ text: message.text,
283
+ retries: this.#textResponseRetries,
284
+ })
285
+ .warn(
286
+ 'The OpenAI Realtime API returned a text response instead of audio. ' +
287
+ 'Attempting to recover to audio mode...',
288
+ );
289
+ this.#session!.recoverFromTextResponse(message.itemId);
290
+ } else {
291
+ this.#textResponseRetries = 0;
292
+ }
293
+ });
294
+
256
295
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
257
296
  this.#session.on('input_speech_committed', (ev: any) => {
258
297
  // openai.realtime.InputSpeechCommittedEvent
@@ -419,7 +458,7 @@ export class MultimodalAgent extends EventEmitter {
419
458
 
420
459
  #getLocalTrackSid(): string | null {
421
460
  if (!this.#localTrackSid && this.room && this.room.localParticipant) {
422
- this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant?.identity);
461
+ this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant!.identity!);
423
462
  }
424
463
  return this.#localTrackSid;
425
464
  }
@@ -470,7 +509,7 @@ export class MultimodalAgent extends EventEmitter {
470
509
 
471
510
  #setState(state: AgentState) {
472
511
  if (this.room?.isConnected && this.room.localParticipant) {
473
- const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
512
+ const currentState = this.room.localParticipant.attributes![AGENT_STATE_ATTRIBUTE];
474
513
  if (currentState !== state) {
475
514
  this.room.localParticipant.setAttributes({
476
515
  [AGENT_STATE_ATTRIBUTE]: state,
@@ -355,7 +355,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
355
355
  if (this.#participant) {
356
356
  return;
357
357
  }
358
- this.#linkParticipant.call(this, participant.identity);
358
+ this.#linkParticipant.call(this, participant.identity!);
359
359
  });
360
360
 
361
361
  this.#room = room;
@@ -365,7 +365,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
365
365
  if (typeof participant === 'string') {
366
366
  this.#linkParticipant(participant);
367
367
  } else {
368
- this.#linkParticipant(participant.identity);
368
+ this.#linkParticipant(participant.identity!);
369
369
  }
370
370
  }
371
371
 
package/src/utils.ts CHANGED
@@ -91,10 +91,18 @@ export class Queue<T> {
91
91
  }
92
92
 
93
93
  async get(): Promise<T> {
94
- if (this.items.length === 0) {
95
- await once(this.#events, 'put');
96
- }
97
- const item = this.items.shift()!;
94
+ const _get = async (): Promise<T> => {
95
+ if (this.items.length === 0) {
96
+ await once(this.#events, 'put');
97
+ }
98
+ let item = this.items.shift();
99
+ if (!item) {
100
+ item = await _get();
101
+ }
102
+ return item;
103
+ };
104
+
105
+ const item = _get();
98
106
  this.#events.emit('get');
99
107
  return item;
100
108
  }
package/src/worker.ts CHANGED
@@ -322,15 +322,19 @@ export class Worker {
322
322
  try {
323
323
  await new Promise((resolve, reject) => {
324
324
  this.#session!.on('open', resolve);
325
- this.#session!.on('error', (error) => reject(error));
326
- this.#session!.on('close', (code) => reject(new Error(`WebSocket returned ${code}`)));
325
+ this.#session!.on('error', (error) => reject(error.message));
326
+ this.#session!.on('close', (code) => reject(`WebSocket returned ${code}`));
327
327
  });
328
328
 
329
329
  retries = 0;
330
330
  this.#logger.debug('connected to LiveKit server');
331
331
  this.#runWS(this.#session);
332
332
  return;
333
- } catch (e) {
333
+ } catch (e: unknown) {
334
+ if (e instanceof Error || e instanceof ErrorEvent) {
335
+ e = e.message;
336
+ }
337
+
334
338
  if (this.#closed) return;
335
339
  if (retries >= this.#opts.maxRetry) {
336
340
  throw new WorkerError(