@livekit/agents 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/job.cjs.map +1 -1
- package/dist/job.js.map +1 -1
- package/dist/multimodal/multimodal_agent.cjs +27 -3
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +3 -1
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +27 -3
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +19 -7
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.js +19 -7
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/utils.cjs +11 -4
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +11 -4
- package/dist/utils.js.map +1 -1
- package/dist/worker.cjs +5 -2
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +5 -2
- package/dist/worker.js.map +1 -1
- package/package.json +3 -3
- package/src/job.ts +3 -3
- package/src/multimodal/multimodal_agent.ts +45 -6
- package/src/pipeline/pipeline_agent.ts +19 -13
- package/src/utils.ts +12 -4
- package/src/worker.ts +7 -3
|
@@ -36,6 +36,7 @@ export abstract class RealtimeSession extends EventEmitter {
|
|
|
36
36
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
37
37
|
abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer
|
|
38
38
|
abstract fncCtx: llm.FunctionContext | undefined;
|
|
39
|
+
abstract recoverFromTextResponse(itemId: string): void;
|
|
39
40
|
}
|
|
40
41
|
|
|
41
42
|
/**
|
|
@@ -63,19 +64,25 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
63
64
|
subscribedTrack: RemoteAudioTrack | null = null;
|
|
64
65
|
readMicroTask: Promise<void> | null = null;
|
|
65
66
|
|
|
67
|
+
#textResponseRetries = 0;
|
|
68
|
+
#maxTextResponseRetries: number;
|
|
69
|
+
|
|
66
70
|
constructor({
|
|
67
71
|
model,
|
|
68
72
|
chatCtx,
|
|
69
73
|
fncCtx,
|
|
74
|
+
maxTextResponseRetries = 5,
|
|
70
75
|
}: {
|
|
71
76
|
model: RealtimeModel;
|
|
72
77
|
chatCtx?: llm.ChatContext;
|
|
73
78
|
fncCtx?: llm.FunctionContext;
|
|
79
|
+
maxTextResponseRetries?: number;
|
|
74
80
|
}) {
|
|
75
81
|
super();
|
|
76
82
|
this.model = model;
|
|
77
83
|
this.#chatCtx = chatCtx;
|
|
78
84
|
this.#fncCtx = fncCtx;
|
|
85
|
+
this.#maxTextResponseRetries = maxTextResponseRetries;
|
|
79
86
|
}
|
|
80
87
|
|
|
81
88
|
#participant: RemoteParticipant | string | null = null;
|
|
@@ -146,7 +153,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
146
153
|
if (this.linkedParticipant) {
|
|
147
154
|
return;
|
|
148
155
|
}
|
|
149
|
-
this.#linkParticipant(participant.identity);
|
|
156
|
+
this.#linkParticipant(participant.identity!);
|
|
150
157
|
});
|
|
151
158
|
room.on(
|
|
152
159
|
RoomEvent.TrackPublished,
|
|
@@ -220,12 +227,12 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
220
227
|
if (typeof participant === 'string') {
|
|
221
228
|
this.#linkParticipant(participant);
|
|
222
229
|
} else {
|
|
223
|
-
this.#linkParticipant(participant.identity);
|
|
230
|
+
this.#linkParticipant(participant.identity!);
|
|
224
231
|
}
|
|
225
232
|
} else {
|
|
226
233
|
// No participant specified, try to find the first participant in the room
|
|
227
234
|
for (const participant of room.remoteParticipants.values()) {
|
|
228
|
-
this.#linkParticipant(participant.identity);
|
|
235
|
+
this.#linkParticipant(participant.identity!);
|
|
229
236
|
break;
|
|
230
237
|
}
|
|
231
238
|
}
|
|
@@ -236,9 +243,11 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
236
243
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
237
244
|
this.#session.on('response_content_added', (message: any) => {
|
|
238
245
|
// openai.realtime.RealtimeContent
|
|
246
|
+
if (message.contentType === 'text') return;
|
|
247
|
+
|
|
239
248
|
const trFwd = new BasicTranscriptionForwarder(
|
|
240
249
|
this.room!,
|
|
241
|
-
this.room!.localParticipant!.identity
|
|
250
|
+
this.room!.localParticipant!.identity!,
|
|
242
251
|
this.#getLocalTrackSid()!,
|
|
243
252
|
message.responseId,
|
|
244
253
|
);
|
|
@@ -253,6 +262,36 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
253
262
|
this.#playingHandle = handle;
|
|
254
263
|
});
|
|
255
264
|
|
|
265
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
266
|
+
this.#session.on('response_content_done', (message: any) => {
|
|
267
|
+
// openai.realtime.RealtimeContent
|
|
268
|
+
if (message.contentType === 'text') {
|
|
269
|
+
if (this.#textResponseRetries >= this.#maxTextResponseRetries) {
|
|
270
|
+
throw new Error(
|
|
271
|
+
'The OpenAI Realtime API returned a text response ' +
|
|
272
|
+
`after ${this.#maxTextResponseRetries} retries. ` +
|
|
273
|
+
'Please try to reduce the number of text system or ' +
|
|
274
|
+
'assistant messages in the chat context.',
|
|
275
|
+
);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
this.#textResponseRetries++;
|
|
279
|
+
this.#logger
|
|
280
|
+
.child({
|
|
281
|
+
itemId: message.itemId,
|
|
282
|
+
text: message.text,
|
|
283
|
+
retries: this.#textResponseRetries,
|
|
284
|
+
})
|
|
285
|
+
.warn(
|
|
286
|
+
'The OpenAI Realtime API returned a text response instead of audio. ' +
|
|
287
|
+
'Attempting to recover to audio mode...',
|
|
288
|
+
);
|
|
289
|
+
this.#session!.recoverFromTextResponse(message.itemId);
|
|
290
|
+
} else {
|
|
291
|
+
this.#textResponseRetries = 0;
|
|
292
|
+
}
|
|
293
|
+
});
|
|
294
|
+
|
|
256
295
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
257
296
|
this.#session.on('input_speech_committed', (ev: any) => {
|
|
258
297
|
// openai.realtime.InputSpeechCommittedEvent
|
|
@@ -419,7 +458,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
419
458
|
|
|
420
459
|
#getLocalTrackSid(): string | null {
|
|
421
460
|
if (!this.#localTrackSid && this.room && this.room.localParticipant) {
|
|
422
|
-
this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant
|
|
461
|
+
this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant!.identity!);
|
|
423
462
|
}
|
|
424
463
|
return this.#localTrackSid;
|
|
425
464
|
}
|
|
@@ -470,7 +509,7 @@ export class MultimodalAgent extends EventEmitter {
|
|
|
470
509
|
|
|
471
510
|
#setState(state: AgentState) {
|
|
472
511
|
if (this.room?.isConnected && this.room.localParticipant) {
|
|
473
|
-
const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
|
|
512
|
+
const currentState = this.room.localParticipant.attributes![AGENT_STATE_ATTRIBUTE];
|
|
474
513
|
if (currentState !== state) {
|
|
475
514
|
this.room.localParticipant.setAttributes({
|
|
476
515
|
[AGENT_STATE_ATTRIBUTE]: state,
|
|
@@ -355,7 +355,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
355
355
|
if (this.#participant) {
|
|
356
356
|
return;
|
|
357
357
|
}
|
|
358
|
-
this.#linkParticipant.call(this, participant.identity);
|
|
358
|
+
this.#linkParticipant.call(this, participant.identity!);
|
|
359
359
|
});
|
|
360
360
|
|
|
361
361
|
this.#room = room;
|
|
@@ -365,7 +365,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
365
365
|
if (typeof participant === 'string') {
|
|
366
366
|
this.#linkParticipant(participant);
|
|
367
367
|
} else {
|
|
368
|
-
this.#linkParticipant(participant.identity);
|
|
368
|
+
this.#linkParticipant(participant.identity!);
|
|
369
369
|
}
|
|
370
370
|
}
|
|
371
371
|
|
|
@@ -488,7 +488,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
488
488
|
}
|
|
489
489
|
});
|
|
490
490
|
this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {
|
|
491
|
-
this.emit(VPAEvent.
|
|
491
|
+
this.emit(VPAEvent.USER_STOPPED_SPEAKING);
|
|
492
492
|
this.#deferredValidation.onHumanEndOfSpeech(event);
|
|
493
493
|
});
|
|
494
494
|
this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
|
|
@@ -971,6 +971,7 @@ class DeferredReplyValidation {
|
|
|
971
971
|
#speaking = false;
|
|
972
972
|
#endOfSpeechDelay: number;
|
|
973
973
|
#finalTranscriptDelay: number;
|
|
974
|
+
#abort?: AbortController;
|
|
974
975
|
|
|
975
976
|
constructor(validateFunc: () => Promise<void>, minEndpointingDelay: number) {
|
|
976
977
|
this.#validateFunc = validateFunc;
|
|
@@ -997,10 +998,9 @@ class DeferredReplyValidation {
|
|
|
997
998
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
998
999
|
onHumanStartOfSpeech(_: VADEvent) {
|
|
999
1000
|
this.#speaking = true;
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
// }
|
|
1001
|
+
if (this.validating) {
|
|
1002
|
+
this.#abort?.abort();
|
|
1003
|
+
}
|
|
1004
1004
|
}
|
|
1005
1005
|
|
|
1006
1006
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
@@ -1011,7 +1011,7 @@ class DeferredReplyValidation {
|
|
|
1011
1011
|
if (this.#lastFinalTranscript) {
|
|
1012
1012
|
const delay = this.#endWithPunctuation()
|
|
1013
1013
|
? this.#endOfSpeechDelay * this.PUNCTUATION_REDUCE_FACTOR
|
|
1014
|
-
:
|
|
1014
|
+
: 1_000;
|
|
1015
1015
|
this.#run(delay);
|
|
1016
1016
|
}
|
|
1017
1017
|
}
|
|
@@ -1031,13 +1031,19 @@ class DeferredReplyValidation {
|
|
|
1031
1031
|
}
|
|
1032
1032
|
|
|
1033
1033
|
#run(delay: number) {
|
|
1034
|
-
const runTask = async (delay: number) => {
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1034
|
+
const runTask = async (delay: number, signal: AbortSignal) => {
|
|
1035
|
+
const timeout = setTimeout(() => {
|
|
1036
|
+
this.#resetStates();
|
|
1037
|
+
this.#validateFunc();
|
|
1038
|
+
}, delay);
|
|
1039
|
+
signal.addEventListener('abort', () => {
|
|
1040
|
+
clearTimeout(timeout);
|
|
1041
|
+
});
|
|
1038
1042
|
};
|
|
1039
1043
|
|
|
1044
|
+
this.#abort?.abort();
|
|
1045
|
+
this.#abort = new AbortController();
|
|
1040
1046
|
this.#validatingFuture = new Future();
|
|
1041
|
-
this.#validatingPromise = runTask(delay);
|
|
1047
|
+
this.#validatingPromise = runTask(delay, this.#abort.signal);
|
|
1042
1048
|
}
|
|
1043
1049
|
}
|
package/src/utils.ts
CHANGED
|
@@ -91,10 +91,18 @@ export class Queue<T> {
|
|
|
91
91
|
}
|
|
92
92
|
|
|
93
93
|
async get(): Promise<T> {
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
94
|
+
const _get = async (): Promise<T> => {
|
|
95
|
+
if (this.items.length === 0) {
|
|
96
|
+
await once(this.#events, 'put');
|
|
97
|
+
}
|
|
98
|
+
let item = this.items.shift();
|
|
99
|
+
if (!item) {
|
|
100
|
+
item = await _get();
|
|
101
|
+
}
|
|
102
|
+
return item;
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
const item = _get();
|
|
98
106
|
this.#events.emit('get');
|
|
99
107
|
return item;
|
|
100
108
|
}
|
package/src/worker.ts
CHANGED
|
@@ -322,15 +322,19 @@ export class Worker {
|
|
|
322
322
|
try {
|
|
323
323
|
await new Promise((resolve, reject) => {
|
|
324
324
|
this.#session!.on('open', resolve);
|
|
325
|
-
this.#session!.on('error', (error) => reject(error));
|
|
326
|
-
this.#session!.on('close', (code) => reject(
|
|
325
|
+
this.#session!.on('error', (error) => reject(error.message));
|
|
326
|
+
this.#session!.on('close', (code) => reject(`WebSocket returned ${code}`));
|
|
327
327
|
});
|
|
328
328
|
|
|
329
329
|
retries = 0;
|
|
330
330
|
this.#logger.debug('connected to LiveKit server');
|
|
331
331
|
this.#runWS(this.#session);
|
|
332
332
|
return;
|
|
333
|
-
} catch (e) {
|
|
333
|
+
} catch (e: unknown) {
|
|
334
|
+
if (e instanceof Error || e instanceof ErrorEvent) {
|
|
335
|
+
e = e.message;
|
|
336
|
+
}
|
|
337
|
+
|
|
334
338
|
if (this.#closed) return;
|
|
335
339
|
if (retries >= this.#opts.maxRetry) {
|
|
336
340
|
throw new WorkerError(
|