@livekit/agents 1.0.15 → 1.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +12 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.cts +3 -3
- package/dist/cli.d.ts +3 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +13 -13
- package/dist/cli.js.map +1 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +1 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +2 -1
- package/dist/inference/tts.d.ts +2 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +1 -5
- package/dist/inference/tts.js.map +1 -1
- package/dist/llm/chat_context.cjs +78 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +16 -0
- package/dist/llm/chat_context.d.ts +16 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +78 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/chat_context.test.cjs +531 -0
- package/dist/llm/chat_context.test.cjs.map +1 -1
- package/dist/llm/chat_context.test.js +531 -0
- package/dist/llm/chat_context.test.js.map +1 -1
- package/dist/llm/tool_context.cjs +40 -0
- package/dist/llm/tool_context.cjs.map +1 -1
- package/dist/llm/tool_context.d.cts +2 -0
- package/dist/llm/tool_context.d.ts +2 -0
- package/dist/llm/tool_context.d.ts.map +1 -1
- package/dist/llm/tool_context.js +38 -0
- package/dist/llm/tool_context.js.map +1 -1
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.cts +7 -0
- package/dist/metrics/base.d.ts +7 -0
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/stt/stt.cjs +1 -1
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -1
- package/dist/stt/stt.d.ts +7 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +1 -1
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/tts.cjs +2 -4
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +3 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/voice/agent_activity.cjs +83 -8
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +6 -2
- package/dist/voice/agent_activity.d.ts +6 -2
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +83 -8
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +3 -2
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +2 -1
- package/dist/voice/agent_session.d.ts +2 -1
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +3 -2
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +138 -16
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +11 -0
- package/dist/voice/audio_recognition.d.ts +11 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +138 -16
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +0 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/worker.cjs +17 -11
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.cts +16 -9
- package/dist/worker.d.ts +16 -9
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +16 -12
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
- package/src/cli.ts +17 -17
- package/src/inference/stt.ts +2 -1
- package/src/inference/tts.ts +2 -5
- package/src/llm/chat_context.test.ts +607 -0
- package/src/llm/chat_context.ts +106 -0
- package/src/llm/tool_context.ts +44 -0
- package/src/metrics/base.ts +7 -0
- package/src/stt/stt.ts +8 -1
- package/src/tts/tts.ts +7 -5
- package/src/voice/agent_activity.ts +119 -9
- package/src/voice/agent_session.ts +3 -1
- package/src/voice/audio_recognition.ts +235 -57
- package/src/voice/room_io/_input.ts +1 -1
- package/src/worker.ts +29 -18
|
@@ -17,8 +17,16 @@ import type { STTNode } from './io.js';
|
|
|
17
17
|
|
|
18
18
|
export interface EndOfTurnInfo {
|
|
19
19
|
newTranscript: string;
|
|
20
|
+
transcriptConfidence: number;
|
|
20
21
|
transcriptionDelay: number;
|
|
21
22
|
endOfUtteranceDelay: number;
|
|
23
|
+
startedSpeakingAt: number | undefined;
|
|
24
|
+
stoppedSpeakingAt: number | undefined;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface PreemptiveGenerationInfo {
|
|
28
|
+
newTranscript: string;
|
|
29
|
+
transcriptConfidence: number;
|
|
22
30
|
}
|
|
23
31
|
|
|
24
32
|
export interface RecognitionHooks {
|
|
@@ -28,6 +36,7 @@ export interface RecognitionHooks {
|
|
|
28
36
|
onInterimTranscript: (ev: SpeechEvent) => void;
|
|
29
37
|
onFinalTranscript: (ev: SpeechEvent) => void;
|
|
30
38
|
onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
|
|
39
|
+
onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
|
|
31
40
|
|
|
32
41
|
retrieveChatCtx: () => ChatContext;
|
|
33
42
|
}
|
|
@@ -63,7 +72,10 @@ export class AudioRecognition {
|
|
|
63
72
|
private lastFinalTranscriptTime = 0;
|
|
64
73
|
private audioTranscript = '';
|
|
65
74
|
private audioInterimTranscript = '';
|
|
66
|
-
private
|
|
75
|
+
private audioPreflightTranscript = '';
|
|
76
|
+
private finalTranscriptConfidence: number[] = [];
|
|
77
|
+
private lastSpeakingTime: number | undefined;
|
|
78
|
+
private speechStartTime: number | undefined;
|
|
67
79
|
private userTurnCommitted = false;
|
|
68
80
|
private speaking = false;
|
|
69
81
|
private sampleRate?: number;
|
|
@@ -144,6 +156,7 @@ export class AudioRecognition {
|
|
|
144
156
|
case SpeechEventType.FINAL_TRANSCRIPT:
|
|
145
157
|
this.hooks.onFinalTranscript(ev);
|
|
146
158
|
const transcript = ev.alternatives?.[0]?.text;
|
|
159
|
+
const confidence = ev.alternatives?.[0]?.confidence ?? 0;
|
|
147
160
|
this.lastLanguage = ev.alternatives?.[0]?.language;
|
|
148
161
|
|
|
149
162
|
if (!transcript) {
|
|
@@ -162,34 +175,144 @@ export class AudioRecognition {
|
|
|
162
175
|
this.lastFinalTranscriptTime = Date.now();
|
|
163
176
|
this.audioTranscript += ` ${transcript}`;
|
|
164
177
|
this.audioTranscript = this.audioTranscript.trimStart();
|
|
178
|
+
this.finalTranscriptConfidence.push(confidence);
|
|
179
|
+
const transcriptChanged = this.audioTranscript !== this.audioPreflightTranscript;
|
|
165
180
|
this.audioInterimTranscript = '';
|
|
181
|
+
this.audioPreflightTranscript = '';
|
|
182
|
+
|
|
183
|
+
if (!this.vad || this.lastSpeakingTime === undefined) {
|
|
184
|
+
// vad disabled, use stt timestamp
|
|
185
|
+
// TODO: this would screw up transcription latency metrics
|
|
186
|
+
// but we'll live with it for now.
|
|
187
|
+
// the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
|
|
188
|
+
// and using that timestamp for lastSpeakingTime
|
|
189
|
+
this.lastSpeakingTime = Date.now();
|
|
190
|
+
}
|
|
166
191
|
|
|
167
|
-
if (
|
|
168
|
-
if (
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
192
|
+
if (this.vadBaseTurnDetection || this.userTurnCommitted) {
|
|
193
|
+
if (transcriptChanged) {
|
|
194
|
+
this.logger.debug(
|
|
195
|
+
{ transcript: this.audioTranscript },
|
|
196
|
+
'triggering preemptive generation (FINAL_TRANSCRIPT)',
|
|
197
|
+
);
|
|
198
|
+
this.hooks.onPreemptiveGeneration({
|
|
199
|
+
newTranscript: this.audioTranscript,
|
|
200
|
+
transcriptConfidence:
|
|
201
|
+
this.finalTranscriptConfidence.length > 0
|
|
202
|
+
? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
|
|
203
|
+
this.finalTranscriptConfidence.length
|
|
204
|
+
: 0,
|
|
205
|
+
});
|
|
176
206
|
}
|
|
177
207
|
|
|
178
|
-
if (this.
|
|
208
|
+
if (!this.speaking) {
|
|
179
209
|
const chatCtx = this.hooks.retrieveChatCtx();
|
|
180
210
|
this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');
|
|
181
211
|
this.runEOUDetection(chatCtx);
|
|
182
212
|
}
|
|
183
213
|
}
|
|
184
214
|
break;
|
|
215
|
+
case SpeechEventType.PREFLIGHT_TRANSCRIPT:
|
|
216
|
+
this.hooks.onInterimTranscript(ev);
|
|
217
|
+
const preflightTranscript = ev.alternatives?.[0]?.text ?? '';
|
|
218
|
+
const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0;
|
|
219
|
+
const preflightLanguage = ev.alternatives?.[0]?.language;
|
|
220
|
+
|
|
221
|
+
const MIN_LANGUAGE_DETECTION_LENGTH = 5;
|
|
222
|
+
if (
|
|
223
|
+
!this.lastLanguage ||
|
|
224
|
+
(preflightLanguage && preflightTranscript.length > MIN_LANGUAGE_DETECTION_LENGTH)
|
|
225
|
+
) {
|
|
226
|
+
this.lastLanguage = preflightLanguage;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (!preflightTranscript) {
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
this.logger.debug(
|
|
234
|
+
{
|
|
235
|
+
user_transcript: preflightTranscript,
|
|
236
|
+
language: this.lastLanguage,
|
|
237
|
+
},
|
|
238
|
+
'received user preflight transcript',
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
// still need to increment it as it's used for turn detection,
|
|
242
|
+
this.lastFinalTranscriptTime = Date.now();
|
|
243
|
+
// preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
|
|
244
|
+
this.audioPreflightTranscript =
|
|
245
|
+
`${this.audioTranscript} ${preflightTranscript}`.trimStart();
|
|
246
|
+
this.audioInterimTranscript = preflightTranscript;
|
|
247
|
+
|
|
248
|
+
if (!this.vad || this.lastSpeakingTime === undefined) {
|
|
249
|
+
// vad disabled, use stt timestamp
|
|
250
|
+
this.lastSpeakingTime = Date.now();
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if (this.turnDetectionMode !== 'manual' || this.userTurnCommitted) {
|
|
254
|
+
const confidenceVals = [...this.finalTranscriptConfidence, preflightConfidence];
|
|
255
|
+
this.logger.debug(
|
|
256
|
+
{
|
|
257
|
+
transcript:
|
|
258
|
+
this.audioPreflightTranscript.length > 100
|
|
259
|
+
? this.audioPreflightTranscript.slice(0, 100) + '...'
|
|
260
|
+
: this.audioPreflightTranscript,
|
|
261
|
+
},
|
|
262
|
+
'triggering preemptive generation (PREFLIGHT_TRANSCRIPT)',
|
|
263
|
+
);
|
|
264
|
+
this.hooks.onPreemptiveGeneration({
|
|
265
|
+
newTranscript: this.audioPreflightTranscript,
|
|
266
|
+
transcriptConfidence:
|
|
267
|
+
confidenceVals.length > 0
|
|
268
|
+
? confidenceVals.reduce((a, b) => a + b, 0) / confidenceVals.length
|
|
269
|
+
: 0,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
break;
|
|
185
273
|
case SpeechEventType.INTERIM_TRANSCRIPT:
|
|
186
274
|
this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');
|
|
187
275
|
this.hooks.onInterimTranscript(ev);
|
|
188
276
|
this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';
|
|
189
277
|
break;
|
|
278
|
+
case SpeechEventType.START_OF_SPEECH:
|
|
279
|
+
if (this.turnDetectionMode !== 'stt') break;
|
|
280
|
+
this.hooks.onStartOfSpeech({
|
|
281
|
+
type: VADEventType.START_OF_SPEECH,
|
|
282
|
+
samplesIndex: 0,
|
|
283
|
+
timestamp: Date.now(),
|
|
284
|
+
speechDuration: 0,
|
|
285
|
+
silenceDuration: 0,
|
|
286
|
+
frames: [],
|
|
287
|
+
probability: 0,
|
|
288
|
+
inferenceDuration: 0,
|
|
289
|
+
speaking: true,
|
|
290
|
+
rawAccumulatedSilence: 0,
|
|
291
|
+
rawAccumulatedSpeech: 0,
|
|
292
|
+
});
|
|
293
|
+
this.speaking = true;
|
|
294
|
+
this.lastSpeakingTime = Date.now();
|
|
295
|
+
|
|
296
|
+
this.bounceEOUTask?.cancel();
|
|
297
|
+
break;
|
|
190
298
|
case SpeechEventType.END_OF_SPEECH:
|
|
191
299
|
if (this.turnDetectionMode !== 'stt') break;
|
|
300
|
+
this.hooks.onEndOfSpeech({
|
|
301
|
+
type: VADEventType.END_OF_SPEECH,
|
|
302
|
+
samplesIndex: 0,
|
|
303
|
+
timestamp: Date.now(),
|
|
304
|
+
speechDuration: 0,
|
|
305
|
+
silenceDuration: 0,
|
|
306
|
+
frames: [],
|
|
307
|
+
probability: 0,
|
|
308
|
+
inferenceDuration: 0,
|
|
309
|
+
speaking: false,
|
|
310
|
+
rawAccumulatedSilence: 0,
|
|
311
|
+
rawAccumulatedSpeech: 0,
|
|
312
|
+
});
|
|
313
|
+
this.speaking = false;
|
|
192
314
|
this.userTurnCommitted = true;
|
|
315
|
+
this.lastSpeakingTime = Date.now();
|
|
193
316
|
|
|
194
317
|
if (!this.speaking) {
|
|
195
318
|
const chatCtx = this.hooks.retrieveChatCtx();
|
|
@@ -222,61 +345,106 @@ export class AudioRecognition {
|
|
|
222
345
|
// disable EOU model if manual turn detection enabled
|
|
223
346
|
this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;
|
|
224
347
|
|
|
225
|
-
const bounceEOUTask =
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
} else {
|
|
234
|
-
const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
|
|
235
|
-
this.logger.debug(
|
|
236
|
-
{ endOfTurnProbability, language: this.lastLanguage },
|
|
237
|
-
'end of turn probability',
|
|
238
|
-
);
|
|
239
|
-
|
|
240
|
-
const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
|
|
241
|
-
this.logger.debug(
|
|
242
|
-
{
|
|
243
|
-
unlikelyThreshold,
|
|
244
|
-
endOfTurnProbability,
|
|
245
|
-
language: this.lastLanguage,
|
|
246
|
-
transcript: this.audioTranscript,
|
|
247
|
-
},
|
|
248
|
-
'EOU Detection',
|
|
249
|
-
);
|
|
348
|
+
const bounceEOUTask =
|
|
349
|
+
(
|
|
350
|
+
lastSpeakingTime: number | undefined,
|
|
351
|
+
lastFinalTranscriptTime: number,
|
|
352
|
+
speechStartTime: number | undefined,
|
|
353
|
+
) =>
|
|
354
|
+
async (controller: AbortController) => {
|
|
355
|
+
let endpointingDelay = this.minEndpointingDelay;
|
|
250
356
|
|
|
251
|
-
|
|
252
|
-
|
|
357
|
+
if (turnDetector) {
|
|
358
|
+
this.logger.debug('Running turn detector model');
|
|
359
|
+
if (!turnDetector.supportsLanguage(this.lastLanguage)) {
|
|
360
|
+
this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
|
|
361
|
+
} else {
|
|
362
|
+
const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
|
|
363
|
+
this.logger.debug(
|
|
364
|
+
{ endOfTurnProbability, language: this.lastLanguage },
|
|
365
|
+
'end of turn probability',
|
|
366
|
+
);
|
|
367
|
+
|
|
368
|
+
const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
|
|
369
|
+
this.logger.debug(
|
|
370
|
+
{
|
|
371
|
+
unlikelyThreshold,
|
|
372
|
+
endOfTurnProbability,
|
|
373
|
+
language: this.lastLanguage,
|
|
374
|
+
transcript: this.audioTranscript,
|
|
375
|
+
},
|
|
376
|
+
'EOU Detection',
|
|
377
|
+
);
|
|
378
|
+
|
|
379
|
+
if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
|
|
380
|
+
endpointingDelay = this.maxEndpointingDelay;
|
|
381
|
+
}
|
|
253
382
|
}
|
|
254
383
|
}
|
|
255
|
-
}
|
|
256
384
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
385
|
+
let extraSleep = endpointingDelay;
|
|
386
|
+
if (lastSpeakingTime !== undefined) {
|
|
387
|
+
extraSleep += lastSpeakingTime - Date.now();
|
|
388
|
+
}
|
|
260
389
|
|
|
261
|
-
|
|
390
|
+
if (extraSleep > 0) {
|
|
391
|
+
// add delay to see if there's a potential upcoming EOU task that cancels this one
|
|
392
|
+
await delay(Math.max(extraSleep, 0), { signal: controller.signal });
|
|
393
|
+
}
|
|
262
394
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
395
|
+
this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
|
|
396
|
+
|
|
397
|
+
const confidenceAvg =
|
|
398
|
+
this.finalTranscriptConfidence.length > 0
|
|
399
|
+
? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
|
|
400
|
+
this.finalTranscriptConfidence.length
|
|
401
|
+
: 0;
|
|
402
|
+
|
|
403
|
+
let startedSpeakingAt: number | undefined;
|
|
404
|
+
let stoppedSpeakingAt: number | undefined;
|
|
405
|
+
let transcriptionDelay: number | undefined;
|
|
406
|
+
let endOfUtteranceDelay: number | undefined;
|
|
407
|
+
|
|
408
|
+
// sometimes, we can't calculate the metrics because VAD was unreliable.
|
|
409
|
+
// in this case, we just ignore the calculation, it's better than providing likely wrong values
|
|
410
|
+
if (
|
|
411
|
+
lastFinalTranscriptTime !== 0 &&
|
|
412
|
+
lastSpeakingTime !== undefined &&
|
|
413
|
+
speechStartTime !== undefined
|
|
414
|
+
) {
|
|
415
|
+
startedSpeakingAt = speechStartTime;
|
|
416
|
+
stoppedSpeakingAt = lastSpeakingTime;
|
|
417
|
+
transcriptionDelay = Math.max(lastFinalTranscriptTime - lastSpeakingTime, 0);
|
|
418
|
+
endOfUtteranceDelay = Date.now() - lastSpeakingTime;
|
|
419
|
+
}
|
|
268
420
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
421
|
+
const committed = await this.hooks.onEndOfTurn({
|
|
422
|
+
newTranscript: this.audioTranscript,
|
|
423
|
+
transcriptConfidence: confidenceAvg,
|
|
424
|
+
transcriptionDelay: transcriptionDelay ?? 0,
|
|
425
|
+
endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
|
|
426
|
+
startedSpeakingAt,
|
|
427
|
+
stoppedSpeakingAt,
|
|
428
|
+
});
|
|
429
|
+
|
|
430
|
+
if (committed) {
|
|
431
|
+
// clear the transcript if the user turn was committed
|
|
432
|
+
this.audioTranscript = '';
|
|
433
|
+
this.finalTranscriptConfidence = [];
|
|
434
|
+
this.lastSpeakingTime = undefined;
|
|
435
|
+
this.lastFinalTranscriptTime = 0;
|
|
436
|
+
this.speechStartTime = undefined;
|
|
437
|
+
}
|
|
273
438
|
|
|
274
|
-
|
|
275
|
-
|
|
439
|
+
this.userTurnCommitted = false;
|
|
440
|
+
};
|
|
276
441
|
|
|
277
442
|
// cancel any existing EOU task
|
|
278
443
|
this.bounceEOUTask?.cancel();
|
|
279
|
-
|
|
444
|
+
// copy the values before awaiting (the values can change)
|
|
445
|
+
this.bounceEOUTask = Task.from(
|
|
446
|
+
bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.speechStartTime),
|
|
447
|
+
);
|
|
280
448
|
|
|
281
449
|
this.bounceEOUTask.result
|
|
282
450
|
.then(() => {
|
|
@@ -376,13 +544,21 @@ export class AudioRecognition {
|
|
|
376
544
|
break;
|
|
377
545
|
case VADEventType.INFERENCE_DONE:
|
|
378
546
|
this.hooks.onVADInferenceDone(ev);
|
|
547
|
+
// for metrics, get the "earliest" signal of speech as possible
|
|
548
|
+
if (ev.rawAccumulatedSpeech > 0.0) {
|
|
549
|
+
this.lastSpeakingTime = Date.now();
|
|
550
|
+
|
|
551
|
+
if (this.speechStartTime === undefined) {
|
|
552
|
+
this.speechStartTime = Date.now();
|
|
553
|
+
}
|
|
554
|
+
}
|
|
379
555
|
break;
|
|
380
556
|
case VADEventType.END_OF_SPEECH:
|
|
381
557
|
this.logger.debug('VAD task: END_OF_SPEECH');
|
|
382
558
|
this.hooks.onEndOfSpeech(ev);
|
|
383
|
-
|
|
559
|
+
|
|
384
560
|
// when VAD fires END_OF_SPEECH, it already waited for the silence_duration
|
|
385
|
-
this.
|
|
561
|
+
this.speaking = false;
|
|
386
562
|
|
|
387
563
|
if (
|
|
388
564
|
this.vadBaseTurnDetection ||
|
|
@@ -412,6 +588,8 @@ export class AudioRecognition {
|
|
|
412
588
|
clearUserTurn() {
|
|
413
589
|
this.audioTranscript = '';
|
|
414
590
|
this.audioInterimTranscript = '';
|
|
591
|
+
this.audioPreflightTranscript = '';
|
|
592
|
+
this.finalTranscriptConfidence = [];
|
|
415
593
|
this.userTurnCommitted = false;
|
|
416
594
|
|
|
417
595
|
this.sttTask?.cancelAndWait().finally(() => {
|
package/src/worker.ts
CHANGED
|
@@ -10,7 +10,8 @@ import {
|
|
|
10
10
|
WorkerMessage,
|
|
11
11
|
WorkerStatus,
|
|
12
12
|
} from '@livekit/protocol';
|
|
13
|
-
import {
|
|
13
|
+
import type { ParticipantInfo } from 'livekit-server-sdk';
|
|
14
|
+
import { AccessToken, RoomServiceClient } from 'livekit-server-sdk';
|
|
14
15
|
import { EventEmitter } from 'node:events';
|
|
15
16
|
import os from 'node:os';
|
|
16
17
|
import { WebSocket } from 'ws';
|
|
@@ -79,7 +80,7 @@ const defaultRequestFunc = async (ctx: JobRequest) => {
|
|
|
79
80
|
};
|
|
80
81
|
|
|
81
82
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
82
|
-
const defaultCpuLoad = async (worker:
|
|
83
|
+
const defaultCpuLoad = async (worker: AgentServer): Promise<number> => {
|
|
83
84
|
return new Promise((resolve) => {
|
|
84
85
|
const cpus1 = os.cpus();
|
|
85
86
|
|
|
@@ -141,17 +142,17 @@ export class WorkerPermissions {
|
|
|
141
142
|
*
|
|
142
143
|
* This class is mostly useful in conjunction with {@link cli.runApp}.
|
|
143
144
|
*/
|
|
144
|
-
export class
|
|
145
|
+
export class ServerOptions {
|
|
145
146
|
agent: string;
|
|
146
147
|
requestFunc: (job: JobRequest) => Promise<void>;
|
|
147
|
-
loadFunc: (worker:
|
|
148
|
+
loadFunc: (worker: AgentServer) => Promise<number>;
|
|
148
149
|
loadThreshold: number;
|
|
149
150
|
numIdleProcesses: number;
|
|
150
151
|
shutdownProcessTimeout: number;
|
|
151
152
|
initializeProcessTimeout: number;
|
|
152
153
|
permissions: WorkerPermissions;
|
|
153
154
|
agentName: string;
|
|
154
|
-
|
|
155
|
+
serverType: JobType;
|
|
155
156
|
maxRetry: number;
|
|
156
157
|
wsURL: string;
|
|
157
158
|
apiKey?: string;
|
|
@@ -175,7 +176,7 @@ export class WorkerOptions {
|
|
|
175
176
|
initializeProcessTimeout = 10 * 1000,
|
|
176
177
|
permissions = new WorkerPermissions(),
|
|
177
178
|
agentName = '',
|
|
178
|
-
|
|
179
|
+
serverType = JobType.JT_ROOM,
|
|
179
180
|
maxRetry = MAX_RECONNECT_ATTEMPTS,
|
|
180
181
|
wsURL = 'ws://localhost:7880',
|
|
181
182
|
apiKey = undefined,
|
|
@@ -195,7 +196,7 @@ export class WorkerOptions {
|
|
|
195
196
|
agent: string;
|
|
196
197
|
requestFunc?: (job: JobRequest) => Promise<void>;
|
|
197
198
|
/** Called to determine the current load of the worker. Should return a value between 0 and 1. */
|
|
198
|
-
loadFunc?: (worker:
|
|
199
|
+
loadFunc?: (worker: AgentServer) => Promise<number>;
|
|
199
200
|
/** When the load exceeds this threshold, the worker will be marked as unavailable. */
|
|
200
201
|
loadThreshold?: number;
|
|
201
202
|
numIdleProcesses?: number;
|
|
@@ -203,7 +204,7 @@ export class WorkerOptions {
|
|
|
203
204
|
initializeProcessTimeout?: number;
|
|
204
205
|
permissions?: WorkerPermissions;
|
|
205
206
|
agentName?: string;
|
|
206
|
-
|
|
207
|
+
serverType?: JobType;
|
|
207
208
|
maxRetry?: number;
|
|
208
209
|
wsURL?: string;
|
|
209
210
|
apiKey?: string;
|
|
@@ -228,7 +229,7 @@ export class WorkerOptions {
|
|
|
228
229
|
this.initializeProcessTimeout = initializeProcessTimeout;
|
|
229
230
|
this.permissions = permissions;
|
|
230
231
|
this.agentName = agentName;
|
|
231
|
-
this.
|
|
232
|
+
this.serverType = serverType;
|
|
232
233
|
this.maxRetry = maxRetry;
|
|
233
234
|
this.wsURL = wsURL;
|
|
234
235
|
this.apiKey = apiKey;
|
|
@@ -261,8 +262,8 @@ class PendingAssignment {
|
|
|
261
262
|
* you don't have access to a command line, such as a headless program, or one that uses Agents
|
|
262
263
|
* behind a wrapper.
|
|
263
264
|
*/
|
|
264
|
-
export class
|
|
265
|
-
#opts:
|
|
265
|
+
export class AgentServer {
|
|
266
|
+
#opts: ServerOptions;
|
|
266
267
|
#procPool: ProcPool;
|
|
267
268
|
|
|
268
269
|
#id = 'unregistered';
|
|
@@ -279,23 +280,23 @@ export class Worker {
|
|
|
279
280
|
#logger = log().child({ version });
|
|
280
281
|
#inferenceExecutor?: InferenceProcExecutor;
|
|
281
282
|
|
|
282
|
-
|
|
283
|
-
constructor(opts:
|
|
283
|
+
/* @throws {@link MissingCredentialsError} if URL, API key or API secret are missing */
|
|
284
|
+
constructor(opts: ServerOptions) {
|
|
284
285
|
opts.wsURL = opts.wsURL || process.env.LIVEKIT_URL || '';
|
|
285
286
|
opts.apiKey = opts.apiKey || process.env.LIVEKIT_API_KEY || '';
|
|
286
287
|
opts.apiSecret = opts.apiSecret || process.env.LIVEKIT_API_SECRET || '';
|
|
287
288
|
|
|
288
289
|
if (opts.wsURL === '')
|
|
289
290
|
throw new MissingCredentialsError(
|
|
290
|
-
'URL is required: Set LIVEKIT_URL, run with --url, or pass wsURL in
|
|
291
|
+
'URL is required: Set LIVEKIT_URL, run with --url, or pass wsURL in ServerOptions',
|
|
291
292
|
);
|
|
292
293
|
if (opts.apiKey === '')
|
|
293
294
|
throw new MissingCredentialsError(
|
|
294
|
-
'API Key is required: Set LIVEKIT_API_KEY, run with --api-key, or pass apiKey in
|
|
295
|
+
'API Key is required: Set LIVEKIT_API_KEY, run with --api-key, or pass apiKey in ServerOptions',
|
|
295
296
|
);
|
|
296
297
|
if (opts.apiSecret === '')
|
|
297
298
|
throw new MissingCredentialsError(
|
|
298
|
-
'API Secret is required: Set LIVEKIT_API_SECRET, run with --api-secret, or pass apiSecret in
|
|
299
|
+
'API Secret is required: Set LIVEKIT_API_SECRET, run with --api-secret, or pass apiSecret in ServerOptions',
|
|
299
300
|
);
|
|
300
301
|
|
|
301
302
|
if (opts.workerToken) {
|
|
@@ -340,7 +341,7 @@ export class Worker {
|
|
|
340
341
|
this.#opts = opts;
|
|
341
342
|
this.#httpServer = new HTTPServer(opts.host, opts.port, () => ({
|
|
342
343
|
agent_name: opts.agentName,
|
|
343
|
-
worker_type: JobType[opts.
|
|
344
|
+
worker_type: JobType[opts.serverType],
|
|
344
345
|
active_jobs: this.activeJobs.length,
|
|
345
346
|
sdk_version: version,
|
|
346
347
|
project_type: PROJECT_TYPE,
|
|
@@ -610,7 +611,7 @@ export class Worker {
|
|
|
610
611
|
message: {
|
|
611
612
|
case: 'register',
|
|
612
613
|
value: {
|
|
613
|
-
type: this.#opts.
|
|
614
|
+
type: this.#opts.serverType,
|
|
614
615
|
agentName: this.#opts.agentName,
|
|
615
616
|
allowedPermissions: new ParticipantPermission({
|
|
616
617
|
canPublish: this.#opts.permissions.canPublish,
|
|
@@ -788,3 +789,13 @@ export class Worker {
|
|
|
788
789
|
await this.#close.await;
|
|
789
790
|
}
|
|
790
791
|
}
|
|
792
|
+
|
|
793
|
+
/**
|
|
794
|
+
* @deprecated Use {@link AgentServer} instead. This alias is provided for backward compatibility.
|
|
795
|
+
*/
|
|
796
|
+
export const Worker = AgentServer;
|
|
797
|
+
|
|
798
|
+
/**
|
|
799
|
+
* @deprecated Use {@link ServerOptions} instead. This alias is provided for backward compatibility.
|
|
800
|
+
*/
|
|
801
|
+
export const WorkerOptions = ServerOptions;
|