@livekit/agents 0.6.4 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +6 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/inference_runner.cjs +38 -0
- package/dist/inference_runner.cjs.map +1 -0
- package/dist/inference_runner.d.ts +11 -0
- package/dist/inference_runner.d.ts.map +1 -0
- package/dist/inference_runner.js +14 -0
- package/dist/inference_runner.js.map +1 -0
- package/dist/ipc/index.cjs +23 -0
- package/dist/ipc/index.cjs.map +1 -0
- package/dist/ipc/index.d.ts +2 -0
- package/dist/ipc/index.d.ts.map +1 -0
- package/dist/ipc/index.js +2 -0
- package/dist/ipc/index.js.map +1 -0
- package/dist/ipc/inference_executor.cjs +17 -0
- package/dist/ipc/inference_executor.cjs.map +1 -0
- package/dist/ipc/inference_executor.d.ts +4 -0
- package/dist/ipc/inference_executor.d.ts.map +1 -0
- package/dist/ipc/inference_executor.js +1 -0
- package/dist/ipc/inference_executor.js.map +1 -0
- package/dist/ipc/inference_proc_executor.cjs +97 -0
- package/dist/ipc/inference_proc_executor.cjs.map +1 -0
- package/dist/ipc/inference_proc_executor.d.ts +23 -0
- package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
- package/dist/ipc/inference_proc_executor.js +72 -0
- package/dist/ipc/inference_proc_executor.js.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.cjs +90 -0
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
- package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.js +67 -0
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
- package/dist/ipc/job_executor.cjs +8 -7
- package/dist/ipc/job_executor.cjs.map +1 -1
- package/dist/ipc/job_executor.d.ts +14 -15
- package/dist/ipc/job_executor.d.ts.map +1 -1
- package/dist/ipc/job_executor.js +7 -6
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_proc_executor.cjs +108 -0
- package/dist/ipc/job_proc_executor.cjs.map +1 -0
- package/dist/ipc/job_proc_executor.d.ts +19 -0
- package/dist/ipc/job_proc_executor.d.ts.map +1 -0
- package/dist/ipc/job_proc_executor.js +83 -0
- package/dist/ipc/job_proc_executor.js.map +1 -0
- package/dist/ipc/{job_main.cjs → job_proc_lazy_main.cjs} +41 -36
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
- package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
- package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
- package/dist/ipc/{job_main.js → job_proc_lazy_main.js} +41 -11
- package/dist/ipc/job_proc_lazy_main.js.map +1 -0
- package/dist/ipc/message.cjs.map +1 -1
- package/dist/ipc/message.d.ts +17 -0
- package/dist/ipc/message.d.ts.map +1 -1
- package/dist/ipc/proc_pool.cjs +30 -4
- package/dist/ipc/proc_pool.cjs.map +1 -1
- package/dist/ipc/proc_pool.d.ts +5 -1
- package/dist/ipc/proc_pool.d.ts.map +1 -1
- package/dist/ipc/proc_pool.js +30 -4
- package/dist/ipc/proc_pool.js.map +1 -1
- package/dist/ipc/{proc_job_executor.cjs → supervised_proc.cjs} +57 -45
- package/dist/ipc/supervised_proc.cjs.map +1 -0
- package/dist/ipc/supervised_proc.d.ts +30 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -0
- package/dist/ipc/{proc_job_executor.js → supervised_proc.js} +53 -31
- package/dist/ipc/supervised_proc.js.map +1 -0
- package/dist/job.cjs +18 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.ts +9 -1
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +17 -1
- package/dist/job.js.map +1 -1
- package/dist/multimodal/agent_playout.cjs +13 -14
- package/dist/multimodal/agent_playout.cjs.map +1 -1
- package/dist/multimodal/agent_playout.d.ts +4 -4
- package/dist/multimodal/agent_playout.d.ts.map +1 -1
- package/dist/multimodal/agent_playout.js +13 -14
- package/dist/multimodal/agent_playout.js.map +1 -1
- package/dist/multimodal/multimodal_agent.cjs +12 -8
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +13 -9
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.cjs +20 -4
- package/dist/pipeline/agent_output.cjs.map +1 -1
- package/dist/pipeline/agent_output.d.ts +4 -2
- package/dist/pipeline/agent_output.d.ts.map +1 -1
- package/dist/pipeline/agent_output.js +20 -4
- package/dist/pipeline/agent_output.js.map +1 -1
- package/dist/pipeline/agent_playout.cjs +9 -3
- package/dist/pipeline/agent_playout.cjs.map +1 -1
- package/dist/pipeline/agent_playout.d.ts +4 -2
- package/dist/pipeline/agent_playout.d.ts.map +1 -1
- package/dist/pipeline/agent_playout.js +9 -3
- package/dist/pipeline/agent_playout.js.map +1 -1
- package/dist/pipeline/human_input.cjs +6 -0
- package/dist/pipeline/human_input.cjs.map +1 -1
- package/dist/pipeline/human_input.d.ts +3 -1
- package/dist/pipeline/human_input.d.ts.map +1 -1
- package/dist/pipeline/human_input.js +6 -0
- package/dist/pipeline/human_input.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +79 -12
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +8 -0
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +79 -12
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +16 -4
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +16 -4
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/tokenize/basic/basic.cjs +2 -0
- package/dist/tokenize/basic/basic.cjs.map +1 -1
- package/dist/tokenize/basic/basic.d.ts +2 -0
- package/dist/tokenize/basic/basic.d.ts.map +1 -1
- package/dist/tokenize/basic/basic.js +1 -0
- package/dist/tokenize/basic/basic.js.map +1 -1
- package/dist/tokenize/basic/index.cjs +2 -0
- package/dist/tokenize/basic/index.cjs.map +1 -1
- package/dist/tokenize/basic/index.d.ts +1 -1
- package/dist/tokenize/basic/index.d.ts.map +1 -1
- package/dist/tokenize/basic/index.js +8 -1
- package/dist/tokenize/basic/index.js.map +1 -1
- package/dist/tokenize/token_stream.cjs +5 -3
- package/dist/tokenize/token_stream.cjs.map +1 -1
- package/dist/tokenize/token_stream.d.ts.map +1 -1
- package/dist/tokenize/token_stream.js +5 -3
- package/dist/tokenize/token_stream.js.map +1 -1
- package/dist/transcription.cjs +203 -86
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.ts +24 -17
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js +201 -85
- package/dist/transcription.js.map +1 -1
- package/dist/worker.cjs +42 -9
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts +5 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +42 -9
- package/dist/worker.js.map +1 -1
- package/package.json +3 -3
- package/src/index.ts +3 -1
- package/src/inference_runner.ts +19 -0
- package/src/ipc/index.ts +5 -0
- package/src/ipc/inference_executor.ts +7 -0
- package/src/ipc/inference_proc_executor.ts +93 -0
- package/src/ipc/inference_proc_lazy_main.ts +86 -0
- package/src/ipc/job_executor.ts +15 -17
- package/src/ipc/job_proc_executor.ts +112 -0
- package/src/ipc/{job_main.ts → job_proc_lazy_main.ts} +44 -14
- package/src/ipc/message.ts +14 -1
- package/src/ipc/proc_pool.ts +33 -3
- package/src/ipc/{proc_job_executor.ts → supervised_proc.ts} +77 -29
- package/src/job.ts +21 -0
- package/src/multimodal/agent_playout.ts +14 -16
- package/src/multimodal/multimodal_agent.ts +13 -9
- package/src/pipeline/agent_output.ts +34 -5
- package/src/pipeline/agent_playout.ts +10 -1
- package/src/pipeline/human_input.ts +8 -0
- package/src/pipeline/pipeline_agent.ts +96 -11
- package/src/stt/stream_adapter.ts +17 -5
- package/src/tokenize/basic/basic.ts +2 -0
- package/src/tokenize/basic/index.ts +7 -1
- package/src/tokenize/token_stream.ts +6 -3
- package/src/transcription.ts +270 -96
- package/src/worker.ts +42 -5
- package/dist/ipc/job_main.cjs.map +0 -1
- package/dist/ipc/job_main.d.ts +0 -8
- package/dist/ipc/job_main.d.ts.map +0 -1
- package/dist/ipc/job_main.js.map +0 -1
- package/dist/ipc/proc_job_executor.cjs.map +0 -1
- package/dist/ipc/proc_job_executor.d.ts +0 -15
- package/dist/ipc/proc_job_executor.d.ts.map +0 -1
- package/dist/ipc/proc_job_executor.js.map +0 -1
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
TrackSource,
|
|
11
11
|
} from '@livekit/rtc-node';
|
|
12
12
|
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
13
|
+
import { randomUUID } from 'node:crypto';
|
|
13
14
|
import EventEmitter from 'node:events';
|
|
14
15
|
import type {
|
|
15
16
|
CallableFunctionResult,
|
|
@@ -28,6 +29,7 @@ import {
|
|
|
28
29
|
hyphenateWord,
|
|
29
30
|
} from '../tokenize/basic/index.js';
|
|
30
31
|
import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
|
|
32
|
+
import { TextAudioSynchronizer, defaultTextSyncOptions } from '../transcription.js';
|
|
31
33
|
import type { TTS } from '../tts/index.js';
|
|
32
34
|
import { TTSEvent, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
|
|
33
35
|
import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
|
|
@@ -78,6 +80,12 @@ export type VPACallbacks = {
|
|
|
78
80
|
[VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
|
|
79
81
|
};
|
|
80
82
|
|
|
83
|
+
interface TurnDetector {
|
|
84
|
+
unlikelyThreshold: number;
|
|
85
|
+
supportsLanguage: (language?: string) => boolean;
|
|
86
|
+
predictEndOfTurn: (chatCtx: ChatContext) => Promise<number>;
|
|
87
|
+
}
|
|
88
|
+
|
|
81
89
|
export class AgentCallContext {
|
|
82
90
|
#agent: VoicePipelineAgent;
|
|
83
91
|
#llmStream: LLMStream;
|
|
@@ -206,6 +214,8 @@ export interface VPAOptions {
|
|
|
206
214
|
beforeTTSCallback: BeforeTTSCallback;
|
|
207
215
|
/** Options for assistant transcription. */
|
|
208
216
|
transcription: AgentTranscriptionOptions;
|
|
217
|
+
/** Turn detection model to use. */
|
|
218
|
+
turnDetector?: TurnDetector;
|
|
209
219
|
}
|
|
210
220
|
|
|
211
221
|
const defaultVPAOptions: VPAOptions = {
|
|
@@ -238,7 +248,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
238
248
|
#pendingAgentReply?: SpeechHandle;
|
|
239
249
|
#agentReplyTask?: CancellablePromise<void>;
|
|
240
250
|
#playingSpeech?: SpeechHandle;
|
|
241
|
-
|
|
251
|
+
transcribedText = '';
|
|
242
252
|
#transcribedInterimText = '';
|
|
243
253
|
#speechQueueOpen = new Future();
|
|
244
254
|
#speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();
|
|
@@ -251,6 +261,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
251
261
|
#agentPublication?: LocalTrackPublication;
|
|
252
262
|
#lastFinalTranscriptTime?: number;
|
|
253
263
|
#lastSpeechTime?: number;
|
|
264
|
+
#transcriptionId?: string;
|
|
265
|
+
#agentTranscribedText = '';
|
|
254
266
|
|
|
255
267
|
constructor(
|
|
256
268
|
/** Voice Activity Detection instance. */
|
|
@@ -284,6 +296,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
284
296
|
this.#deferredValidation = new DeferredReplyValidation(
|
|
285
297
|
this.#validateReplyIfPossible.bind(this),
|
|
286
298
|
this.#opts.minEndpointingDelay,
|
|
299
|
+
this,
|
|
300
|
+
this.#opts.turnDetector,
|
|
287
301
|
);
|
|
288
302
|
}
|
|
289
303
|
|
|
@@ -492,14 +506,52 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
492
506
|
this.#deferredValidation.onHumanEndOfSpeech(event);
|
|
493
507
|
});
|
|
494
508
|
this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
|
|
509
|
+
if (!this.#transcriptionId) {
|
|
510
|
+
this.#transcriptionId = randomUUID();
|
|
511
|
+
}
|
|
495
512
|
this.#transcribedInterimText = event.alternatives![0].text;
|
|
513
|
+
|
|
514
|
+
this.#room!.localParticipant!.publishTranscription({
|
|
515
|
+
participantIdentity: this.#humanInput!.participant.identity,
|
|
516
|
+
trackSid: this.#humanInput!.subscribedTrack!.sid!,
|
|
517
|
+
segments: [
|
|
518
|
+
{
|
|
519
|
+
text: this.#transcribedInterimText,
|
|
520
|
+
id: this.#transcriptionId,
|
|
521
|
+
final: true,
|
|
522
|
+
startTime: BigInt(0),
|
|
523
|
+
endTime: BigInt(0),
|
|
524
|
+
language: '',
|
|
525
|
+
},
|
|
526
|
+
],
|
|
527
|
+
});
|
|
496
528
|
});
|
|
497
529
|
this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
|
|
498
530
|
const newTranscript = event.alternatives![0].text;
|
|
499
531
|
if (!newTranscript) return;
|
|
500
532
|
|
|
533
|
+
if (!this.#transcriptionId) {
|
|
534
|
+
this.#transcriptionId = randomUUID();
|
|
535
|
+
}
|
|
536
|
+
|
|
501
537
|
this.#lastFinalTranscriptTime = Date.now();
|
|
502
|
-
this
|
|
538
|
+
this.transcribedText += (this.transcribedText ? ' ' : '') + newTranscript;
|
|
539
|
+
|
|
540
|
+
this.#room!.localParticipant!.publishTranscription({
|
|
541
|
+
participantIdentity: this.#humanInput!.participant.identity,
|
|
542
|
+
trackSid: this.#humanInput!.subscribedTrack!.sid!,
|
|
543
|
+
segments: [
|
|
544
|
+
{
|
|
545
|
+
text: this.transcribedText,
|
|
546
|
+
id: this.#transcriptionId,
|
|
547
|
+
final: true,
|
|
548
|
+
startTime: BigInt(0),
|
|
549
|
+
endTime: BigInt(0),
|
|
550
|
+
language: '',
|
|
551
|
+
},
|
|
552
|
+
],
|
|
553
|
+
});
|
|
554
|
+
this.#transcriptionId = undefined;
|
|
503
555
|
|
|
504
556
|
if (
|
|
505
557
|
this.#opts.preemptiveSynthesis &&
|
|
@@ -564,7 +616,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
564
616
|
this.#pendingAgentReply = SpeechHandle.createAssistantReply(
|
|
565
617
|
this.#opts.allowInterruptions,
|
|
566
618
|
true,
|
|
567
|
-
this
|
|
619
|
+
this.transcribedText,
|
|
568
620
|
);
|
|
569
621
|
const newHandle = this.#pendingAgentReply;
|
|
570
622
|
this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);
|
|
@@ -674,7 +726,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
674
726
|
this.chatCtx.messages.push(userMsg);
|
|
675
727
|
this.emit(VPAEvent.USER_SPEECH_COMMITTED, userMsg);
|
|
676
728
|
|
|
677
|
-
this
|
|
729
|
+
this.transcribedText = this.transcribedText.slice(userQuestion.length);
|
|
678
730
|
handle.markUserCommitted();
|
|
679
731
|
};
|
|
680
732
|
|
|
@@ -692,7 +744,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
692
744
|
}
|
|
693
745
|
commitUserQuestionIfNeeded();
|
|
694
746
|
|
|
695
|
-
|
|
747
|
+
let collectedText = this.#agentTranscribedText;
|
|
696
748
|
const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
|
|
697
749
|
const interrupted = handle.interrupted;
|
|
698
750
|
|
|
@@ -701,7 +753,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
701
753
|
this.chatCtx.messages.push(...handle.extraToolsMessages);
|
|
702
754
|
}
|
|
703
755
|
if (interrupted) {
|
|
704
|
-
collectedText
|
|
756
|
+
collectedText += '…';
|
|
705
757
|
}
|
|
706
758
|
|
|
707
759
|
const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });
|
|
@@ -798,6 +850,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
798
850
|
chatCtx,
|
|
799
851
|
fncCtx: this.fncCtx,
|
|
800
852
|
});
|
|
853
|
+
|
|
801
854
|
const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
|
|
802
855
|
newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
|
|
803
856
|
handle.addNestedSpeech(newSpeechHandle);
|
|
@@ -832,6 +885,16 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
832
885
|
speechId: string,
|
|
833
886
|
source: string | LLMStream | AsyncIterable<string>,
|
|
834
887
|
): SynthesisHandle {
|
|
888
|
+
const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
|
|
889
|
+
synchronizer.on('textUpdated', (text) => {
|
|
890
|
+
this.#agentTranscribedText = text.text;
|
|
891
|
+
this.#room!.localParticipant!.publishTranscription({
|
|
892
|
+
participantIdentity: this.#room!.localParticipant!.identity,
|
|
893
|
+
trackSid: this.#agentPublication!.sid!,
|
|
894
|
+
segments: [text],
|
|
895
|
+
});
|
|
896
|
+
});
|
|
897
|
+
|
|
835
898
|
if (!this.#agentOutput) {
|
|
836
899
|
throw new Error('agent output should be initialized when ready');
|
|
837
900
|
}
|
|
@@ -850,7 +913,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
850
913
|
throw new Error('beforeTTSCallback must return string or AsyncIterable<string>');
|
|
851
914
|
}
|
|
852
915
|
|
|
853
|
-
return this.#agentOutput.synthesize(speechId, ttsSource);
|
|
916
|
+
return this.#agentOutput.synthesize(speechId, ttsSource, synchronizer);
|
|
854
917
|
}
|
|
855
918
|
|
|
856
919
|
async #validateReplyIfPossible() {
|
|
@@ -862,7 +925,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
862
925
|
}
|
|
863
926
|
|
|
864
927
|
if (!this.#pendingAgentReply) {
|
|
865
|
-
if (this.#opts.preemptiveSynthesis || !this
|
|
928
|
+
if (this.#opts.preemptiveSynthesis || !this.transcribedText) {
|
|
866
929
|
return;
|
|
867
930
|
}
|
|
868
931
|
this.#synthesizeAgentReply();
|
|
@@ -969,6 +1032,7 @@ class DeferredReplyValidation {
|
|
|
969
1032
|
readonly PUNCTUATION = '.!?';
|
|
970
1033
|
readonly PUNCTUATION_REDUCE_FACTOR = 0.75;
|
|
971
1034
|
readonly LATE_TRANSCRIPT_TOLERANCE = 1.5; // late compared to end of speech
|
|
1035
|
+
readonly UNLIKELY_ENDPOINT_DELAY = 6000;
|
|
972
1036
|
|
|
973
1037
|
#validateFunc: () => Promise<void>;
|
|
974
1038
|
#validatingPromise?: Promise<void>;
|
|
@@ -978,12 +1042,21 @@ class DeferredReplyValidation {
|
|
|
978
1042
|
#speaking = false;
|
|
979
1043
|
#endOfSpeechDelay: number;
|
|
980
1044
|
#finalTranscriptDelay: number;
|
|
1045
|
+
#turnDetector?: TurnDetector;
|
|
1046
|
+
#agent: VoicePipelineAgent;
|
|
981
1047
|
#abort?: AbortController;
|
|
982
1048
|
|
|
983
|
-
constructor(
|
|
1049
|
+
constructor(
|
|
1050
|
+
validateFunc: () => Promise<void>,
|
|
1051
|
+
minEndpointingDelay: number,
|
|
1052
|
+
agent: VoicePipelineAgent,
|
|
1053
|
+
turnDetector?: TurnDetector,
|
|
1054
|
+
) {
|
|
984
1055
|
this.#validateFunc = validateFunc;
|
|
985
1056
|
this.#endOfSpeechDelay = minEndpointingDelay;
|
|
986
1057
|
this.#finalTranscriptDelay = minEndpointingDelay;
|
|
1058
|
+
this.#agent = agent;
|
|
1059
|
+
this.#turnDetector = turnDetector;
|
|
987
1060
|
}
|
|
988
1061
|
|
|
989
1062
|
get validating(): boolean {
|
|
@@ -1038,7 +1111,17 @@ class DeferredReplyValidation {
|
|
|
1038
1111
|
}
|
|
1039
1112
|
|
|
1040
1113
|
#run(delay: number) {
|
|
1041
|
-
const runTask = async (delay: number, signal: AbortSignal) => {
|
|
1114
|
+
const runTask = async (delay: number, chatCtx: ChatContext, signal: AbortSignal) => {
|
|
1115
|
+
if (this.#lastFinalTranscript && !this.#speaking && this.#turnDetector) {
|
|
1116
|
+
const startTime = Date.now();
|
|
1117
|
+
const eotProb = await this.#turnDetector.predictEndOfTurn(chatCtx);
|
|
1118
|
+
const unlikelyThreshold = this.#turnDetector.unlikelyThreshold;
|
|
1119
|
+
const elapsed = Date.now() - startTime;
|
|
1120
|
+
if (eotProb < unlikelyThreshold) {
|
|
1121
|
+
delay = this.UNLIKELY_ENDPOINT_DELAY;
|
|
1122
|
+
}
|
|
1123
|
+
delay = Math.max(0, delay - elapsed);
|
|
1124
|
+
}
|
|
1042
1125
|
const timeout = setTimeout(() => {
|
|
1043
1126
|
this.#resetStates();
|
|
1044
1127
|
this.#validateFunc();
|
|
@@ -1051,6 +1134,8 @@ class DeferredReplyValidation {
|
|
|
1051
1134
|
this.#abort?.abort();
|
|
1052
1135
|
this.#abort = new AbortController();
|
|
1053
1136
|
this.#validatingFuture = new Future();
|
|
1054
|
-
|
|
1137
|
+
const detectCtx = this.#agent.chatCtx.copy();
|
|
1138
|
+
detectCtx.append({ text: this.#agent.transcribedText, role: ChatRole.USER });
|
|
1139
|
+
this.#validatingPromise = runTask(delay, detectCtx, this.#abort.signal);
|
|
1055
1140
|
}
|
|
1056
1141
|
}
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import { log } from '../log.js';
|
|
5
6
|
import type { VAD, VADStream } from '../vad.js';
|
|
6
7
|
import { VADEventType } from '../vad.js';
|
|
7
8
|
import type { SpeechEvent } from './stt.js';
|
|
@@ -71,13 +72,24 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
71
72
|
case VADEventType.END_OF_SPEECH:
|
|
72
73
|
this.output.put({ type: SpeechEventType.END_OF_SPEECH });
|
|
73
74
|
|
|
74
|
-
|
|
75
|
-
|
|
75
|
+
try {
|
|
76
|
+
const event = await this.#stt.recognize(ev.frames);
|
|
77
|
+
if (!event.alternatives![0].text) {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
this.output.put(event);
|
|
82
|
+
break;
|
|
83
|
+
} catch (error) {
|
|
84
|
+
let logger = log();
|
|
85
|
+
if (error instanceof Error) {
|
|
86
|
+
logger = logger.child({ error: error.message });
|
|
87
|
+
} else {
|
|
88
|
+
logger = logger.child({ error });
|
|
89
|
+
}
|
|
90
|
+
logger.error(`${this.label}: provider recognize task failed`);
|
|
76
91
|
continue;
|
|
77
92
|
}
|
|
78
|
-
|
|
79
|
-
this.output.put(event);
|
|
80
|
-
break;
|
|
81
93
|
}
|
|
82
94
|
}
|
|
83
95
|
};
|
|
@@ -68,6 +68,8 @@ export const hyphenateWord = (word: string): string[] => {
|
|
|
68
68
|
return hyphenator.hyphenateWord(word);
|
|
69
69
|
};
|
|
70
70
|
|
|
71
|
+
export { splitWords };
|
|
72
|
+
|
|
71
73
|
export const tokenizeParagraphs = (text: string): string[] => {
|
|
72
74
|
return splitParagraphs(text).map((tok) => tok[0]);
|
|
73
75
|
};
|
|
@@ -2,4 +2,10 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
export {
|
|
5
|
+
export {
|
|
6
|
+
SentenceTokenizer,
|
|
7
|
+
WordTokenizer,
|
|
8
|
+
tokenizeParagraphs,
|
|
9
|
+
hyphenateWord,
|
|
10
|
+
splitWords,
|
|
11
|
+
} from './basic.js';
|
|
@@ -44,12 +44,15 @@ export class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
|
|
|
44
44
|
if (this.#outBuf) this.#outBuf += ' ';
|
|
45
45
|
|
|
46
46
|
const tok = tokens.shift()!;
|
|
47
|
-
let tokText
|
|
48
|
-
if (
|
|
47
|
+
let tokText: string;
|
|
48
|
+
if (Array.isArray(tok)) {
|
|
49
49
|
tokText = tok[0];
|
|
50
|
+
} else {
|
|
51
|
+
tokText = tok;
|
|
50
52
|
}
|
|
51
53
|
|
|
52
54
|
this.#outBuf += tokText;
|
|
55
|
+
|
|
53
56
|
if (this.#outBuf.length >= this.#minTokenLength) {
|
|
54
57
|
this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
|
|
55
58
|
this.#outBuf = '';
|
|
@@ -76,7 +79,7 @@ export class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
|
|
|
76
79
|
if (tokens) {
|
|
77
80
|
if (this.#outBuf) this.#outBuf += ' ';
|
|
78
81
|
|
|
79
|
-
if (
|
|
82
|
+
if (Array.isArray(tokens[0])) {
|
|
80
83
|
this.#outBuf += tokens.map((tok) => tok[0]).join(' ');
|
|
81
84
|
} else {
|
|
82
85
|
this.#outBuf += tokens.join(' ');
|