@livekit/agents 0.6.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/dist/index.cjs +6 -1
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +3 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +3 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/inference_runner.cjs +38 -0
  8. package/dist/inference_runner.cjs.map +1 -0
  9. package/dist/inference_runner.d.ts +11 -0
  10. package/dist/inference_runner.d.ts.map +1 -0
  11. package/dist/inference_runner.js +14 -0
  12. package/dist/inference_runner.js.map +1 -0
  13. package/dist/ipc/index.cjs +23 -0
  14. package/dist/ipc/index.cjs.map +1 -0
  15. package/dist/ipc/index.d.ts +2 -0
  16. package/dist/ipc/index.d.ts.map +1 -0
  17. package/dist/ipc/index.js +2 -0
  18. package/dist/ipc/index.js.map +1 -0
  19. package/dist/ipc/inference_executor.cjs +17 -0
  20. package/dist/ipc/inference_executor.cjs.map +1 -0
  21. package/dist/ipc/inference_executor.d.ts +4 -0
  22. package/dist/ipc/inference_executor.d.ts.map +1 -0
  23. package/dist/ipc/inference_executor.js +1 -0
  24. package/dist/ipc/inference_executor.js.map +1 -0
  25. package/dist/ipc/inference_proc_executor.cjs +97 -0
  26. package/dist/ipc/inference_proc_executor.cjs.map +1 -0
  27. package/dist/ipc/inference_proc_executor.d.ts +23 -0
  28. package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
  29. package/dist/ipc/inference_proc_executor.js +72 -0
  30. package/dist/ipc/inference_proc_executor.js.map +1 -0
  31. package/dist/ipc/inference_proc_lazy_main.cjs +90 -0
  32. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
  33. package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
  34. package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
  35. package/dist/ipc/inference_proc_lazy_main.js +67 -0
  36. package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
  37. package/dist/ipc/job_executor.cjs +8 -7
  38. package/dist/ipc/job_executor.cjs.map +1 -1
  39. package/dist/ipc/job_executor.d.ts +14 -15
  40. package/dist/ipc/job_executor.d.ts.map +1 -1
  41. package/dist/ipc/job_executor.js +7 -6
  42. package/dist/ipc/job_executor.js.map +1 -1
  43. package/dist/ipc/job_proc_executor.cjs +108 -0
  44. package/dist/ipc/job_proc_executor.cjs.map +1 -0
  45. package/dist/ipc/job_proc_executor.d.ts +19 -0
  46. package/dist/ipc/job_proc_executor.d.ts.map +1 -0
  47. package/dist/ipc/job_proc_executor.js +83 -0
  48. package/dist/ipc/job_proc_executor.js.map +1 -0
  49. package/dist/ipc/{job_main.cjs → job_proc_lazy_main.cjs} +41 -36
  50. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
  51. package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
  52. package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
  53. package/dist/ipc/{job_main.js → job_proc_lazy_main.js} +41 -11
  54. package/dist/ipc/job_proc_lazy_main.js.map +1 -0
  55. package/dist/ipc/message.cjs.map +1 -1
  56. package/dist/ipc/message.d.ts +17 -0
  57. package/dist/ipc/message.d.ts.map +1 -1
  58. package/dist/ipc/proc_pool.cjs +30 -4
  59. package/dist/ipc/proc_pool.cjs.map +1 -1
  60. package/dist/ipc/proc_pool.d.ts +5 -1
  61. package/dist/ipc/proc_pool.d.ts.map +1 -1
  62. package/dist/ipc/proc_pool.js +30 -4
  63. package/dist/ipc/proc_pool.js.map +1 -1
  64. package/dist/ipc/{proc_job_executor.cjs → supervised_proc.cjs} +58 -46
  65. package/dist/ipc/supervised_proc.cjs.map +1 -0
  66. package/dist/ipc/supervised_proc.d.ts +30 -0
  67. package/dist/ipc/supervised_proc.d.ts.map +1 -0
  68. package/dist/ipc/{proc_job_executor.js → supervised_proc.js} +54 -32
  69. package/dist/ipc/supervised_proc.js.map +1 -0
  70. package/dist/job.cjs +18 -1
  71. package/dist/job.cjs.map +1 -1
  72. package/dist/job.d.ts +9 -1
  73. package/dist/job.d.ts.map +1 -1
  74. package/dist/job.js +17 -1
  75. package/dist/job.js.map +1 -1
  76. package/dist/metrics/base.cjs +2 -2
  77. package/dist/metrics/base.cjs.map +1 -1
  78. package/dist/metrics/base.d.ts +1 -1
  79. package/dist/metrics/base.d.ts.map +1 -1
  80. package/dist/metrics/base.js +2 -2
  81. package/dist/metrics/base.js.map +1 -1
  82. package/dist/multimodal/agent_playout.cjs +13 -14
  83. package/dist/multimodal/agent_playout.cjs.map +1 -1
  84. package/dist/multimodal/agent_playout.d.ts +4 -4
  85. package/dist/multimodal/agent_playout.d.ts.map +1 -1
  86. package/dist/multimodal/agent_playout.js +13 -14
  87. package/dist/multimodal/agent_playout.js.map +1 -1
  88. package/dist/multimodal/multimodal_agent.cjs +12 -8
  89. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  90. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  91. package/dist/multimodal/multimodal_agent.js +13 -9
  92. package/dist/multimodal/multimodal_agent.js.map +1 -1
  93. package/dist/pipeline/agent_output.cjs +20 -4
  94. package/dist/pipeline/agent_output.cjs.map +1 -1
  95. package/dist/pipeline/agent_output.d.ts +4 -2
  96. package/dist/pipeline/agent_output.d.ts.map +1 -1
  97. package/dist/pipeline/agent_output.js +20 -4
  98. package/dist/pipeline/agent_output.js.map +1 -1
  99. package/dist/pipeline/agent_playout.cjs +9 -3
  100. package/dist/pipeline/agent_playout.cjs.map +1 -1
  101. package/dist/pipeline/agent_playout.d.ts +4 -2
  102. package/dist/pipeline/agent_playout.d.ts.map +1 -1
  103. package/dist/pipeline/agent_playout.js +9 -3
  104. package/dist/pipeline/agent_playout.js.map +1 -1
  105. package/dist/pipeline/human_input.cjs +6 -0
  106. package/dist/pipeline/human_input.cjs.map +1 -1
  107. package/dist/pipeline/human_input.d.ts +3 -1
  108. package/dist/pipeline/human_input.d.ts.map +1 -1
  109. package/dist/pipeline/human_input.js +6 -0
  110. package/dist/pipeline/human_input.js.map +1 -1
  111. package/dist/pipeline/pipeline_agent.cjs +79 -12
  112. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  113. package/dist/pipeline/pipeline_agent.d.ts +8 -0
  114. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  115. package/dist/pipeline/pipeline_agent.js +79 -12
  116. package/dist/pipeline/pipeline_agent.js.map +1 -1
  117. package/dist/stt/stream_adapter.cjs +16 -4
  118. package/dist/stt/stream_adapter.cjs.map +1 -1
  119. package/dist/stt/stream_adapter.d.ts.map +1 -1
  120. package/dist/stt/stream_adapter.js +16 -4
  121. package/dist/stt/stream_adapter.js.map +1 -1
  122. package/dist/tokenize/basic/basic.cjs +2 -0
  123. package/dist/tokenize/basic/basic.cjs.map +1 -1
  124. package/dist/tokenize/basic/basic.d.ts +2 -0
  125. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  126. package/dist/tokenize/basic/basic.js +1 -0
  127. package/dist/tokenize/basic/basic.js.map +1 -1
  128. package/dist/tokenize/basic/index.cjs +2 -0
  129. package/dist/tokenize/basic/index.cjs.map +1 -1
  130. package/dist/tokenize/basic/index.d.ts +1 -1
  131. package/dist/tokenize/basic/index.d.ts.map +1 -1
  132. package/dist/tokenize/basic/index.js +8 -1
  133. package/dist/tokenize/basic/index.js.map +1 -1
  134. package/dist/tokenize/token_stream.cjs +5 -3
  135. package/dist/tokenize/token_stream.cjs.map +1 -1
  136. package/dist/tokenize/token_stream.d.ts.map +1 -1
  137. package/dist/tokenize/token_stream.js +5 -3
  138. package/dist/tokenize/token_stream.js.map +1 -1
  139. package/dist/transcription.cjs +203 -86
  140. package/dist/transcription.cjs.map +1 -1
  141. package/dist/transcription.d.ts +24 -17
  142. package/dist/transcription.d.ts.map +1 -1
  143. package/dist/transcription.js +201 -85
  144. package/dist/transcription.js.map +1 -1
  145. package/dist/worker.cjs +42 -9
  146. package/dist/worker.cjs.map +1 -1
  147. package/dist/worker.d.ts +5 -1
  148. package/dist/worker.d.ts.map +1 -1
  149. package/dist/worker.js +42 -9
  150. package/dist/worker.js.map +1 -1
  151. package/package.json +3 -3
  152. package/src/index.ts +3 -1
  153. package/src/inference_runner.ts +19 -0
  154. package/src/ipc/index.ts +5 -0
  155. package/src/ipc/inference_executor.ts +7 -0
  156. package/src/ipc/inference_proc_executor.ts +93 -0
  157. package/src/ipc/inference_proc_lazy_main.ts +86 -0
  158. package/src/ipc/job_executor.ts +15 -17
  159. package/src/ipc/job_proc_executor.ts +112 -0
  160. package/src/ipc/{job_main.ts → job_proc_lazy_main.ts} +44 -14
  161. package/src/ipc/message.ts +14 -1
  162. package/src/ipc/proc_pool.ts +33 -3
  163. package/src/ipc/{proc_job_executor.ts → supervised_proc.ts} +80 -30
  164. package/src/job.ts +21 -0
  165. package/src/metrics/base.ts +7 -10
  166. package/src/multimodal/agent_playout.ts +14 -16
  167. package/src/multimodal/multimodal_agent.ts +13 -9
  168. package/src/pipeline/agent_output.ts +34 -5
  169. package/src/pipeline/agent_playout.ts +10 -1
  170. package/src/pipeline/human_input.ts +8 -0
  171. package/src/pipeline/pipeline_agent.ts +96 -11
  172. package/src/stt/stream_adapter.ts +17 -5
  173. package/src/tokenize/basic/basic.ts +2 -0
  174. package/src/tokenize/basic/index.ts +7 -1
  175. package/src/tokenize/token_stream.ts +6 -3
  176. package/src/transcription.ts +270 -96
  177. package/src/worker.ts +42 -5
  178. package/dist/ipc/job_main.cjs.map +0 -1
  179. package/dist/ipc/job_main.d.ts +0 -8
  180. package/dist/ipc/job_main.d.ts.map +0 -1
  181. package/dist/ipc/job_main.js.map +0 -1
  182. package/dist/ipc/proc_job_executor.cjs.map +0 -1
  183. package/dist/ipc/proc_job_executor.d.ts +0 -15
  184. package/dist/ipc/proc_job_executor.d.ts.map +0 -1
  185. package/dist/ipc/proc_job_executor.js.map +0 -1
@@ -10,6 +10,7 @@ import {
10
10
  TrackSource,
11
11
  } from '@livekit/rtc-node';
12
12
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
13
+ import { randomUUID } from 'node:crypto';
13
14
  import EventEmitter from 'node:events';
14
15
  import type {
15
16
  CallableFunctionResult,
@@ -28,6 +29,7 @@ import {
28
29
  hyphenateWord,
29
30
  } from '../tokenize/basic/index.js';
30
31
  import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
32
+ import { TextAudioSynchronizer, defaultTextSyncOptions } from '../transcription.js';
31
33
  import type { TTS } from '../tts/index.js';
32
34
  import { TTSEvent, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
33
35
  import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
@@ -78,6 +80,12 @@ export type VPACallbacks = {
78
80
  [VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
79
81
  };
80
82
 
83
+ interface TurnDetector {
84
+ unlikelyThreshold: number;
85
+ supportsLanguage: (language?: string) => boolean;
86
+ predictEndOfTurn: (chatCtx: ChatContext) => Promise<number>;
87
+ }
88
+
81
89
  export class AgentCallContext {
82
90
  #agent: VoicePipelineAgent;
83
91
  #llmStream: LLMStream;
@@ -206,6 +214,8 @@ export interface VPAOptions {
206
214
  beforeTTSCallback: BeforeTTSCallback;
207
215
  /** Options for assistant transcription. */
208
216
  transcription: AgentTranscriptionOptions;
217
+ /** Turn detection model to use. */
218
+ turnDetector?: TurnDetector;
209
219
  }
210
220
 
211
221
  const defaultVPAOptions: VPAOptions = {
@@ -238,7 +248,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
238
248
  #pendingAgentReply?: SpeechHandle;
239
249
  #agentReplyTask?: CancellablePromise<void>;
240
250
  #playingSpeech?: SpeechHandle;
241
- #transcribedText = '';
251
+ transcribedText = '';
242
252
  #transcribedInterimText = '';
243
253
  #speechQueueOpen = new Future();
244
254
  #speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();
@@ -251,6 +261,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
251
261
  #agentPublication?: LocalTrackPublication;
252
262
  #lastFinalTranscriptTime?: number;
253
263
  #lastSpeechTime?: number;
264
+ #transcriptionId?: string;
265
+ #agentTranscribedText = '';
254
266
 
255
267
  constructor(
256
268
  /** Voice Activity Detection instance. */
@@ -284,6 +296,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
284
296
  this.#deferredValidation = new DeferredReplyValidation(
285
297
  this.#validateReplyIfPossible.bind(this),
286
298
  this.#opts.minEndpointingDelay,
299
+ this,
300
+ this.#opts.turnDetector,
287
301
  );
288
302
  }
289
303
 
@@ -492,14 +506,52 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
492
506
  this.#deferredValidation.onHumanEndOfSpeech(event);
493
507
  });
494
508
  this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
509
+ if (!this.#transcriptionId) {
510
+ this.#transcriptionId = randomUUID();
511
+ }
495
512
  this.#transcribedInterimText = event.alternatives![0].text;
513
+
514
+ this.#room!.localParticipant!.publishTranscription({
515
+ participantIdentity: this.#humanInput!.participant.identity,
516
+ trackSid: this.#humanInput!.subscribedTrack!.sid!,
517
+ segments: [
518
+ {
519
+ text: this.#transcribedInterimText,
520
+ id: this.#transcriptionId,
521
+ final: true,
522
+ startTime: BigInt(0),
523
+ endTime: BigInt(0),
524
+ language: '',
525
+ },
526
+ ],
527
+ });
496
528
  });
497
529
  this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
498
530
  const newTranscript = event.alternatives![0].text;
499
531
  if (!newTranscript) return;
500
532
 
533
+ if (!this.#transcriptionId) {
534
+ this.#transcriptionId = randomUUID();
535
+ }
536
+
501
537
  this.#lastFinalTranscriptTime = Date.now();
502
- this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;
538
+ this.transcribedText += (this.transcribedText ? ' ' : '') + newTranscript;
539
+
540
+ this.#room!.localParticipant!.publishTranscription({
541
+ participantIdentity: this.#humanInput!.participant.identity,
542
+ trackSid: this.#humanInput!.subscribedTrack!.sid!,
543
+ segments: [
544
+ {
545
+ text: this.transcribedText,
546
+ id: this.#transcriptionId,
547
+ final: true,
548
+ startTime: BigInt(0),
549
+ endTime: BigInt(0),
550
+ language: '',
551
+ },
552
+ ],
553
+ });
554
+ this.#transcriptionId = undefined;
503
555
 
504
556
  if (
505
557
  this.#opts.preemptiveSynthesis &&
@@ -564,7 +616,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
564
616
  this.#pendingAgentReply = SpeechHandle.createAssistantReply(
565
617
  this.#opts.allowInterruptions,
566
618
  true,
567
- this.#transcribedText,
619
+ this.transcribedText,
568
620
  );
569
621
  const newHandle = this.#pendingAgentReply;
570
622
  this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);
@@ -674,7 +726,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
674
726
  this.chatCtx.messages.push(userMsg);
675
727
  this.emit(VPAEvent.USER_SPEECH_COMMITTED, userMsg);
676
728
 
677
- this.#transcribedText = this.#transcribedText.slice(userQuestion.length);
729
+ this.transcribedText = this.transcribedText.slice(userQuestion.length);
678
730
  handle.markUserCommitted();
679
731
  };
680
732
 
@@ -692,7 +744,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
692
744
  }
693
745
  commitUserQuestionIfNeeded();
694
746
 
695
- const collectedText = handle.synthesisHandle.text;
747
+ let collectedText = this.#agentTranscribedText;
696
748
  const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
697
749
  const interrupted = handle.interrupted;
698
750
 
@@ -701,7 +753,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
701
753
  this.chatCtx.messages.push(...handle.extraToolsMessages);
702
754
  }
703
755
  if (interrupted) {
704
- collectedText + '…';
756
+ collectedText += '…';
705
757
  }
706
758
 
707
759
  const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });
@@ -798,6 +850,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
798
850
  chatCtx,
799
851
  fncCtx: this.fncCtx,
800
852
  });
853
+
801
854
  const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
802
855
  newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
803
856
  handle.addNestedSpeech(newSpeechHandle);
@@ -832,6 +885,16 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
832
885
  speechId: string,
833
886
  source: string | LLMStream | AsyncIterable<string>,
834
887
  ): SynthesisHandle {
888
+ const synchronizer = new TextAudioSynchronizer(defaultTextSyncOptions);
889
+ synchronizer.on('textUpdated', (text) => {
890
+ this.#agentTranscribedText = text.text;
891
+ this.#room!.localParticipant!.publishTranscription({
892
+ participantIdentity: this.#room!.localParticipant!.identity,
893
+ trackSid: this.#agentPublication!.sid!,
894
+ segments: [text],
895
+ });
896
+ });
897
+
835
898
  if (!this.#agentOutput) {
836
899
  throw new Error('agent output should be initialized when ready');
837
900
  }
@@ -850,7 +913,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
850
913
  throw new Error('beforeTTSCallback must return string or AsyncIterable<string>');
851
914
  }
852
915
 
853
- return this.#agentOutput.synthesize(speechId, ttsSource);
916
+ return this.#agentOutput.synthesize(speechId, ttsSource, synchronizer);
854
917
  }
855
918
 
856
919
  async #validateReplyIfPossible() {
@@ -862,7 +925,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
862
925
  }
863
926
 
864
927
  if (!this.#pendingAgentReply) {
865
- if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {
928
+ if (this.#opts.preemptiveSynthesis || !this.transcribedText) {
866
929
  return;
867
930
  }
868
931
  this.#synthesizeAgentReply();
@@ -969,6 +1032,7 @@ class DeferredReplyValidation {
969
1032
  readonly PUNCTUATION = '.!?';
970
1033
  readonly PUNCTUATION_REDUCE_FACTOR = 0.75;
971
1034
  readonly LATE_TRANSCRIPT_TOLERANCE = 1.5; // late compared to end of speech
1035
+ readonly UNLIKELY_ENDPOINT_DELAY = 6000;
972
1036
 
973
1037
  #validateFunc: () => Promise<void>;
974
1038
  #validatingPromise?: Promise<void>;
@@ -978,12 +1042,21 @@ class DeferredReplyValidation {
978
1042
  #speaking = false;
979
1043
  #endOfSpeechDelay: number;
980
1044
  #finalTranscriptDelay: number;
1045
+ #turnDetector?: TurnDetector;
1046
+ #agent: VoicePipelineAgent;
981
1047
  #abort?: AbortController;
982
1048
 
983
- constructor(validateFunc: () => Promise<void>, minEndpointingDelay: number) {
1049
+ constructor(
1050
+ validateFunc: () => Promise<void>,
1051
+ minEndpointingDelay: number,
1052
+ agent: VoicePipelineAgent,
1053
+ turnDetector?: TurnDetector,
1054
+ ) {
984
1055
  this.#validateFunc = validateFunc;
985
1056
  this.#endOfSpeechDelay = minEndpointingDelay;
986
1057
  this.#finalTranscriptDelay = minEndpointingDelay;
1058
+ this.#agent = agent;
1059
+ this.#turnDetector = turnDetector;
987
1060
  }
988
1061
 
989
1062
  get validating(): boolean {
@@ -1038,7 +1111,17 @@ class DeferredReplyValidation {
1038
1111
  }
1039
1112
 
1040
1113
  #run(delay: number) {
1041
- const runTask = async (delay: number, signal: AbortSignal) => {
1114
+ const runTask = async (delay: number, chatCtx: ChatContext, signal: AbortSignal) => {
1115
+ if (this.#lastFinalTranscript && !this.#speaking && this.#turnDetector) {
1116
+ const startTime = Date.now();
1117
+ const eotProb = await this.#turnDetector.predictEndOfTurn(chatCtx);
1118
+ const unlikelyThreshold = this.#turnDetector.unlikelyThreshold;
1119
+ const elapsed = Date.now() - startTime;
1120
+ if (eotProb < unlikelyThreshold) {
1121
+ delay = this.UNLIKELY_ENDPOINT_DELAY;
1122
+ }
1123
+ delay = Math.max(0, delay - elapsed);
1124
+ }
1042
1125
  const timeout = setTimeout(() => {
1043
1126
  this.#resetStates();
1044
1127
  this.#validateFunc();
@@ -1051,6 +1134,8 @@ class DeferredReplyValidation {
1051
1134
  this.#abort?.abort();
1052
1135
  this.#abort = new AbortController();
1053
1136
  this.#validatingFuture = new Future();
1054
- this.#validatingPromise = runTask(delay, this.#abort.signal);
1137
+ const detectCtx = this.#agent.chatCtx.copy();
1138
+ detectCtx.append({ text: this.#agent.transcribedText, role: ChatRole.USER });
1139
+ this.#validatingPromise = runTask(delay, detectCtx, this.#abort.signal);
1055
1140
  }
1056
1141
  }
@@ -2,6 +2,7 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
+ import { log } from '../log.js';
5
6
  import type { VAD, VADStream } from '../vad.js';
6
7
  import { VADEventType } from '../vad.js';
7
8
  import type { SpeechEvent } from './stt.js';
@@ -71,13 +72,24 @@ export class StreamAdapterWrapper extends SpeechStream {
71
72
  case VADEventType.END_OF_SPEECH:
72
73
  this.output.put({ type: SpeechEventType.END_OF_SPEECH });
73
74
 
74
- const event = await this.#stt.recognize(ev.frames);
75
- if (!event.alternatives![0].text) {
75
+ try {
76
+ const event = await this.#stt.recognize(ev.frames);
77
+ if (!event.alternatives![0].text) {
78
+ continue;
79
+ }
80
+
81
+ this.output.put(event);
82
+ break;
83
+ } catch (error) {
84
+ let logger = log();
85
+ if (error instanceof Error) {
86
+ logger = logger.child({ error: error.message });
87
+ } else {
88
+ logger = logger.child({ error });
89
+ }
90
+ logger.error(`${this.label}: provider recognize task failed`);
76
91
  continue;
77
92
  }
78
-
79
- this.output.put(event);
80
- break;
81
93
  }
82
94
  }
83
95
  };
@@ -68,6 +68,8 @@ export const hyphenateWord = (word: string): string[] => {
68
68
  return hyphenator.hyphenateWord(word);
69
69
  };
70
70
 
71
+ export { splitWords };
72
+
71
73
  export const tokenizeParagraphs = (text: string): string[] => {
72
74
  return splitParagraphs(text).map((tok) => tok[0]);
73
75
  };
@@ -2,4 +2,10 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
 
5
- export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord } from './basic.js';
5
+ export {
6
+ SentenceTokenizer,
7
+ WordTokenizer,
8
+ tokenizeParagraphs,
9
+ hyphenateWord,
10
+ splitWords,
11
+ } from './basic.js';
@@ -44,12 +44,15 @@ export class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
44
44
  if (this.#outBuf) this.#outBuf += ' ';
45
45
 
46
46
  const tok = tokens.shift()!;
47
- let tokText = tok as string;
48
- if (tok.length > 1 && typeof tok[1] === 'number') {
47
+ let tokText: string;
48
+ if (Array.isArray(tok)) {
49
49
  tokText = tok[0];
50
+ } else {
51
+ tokText = tok;
50
52
  }
51
53
 
52
54
  this.#outBuf += tokText;
55
+
53
56
  if (this.#outBuf.length >= this.#minTokenLength) {
54
57
  this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
55
58
  this.#outBuf = '';
@@ -76,7 +79,7 @@ export class BufferedTokenStream implements AsyncIterableIterator<TokenData> {
76
79
  if (tokens) {
77
80
  if (this.#outBuf) this.#outBuf += ' ';
78
81
 
79
- if (typeof tokens[0] !== 'string') {
82
+ if (Array.isArray(tokens[0])) {
80
83
  this.#outBuf += tokens.map((tok) => tok[0]).join(' ');
81
84
  } else {
82
85
  this.#outBuf += tokens.join(' ');