@livekit/agents 1.0.38 → 1.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/http_server.cjs +9 -6
  2. package/dist/http_server.cjs.map +1 -1
  3. package/dist/http_server.d.cts +5 -1
  4. package/dist/http_server.d.ts +5 -1
  5. package/dist/http_server.d.ts.map +1 -1
  6. package/dist/http_server.js +9 -6
  7. package/dist/http_server.js.map +1 -1
  8. package/dist/inference/llm.cjs +7 -3
  9. package/dist/inference/llm.cjs.map +1 -1
  10. package/dist/inference/llm.d.cts +5 -6
  11. package/dist/inference/llm.d.ts +5 -6
  12. package/dist/inference/llm.d.ts.map +1 -1
  13. package/dist/inference/llm.js +7 -3
  14. package/dist/inference/llm.js.map +1 -1
  15. package/dist/inference/stt.cjs.map +1 -1
  16. package/dist/inference/stt.d.cts +5 -4
  17. package/dist/inference/stt.d.ts +5 -4
  18. package/dist/inference/stt.d.ts.map +1 -1
  19. package/dist/inference/stt.js.map +1 -1
  20. package/dist/inference/tts.cjs.map +1 -1
  21. package/dist/inference/tts.d.cts +10 -7
  22. package/dist/inference/tts.d.ts +10 -7
  23. package/dist/inference/tts.d.ts.map +1 -1
  24. package/dist/inference/tts.js.map +1 -1
  25. package/dist/ipc/supervised_proc.cjs +4 -0
  26. package/dist/ipc/supervised_proc.cjs.map +1 -1
  27. package/dist/ipc/supervised_proc.d.cts +1 -0
  28. package/dist/ipc/supervised_proc.d.ts +1 -0
  29. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  30. package/dist/ipc/supervised_proc.js +4 -0
  31. package/dist/ipc/supervised_proc.js.map +1 -1
  32. package/dist/stt/stream_adapter.cjs +9 -1
  33. package/dist/stt/stream_adapter.cjs.map +1 -1
  34. package/dist/stt/stream_adapter.d.ts.map +1 -1
  35. package/dist/stt/stream_adapter.js +9 -1
  36. package/dist/stt/stream_adapter.js.map +1 -1
  37. package/dist/tokenize/basic/sentence.cjs +3 -3
  38. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  39. package/dist/tokenize/basic/sentence.js +3 -3
  40. package/dist/tokenize/basic/sentence.js.map +1 -1
  41. package/dist/tokenize/tokenizer.test.cjs +3 -1
  42. package/dist/tokenize/tokenizer.test.cjs.map +1 -1
  43. package/dist/tokenize/tokenizer.test.js +3 -1
  44. package/dist/tokenize/tokenizer.test.js.map +1 -1
  45. package/dist/utils.cjs +5 -0
  46. package/dist/utils.cjs.map +1 -1
  47. package/dist/utils.d.cts +8 -0
  48. package/dist/utils.d.ts +8 -0
  49. package/dist/utils.d.ts.map +1 -1
  50. package/dist/utils.js +4 -0
  51. package/dist/utils.js.map +1 -1
  52. package/dist/voice/agent.cjs +1 -2
  53. package/dist/voice/agent.cjs.map +1 -1
  54. package/dist/voice/agent.js +1 -2
  55. package/dist/voice/agent.js.map +1 -1
  56. package/dist/voice/agent_activity.cjs +23 -14
  57. package/dist/voice/agent_activity.cjs.map +1 -1
  58. package/dist/voice/agent_activity.d.cts +1 -0
  59. package/dist/voice/agent_activity.d.ts +1 -0
  60. package/dist/voice/agent_activity.d.ts.map +1 -1
  61. package/dist/voice/agent_activity.js +23 -14
  62. package/dist/voice/agent_activity.js.map +1 -1
  63. package/dist/worker.cjs +12 -2
  64. package/dist/worker.cjs.map +1 -1
  65. package/dist/worker.d.ts.map +1 -1
  66. package/dist/worker.js +12 -2
  67. package/dist/worker.js.map +1 -1
  68. package/package.json +2 -2
  69. package/src/http_server.ts +18 -6
  70. package/src/inference/llm.ts +20 -15
  71. package/src/inference/stt.ts +9 -7
  72. package/src/inference/tts.ts +36 -16
  73. package/src/ipc/supervised_proc.ts +4 -0
  74. package/src/stt/stream_adapter.ts +12 -1
  75. package/src/tokenize/basic/sentence.ts +3 -3
  76. package/src/tokenize/tokenizer.test.ts +4 -0
  77. package/src/utils.ts +14 -0
  78. package/src/voice/agent.ts +2 -2
  79. package/src/voice/agent_activity.ts +36 -15
  80. package/src/worker.ts +24 -2
@@ -23,22 +23,27 @@ import {
23
23
  import { type AnyString, connectWs, createAccessToken } from './utils.js';
24
24
 
25
25
  export type CartesiaModels =
26
- | 'cartesia'
27
- | 'cartesia/sonic'
26
+ | 'cartesia/sonic-3'
28
27
  | 'cartesia/sonic-2'
29
- | 'cartesia/sonic-turbo';
28
+ | 'cartesia/sonic-turbo'
29
+ | 'cartesia/sonic';
30
+
31
+ export type DeepgramTTSModels = 'deepgram/aura' | 'deepgram/aura-2';
30
32
 
31
33
  export type ElevenlabsModels =
32
- | 'elevenlabs'
33
34
  | 'elevenlabs/eleven_flash_v2'
34
35
  | 'elevenlabs/eleven_flash_v2_5'
35
36
  | 'elevenlabs/eleven_turbo_v2'
36
37
  | 'elevenlabs/eleven_turbo_v2_5'
37
38
  | 'elevenlabs/eleven_multilingual_v2';
38
39
 
39
- export type RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
40
+ export type InworldModels =
41
+ | 'inworld/inworld-tts-1.5-max'
42
+ | 'inworld/inworld-tts-1.5-mini'
43
+ | 'inworld/inworld-tts-1-max'
44
+ | 'inworld/inworld-tts-1';
40
45
 
41
- export type InworldModels = 'inworld' | 'inworld/inworld-tts-1';
46
+ export type RimeModels = 'rime/arcana' | 'rime/mistv2';
42
47
 
43
48
  export interface CartesiaOptions {
44
49
  duration?: number; // max duration of audio in seconds
@@ -50,25 +55,40 @@ export interface ElevenlabsOptions {
50
55
  apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
51
56
  }
52
57
 
58
+ export interface DeepgramTTSOptions {}
59
+
53
60
  export interface RimeOptions {}
54
61
 
55
62
  export interface InworldOptions {}
56
63
 
57
- type _TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels;
58
-
59
- export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyString;
64
+ type _TTSModels =
65
+ | CartesiaModels
66
+ | DeepgramTTSModels
67
+ | ElevenlabsModels
68
+ | RimeModels
69
+ | InworldModels;
70
+
71
+ export type TTSModels =
72
+ | CartesiaModels
73
+ | DeepgramTTSModels
74
+ | ElevenlabsModels
75
+ | RimeModels
76
+ | InworldModels
77
+ | AnyString;
60
78
 
61
79
  export type ModelWithVoice = `${_TTSModels}:${string}` | TTSModels;
62
80
 
63
81
  export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
64
82
  ? CartesiaOptions
65
- : TModel extends ElevenlabsModels
66
- ? ElevenlabsOptions
67
- : TModel extends RimeOptions
68
- ? RimeOptions
69
- : TModel extends InworldOptions
70
- ? InworldOptions
71
- : Record<string, unknown>;
83
+ : TModel extends DeepgramTTSModels
84
+ ? DeepgramTTSOptions
85
+ : TModel extends ElevenlabsModels
86
+ ? ElevenlabsOptions
87
+ : TModel extends RimeModels
88
+ ? RimeOptions
89
+ : TModel extends InworldModels
90
+ ? InworldOptions
91
+ : Record<string, unknown>;
72
92
 
73
93
  type TTSEncoding = 'pcm_s16le';
74
94
 
@@ -59,6 +59,10 @@ export abstract class SupervisedProc {
59
59
  return this.#started;
60
60
  }
61
61
 
62
+ get isAlive(): boolean {
63
+ return this.#started && !this.#closing && !!this.proc?.connected;
64
+ }
65
+
62
66
  get runningJob(): RunningJobInfo | undefined {
63
67
  return this.#runningJob;
64
68
  }
@@ -4,6 +4,7 @@
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { log } from '../log.js';
6
6
  import type { APIConnectOptions } from '../types.js';
7
+ import { isStreamClosedError } from '../utils.js';
7
8
  import type { VAD, VADStream } from '../vad.js';
8
9
  import { VADEventType } from '../vad.js';
9
10
  import type { SpeechEvent } from './stt.js';
@@ -68,7 +69,17 @@ export class StreamAdapterWrapper extends SpeechStream {
68
69
  this.#vadStream.pushFrame(input);
69
70
  }
70
71
  }
71
- this.#vadStream.endInput();
72
+
73
+ // Guard against calling endInput() on already-closed stream
74
+ // This happens during handover when close() is called while forwardInput is running
75
+ try {
76
+ this.#vadStream.endInput();
77
+ } catch (e) {
78
+ if (isStreamClosedError(e)) {
79
+ return;
80
+ }
81
+ throw e;
82
+ }
72
83
  };
73
84
 
74
85
  const recognize = async () => {
@@ -16,7 +16,7 @@ export const splitSentences = (
16
16
  const starters =
17
17
  /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
18
18
  const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
19
- const websites = /[.](com|net|org|io|gov|edu|me)/g;
19
+ const websites = /(\w+\.)+(com|net|org|io|gov|edu|me)/g;
20
20
  const digits = /([0-9])/g;
21
21
  const dots = /\.{2,}/g;
22
22
 
@@ -27,7 +27,7 @@ export const splitSentences = (
27
27
  }
28
28
 
29
29
  text = text.replaceAll(prefixes, '$1<prd>');
30
- text = text.replaceAll(websites, '<prd>$2');
30
+ text = text.replace(websites, (match) => match.replaceAll('.', '<prd>'));
31
31
  text = text.replaceAll(new RegExp(`${digits.source}[.]${digits.source}`, 'g'), '$1<prd>$2');
32
32
  text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));
33
33
  text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');
@@ -51,7 +51,7 @@ export const splitSentences = (
51
51
  text = text.replaceAll('."', '".');
52
52
  text = text.replaceAll('!"', '"!');
53
53
  text = text.replaceAll('?"', '"?');
54
- text = text.replaceAll('.', '.<stop>');
54
+ text = text.replace(/\.(?=\s|$)/g, '.<stop>');
55
55
  text = text.replaceAll('?', '?<stop>');
56
56
  text = text.replaceAll('!', '!<stop>');
57
57
  text = text.replaceAll('<prd>', '.');
@@ -13,6 +13,8 @@ const TEXT =
13
13
  'This is a test. Another test. ' +
14
14
  'A short sentence. ' +
15
15
  'A longer sentence that is longer than the previous sentence. ' +
16
+ 'Find additional resources on livekit.com. ' +
17
+ 'Find additional resources on docs.livekit.com. ' +
16
18
  'f(x) = x * 2.54 + 42. ' +
17
19
  'Hey! Hi! Hello! ';
18
20
 
@@ -22,6 +24,8 @@ const EXPECTED_MIN_20 = [
22
24
  'Mr. Theo is testing the sentence tokenizer.',
23
25
  'This is a test. Another test.',
24
26
  'A short sentence. A longer sentence that is longer than the previous sentence.',
27
+ 'Find additional resources on livekit.com.',
28
+ 'Find additional resources on docs.livekit.com.',
25
29
  'f(x) = x * 2.54 + 42.',
26
30
  'Hey! Hi! Hello!',
27
31
  ];
package/src/utils.ts CHANGED
@@ -675,6 +675,20 @@ export class InvalidErrorType extends Error {
675
675
  }
676
676
  }
677
677
 
678
+ /**
679
+ * Check if an error is a stream closed error that can be safely ignored during cleanup.
680
+ * This happens during handover/cleanup when close() is called while operations are still running.
681
+ *
682
+ * @param error - The error to check.
683
+ * @returns True if the error is a stream closed error.
684
+ */
685
+ export function isStreamClosedError(error: unknown): boolean {
686
+ return (
687
+ error instanceof Error &&
688
+ (error.message === 'Stream is closed' || error.message === 'Input is closed')
689
+ );
690
+ }
691
+
678
692
  /**
679
693
  * In JS an error can be any arbitrary value.
680
694
  * This function converts an unknown error to an Error and stores the original value in the error object.
@@ -325,16 +325,16 @@ export class Agent<UserData = any> {
325
325
  );
326
326
  }
327
327
 
328
- // TODO(brian): make parallelToolCalls configurable
329
328
  const { toolChoice } = modelSettings;
330
329
  const connOptions = activity.agentSession.connOptions.llmConnOptions;
331
330
 
331
+ // parallelToolCalls is not passed here - it will use the value from LLM's modelOptions
332
+ // This allows users to configure it via: new inference.LLM({ modelOptions: { parallel_tool_calls: false } })
332
333
  const stream = activity.llm.chat({
333
334
  chatCtx,
334
335
  toolCtx,
335
336
  toolChoice,
336
337
  connOptions,
337
- parallelToolCalls: true,
338
338
  });
339
339
 
340
340
  let cleaned = false;
@@ -194,12 +194,13 @@ export class AgentActivity implements RecognitionHooks {
194
194
  if (
195
195
  !this.vad &&
196
196
  this.stt &&
197
+ !this.stt.capabilities.streaming &&
197
198
  this.llm instanceof LLM &&
198
199
  this.allowInterruptions &&
199
200
  this.turnDetectionMode === undefined
200
201
  ) {
201
202
  this.logger.warn(
202
- 'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
203
+ 'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
203
204
  'for more responsive interruption handling.',
204
205
  );
205
206
  }
@@ -659,12 +660,14 @@ export class AgentActivity implements RecognitionHooks {
659
660
  return;
660
661
  }
661
662
 
662
- if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
663
- // skip speech handle interruption if server side turn detection is enabled
664
- return;
663
+ if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
664
+ this.interruptByAudioActivity();
665
665
  }
666
+ }
666
667
 
667
- if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
668
+ private interruptByAudioActivity(): void {
669
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
670
+ // skip speech handle interruption if server side turn detection is enabled
668
671
  return;
669
672
  }
670
673
 
@@ -694,7 +697,10 @@ export class AgentActivity implements RecognitionHooks {
694
697
  !this._currentSpeech.interrupted &&
695
698
  this._currentSpeech.allowInterruptions
696
699
  ) {
697
- this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
700
+ this.logger.info(
701
+ { 'speech id': this._currentSpeech.id },
702
+ 'speech interrupted by audio activity',
703
+ );
698
704
  this.realtimeSession?.interrupt();
699
705
  this._currentSpeech.interrupt();
700
706
  }
@@ -715,6 +721,10 @@ export class AgentActivity implements RecognitionHooks {
715
721
  // TODO(AJS-106): add multi participant support
716
722
  }),
717
723
  );
724
+
725
+ if (ev.alternatives![0].text) {
726
+ this.interruptByAudioActivity();
727
+ }
718
728
  }
719
729
 
720
730
  onFinalTranscript(ev: SpeechEvent): void {
@@ -732,6 +742,20 @@ export class AgentActivity implements RecognitionHooks {
732
742
  // TODO(AJS-106): add multi participant support
733
743
  }),
734
744
  );
745
+
746
+ // agent speech might not be interrupted if VAD failed and a final transcript is received
747
+ // we call interruptByAudioActivity (idempotent) to pause the speech, if possible
748
+ if (
749
+ this.audioRecognition &&
750
+ this.turnDetection !== 'manual' &&
751
+ this.turnDetection !== 'realtime_llm'
752
+ ) {
753
+ this.interruptByAudioActivity();
754
+
755
+ // TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
756
+ }
757
+
758
+ // TODO: resume false interruption - start interrupt paused speech task
735
759
  }
736
760
 
737
761
  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
@@ -1982,7 +2006,6 @@ export class AgentActivity implements RecognitionHooks {
1982
2006
 
1983
2007
  if (audioOutput) {
1984
2008
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1985
- this.agentSession._updateAgentState('listening');
1986
2009
  }
1987
2010
 
1988
2011
  if (speechHandle.interrupted) {
@@ -2069,17 +2092,15 @@ export class AgentActivity implements RecognitionHooks {
2069
2092
  speechHandle._markGenerationDone();
2070
2093
  // TODO(brian): close tees
2071
2094
 
2072
- toolOutput.firstToolStartedFuture.await.finally(() => {
2073
- this.agentSession._updateAgentState('thinking');
2074
- });
2075
-
2076
2095
  await executeToolsTask.result;
2077
2096
 
2097
+ if (toolOutput.output.length > 0) {
2098
+ this.agentSession._updateAgentState('thinking');
2099
+ } else if (this.agentSession.agentState === 'speaking') {
2100
+ this.agentSession._updateAgentState('listening');
2101
+ }
2102
+
2078
2103
  if (toolOutput.output.length === 0) {
2079
- // return to listening state for thinking-only turns (no audio output, no tools)
2080
- if (!speechHandle.interrupted) {
2081
- this.agentSession._updateAgentState('listening');
2082
- }
2083
2104
  return;
2084
2105
  }
2085
2106
 
package/src/worker.ts CHANGED
@@ -339,13 +339,35 @@ export class AgentServer {
339
339
  );
340
340
 
341
341
  this.#opts = opts;
342
- this.#httpServer = new HTTPServer(opts.host, opts.port, () => ({
342
+
343
+ const healthCheck = () => {
344
+ // Check if inference executor exists and is not alive
345
+ if (this.#inferenceExecutor && !this.#inferenceExecutor.isAlive) {
346
+ return { healthy: false, message: 'inference process not running' };
347
+ }
348
+
349
+ // Only healthy when fully connected with an active WebSocket
350
+ if (
351
+ this.#closed ||
352
+ this.#connecting ||
353
+ !this.#session ||
354
+ this.#session.readyState !== WebSocket.OPEN
355
+ ) {
356
+ return { healthy: false, message: 'not connected to livekit' };
357
+ }
358
+
359
+ return { healthy: true, message: 'OK' };
360
+ };
361
+
362
+ const getWorkerInfo = () => ({
343
363
  agent_name: opts.agentName,
344
364
  worker_type: JobType[opts.serverType],
345
365
  active_jobs: this.activeJobs.length,
346
366
  sdk_version: version,
347
367
  project_type: PROJECT_TYPE,
348
- }));
368
+ });
369
+
370
+ this.#httpServer = new HTTPServer(opts.host, opts.port, healthCheck, getWorkerInfo);
349
371
  }
350
372
 
351
373
  /** @throws {@link WorkerError} if worker failed to connect or already running */