@livekit/agents 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +40 -0
  3. package/dist/audio.js +17 -30
  4. package/dist/audio.js.map +1 -1
  5. package/dist/cli.js +3 -14
  6. package/dist/cli.js.map +1 -1
  7. package/dist/http_server.d.ts +1 -1
  8. package/dist/http_server.js +5 -9
  9. package/dist/http_server.js.map +1 -1
  10. package/dist/index.d.ts +3 -2
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +14 -2
  13. package/dist/index.js.map +1 -1
  14. package/dist/ipc/job_executor.js +3 -5
  15. package/dist/ipc/job_executor.js.map +1 -1
  16. package/dist/ipc/job_main.d.ts +1 -1
  17. package/dist/ipc/proc_job_executor.js +66 -80
  18. package/dist/ipc/proc_job_executor.js.map +1 -1
  19. package/dist/ipc/proc_pool.d.ts +3 -3
  20. package/dist/ipc/proc_pool.d.ts.map +1 -1
  21. package/dist/ipc/proc_pool.js +16 -11
  22. package/dist/ipc/proc_pool.js.map +1 -1
  23. package/dist/job.js +56 -73
  24. package/dist/job.js.map +1 -1
  25. package/dist/llm/chat_context.d.ts +66 -0
  26. package/dist/llm/chat_context.d.ts.map +1 -0
  27. package/dist/llm/chat_context.js +93 -0
  28. package/dist/llm/chat_context.js.map +1 -0
  29. package/dist/llm/function_context.d.ts +19 -1
  30. package/dist/llm/function_context.d.ts.map +1 -1
  31. package/dist/llm/function_context.js +54 -18
  32. package/dist/llm/function_context.js.map +1 -1
  33. package/dist/llm/function_context.test.d.ts +2 -0
  34. package/dist/llm/function_context.test.d.ts.map +1 -0
  35. package/dist/llm/function_context.test.js +218 -0
  36. package/dist/llm/function_context.test.js.map +1 -0
  37. package/dist/llm/index.d.ts +3 -2
  38. package/dist/llm/index.d.ts.map +1 -1
  39. package/dist/llm/index.js +3 -2
  40. package/dist/llm/index.js.map +1 -1
  41. package/dist/llm/llm.d.ts +53 -0
  42. package/dist/llm/llm.d.ts.map +1 -0
  43. package/dist/llm/llm.js +45 -0
  44. package/dist/llm/llm.js.map +1 -0
  45. package/dist/multimodal/agent_playout.d.ts +1 -1
  46. package/dist/multimodal/agent_playout.js +116 -153
  47. package/dist/multimodal/agent_playout.js.map +1 -1
  48. package/dist/multimodal/multimodal_agent.d.ts +4 -3
  49. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  50. package/dist/multimodal/multimodal_agent.js +214 -237
  51. package/dist/multimodal/multimodal_agent.js.map +1 -1
  52. package/dist/pipeline/agent_output.d.ts +30 -0
  53. package/dist/pipeline/agent_output.d.ts.map +1 -0
  54. package/dist/pipeline/agent_output.js +155 -0
  55. package/dist/pipeline/agent_output.js.map +1 -0
  56. package/dist/pipeline/agent_playout.d.ts +38 -0
  57. package/dist/pipeline/agent_playout.d.ts.map +1 -0
  58. package/dist/pipeline/agent_playout.js +142 -0
  59. package/dist/pipeline/agent_playout.js.map +1 -0
  60. package/dist/pipeline/human_input.d.ts +28 -0
  61. package/dist/pipeline/human_input.d.ts.map +1 -0
  62. package/dist/pipeline/human_input.js +134 -0
  63. package/dist/pipeline/human_input.js.map +1 -0
  64. package/dist/pipeline/index.d.ts +2 -0
  65. package/dist/pipeline/index.d.ts.map +1 -0
  66. package/dist/pipeline/index.js +5 -0
  67. package/dist/pipeline/index.js.map +1 -0
  68. package/dist/pipeline/pipeline_agent.d.ts +134 -0
  69. package/dist/pipeline/pipeline_agent.d.ts.map +1 -0
  70. package/dist/pipeline/pipeline_agent.js +661 -0
  71. package/dist/pipeline/pipeline_agent.js.map +1 -0
  72. package/dist/pipeline/speech_handle.d.ts +27 -0
  73. package/dist/pipeline/speech_handle.d.ts.map +1 -0
  74. package/dist/pipeline/speech_handle.js +102 -0
  75. package/dist/pipeline/speech_handle.js.map +1 -0
  76. package/dist/plugin.js +7 -20
  77. package/dist/plugin.js.map +1 -1
  78. package/dist/stt/index.d.ts +1 -2
  79. package/dist/stt/index.d.ts.map +1 -1
  80. package/dist/stt/index.js +1 -2
  81. package/dist/stt/index.js.map +1 -1
  82. package/dist/stt/stt.d.ts +62 -24
  83. package/dist/stt/stt.d.ts.map +1 -1
  84. package/dist/stt/stt.js +77 -27
  85. package/dist/stt/stt.js.map +1 -1
  86. package/dist/tokenize/basic/basic.d.ts +16 -0
  87. package/dist/tokenize/basic/basic.d.ts.map +1 -0
  88. package/dist/tokenize/basic/basic.js +50 -0
  89. package/dist/tokenize/basic/basic.js.map +1 -0
  90. package/dist/tokenize/basic/hyphenator.d.ts +17 -0
  91. package/dist/tokenize/basic/hyphenator.d.ts.map +1 -0
  92. package/dist/tokenize/basic/hyphenator.js +420 -0
  93. package/dist/tokenize/basic/hyphenator.js.map +1 -0
  94. package/dist/tokenize/basic/index.d.ts +2 -0
  95. package/dist/tokenize/basic/index.d.ts.map +1 -0
  96. package/dist/tokenize/basic/index.js +5 -0
  97. package/dist/tokenize/basic/index.js.map +1 -0
  98. package/dist/tokenize/basic/paragraph.d.ts +5 -0
  99. package/dist/tokenize/basic/paragraph.d.ts.map +1 -0
  100. package/dist/tokenize/basic/paragraph.js +38 -0
  101. package/dist/tokenize/basic/paragraph.js.map +1 -0
  102. package/dist/tokenize/basic/sentence.d.ts +5 -0
  103. package/dist/tokenize/basic/sentence.d.ts.map +1 -0
  104. package/dist/tokenize/basic/sentence.js +60 -0
  105. package/dist/tokenize/basic/sentence.js.map +1 -0
  106. package/dist/tokenize/basic/word.d.ts +5 -0
  107. package/dist/tokenize/basic/word.d.ts.map +1 -0
  108. package/dist/tokenize/basic/word.js +23 -0
  109. package/dist/tokenize/basic/word.js.map +1 -0
  110. package/dist/tokenize/index.d.ts +5 -0
  111. package/dist/tokenize/index.d.ts.map +1 -0
  112. package/dist/tokenize/index.js +8 -0
  113. package/dist/tokenize/index.js.map +1 -0
  114. package/dist/tokenize/token_stream.d.ts +36 -0
  115. package/dist/tokenize/token_stream.d.ts.map +1 -0
  116. package/dist/tokenize/token_stream.js +136 -0
  117. package/dist/tokenize/token_stream.js.map +1 -0
  118. package/dist/tokenize/tokenizer.d.ts +55 -0
  119. package/dist/tokenize/tokenizer.d.ts.map +1 -0
  120. package/dist/tokenize/tokenizer.js +117 -0
  121. package/dist/tokenize/tokenizer.js.map +1 -0
  122. package/dist/transcription.js +78 -89
  123. package/dist/transcription.js.map +1 -1
  124. package/dist/tts/index.d.ts +1 -3
  125. package/dist/tts/index.d.ts.map +1 -1
  126. package/dist/tts/index.js +1 -3
  127. package/dist/tts/index.js.map +1 -1
  128. package/dist/tts/tts.d.ts +66 -37
  129. package/dist/tts/tts.d.ts.map +1 -1
  130. package/dist/tts/tts.js +79 -74
  131. package/dist/tts/tts.js.map +1 -1
  132. package/dist/utils.d.ts +21 -6
  133. package/dist/utils.d.ts.map +1 -1
  134. package/dist/utils.js +120 -76
  135. package/dist/utils.js.map +1 -1
  136. package/dist/vad.d.ts +43 -39
  137. package/dist/vad.d.ts.map +1 -1
  138. package/dist/vad.js +51 -4
  139. package/dist/vad.js.map +1 -1
  140. package/dist/worker.d.ts +1 -1
  141. package/dist/worker.js +257 -247
  142. package/dist/worker.js.map +1 -1
  143. package/package.json +4 -3
  144. package/src/index.ts +16 -2
  145. package/src/ipc/proc_pool.ts +4 -4
  146. package/src/llm/chat_context.ts +147 -0
  147. package/src/llm/function_context.test.ts +248 -0
  148. package/src/llm/function_context.ts +77 -18
  149. package/src/llm/index.ts +21 -2
  150. package/src/llm/llm.ts +102 -0
  151. package/src/multimodal/multimodal_agent.ts +19 -6
  152. package/src/pipeline/agent_output.ts +185 -0
  153. package/src/pipeline/agent_playout.ts +187 -0
  154. package/src/pipeline/human_input.ts +166 -0
  155. package/src/pipeline/index.ts +15 -0
  156. package/src/pipeline/pipeline_agent.ts +917 -0
  157. package/src/pipeline/speech_handle.ts +136 -0
  158. package/src/stt/index.ts +8 -2
  159. package/src/stt/stt.ts +98 -31
  160. package/src/tokenize/basic/basic.ts +73 -0
  161. package/src/tokenize/basic/hyphenator.ts +436 -0
  162. package/src/tokenize/basic/index.ts +5 -0
  163. package/src/tokenize/basic/paragraph.ts +43 -0
  164. package/src/tokenize/basic/sentence.ts +69 -0
  165. package/src/tokenize/basic/word.ts +27 -0
  166. package/src/tokenize/index.ts +16 -0
  167. package/src/tokenize/token_stream.ts +163 -0
  168. package/src/tokenize/tokenizer.ts +152 -0
  169. package/src/tts/index.ts +1 -20
  170. package/src/tts/tts.ts +110 -57
  171. package/src/utils.ts +95 -25
  172. package/src/vad.ts +86 -45
  173. package/tsconfig.tsbuildinfo +1 -1
  174. package/dist/stt/stream_adapter.d.ts +0 -19
  175. package/dist/stt/stream_adapter.d.ts.map +0 -1
  176. package/dist/stt/stream_adapter.js +0 -96
  177. package/dist/stt/stream_adapter.js.map +0 -1
  178. package/dist/tokenize.d.ts +0 -15
  179. package/dist/tokenize.d.ts.map +0 -1
  180. package/dist/tokenize.js +0 -12
  181. package/dist/tokenize.js.map +0 -1
  182. package/dist/tts/stream_adapter.d.ts +0 -19
  183. package/dist/tts/stream_adapter.d.ts.map +0 -1
  184. package/dist/tts/stream_adapter.js +0 -111
  185. package/dist/tts/stream_adapter.js.map +0 -1
  186. package/src/stt/stream_adapter.ts +0 -104
  187. package/src/tokenize.ts +0 -22
  188. package/src/tts/stream_adapter.ts +0 -93
package/src/vad.ts CHANGED
@@ -2,70 +2,111 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
+ import { AsyncIterableQueue } from './utils.js';
5
6
 
6
7
  export enum VADEventType {
7
- START_OF_SPEECH = 1,
8
- SPEAKING = 2,
9
- END_OF_SPEECH = 3,
8
+ START_OF_SPEECH,
9
+ INFERENCE_DONE,
10
+ END_OF_SPEECH,
10
11
  }
11
12
 
12
13
  export interface VADEvent {
14
+ /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */
13
15
  type: VADEventType;
14
16
  /**
15
- * Index of the samples of the event (when the event was fired)
17
+ * Index of the audio sample where the event occurred, relative to the inference sample rate.
16
18
  */
17
19
  samplesIndex: number;
20
+ /** Timestamp when the event was fired. */
21
+ timestamp: number;
22
+ /** Duration of the detected speech segment in seconds. */
23
+ speechDuration: number;
24
+ /** Duration of the silence segment preceding or following the speech, in seconds. */
25
+ silenceDuration: number;
18
26
  /**
19
- * Duration of speech, in seconds
27
+ * List of audio frames associated with the speech.
28
+ *
29
+ * @remarks
30
+ * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.
31
+ * - For `inference_done` events, this contains the audio chunks that were processed.
32
+ * - For `end_of_speech` events, this contains the complete user speech.
20
33
  */
21
- duration: number;
22
- speech: AudioFrame[];
34
+ frames: AudioFrame[];
35
+ /** Probability that speech is present (only for `INFERENCE_DONE` events). */
36
+ probability: number;
37
+ /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */
38
+ inferenceDuration: number;
39
+ /** Indicates whether speech was detected in the frames. */
40
+ speaking: boolean;
41
+ }
42
+
43
+ export interface VADCapabilities {
44
+ updateInterval: number;
23
45
  }
24
46
 
25
47
  export abstract class VAD {
48
+ #capabilities: VADCapabilities;
49
+ constructor(capabilities: VADCapabilities) {
50
+ this.#capabilities = capabilities;
51
+ }
52
+
53
+ get capabilities(): VADCapabilities {
54
+ return this.#capabilities;
55
+ }
56
+
26
57
  /**
27
58
  * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.
28
- *
29
- * @param options
30
59
  */
31
- abstract stream({
32
- minSpeakingDuration,
33
- minSilenceDuration,
34
- paddingDuration,
35
- sampleRate,
36
- maxBufferedSpeech,
37
- }: {
38
- /**
39
- * Minimum duration of speech required to trigger a {@link VADEventType.START_OF_SPEECH} event
40
- */
41
- minSpeakingDuration: number;
42
- /**
43
- * Milliseconds to wait before separating speech chunk.
44
- * Not always precise, generally rounded to the nearest 40ms depending on VAD implementation
45
- */
46
- minSilenceDuration: number;
47
- /**
48
- * Number of frames to pad the start and end of speech with
49
- */
50
- paddingDuration: number;
51
- /**
52
- * Sample rate of inference/processing
53
- */
54
- sampleRate: number;
55
- /**
56
- * Number of seconds the buffer may keep until {@link VADEventType.END_OF_SPEECH} is triggered.
57
- * It is recommended to set this to a positive value, as zero may OOM if the user doesn't stop
58
- * speaking.
59
- */
60
- maxBufferedSpeech: number;
61
- }): VADStream;
60
+ abstract stream(): VADStream;
62
61
  }
63
62
 
64
- export abstract class VADStream implements IterableIterator<VADEvent> {
65
- abstract pushFrame(frame: AudioFrame): void;
66
- abstract close(wait: boolean): Promise<void>;
67
- abstract next(): IteratorResult<VADEvent>;
68
- [Symbol.iterator](): VADStream {
63
+ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
64
+ protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
65
+ protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();
66
+ protected queue = new AsyncIterableQueue<VADEvent>();
67
+ protected closed = false;
68
+
69
+ pushFrame(frame: AudioFrame) {
70
+ if (this.input.closed) {
71
+ throw new Error('Input is closed');
72
+ }
73
+ if (this.closed) {
74
+ throw new Error('Stream is closed');
75
+ }
76
+ this.input.put(frame);
77
+ }
78
+
79
+ flush() {
80
+ if (this.input.closed) {
81
+ throw new Error('Input is closed');
82
+ }
83
+ if (this.closed) {
84
+ throw new Error('Stream is closed');
85
+ }
86
+ this.input.put(VADStream.FLUSH_SENTINEL);
87
+ }
88
+
89
+ endInput() {
90
+ if (this.input.closed) {
91
+ throw new Error('Input is closed');
92
+ }
93
+ if (this.closed) {
94
+ throw new Error('Stream is closed');
95
+ }
96
+ this.input.close();
97
+ }
98
+
99
+ next(): Promise<IteratorResult<VADEvent>> {
100
+ return this.queue.next();
101
+ }
102
+
103
+ close() {
104
+ this.input.close();
105
+ this.queue.close();
106
+ this.closed = true;
107
+ }
108
+
109
+ [Symbol.asyncIterator](): VADStream {
69
110
  return this;
70
111
  }
71
112
  }