@livekit/agents 0.4.6 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/README.md +17 -0
  2. package/dist/audio.cjs +77 -0
  3. package/dist/audio.cjs.map +1 -0
  4. package/dist/audio.js +48 -37
  5. package/dist/audio.js.map +1 -1
  6. package/dist/cli.cjs +131 -0
  7. package/dist/cli.cjs.map +1 -0
  8. package/dist/cli.js +96 -122
  9. package/dist/cli.js.map +1 -1
  10. package/dist/generator.cjs +36 -0
  11. package/dist/generator.cjs.map +1 -0
  12. package/dist/generator.js +8 -22
  13. package/dist/generator.js.map +1 -1
  14. package/dist/http_server.cjs +72 -0
  15. package/dist/http_server.cjs.map +1 -0
  16. package/dist/http_server.d.ts +1 -1
  17. package/dist/http_server.js +44 -47
  18. package/dist/http_server.js.map +1 -1
  19. package/dist/index.cjs +78 -0
  20. package/dist/index.cjs.map +1 -0
  21. package/dist/index.js +26 -28
  22. package/dist/index.js.map +1 -1
  23. package/dist/ipc/job_executor.cjs +33 -0
  24. package/dist/ipc/job_executor.cjs.map +1 -0
  25. package/dist/ipc/job_executor.js +7 -4
  26. package/dist/ipc/job_executor.js.map +1 -1
  27. package/dist/ipc/job_main.cjs +147 -0
  28. package/dist/ipc/job_main.cjs.map +1 -0
  29. package/dist/ipc/job_main.d.ts +1 -1
  30. package/dist/ipc/job_main.js +103 -103
  31. package/dist/ipc/job_main.js.map +1 -1
  32. package/dist/ipc/message.cjs +17 -0
  33. package/dist/ipc/message.cjs.map +1 -0
  34. package/dist/ipc/message.js +0 -1
  35. package/dist/ipc/message.js.map +1 -1
  36. package/dist/ipc/proc_job_executor.cjs +174 -0
  37. package/dist/ipc/proc_job_executor.cjs.map +1 -0
  38. package/dist/ipc/proc_job_executor.js +130 -126
  39. package/dist/ipc/proc_job_executor.js.map +1 -1
  40. package/dist/ipc/proc_pool.cjs +126 -0
  41. package/dist/ipc/proc_pool.cjs.map +1 -0
  42. package/dist/ipc/proc_pool.js +93 -96
  43. package/dist/ipc/proc_pool.js.map +1 -1
  44. package/dist/job.cjs +230 -0
  45. package/dist/job.cjs.map +1 -0
  46. package/dist/job.js +195 -198
  47. package/dist/job.js.map +1 -1
  48. package/dist/llm/chat_context.cjs +131 -0
  49. package/dist/llm/chat_context.cjs.map +1 -0
  50. package/dist/llm/chat_context.js +98 -86
  51. package/dist/llm/chat_context.js.map +1 -1
  52. package/dist/llm/function_context.cjs +103 -0
  53. package/dist/llm/function_context.cjs.map +1 -0
  54. package/dist/llm/function_context.js +72 -81
  55. package/dist/llm/function_context.js.map +1 -1
  56. package/dist/llm/function_context.test.cjs +218 -0
  57. package/dist/llm/function_context.test.cjs.map +1 -0
  58. package/dist/llm/function_context.test.js +209 -210
  59. package/dist/llm/function_context.test.js.map +1 -1
  60. package/dist/llm/index.cjs +43 -0
  61. package/dist/llm/index.cjs.map +1 -0
  62. package/dist/llm/index.js +22 -6
  63. package/dist/llm/index.js.map +1 -1
  64. package/dist/llm/llm.cjs +76 -0
  65. package/dist/llm/llm.cjs.map +1 -0
  66. package/dist/llm/llm.js +48 -42
  67. package/dist/llm/llm.js.map +1 -1
  68. package/dist/log.cjs +57 -0
  69. package/dist/log.cjs.map +1 -0
  70. package/dist/log.js +27 -26
  71. package/dist/log.js.map +1 -1
  72. package/dist/multimodal/agent_playout.cjs +228 -0
  73. package/dist/multimodal/agent_playout.cjs.map +1 -0
  74. package/dist/multimodal/agent_playout.d.ts +1 -1
  75. package/dist/multimodal/agent_playout.js +193 -180
  76. package/dist/multimodal/agent_playout.js.map +1 -1
  77. package/dist/multimodal/index.cjs +25 -0
  78. package/dist/multimodal/index.cjs.map +1 -0
  79. package/dist/multimodal/index.js +2 -5
  80. package/dist/multimodal/index.js.map +1 -1
  81. package/dist/multimodal/multimodal_agent.cjs +404 -0
  82. package/dist/multimodal/multimodal_agent.cjs.map +1 -0
  83. package/dist/multimodal/multimodal_agent.d.ts +1 -1
  84. package/dist/multimodal/multimodal_agent.js +351 -330
  85. package/dist/multimodal/multimodal_agent.js.map +1 -1
  86. package/dist/pipeline/agent_output.cjs +172 -0
  87. package/dist/pipeline/agent_output.cjs.map +1 -0
  88. package/dist/pipeline/agent_output.js +136 -138
  89. package/dist/pipeline/agent_output.js.map +1 -1
  90. package/dist/pipeline/agent_playout.cjs +169 -0
  91. package/dist/pipeline/agent_playout.cjs.map +1 -0
  92. package/dist/pipeline/agent_playout.js +126 -136
  93. package/dist/pipeline/agent_playout.js.map +1 -1
  94. package/dist/pipeline/human_input.cjs +158 -0
  95. package/dist/pipeline/human_input.cjs.map +1 -0
  96. package/dist/pipeline/human_input.js +124 -125
  97. package/dist/pipeline/human_input.js.map +1 -1
  98. package/dist/pipeline/index.cjs +31 -0
  99. package/dist/pipeline/index.cjs.map +1 -0
  100. package/dist/pipeline/index.js +8 -4
  101. package/dist/pipeline/index.js.map +1 -1
  102. package/dist/pipeline/pipeline_agent.cjs +642 -0
  103. package/dist/pipeline/pipeline_agent.cjs.map +1 -0
  104. package/dist/pipeline/pipeline_agent.js +595 -651
  105. package/dist/pipeline/pipeline_agent.js.map +1 -1
  106. package/dist/pipeline/speech_handle.cjs +128 -0
  107. package/dist/pipeline/speech_handle.cjs.map +1 -0
  108. package/dist/pipeline/speech_handle.js +102 -100
  109. package/dist/pipeline/speech_handle.js.map +1 -1
  110. package/dist/plugin.cjs +46 -0
  111. package/dist/plugin.cjs.map +1 -0
  112. package/dist/plugin.js +20 -20
  113. package/dist/plugin.js.map +1 -1
  114. package/dist/stt/index.cjs +38 -0
  115. package/dist/stt/index.cjs.map +1 -0
  116. package/dist/stt/index.js +13 -5
  117. package/dist/stt/index.js.map +1 -1
  118. package/dist/stt/stream_adapter.cjs +87 -0
  119. package/dist/stt/stream_adapter.cjs.map +1 -0
  120. package/dist/stt/stream_adapter.js +58 -55
  121. package/dist/stt/stream_adapter.js.map +1 -1
  122. package/dist/stt/stt.cjs +98 -0
  123. package/dist/stt/stt.cjs.map +1 -0
  124. package/dist/stt/stt.js +63 -98
  125. package/dist/stt/stt.js.map +1 -1
  126. package/dist/tokenize/basic/basic.cjs +98 -0
  127. package/dist/tokenize/basic/basic.cjs.map +1 -0
  128. package/dist/tokenize/basic/basic.js +56 -45
  129. package/dist/tokenize/basic/basic.js.map +1 -1
  130. package/dist/tokenize/basic/hyphenator.cjs +425 -0
  131. package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
  132. package/dist/tokenize/basic/hyphenator.js +66 -82
  133. package/dist/tokenize/basic/hyphenator.js.map +1 -1
  134. package/dist/tokenize/basic/index.cjs +35 -0
  135. package/dist/tokenize/basic/index.cjs.map +1 -0
  136. package/dist/tokenize/basic/index.js +7 -4
  137. package/dist/tokenize/basic/index.js.map +1 -1
  138. package/dist/tokenize/basic/paragraph.cjs +57 -0
  139. package/dist/tokenize/basic/paragraph.cjs.map +1 -0
  140. package/dist/tokenize/basic/paragraph.js +30 -35
  141. package/dist/tokenize/basic/paragraph.js.map +1 -1
  142. package/dist/tokenize/basic/sentence.cjs +83 -0
  143. package/dist/tokenize/basic/sentence.cjs.map +1 -0
  144. package/dist/tokenize/basic/sentence.js +56 -57
  145. package/dist/tokenize/basic/sentence.js.map +1 -1
  146. package/dist/tokenize/basic/word.cjs +44 -0
  147. package/dist/tokenize/basic/word.cjs.map +1 -0
  148. package/dist/tokenize/basic/word.js +17 -20
  149. package/dist/tokenize/basic/word.js.map +1 -1
  150. package/dist/tokenize/index.cjs +55 -0
  151. package/dist/tokenize/index.cjs.map +1 -0
  152. package/dist/tokenize/index.js +18 -7
  153. package/dist/tokenize/index.js.map +1 -1
  154. package/dist/tokenize/token_stream.cjs +164 -0
  155. package/dist/tokenize/token_stream.cjs.map +1 -0
  156. package/dist/tokenize/token_stream.js +133 -139
  157. package/dist/tokenize/token_stream.js.map +1 -1
  158. package/dist/tokenize/tokenizer.cjs +184 -0
  159. package/dist/tokenize/tokenizer.cjs.map +1 -0
  160. package/dist/tokenize/tokenizer.js +138 -99
  161. package/dist/tokenize/tokenizer.js.map +1 -1
  162. package/dist/transcription.cjs +131 -0
  163. package/dist/transcription.cjs.map +1 -0
  164. package/dist/transcription.js +99 -96
  165. package/dist/transcription.js.map +1 -1
  166. package/dist/tts/index.cjs +38 -0
  167. package/dist/tts/index.cjs.map +1 -0
  168. package/dist/tts/index.js +13 -5
  169. package/dist/tts/index.js.map +1 -1
  170. package/dist/tts/stream_adapter.cjs +78 -0
  171. package/dist/tts/stream_adapter.cjs.map +1 -0
  172. package/dist/tts/stream_adapter.js +50 -47
  173. package/dist/tts/stream_adapter.js.map +1 -1
  174. package/dist/tts/tts.cjs +127 -0
  175. package/dist/tts/tts.cjs.map +1 -0
  176. package/dist/tts/tts.js +90 -120
  177. package/dist/tts/tts.js.map +1 -1
  178. package/dist/utils.cjs +284 -0
  179. package/dist/utils.cjs.map +1 -0
  180. package/dist/utils.js +242 -247
  181. package/dist/utils.js.map +1 -1
  182. package/dist/vad.cjs +92 -0
  183. package/dist/vad.cjs.map +1 -0
  184. package/dist/vad.js +57 -52
  185. package/dist/vad.js.map +1 -1
  186. package/dist/version.cjs +29 -0
  187. package/dist/version.cjs.map +1 -0
  188. package/dist/version.js +4 -4
  189. package/dist/version.js.map +1 -1
  190. package/dist/worker.cjs +576 -0
  191. package/dist/worker.cjs.map +1 -0
  192. package/dist/worker.d.ts +1 -1
  193. package/dist/worker.js +511 -484
  194. package/dist/worker.js.map +1 -1
  195. package/package.json +18 -8
  196. package/src/ipc/job_main.ts +66 -64
  197. package/src/pipeline/pipeline_agent.ts +23 -23
@@ -0,0 +1,642 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+ var pipeline_agent_exports = {};
30
+ __export(pipeline_agent_exports, {
31
+ AGENT_STATE_ATTRIBUTE: () => AGENT_STATE_ATTRIBUTE,
32
+ AgentCallContext: () => AgentCallContext,
33
+ VPAEvent: () => VPAEvent,
34
+ VoicePipelineAgent: () => VoicePipelineAgent
35
+ });
36
+ module.exports = __toCommonJS(pipeline_agent_exports);
37
+ var import_rtc_node = require("@livekit/rtc-node");
38
+ var import_node_events = __toESM(require("node:events"), 1);
39
+ var import_llm = require("../llm/index.cjs");
40
+ var import_llm2 = require("../llm/index.cjs");
41
+ var import_log = require("../log.cjs");
42
+ var import_stt = require("../stt/index.cjs");
43
+ var import_basic = require("../tokenize/basic/index.cjs");
44
+ var import_tts = require("../tts/index.cjs");
45
+ var import_utils = require("../utils.cjs");
46
+ var import_agent_output = require("./agent_output.cjs");
47
+ var import_agent_playout = require("./agent_playout.cjs");
48
+ var import_human_input = require("./human_input.cjs");
49
+ var import_speech_handle = require("./speech_handle.cjs");
50
+ const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
51
+ var VPAEvent = /* @__PURE__ */ ((VPAEvent2) => {
52
+ VPAEvent2[VPAEvent2["USER_STARTED_SPEAKING"] = 0] = "USER_STARTED_SPEAKING";
53
+ VPAEvent2[VPAEvent2["USER_STOPPED_SPEAKING"] = 1] = "USER_STOPPED_SPEAKING";
54
+ VPAEvent2[VPAEvent2["AGENT_STARTED_SPEAKING"] = 2] = "AGENT_STARTED_SPEAKING";
55
+ VPAEvent2[VPAEvent2["AGENT_STOPPED_SPEAKING"] = 3] = "AGENT_STOPPED_SPEAKING";
56
+ VPAEvent2[VPAEvent2["USER_SPEECH_COMMITTED"] = 4] = "USER_SPEECH_COMMITTED";
57
+ VPAEvent2[VPAEvent2["AGENT_SPEECH_COMMITTED"] = 5] = "AGENT_SPEECH_COMMITTED";
58
+ VPAEvent2[VPAEvent2["AGENT_SPEECH_INTERRUPTED"] = 6] = "AGENT_SPEECH_INTERRUPTED";
59
+ VPAEvent2[VPAEvent2["FUNCTION_CALLS_COLLECTED"] = 7] = "FUNCTION_CALLS_COLLECTED";
60
+ VPAEvent2[VPAEvent2["FUNCTION_CALLS_FINISHED"] = 8] = "FUNCTION_CALLS_FINISHED";
61
+ return VPAEvent2;
62
+ })(VPAEvent || {});
63
+ class AgentCallContext {
64
+ #agent;
65
+ #llmStream;
66
+ #metadata = /* @__PURE__ */ new Map();
67
+ static #current;
68
+ constructor(agent, llmStream) {
69
+ this.#agent = agent;
70
+ this.#llmStream = llmStream;
71
+ AgentCallContext.#current = this;
72
+ }
73
+ static getCurrent() {
74
+ return AgentCallContext.#current;
75
+ }
76
+ get agent() {
77
+ return this.#agent;
78
+ }
79
+ storeMetadata(key, value) {
80
+ this.#metadata.set(key, value);
81
+ }
82
+ getMetadata(key, orDefault = void 0) {
83
+ return this.#metadata.get(key) || orDefault;
84
+ }
85
+ get llmStream() {
86
+ return this.#llmStream;
87
+ }
88
+ }
89
+ const defaultBeforeLLMCallback = (agent, chatCtx) => {
90
+ return agent.llm.chat({ chatCtx, fncCtx: agent.fncCtx });
91
+ };
92
+ const defaultBeforeTTSCallback = (_, text) => {
93
+ return text;
94
+ };
95
+ const defaultAgentTranscriptionOptions = {
96
+ userTranscription: true,
97
+ agentTranscription: true,
98
+ agentTranscriptionSpeech: 1,
99
+ sentenceTokenizer: new import_basic.SentenceTokenizer(),
100
+ wordTokenizer: new import_basic.WordTokenizer(false),
101
+ hyphenateWord: import_basic.hyphenateWord
102
+ };
103
+ const defaultVPAOptions = {
104
+ chatCtx: new import_llm2.ChatContext(),
105
+ allowInterruptions: true,
106
+ interruptSpeechDuration: 50,
107
+ interruptMinWords: 0,
108
+ minEndpointingDelay: 500,
109
+ maxRecursiveFncCalls: 1,
110
+ preemptiveSynthesis: false,
111
+ beforeLLMCallback: defaultBeforeLLMCallback,
112
+ beforeTTSCallback: defaultBeforeTTSCallback,
113
+ transcription: defaultAgentTranscriptionOptions
114
+ };
115
+ class VoicePipelineAgent extends import_node_events.default {
116
+ /** Minimum time played for the user speech to be committed to the chat context. */
117
+ MIN_TIME_PLAYED_FOR_COMMIT = 1.5;
118
+ static FLUSH_SENTINEL = Symbol("FLUSH_SENTINEL");
119
+ #vad;
120
+ #stt;
121
+ #llm;
122
+ #tts;
123
+ #opts;
124
+ #humanInput;
125
+ #agentOutput;
126
+ #trackPublishedFut = new import_utils.Future();
127
+ #pendingAgentReply;
128
+ #agentReplyTask;
129
+ #playingSpeech;
130
+ #transcribedText = "";
131
+ #transcribedInterimText = "";
132
+ #speechQueueOpen = new import_utils.Future();
133
+ #speechQueue = new import_utils.AsyncIterableQueue();
134
+ #lastEndOfSpeechTime;
135
+ #updateStateTask;
136
+ #started = false;
137
+ #room;
138
+ #participant = null;
139
+ #deferredValidation;
140
+ #logger = (0, import_log.log)();
141
+ #agentPublication;
142
+ constructor(vad, stt, llm, tts, opts = defaultVPAOptions) {
143
+ super();
144
+ this.#opts = { ...defaultVPAOptions, ...opts };
145
+ if (!stt.capabilities.streaming) {
146
+ stt = new import_stt.StreamAdapter(stt, vad);
147
+ }
148
+ if (!tts.capabilities.streaming) {
149
+ tts = new import_tts.StreamAdapter(tts, new import_basic.SentenceTokenizer());
150
+ }
151
+ this.#vad = vad;
152
+ this.#stt = stt;
153
+ this.#llm = llm;
154
+ this.#tts = tts;
155
+ this.#deferredValidation = new DeferredReplyValidation(
156
+ this.#validateReplyIfPossible.bind(this),
157
+ this.#opts.minEndpointingDelay
158
+ );
159
+ }
160
+ get fncCtx() {
161
+ return this.#opts.fncCtx;
162
+ }
163
+ set fncCtx(ctx) {
164
+ this.#opts.fncCtx = ctx;
165
+ }
166
+ get chatCtx() {
167
+ return this.#opts.chatCtx;
168
+ }
169
+ get llm() {
170
+ return this.#llm;
171
+ }
172
+ get tts() {
173
+ return this.#tts;
174
+ }
175
+ get stt() {
176
+ return this.#stt;
177
+ }
178
+ get vad() {
179
+ return this.#vad;
180
+ }
181
+ /** Start the voice assistant. */
182
+ start(room, participant = null) {
183
+ if (this.#started) {
184
+ throw new Error("voice assistant already started");
185
+ }
186
+ room.on(import_rtc_node.RoomEvent.ParticipantConnected, (participant2) => {
187
+ if (this.#participant) {
188
+ return;
189
+ }
190
+ this.#linkParticipant.call(this, participant2.identity);
191
+ });
192
+ this.#room = room;
193
+ this.#participant = participant;
194
+ if (participant) {
195
+ if (typeof participant === "string") {
196
+ this.#linkParticipant(participant);
197
+ } else {
198
+ this.#linkParticipant(participant.identity);
199
+ }
200
+ }
201
+ this.#run();
202
+ }
203
+ /** Play a speech source through the voice assistant. */
204
+ async say(source, allowInterruptions = true, addToChatCtx = true) {
205
+ await this.#trackPublishedFut.await;
206
+ const newHandle = import_speech_handle.SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
207
+ const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
208
+ newHandle.initialize(source, synthesisHandle);
209
+ this.#addSpeechForPlayout(newHandle);
210
+ }
211
+ #updateState(state, delay = 0) {
212
+ const runTask = (delay2) => {
213
+ return new import_utils.CancellablePromise(async (resolve, _, onCancel) => {
214
+ var _a, _b;
215
+ let cancelled = false;
216
+ onCancel(() => {
217
+ cancelled = true;
218
+ });
219
+ await new Promise((resolve2) => setTimeout(resolve2, delay2));
220
+ if ((_a = this.#room) == null ? void 0 : _a.isConnected) {
221
+ if (!cancelled) {
222
+ await ((_b = this.#room.localParticipant) == null ? void 0 : _b.setAttributes({ [AGENT_STATE_ATTRIBUTE]: state }));
223
+ }
224
+ }
225
+ resolve();
226
+ });
227
+ };
228
+ if (this.#updateStateTask) {
229
+ this.#updateStateTask.cancel();
230
+ }
231
+ this.#updateStateTask = runTask(delay);
232
+ }
233
+ #linkParticipant(participantIdentity) {
234
+ if (!this.#room) {
235
+ this.#logger.error("Room is not set");
236
+ return;
237
+ }
238
+ this.#participant = this.#room.remoteParticipants.get(participantIdentity) || null;
239
+ if (!this.#participant) {
240
+ this.#logger.error(`Participant with identity ${participantIdentity} not found`);
241
+ return;
242
+ }
243
+ this.#humanInput = new import_human_input.HumanInput(this.#room, this.#vad, this.#stt, this.#participant);
244
+ this.#humanInput.on(import_human_input.HumanInputEvent.START_OF_SPEECH, (event) => {
245
+ this.emit(0 /* USER_STARTED_SPEAKING */);
246
+ this.#deferredValidation.onHumanStartOfSpeech(event);
247
+ });
248
+ this.#humanInput.on(import_human_input.HumanInputEvent.VAD_INFERENCE_DONE, (event) => {
249
+ if (!this.#trackPublishedFut.done) {
250
+ return;
251
+ }
252
+ if (!this.#agentOutput) {
253
+ throw new Error("agent output is undefined");
254
+ }
255
+ let tv = 1;
256
+ if (this.#opts.allowInterruptions) {
257
+ tv = Math.max(0, 1 - event.probability);
258
+ this.#agentOutput.playout.targetVolume = tv;
259
+ }
260
+ if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
261
+ this.#interruptIfPossible();
262
+ }
263
+ });
264
+ this.#humanInput.on(import_human_input.HumanInputEvent.END_OF_SPEECH, (event) => {
265
+ this.emit(0 /* USER_STARTED_SPEAKING */);
266
+ this.#deferredValidation.onHumanEndOfSpeech(event);
267
+ this.#lastEndOfSpeechTime = Date.now();
268
+ });
269
+ this.#humanInput.on(import_human_input.HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
270
+ this.#transcribedInterimText = event.alternatives[0].text;
271
+ });
272
+ this.#humanInput.on(import_human_input.HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
273
+ const newTranscript = event.alternatives[0].text;
274
+ if (!newTranscript) return;
275
+ this.#logger.child({ userTranscript: newTranscript }).debug("received user transcript");
276
+ this.#transcribedText += (this.#transcribedText ? " " : "") + newTranscript;
277
+ if (this.#opts.preemptiveSynthesis && (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)) {
278
+ this.#synthesizeAgentReply();
279
+ }
280
+ this.#deferredValidation.onHumanFinalTranscript(newTranscript);
281
+ const words = this.#opts.transcription.wordTokenizer.tokenize(newTranscript);
282
+ if (words.length >= 3) {
283
+ this.#interruptIfPossible();
284
+ }
285
+ });
286
+ }
287
+ async #run() {
288
+ var _a, _b;
289
+ this.#updateState("initializing");
290
+ const audioSource = new import_rtc_node.AudioSource(this.#tts.sampleRate, this.#tts.numChannels);
291
+ const track = import_rtc_node.LocalAudioTrack.createAudioTrack("assistant_voice", audioSource);
292
+ this.#agentPublication = await ((_b = (_a = this.#room) == null ? void 0 : _a.localParticipant) == null ? void 0 : _b.publishTrack(
293
+ track,
294
+ new import_rtc_node.TrackPublishOptions({ source: import_rtc_node.TrackSource.SOURCE_MICROPHONE })
295
+ ));
296
+ const agentPlayout = new import_agent_playout.AgentPlayout(audioSource);
297
+ this.#agentOutput = new import_agent_output.AgentOutput(agentPlayout, this.#tts);
298
+ agentPlayout.on(import_agent_playout.AgentPlayoutEvent.PLAYOUT_STARTED, () => {
299
+ this.emit(2 /* AGENT_STARTED_SPEAKING */);
300
+ this.#updateState("speaking");
301
+ });
302
+ agentPlayout.on(import_agent_playout.AgentPlayoutEvent.PLAYOUT_STOPPED, (_) => {
303
+ this.emit(3 /* AGENT_STOPPED_SPEAKING */);
304
+ this.#updateState("listening");
305
+ });
306
+ this.#trackPublishedFut.resolve();
307
+ while (true) {
308
+ await this.#speechQueueOpen.await;
309
+ for await (const speech of this.#speechQueue) {
310
+ if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;
311
+ this.#playingSpeech = speech;
312
+ await this.#playSpeech(speech);
313
+ this.#playingSpeech = void 0;
314
+ }
315
+ this.#speechQueueOpen = new import_utils.Future();
316
+ }
317
+ }
318
+ #synthesizeAgentReply() {
319
+ var _a;
320
+ (_a = this.#pendingAgentReply) == null ? void 0 : _a.cancel();
321
+ if (this.#humanInput && this.#humanInput.speaking) {
322
+ this.#updateState("thinking", 200);
323
+ }
324
+ this.#pendingAgentReply = import_speech_handle.SpeechHandle.createAssistantReply(
325
+ this.#opts.allowInterruptions,
326
+ true,
327
+ this.#transcribedText
328
+ );
329
+ const newHandle = this.#pendingAgentReply;
330
+ this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);
331
+ }
332
+ #synthesizeAnswerTask(oldTask, handle) {
333
+ return new import_utils.CancellablePromise(async (resolve, _, onCancel) => {
334
+ let cancelled = false;
335
+ onCancel(() => {
336
+ cancelled = true;
337
+ });
338
+ if (oldTask) {
339
+ await (0, import_utils.gracefullyCancel)(oldTask);
340
+ }
341
+ const copiedCtx = this.chatCtx.copy();
342
+ const playingSpeech = this.#playingSpeech;
343
+ if (playingSpeech && playingSpeech.initialized) {
344
+ if ((!playingSpeech.userQuestion || playingSpeech.userCommitted) && !playingSpeech.speechCommitted) {
345
+ copiedCtx.messages.push(
346
+ import_llm2.ChatMessage.create({
347
+ // TODO(nbsp): uhhh unsure where to get the played text here
348
+ // text: playingSpeech.synthesisHandle.(theres no ttsForwarder here)
349
+ role: import_llm2.ChatRole.ASSISTANT
350
+ })
351
+ );
352
+ }
353
+ }
354
+ copiedCtx.messages.push(
355
+ import_llm2.ChatMessage.create({
356
+ text: handle == null ? void 0 : handle.userQuestion,
357
+ role: import_llm2.ChatRole.USER
358
+ })
359
+ );
360
+ if (cancelled) resolve();
361
+ let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
362
+ if (llmStream === false) {
363
+ handle == null ? void 0 : handle.cancel();
364
+ return;
365
+ }
366
+ if (cancelled) resolve();
367
+ if (!(llmStream instanceof import_llm.LLMStream)) {
368
+ llmStream = await defaultBeforeLLMCallback(this, copiedCtx);
369
+ }
370
+ if (handle.interrupted) {
371
+ return;
372
+ }
373
+ const synthesisHandle = this.#synthesizeAgentSpeech(handle.id, llmStream);
374
+ handle.initialize(llmStream, synthesisHandle);
375
+ const elapsed = !!this.#lastEndOfSpeechTime ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1e3) / 1e3 : -1;
376
+ this.#logger.child({ speechId: handle.id, elapsed }).debug("synthesizing agent reply");
377
+ resolve();
378
+ });
379
+ }
380
+ async #playSpeech(handle) {
381
+ try {
382
+ await handle.waitForInitialization();
383
+ } catch {
384
+ return;
385
+ }
386
+ await this.#agentPublication.waitForSubscription();
387
+ const synthesisHandle = handle.synthesisHandle;
388
+ if (synthesisHandle.interrupted) return;
389
+ const userQuestion = handle.userQuestion;
390
+ const playHandle = synthesisHandle.play();
391
+ const joinFut = playHandle.join();
392
+ const commitUserQuestionIfNeeded = () => {
393
+ if (!userQuestion || synthesisHandle.interrupted || handle.userCommitted) return;
394
+ const isUsingTools2 = handle.source instanceof import_llm.LLMStream && !!handle.source.functionCalls.length;
395
+ if (handle.allowInterruptions && !isUsingTools2 && playHandle.timePlayed < this.MIN_TIME_PLAYED_FOR_COMMIT && !joinFut.done) {
396
+ return;
397
+ }
398
+ this.#logger.child({ userTranscript: userQuestion }).debug("committed user transcript");
399
+ const userMsg = import_llm2.ChatMessage.create({ text: userQuestion, role: import_llm2.ChatRole.USER });
400
+ this.chatCtx.messages.push(userMsg);
401
+ this.emit(4 /* USER_SPEECH_COMMITTED */, userMsg);
402
+ this.#transcribedText = this.#transcribedText.slice(userQuestion.length);
403
+ handle.markUserCommitted();
404
+ };
405
+ commitUserQuestionIfNeeded();
406
+ while (!joinFut.done) {
407
+ await new Promise(async (resolve) => {
408
+ setTimeout(resolve, 500);
409
+ await joinFut.await;
410
+ resolve();
411
+ });
412
+ commitUserQuestionIfNeeded();
413
+ if (handle.interrupted) break;
414
+ }
415
+ commitUserQuestionIfNeeded();
416
+ let collectedText = "";
417
+ const isUsingTools = handle.source instanceof import_llm.LLMStream && !!handle.source.functionCalls.length;
418
+ const extraToolsMessages = [];
419
+ let interrupted = handle.interrupted;
420
+ if (isUsingTools && !interrupted) {
421
+ if (!userQuestion || !handle.userCommitted) {
422
+ throw new Error("user speech should have been committed before using tools");
423
+ }
424
+ const llmStream = handle.source;
425
+ let newFunctionCalls = llmStream.functionCalls;
426
+ for (let i = 0; i < this.#opts.maxRecursiveFncCalls; i++) {
427
+ this.emit(7 /* FUNCTION_CALLS_COLLECTED */, newFunctionCalls);
428
+ const calledFuncs = [];
429
+ for (const func of newFunctionCalls) {
430
+ const task = func.func.execute(func.params).then(
431
+ (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
432
+ (error) => ({ name: func.name, toolCallId: func.toolCallId, error })
433
+ );
434
+ calledFuncs.push({ ...func, task });
435
+ this.#logger.child({ function: func.name, speechId: handle.id }).debug("executing AI function");
436
+ try {
437
+ await task;
438
+ } catch {
439
+ this.#logger.child({ function: func.name, speechId: handle.id }).error("error executing AI function");
440
+ }
441
+ }
442
+ const toolCallsInfo = [];
443
+ const toolCallsResults = [];
444
+ for (const fnc of calledFuncs) {
445
+ const task = await fnc.task;
446
+ if (!task || task.result === void 0) continue;
447
+ toolCallsInfo.push(fnc);
448
+ toolCallsResults.push(import_llm2.ChatMessage.createToolFromFunctionResult(task));
449
+ }
450
+ if (!toolCallsInfo.length) break;
451
+ extraToolsMessages.push(import_llm2.ChatMessage.createToolCalls(toolCallsInfo, collectedText));
452
+ extraToolsMessages.push(...toolCallsResults);
453
+ const chatCtx = handle.source.chatCtx.copy();
454
+ chatCtx.messages.push(...extraToolsMessages);
455
+ const answerLLMStream = this.llm.chat({
456
+ chatCtx,
457
+ fncCtx: this.fncCtx
458
+ });
459
+ const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);
460
+ handle.synthesisHandle = answerSynthesis;
461
+ const playHandle2 = answerSynthesis.play();
462
+ await playHandle2.join().await;
463
+ collectedText = "";
464
+ interrupted = answerSynthesis.interrupted;
465
+ newFunctionCalls = answerLLMStream.functionCalls;
466
+ this.emit(8 /* FUNCTION_CALLS_FINISHED */, calledFuncs);
467
+ if (!newFunctionCalls) break;
468
+ }
469
+ }
470
+ if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
471
+ this.chatCtx.messages.push(...extraToolsMessages);
472
+ if (interrupted) {
473
+ collectedText + "\u2026";
474
+ }
475
+ const msg = import_llm2.ChatMessage.create({ text: collectedText, role: import_llm2.ChatRole.ASSISTANT });
476
+ this.chatCtx.messages.push(msg);
477
+ handle.markSpeechCommitted();
478
+ if (interrupted) {
479
+ this.emit(6 /* AGENT_SPEECH_INTERRUPTED */, msg);
480
+ } else {
481
+ this.emit(5 /* AGENT_SPEECH_COMMITTED */, msg);
482
+ }
483
+ this.#logger.child({
484
+ agentTranscript: collectedText,
485
+ interrupted,
486
+ speechId: handle.id
487
+ }).debug("committed agent speech");
488
+ }
489
+ }
490
+ #synthesizeAgentSpeech(speechId, source) {
491
+ if (!this.#agentOutput) {
492
+ throw new Error("agent output should be initialized when ready");
493
+ }
494
+ if (source instanceof import_llm.LLMStream) {
495
+ source = llmStreamToStringIterable(speechId, source);
496
+ }
497
+ const ogSource = source;
498
+ if (!(typeof source === "string")) {
499
+ }
500
+ const ttsSource = this.#opts.beforeTTSCallback(this, ogSource);
501
+ if (!ttsSource) {
502
+ throw new Error("beforeTTSCallback must return string or AsyncIterable<string>");
503
+ }
504
+ return this.#agentOutput.synthesize(speechId, ttsSource);
505
+ }
506
+ async #validateReplyIfPossible() {
507
+ if (this.#playingSpeech && !this.#playingSpeech.allowInterruptions) {
508
+ this.#logger.child({ speechId: this.#playingSpeech.id }).debug("skipping validation, agent is speaking and does not allow interruptions");
509
+ return;
510
+ }
511
+ if (!this.#pendingAgentReply) {
512
+ if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {
513
+ return;
514
+ }
515
+ this.#synthesizeAgentReply();
516
+ }
517
+ if (!this.#pendingAgentReply) {
518
+ throw new Error("pending agent reply is undefined");
519
+ }
520
+ if (this.#speechQueueOpen.done) {
521
+ for await (const speech of this.#speechQueue) {
522
+ if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;
523
+ if (!speech.isReply) continue;
524
+ if (speech.allowInterruptions) speech.interrupt();
525
+ }
526
+ }
527
+ this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug("validated agent reply");
528
+ this.#addSpeechForPlayout(this.#pendingAgentReply);
529
+ this.#pendingAgentReply = void 0;
530
+ this.#transcribedInterimText = "";
531
+ }
532
+ #interruptIfPossible() {
533
+ if (!this.#playingSpeech || !this.#playingSpeech.allowInterruptions || this.#playingSpeech.interrupted) {
534
+ return;
535
+ }
536
+ if (this.#opts.interruptMinWords !== 0) {
537
+ const interimWords = this.#opts.transcription.wordTokenizer.tokenize(
538
+ this.#transcribedInterimText
539
+ );
540
+ if (interimWords.length < this.#opts.interruptMinWords) {
541
+ return;
542
+ }
543
+ }
544
+ this.#playingSpeech.interrupt();
545
+ }
546
+ #addSpeechForPlayout(handle) {
547
+ this.#speechQueue.put(handle);
548
+ this.#speechQueue.put(VoicePipelineAgent.FLUSH_SENTINEL);
549
+ this.#speechQueueOpen.resolve();
550
+ }
551
+ /** Close the voice assistant. */
552
+ async close() {
553
+ var _a;
554
+ if (!this.#started) {
555
+ return;
556
+ }
557
+ (_a = this.#room) == null ? void 0 : _a.removeAllListeners(import_rtc_node.RoomEvent.ParticipantConnected);
558
+ }
559
+ }
560
+ async function* llmStreamToStringIterable(speechId, stream) {
561
+ var _a;
562
+ const startTime = Date.now();
563
+ let firstFrame = true;
564
+ for await (const chunk of stream) {
565
+ const content = (_a = chunk.choices[0]) == null ? void 0 : _a.delta.content;
566
+ if (!content) continue;
567
+ if (firstFrame) {
568
+ firstFrame = false;
569
+ (0, import_log.log)().child({ speechId, elapsed: Math.round(Date.now() - startTime) }).debug("received first LLM token");
570
+ }
571
+ yield content;
572
+ }
573
+ }
574
+ class DeferredReplyValidation {
575
+ // if the STT gives us punctuation, we can try to validate the reply faster.
576
+ PUNCTUATION = ".!?";
577
+ PUNCTUATION_REDUCE_FACTOR = 0.75;
578
+ LATE_TRANSCRIPT_TOLERANCE = 1.5;
579
+ // late compared to end of speech
580
+ #validateFunc;
581
+ #validatingPromise;
582
+ #validatingFuture = new import_utils.Future();
583
+ #lastFinalTranscript = "";
584
+ #lastRecvEndOfSpeechTime = 0;
585
+ #speaking = false;
586
+ #endOfSpeechDelay;
587
+ #finalTranscriptDelay;
588
+ constructor(validateFunc, minEndpointingDelay) {
589
+ this.#validateFunc = validateFunc;
590
+ this.#endOfSpeechDelay = minEndpointingDelay;
591
+ this.#finalTranscriptDelay = minEndpointingDelay;
592
+ }
593
+ get validating() {
594
+ return !this.#validatingFuture.done;
595
+ }
596
+ onHumanFinalTranscript(transcript) {
597
+ this.#lastFinalTranscript = transcript.trim();
598
+ if (this.#speaking) return;
599
+ const hasRecentEndOfSpeech = Date.now() - this.#lastRecvEndOfSpeechTime < this.LATE_TRANSCRIPT_TOLERANCE;
600
+ let delay = hasRecentEndOfSpeech ? this.#endOfSpeechDelay : this.#finalTranscriptDelay;
601
+ delay = this.#endWithPunctuation() ? delay * this.PUNCTUATION_REDUCE_FACTOR : 1;
602
+ this.#run(delay);
603
+ }
604
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
605
+ onHumanStartOfSpeech(_) {
606
+ this.#speaking = true;
607
+ }
608
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
609
+ onHumanEndOfSpeech(_) {
610
+ this.#speaking = false;
611
+ this.#lastRecvEndOfSpeechTime = Date.now();
612
+ if (this.#lastFinalTranscript) {
613
+ const delay = this.#endWithPunctuation() ? this.#endOfSpeechDelay * this.PUNCTUATION_REDUCE_FACTOR : 1;
614
+ this.#run(delay);
615
+ }
616
+ }
617
+ // TODO(nbsp): aclose
618
+ #endWithPunctuation() {
619
+ return this.#lastFinalTranscript.length > 0 && this.PUNCTUATION.includes(this.#lastFinalTranscript[this.#lastFinalTranscript.length - 1]);
620
+ }
621
+ #resetStates() {
622
+ this.#lastFinalTranscript = "";
623
+ this.#lastRecvEndOfSpeechTime = 0;
624
+ }
625
+ #run(delay) {
626
+ const runTask = async (delay2) => {
627
+ await new Promise((resolve) => setTimeout(resolve, delay2));
628
+ this.#resetStates();
629
+ await this.#validateFunc();
630
+ };
631
+ this.#validatingFuture = new import_utils.Future();
632
+ this.#validatingPromise = runTask(delay);
633
+ }
634
+ }
635
+ // Annotate the CommonJS export names for ESM import in node:
636
+ 0 && (module.exports = {
637
+ AGENT_STATE_ATTRIBUTE,
638
+ AgentCallContext,
639
+ VPAEvent,
640
+ VoicePipelineAgent
641
+ });
642
+ //# sourceMappingURL=pipeline_agent.cjs.map