@livekit/agents 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. package/README.md +17 -0
  2. package/dist/audio.cjs +77 -0
  3. package/dist/audio.cjs.map +1 -0
  4. package/dist/audio.js +48 -37
  5. package/dist/audio.js.map +1 -1
  6. package/dist/cli.cjs +131 -0
  7. package/dist/cli.cjs.map +1 -0
  8. package/dist/cli.js +96 -122
  9. package/dist/cli.js.map +1 -1
  10. package/dist/generator.cjs +36 -0
  11. package/dist/generator.cjs.map +1 -0
  12. package/dist/generator.js +8 -22
  13. package/dist/generator.js.map +1 -1
  14. package/dist/http_server.cjs +72 -0
  15. package/dist/http_server.cjs.map +1 -0
  16. package/dist/http_server.d.ts +1 -1
  17. package/dist/http_server.js +44 -47
  18. package/dist/http_server.js.map +1 -1
  19. package/dist/index.cjs +78 -0
  20. package/dist/index.cjs.map +1 -0
  21. package/dist/index.js +26 -28
  22. package/dist/index.js.map +1 -1
  23. package/dist/ipc/job_executor.cjs +33 -0
  24. package/dist/ipc/job_executor.cjs.map +1 -0
  25. package/dist/ipc/job_executor.js +7 -4
  26. package/dist/ipc/job_executor.js.map +1 -1
  27. package/dist/ipc/job_main.cjs +147 -0
  28. package/dist/ipc/job_main.cjs.map +1 -0
  29. package/dist/ipc/job_main.d.ts +1 -1
  30. package/dist/ipc/job_main.js +103 -103
  31. package/dist/ipc/job_main.js.map +1 -1
  32. package/dist/ipc/message.cjs +17 -0
  33. package/dist/ipc/message.cjs.map +1 -0
  34. package/dist/ipc/message.js +0 -1
  35. package/dist/ipc/message.js.map +1 -1
  36. package/dist/ipc/proc_job_executor.cjs +174 -0
  37. package/dist/ipc/proc_job_executor.cjs.map +1 -0
  38. package/dist/ipc/proc_job_executor.js +130 -126
  39. package/dist/ipc/proc_job_executor.js.map +1 -1
  40. package/dist/ipc/proc_pool.cjs +126 -0
  41. package/dist/ipc/proc_pool.cjs.map +1 -0
  42. package/dist/ipc/proc_pool.js +93 -96
  43. package/dist/ipc/proc_pool.js.map +1 -1
  44. package/dist/job.cjs +230 -0
  45. package/dist/job.cjs.map +1 -0
  46. package/dist/job.js +195 -198
  47. package/dist/job.js.map +1 -1
  48. package/dist/llm/chat_context.cjs +131 -0
  49. package/dist/llm/chat_context.cjs.map +1 -0
  50. package/dist/llm/chat_context.js +98 -86
  51. package/dist/llm/chat_context.js.map +1 -1
  52. package/dist/llm/function_context.cjs +103 -0
  53. package/dist/llm/function_context.cjs.map +1 -0
  54. package/dist/llm/function_context.js +72 -81
  55. package/dist/llm/function_context.js.map +1 -1
  56. package/dist/llm/function_context.test.cjs +218 -0
  57. package/dist/llm/function_context.test.cjs.map +1 -0
  58. package/dist/llm/function_context.test.js +209 -210
  59. package/dist/llm/function_context.test.js.map +1 -1
  60. package/dist/llm/index.cjs +43 -0
  61. package/dist/llm/index.cjs.map +1 -0
  62. package/dist/llm/index.js +22 -6
  63. package/dist/llm/index.js.map +1 -1
  64. package/dist/llm/llm.cjs +76 -0
  65. package/dist/llm/llm.cjs.map +1 -0
  66. package/dist/llm/llm.js +48 -42
  67. package/dist/llm/llm.js.map +1 -1
  68. package/dist/log.cjs +57 -0
  69. package/dist/log.cjs.map +1 -0
  70. package/dist/log.js +27 -26
  71. package/dist/log.js.map +1 -1
  72. package/dist/multimodal/agent_playout.cjs +228 -0
  73. package/dist/multimodal/agent_playout.cjs.map +1 -0
  74. package/dist/multimodal/agent_playout.d.ts +1 -1
  75. package/dist/multimodal/agent_playout.js +193 -180
  76. package/dist/multimodal/agent_playout.js.map +1 -1
  77. package/dist/multimodal/index.cjs +25 -0
  78. package/dist/multimodal/index.cjs.map +1 -0
  79. package/dist/multimodal/index.js +2 -5
  80. package/dist/multimodal/index.js.map +1 -1
  81. package/dist/multimodal/multimodal_agent.cjs +404 -0
  82. package/dist/multimodal/multimodal_agent.cjs.map +1 -0
  83. package/dist/multimodal/multimodal_agent.d.ts +2 -2
  84. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  85. package/dist/multimodal/multimodal_agent.js +351 -303
  86. package/dist/multimodal/multimodal_agent.js.map +1 -1
  87. package/dist/pipeline/agent_output.cjs +172 -0
  88. package/dist/pipeline/agent_output.cjs.map +1 -0
  89. package/dist/pipeline/agent_output.js +136 -138
  90. package/dist/pipeline/agent_output.js.map +1 -1
  91. package/dist/pipeline/agent_playout.cjs +169 -0
  92. package/dist/pipeline/agent_playout.cjs.map +1 -0
  93. package/dist/pipeline/agent_playout.js +126 -136
  94. package/dist/pipeline/agent_playout.js.map +1 -1
  95. package/dist/pipeline/human_input.cjs +158 -0
  96. package/dist/pipeline/human_input.cjs.map +1 -0
  97. package/dist/pipeline/human_input.js +124 -125
  98. package/dist/pipeline/human_input.js.map +1 -1
  99. package/dist/pipeline/index.cjs +31 -0
  100. package/dist/pipeline/index.cjs.map +1 -0
  101. package/dist/pipeline/index.js +8 -4
  102. package/dist/pipeline/index.js.map +1 -1
  103. package/dist/pipeline/pipeline_agent.cjs +642 -0
  104. package/dist/pipeline/pipeline_agent.cjs.map +1 -0
  105. package/dist/pipeline/pipeline_agent.d.ts +1 -0
  106. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  107. package/dist/pipeline/pipeline_agent.js +595 -650
  108. package/dist/pipeline/pipeline_agent.js.map +1 -1
  109. package/dist/pipeline/speech_handle.cjs +128 -0
  110. package/dist/pipeline/speech_handle.cjs.map +1 -0
  111. package/dist/pipeline/speech_handle.js +102 -100
  112. package/dist/pipeline/speech_handle.js.map +1 -1
  113. package/dist/plugin.cjs +46 -0
  114. package/dist/plugin.cjs.map +1 -0
  115. package/dist/plugin.js +20 -20
  116. package/dist/plugin.js.map +1 -1
  117. package/dist/stt/index.cjs +38 -0
  118. package/dist/stt/index.cjs.map +1 -0
  119. package/dist/stt/index.js +13 -5
  120. package/dist/stt/index.js.map +1 -1
  121. package/dist/stt/stream_adapter.cjs +87 -0
  122. package/dist/stt/stream_adapter.cjs.map +1 -0
  123. package/dist/stt/stream_adapter.js +58 -55
  124. package/dist/stt/stream_adapter.js.map +1 -1
  125. package/dist/stt/stt.cjs +98 -0
  126. package/dist/stt/stt.cjs.map +1 -0
  127. package/dist/stt/stt.js +63 -98
  128. package/dist/stt/stt.js.map +1 -1
  129. package/dist/tokenize/basic/basic.cjs +98 -0
  130. package/dist/tokenize/basic/basic.cjs.map +1 -0
  131. package/dist/tokenize/basic/basic.js +56 -45
  132. package/dist/tokenize/basic/basic.js.map +1 -1
  133. package/dist/tokenize/basic/hyphenator.cjs +425 -0
  134. package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
  135. package/dist/tokenize/basic/hyphenator.js +66 -82
  136. package/dist/tokenize/basic/hyphenator.js.map +1 -1
  137. package/dist/tokenize/basic/index.cjs +35 -0
  138. package/dist/tokenize/basic/index.cjs.map +1 -0
  139. package/dist/tokenize/basic/index.js +7 -4
  140. package/dist/tokenize/basic/index.js.map +1 -1
  141. package/dist/tokenize/basic/paragraph.cjs +57 -0
  142. package/dist/tokenize/basic/paragraph.cjs.map +1 -0
  143. package/dist/tokenize/basic/paragraph.js +30 -35
  144. package/dist/tokenize/basic/paragraph.js.map +1 -1
  145. package/dist/tokenize/basic/sentence.cjs +83 -0
  146. package/dist/tokenize/basic/sentence.cjs.map +1 -0
  147. package/dist/tokenize/basic/sentence.js +56 -57
  148. package/dist/tokenize/basic/sentence.js.map +1 -1
  149. package/dist/tokenize/basic/word.cjs +44 -0
  150. package/dist/tokenize/basic/word.cjs.map +1 -0
  151. package/dist/tokenize/basic/word.js +17 -20
  152. package/dist/tokenize/basic/word.js.map +1 -1
  153. package/dist/tokenize/index.cjs +55 -0
  154. package/dist/tokenize/index.cjs.map +1 -0
  155. package/dist/tokenize/index.js +18 -7
  156. package/dist/tokenize/index.js.map +1 -1
  157. package/dist/tokenize/token_stream.cjs +164 -0
  158. package/dist/tokenize/token_stream.cjs.map +1 -0
  159. package/dist/tokenize/token_stream.js +133 -139
  160. package/dist/tokenize/token_stream.js.map +1 -1
  161. package/dist/tokenize/tokenizer.cjs +184 -0
  162. package/dist/tokenize/tokenizer.cjs.map +1 -0
  163. package/dist/tokenize/tokenizer.js +138 -99
  164. package/dist/tokenize/tokenizer.js.map +1 -1
  165. package/dist/transcription.cjs +131 -0
  166. package/dist/transcription.cjs.map +1 -0
  167. package/dist/transcription.d.ts +2 -0
  168. package/dist/transcription.d.ts.map +1 -1
  169. package/dist/transcription.js +99 -93
  170. package/dist/transcription.js.map +1 -1
  171. package/dist/tts/index.cjs +38 -0
  172. package/dist/tts/index.cjs.map +1 -0
  173. package/dist/tts/index.js +13 -5
  174. package/dist/tts/index.js.map +1 -1
  175. package/dist/tts/stream_adapter.cjs +78 -0
  176. package/dist/tts/stream_adapter.cjs.map +1 -0
  177. package/dist/tts/stream_adapter.js +50 -47
  178. package/dist/tts/stream_adapter.js.map +1 -1
  179. package/dist/tts/tts.cjs +127 -0
  180. package/dist/tts/tts.cjs.map +1 -0
  181. package/dist/tts/tts.js +90 -120
  182. package/dist/tts/tts.js.map +1 -1
  183. package/dist/utils.cjs +284 -0
  184. package/dist/utils.cjs.map +1 -0
  185. package/dist/utils.js +242 -247
  186. package/dist/utils.js.map +1 -1
  187. package/dist/vad.cjs +92 -0
  188. package/dist/vad.cjs.map +1 -0
  189. package/dist/vad.js +57 -52
  190. package/dist/vad.js.map +1 -1
  191. package/dist/version.cjs +29 -0
  192. package/dist/version.cjs.map +1 -0
  193. package/dist/version.js +4 -4
  194. package/dist/version.js.map +1 -1
  195. package/dist/worker.cjs +576 -0
  196. package/dist/worker.cjs.map +1 -0
  197. package/dist/worker.d.ts +1 -1
  198. package/dist/worker.js +511 -484
  199. package/dist/worker.js.map +1 -1
  200. package/package.json +23 -7
  201. package/src/ipc/job_main.ts +66 -64
  202. package/src/multimodal/multimodal_agent.ts +29 -2
  203. package/src/pipeline/pipeline_agent.ts +25 -24
  204. package/src/transcription.ts +5 -0
  205. package/.turbo/turbo-build.log +0 -4
  206. package/CHANGELOG.md +0 -165
  207. package/api-extractor.json +0 -20
  208. package/tsconfig.json +0 -16
  209. package/tsconfig.tsbuildinfo +0 -1
@@ -1,326 +1,374 @@
1
- import { AudioSource, AudioStream, LocalAudioTrack, RoomEvent, TrackPublishOptions, TrackSource, } from '@livekit/rtc-node';
2
- import { EventEmitter } from 'node:events';
3
- import { AudioByteStream } from '../audio.js';
4
- import { log } from '../log.js';
5
- import { BasicTranscriptionForwarder } from '../transcription.js';
6
- import { findMicroTrackId } from '../utils.js';
7
- import { AgentPlayout } from './agent_playout.js';
8
- /**
9
- * @internal
10
- * @beta
11
- */
12
- export class RealtimeSession extends EventEmitter {
1
+ import {
2
+ AudioSource,
3
+ AudioStream,
4
+ LocalAudioTrack,
5
+ RoomEvent,
6
+ TrackPublishOptions,
7
+ TrackSource
8
+ } from "@livekit/rtc-node";
9
+ import { EventEmitter } from "node:events";
10
+ import { AudioByteStream } from "../audio.js";
11
+ import * as llm from "../llm/index.js";
12
+ import { log } from "../log.js";
13
+ import { BasicTranscriptionForwarder } from "../transcription.js";
14
+ import { findMicroTrackId } from "../utils.js";
15
+ import { AgentPlayout } from "./agent_playout.js";
16
+ class RealtimeSession extends EventEmitter {
13
17
  }
14
- /**
15
- * @internal
16
- * @beta
17
- */
18
- export class RealtimeModel {
18
+ class RealtimeModel {
19
19
  }
20
- export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';
21
- /** @beta */
22
- export class MultimodalAgent extends EventEmitter {
23
- model;
24
- room = null;
25
- linkedParticipant = null;
26
- subscribedTrack = null;
27
- readMicroTask = null;
28
- constructor({ model, chatCtx, fncCtx, }) {
29
- super();
30
- this.model = model;
31
- this.#chatCtx = chatCtx;
32
- this.#fncCtx = fncCtx;
20
+ const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
21
+ class MultimodalAgent extends EventEmitter {
22
+ model;
23
+ room = null;
24
+ linkedParticipant = null;
25
+ subscribedTrack = null;
26
+ readMicroTask = null;
27
+ constructor({
28
+ model,
29
+ chatCtx,
30
+ fncCtx
31
+ }) {
32
+ super();
33
+ this.model = model;
34
+ this.#chatCtx = chatCtx;
35
+ this.#fncCtx = fncCtx;
36
+ }
37
+ #participant = null;
38
+ #agentPublication = null;
39
+ #localTrackSid = null;
40
+ #localSource = null;
41
+ #agentPlayout = null;
42
+ #playingHandle = void 0;
43
+ #logger = log();
44
+ #session = null;
45
+ #fncCtx = void 0;
46
+ #chatCtx = void 0;
47
+ #_started = false;
48
+ #_pendingFunctionCalls = /* @__PURE__ */ new Set();
49
+ #_speaking = false;
50
+ get fncCtx() {
51
+ return this.#fncCtx;
52
+ }
53
+ set fncCtx(ctx) {
54
+ this.#fncCtx = ctx;
55
+ if (this.#session) {
56
+ this.#session.fncCtx = ctx;
33
57
  }
34
- #participant = null;
35
- #agentPublication = null;
36
- #localTrackSid = null;
37
- #localSource = null;
38
- #agentPlayout = null;
39
- #playingHandle = undefined;
40
- #logger = log();
41
- #session = null;
42
- #fncCtx = undefined;
43
- #chatCtx = undefined;
44
- #_started = false;
45
- #_pendingFunctionCalls = new Set();
46
- #_speaking = false;
47
- get fncCtx() {
48
- return this.#fncCtx;
49
- }
50
- set fncCtx(ctx) {
51
- this.#fncCtx = ctx;
52
- if (this.#session) {
53
- this.#session.fncCtx = ctx;
58
+ }
59
+ get #pendingFunctionCalls() {
60
+ return this.#_pendingFunctionCalls;
61
+ }
62
+ set #pendingFunctionCalls(calls) {
63
+ this.#_pendingFunctionCalls = calls;
64
+ this.#updateState();
65
+ }
66
+ get #speaking() {
67
+ return this.#_speaking;
68
+ }
69
+ set #speaking(isSpeaking) {
70
+ this.#_speaking = isSpeaking;
71
+ this.#updateState();
72
+ }
73
+ get #started() {
74
+ return this.#_started;
75
+ }
76
+ set #started(started) {
77
+ this.#_started = started;
78
+ this.#updateState();
79
+ }
80
+ start(room, participant = null) {
81
+ return new Promise(async (resolve, reject) => {
82
+ var _a;
83
+ if (this.#started) {
84
+ reject(new Error("MultimodalAgent already started"));
85
+ }
86
+ this.#updateState();
87
+ room.on(RoomEvent.ParticipantConnected, (participant2) => {
88
+ if (this.linkedParticipant) {
89
+ return;
90
+ }
91
+ this.#linkParticipant(participant2.identity);
92
+ });
93
+ room.on(
94
+ RoomEvent.TrackPublished,
95
+ (trackPublication, participant2) => {
96
+ if (this.linkedParticipant && participant2.identity === this.linkedParticipant.identity && trackPublication.source === TrackSource.SOURCE_MICROPHONE && !trackPublication.subscribed) {
97
+ trackPublication.setSubscribed(true);
98
+ }
99
+ }
100
+ );
101
+ room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));
102
+ this.room = room;
103
+ this.#participant = participant;
104
+ this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
105
+ this.#agentPlayout = new AgentPlayout(
106
+ this.#localSource,
107
+ this.model.sampleRate,
108
+ this.model.numChannels,
109
+ this.model.inFrameSize,
110
+ this.model.outFrameSize
111
+ );
112
+ const onPlayoutStarted = () => {
113
+ this.emit("agent_started_speaking");
114
+ this.#speaking = true;
115
+ };
116
+ const onPlayoutStopped = (interrupted) => {
117
+ this.emit("agent_stopped_speaking");
118
+ this.#speaking = false;
119
+ if (this.#playingHandle) {
120
+ let text = this.#playingHandle.transcriptionFwd.text;
121
+ if (interrupted) {
122
+ text += "\u2026";
123
+ }
124
+ const msg = llm.ChatMessage.create({
125
+ role: llm.ChatRole.ASSISTANT,
126
+ text
127
+ });
128
+ if (interrupted) {
129
+ this.emit("agent_speech_interrupted", msg);
130
+ } else {
131
+ this.emit("agent_speech_committed", msg);
132
+ }
133
+ this.#logger.child({ transcription: text, interrupted }).debug("committed agent speech");
134
+ }
135
+ };
136
+ this.#agentPlayout.on("playout_started", onPlayoutStarted);
137
+ this.#agentPlayout.on("playout_stopped", onPlayoutStopped);
138
+ const track = LocalAudioTrack.createAudioTrack("assistant_voice", this.#localSource);
139
+ const options = new TrackPublishOptions();
140
+ options.source = TrackSource.SOURCE_MICROPHONE;
141
+ this.#agentPublication = await ((_a = room.localParticipant) == null ? void 0 : _a.publishTrack(track, options)) || null;
142
+ if (!this.#agentPublication) {
143
+ this.#logger.error("Failed to publish track");
144
+ reject(new Error("Failed to publish track"));
145
+ return;
146
+ }
147
+ await this.#agentPublication.waitForSubscription();
148
+ if (participant) {
149
+ if (typeof participant === "string") {
150
+ this.#linkParticipant(participant);
151
+ } else {
152
+ this.#linkParticipant(participant.identity);
153
+ }
154
+ } else {
155
+ for (const participant2 of room.remoteParticipants.values()) {
156
+ this.#linkParticipant(participant2.identity);
157
+ break;
158
+ }
159
+ }
160
+ this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
161
+ this.#started = true;
162
+ this.#session.on("response_content_added", (message) => {
163
+ var _a2;
164
+ const trFwd = new BasicTranscriptionForwarder(
165
+ this.room,
166
+ this.room.localParticipant.identity,
167
+ this.#getLocalTrackSid(),
168
+ message.responseId
169
+ );
170
+ const handle = (_a2 = this.#agentPlayout) == null ? void 0 : _a2.play(
171
+ message.itemId,
172
+ message.contentIndex,
173
+ trFwd,
174
+ message.textStream,
175
+ message.audioStream
176
+ );
177
+ this.#playingHandle = handle;
178
+ });
179
+ this.#session.on("input_speech_committed", (ev) => {
180
+ var _a2, _b;
181
+ const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
182
+ const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
183
+ if (participantIdentity && trackSid) {
184
+ this.#publishTranscription(participantIdentity, trackSid, "\u2026", false, ev.itemId);
185
+ } else {
186
+ this.#logger.error("Participant or track not set");
187
+ }
188
+ });
189
+ this.#session.on("input_speech_transcription_completed", (ev) => {
190
+ var _a2, _b;
191
+ const transcription = ev.transcript;
192
+ const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
193
+ const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
194
+ if (participantIdentity && trackSid) {
195
+ this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
196
+ } else {
197
+ this.#logger.error("Participant or track not set");
198
+ }
199
+ const userMsg = llm.ChatMessage.create({
200
+ role: llm.ChatRole.USER,
201
+ text: transcription
202
+ });
203
+ this.emit("user_speech_committed", userMsg);
204
+ this.#logger.child({ transcription }).debug("committed user speech");
205
+ });
206
+ this.#session.on("input_speech_started", (ev) => {
207
+ var _a2, _b;
208
+ if (this.#playingHandle && !this.#playingHandle.done) {
209
+ this.#playingHandle.interrupt();
210
+ this.#session.conversation.item.truncate(
211
+ this.#playingHandle.itemId,
212
+ this.#playingHandle.contentIndex,
213
+ Math.floor(this.#playingHandle.audioSamples / 24e3 * 1e3)
214
+ );
215
+ this.#playingHandle = void 0;
54
216
  }
217
+ const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
218
+ const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
219
+ if (participantIdentity && trackSid) {
220
+ this.#publishTranscription(participantIdentity, trackSid, "\u2026", false, ev.itemId);
221
+ }
222
+ });
223
+ this.#session.on("input_speech_stopped", (ev) => {
224
+ this.emit("user_stopped_speaking");
225
+ });
226
+ this.#session.on("function_call_started", (ev) => {
227
+ this.#pendingFunctionCalls.add(ev.callId);
228
+ this.#updateState();
229
+ });
230
+ this.#session.on("function_call_completed", (ev) => {
231
+ this.#pendingFunctionCalls.delete(ev.callId);
232
+ this.#updateState();
233
+ });
234
+ this.#session.on("function_call_failed", (ev) => {
235
+ this.#pendingFunctionCalls.delete(ev.callId);
236
+ this.#updateState();
237
+ });
238
+ resolve(this.#session);
239
+ });
240
+ }
241
+ #linkParticipant(participantIdentity) {
242
+ if (!this.room) {
243
+ this.#logger.error("Room is not set");
244
+ return;
55
245
  }
56
- get #pendingFunctionCalls() {
57
- return this.#_pendingFunctionCalls;
246
+ this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
247
+ if (!this.linkedParticipant) {
248
+ this.#logger.error(`Participant with identity ${participantIdentity} not found`);
249
+ return;
58
250
  }
59
- set #pendingFunctionCalls(calls) {
60
- this.#_pendingFunctionCalls = calls;
61
- this.#updateState();
251
+ if (this.linkedParticipant.trackPublications.size > 0) {
252
+ this.#subscribeToMicrophone();
62
253
  }
63
- get #speaking() {
64
- return this.#_speaking;
254
+ for (const publication of this.linkedParticipant.trackPublications.values()) {
255
+ if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {
256
+ this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);
257
+ break;
258
+ }
65
259
  }
66
- set #speaking(isSpeaking) {
67
- this.#_speaking = isSpeaking;
68
- this.#updateState();
260
+ }
261
+ #subscribeToMicrophone() {
262
+ if (!this.linkedParticipant) {
263
+ this.#logger.error("Participant is not set");
264
+ return;
69
265
  }
70
- get #started() {
71
- return this.#_started;
266
+ let microphonePublication = void 0;
267
+ for (const publication of this.linkedParticipant.trackPublications.values()) {
268
+ if (publication.source === TrackSource.SOURCE_MICROPHONE) {
269
+ microphonePublication = publication;
270
+ break;
271
+ }
72
272
  }
73
- set #started(started) {
74
- this.#_started = started;
75
- this.#updateState();
273
+ if (!microphonePublication) {
274
+ return;
76
275
  }
77
- start(room, participant = null) {
78
- return new Promise(async (resolve, reject) => {
79
- if (this.#started) {
80
- reject(new Error('MultimodalAgent already started'));
81
- }
82
- this.#updateState();
83
- room.on(RoomEvent.ParticipantConnected, (participant) => {
84
- // automatically link to the first participant that connects, if not already linked
85
- if (this.linkedParticipant) {
86
- return;
87
- }
88
- this.#linkParticipant(participant.identity);
89
- });
90
- room.on(RoomEvent.TrackPublished, (trackPublication, participant) => {
91
- if (this.linkedParticipant &&
92
- participant.identity === this.linkedParticipant.identity &&
93
- trackPublication.source === TrackSource.SOURCE_MICROPHONE &&
94
- !trackPublication.subscribed) {
95
- trackPublication.setSubscribed(true);
96
- }
97
- });
98
- room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));
99
- this.room = room;
100
- this.#participant = participant;
101
- this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
102
- this.#agentPlayout = new AgentPlayout(this.#localSource, this.model.sampleRate, this.model.numChannels, this.model.inFrameSize, this.model.outFrameSize);
103
- const onPlayoutStarted = () => {
104
- this.emit('agent_started_speaking');
105
- this.#speaking = true;
106
- };
107
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
108
- const onPlayoutStopped = (interrupted) => {
109
- this.emit('agent_stopped_speaking');
110
- this.#speaking = false;
111
- };
112
- this.#agentPlayout.on('playout_started', onPlayoutStarted);
113
- this.#agentPlayout.on('playout_stopped', onPlayoutStopped);
114
- const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.#localSource);
115
- const options = new TrackPublishOptions();
116
- options.source = TrackSource.SOURCE_MICROPHONE;
117
- this.#agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;
118
- if (!this.#agentPublication) {
119
- this.#logger.error('Failed to publish track');
120
- reject(new Error('Failed to publish track'));
121
- return;
122
- }
123
- await this.#agentPublication.waitForSubscription();
124
- if (participant) {
125
- if (typeof participant === 'string') {
126
- this.#linkParticipant(participant);
127
- }
128
- else {
129
- this.#linkParticipant(participant.identity);
130
- }
131
- }
132
- else {
133
- // No participant specified, try to find the first participant in the room
134
- for (const participant of room.remoteParticipants.values()) {
135
- this.#linkParticipant(participant.identity);
136
- break;
137
- }
138
- }
139
- this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
140
- this.#started = true;
141
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
142
- this.#session.on('response_content_added', (message) => {
143
- // openai.realtime.RealtimeContent
144
- const trFwd = new BasicTranscriptionForwarder(this.room, this.room.localParticipant.identity, this.#getLocalTrackSid(), message.responseId);
145
- const handle = this.#agentPlayout?.play(message.itemId, message.contentIndex, trFwd, message.textStream, message.audioStream);
146
- this.#playingHandle = handle;
147
- });
148
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
149
- this.#session.on('input_speech_committed', (ev) => {
150
- // openai.realtime.InputSpeechCommittedEvent
151
- const participantIdentity = this.linkedParticipant?.identity;
152
- const trackSid = this.subscribedTrack?.sid;
153
- if (participantIdentity && trackSid) {
154
- this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
155
- }
156
- else {
157
- this.#logger.error('Participant or track not set');
158
- }
159
- });
160
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
161
- this.#session.on('input_speech_transcription_completed', (ev) => {
162
- // openai.realtime.InputSpeechTranscriptionCompletedEvent
163
- const transcription = ev.transcript;
164
- const participantIdentity = this.linkedParticipant?.identity;
165
- const trackSid = this.subscribedTrack?.sid;
166
- if (participantIdentity && trackSid) {
167
- this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
168
- }
169
- else {
170
- this.#logger.error('Participant or track not set');
171
- }
172
- });
173
- this.#session.on('input_speech_started', (ev) => {
174
- if (this.#playingHandle && !this.#playingHandle.done) {
175
- this.#playingHandle.interrupt();
176
- this.#session.conversation.item.truncate(this.#playingHandle.itemId, this.#playingHandle.contentIndex, Math.floor((this.#playingHandle.audioSamples / 24000) * 1000));
177
- this.#playingHandle = undefined;
178
- }
179
- const participantIdentity = this.linkedParticipant?.identity;
180
- const trackSid = this.subscribedTrack?.sid;
181
- if (participantIdentity && trackSid) {
182
- this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
183
- }
184
- });
185
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
186
- this.#session.on('function_call_started', (ev) => {
187
- this.#pendingFunctionCalls.add(ev.callId);
188
- this.#updateState();
189
- });
190
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
191
- this.#session.on('function_call_completed', (ev) => {
192
- this.#pendingFunctionCalls.delete(ev.callId);
193
- this.#updateState();
194
- });
195
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
196
- this.#session.on('function_call_failed', (ev) => {
197
- this.#pendingFunctionCalls.delete(ev.callId);
198
- this.#updateState();
199
- });
200
- resolve(this.#session);
201
- });
276
+ if (!microphonePublication.subscribed) {
277
+ microphonePublication.setSubscribed(true);
202
278
  }
203
- #linkParticipant(participantIdentity) {
204
- if (!this.room) {
205
- this.#logger.error('Room is not set');
206
- return;
207
- }
208
- this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
209
- if (!this.linkedParticipant) {
210
- this.#logger.error(`Participant with identity ${participantIdentity} not found`);
211
- return;
212
- }
213
- if (this.linkedParticipant.trackPublications.size > 0) {
214
- this.#subscribeToMicrophone();
215
- }
216
- // also check if already subscribed
217
- for (const publication of this.linkedParticipant.trackPublications.values()) {
218
- if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {
219
- this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);
220
- break;
221
- }
222
- }
279
+ }
280
+ #handleTrackSubscription(track, publication, participant) {
281
+ var _a;
282
+ if (publication.source !== TrackSource.SOURCE_MICROPHONE || participant.identity !== ((_a = this.linkedParticipant) == null ? void 0 : _a.identity)) {
283
+ return;
223
284
  }
224
- #subscribeToMicrophone() {
225
- if (!this.linkedParticipant) {
226
- this.#logger.error('Participant is not set');
227
- return;
228
- }
229
- let microphonePublication = undefined;
230
- for (const publication of this.linkedParticipant.trackPublications.values()) {
231
- if (publication.source === TrackSource.SOURCE_MICROPHONE) {
232
- microphonePublication = publication;
233
- break;
234
- }
235
- }
236
- if (!microphonePublication) {
237
- return;
238
- }
239
- if (!microphonePublication.subscribed) {
240
- microphonePublication.setSubscribed(true);
285
+ const readAudioStreamTask = async (audioStream) => {
286
+ const bstream = new AudioByteStream(
287
+ this.model.sampleRate,
288
+ this.model.numChannels,
289
+ this.model.inFrameSize
290
+ );
291
+ for await (const frame of audioStream) {
292
+ const audioData = frame.data;
293
+ for (const frame2 of bstream.write(audioData.buffer)) {
294
+ this.#session.inputAudioBuffer.append(frame2);
241
295
  }
296
+ }
297
+ };
298
+ this.subscribedTrack = track;
299
+ if (this.readMicroTask) {
300
+ this.readMicroTask.cancel();
242
301
  }
243
- #handleTrackSubscription(track, publication, participant) {
244
- if (publication.source !== TrackSource.SOURCE_MICROPHONE ||
245
- participant.identity !== this.linkedParticipant?.identity) {
246
- return;
247
- }
248
- const readAudioStreamTask = async (audioStream) => {
249
- const bstream = new AudioByteStream(this.model.sampleRate, this.model.numChannels, this.model.inFrameSize);
250
- for await (const frame of audioStream) {
251
- const audioData = frame.data;
252
- for (const frame of bstream.write(audioData.buffer)) {
253
- this.#session.inputAudioBuffer.append(frame);
254
- }
255
- }
256
- };
257
- this.subscribedTrack = track;
258
- if (this.readMicroTask) {
259
- this.readMicroTask.cancel();
260
- }
261
- let cancel;
262
- this.readMicroTask = {
263
- promise: new Promise((resolve, reject) => {
264
- cancel = () => {
265
- reject(new Error('Task cancelled'));
266
- };
267
- readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))
268
- .then(resolve)
269
- .catch(reject);
270
- }),
271
- cancel: () => cancel(),
302
+ let cancel;
303
+ this.readMicroTask = {
304
+ promise: new Promise((resolve, reject) => {
305
+ cancel = () => {
306
+ reject(new Error("Task cancelled"));
272
307
  };
308
+ readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels)).then(resolve).catch(reject);
309
+ }),
310
+ cancel: () => cancel()
311
+ };
312
+ }
313
+ #getLocalTrackSid() {
314
+ var _a;
315
+ if (!this.#localTrackSid && this.room && this.room.localParticipant) {
316
+ this.#localTrackSid = findMicroTrackId(this.room, (_a = this.room.localParticipant) == null ? void 0 : _a.identity);
273
317
  }
274
- #getLocalTrackSid() {
275
- if (!this.#localTrackSid && this.room && this.room.localParticipant) {
276
- this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant?.identity);
277
- }
278
- return this.#localTrackSid;
279
- }
280
- #publishTranscription(participantIdentity, trackSid, text, isFinal, id) {
281
- this.#logger.debug(`Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`);
282
- if (!this.room?.localParticipant) {
283
- this.#logger.error('Room or local participant not set');
284
- return;
285
- }
286
- this.room.localParticipant.publishTranscription({
287
- participantIdentity,
288
- trackSid,
289
- segments: [
290
- {
291
- text,
292
- final: isFinal,
293
- id,
294
- startTime: BigInt(0),
295
- endTime: BigInt(0),
296
- language: '',
297
- },
298
- ],
299
- });
318
+ return this.#localTrackSid;
319
+ }
320
+ #publishTranscription(participantIdentity, trackSid, text, isFinal, id) {
321
+ var _a;
322
+ this.#logger.debug(
323
+ `Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`
324
+ );
325
+ if (!((_a = this.room) == null ? void 0 : _a.localParticipant)) {
326
+ this.#logger.error("Room or local participant not set");
327
+ return;
300
328
  }
301
- #updateState() {
302
- let newState = 'initializing';
303
- if (this.#pendingFunctionCalls.size > 0) {
304
- newState = 'thinking';
329
+ this.room.localParticipant.publishTranscription({
330
+ participantIdentity,
331
+ trackSid,
332
+ segments: [
333
+ {
334
+ text,
335
+ final: isFinal,
336
+ id,
337
+ startTime: BigInt(0),
338
+ endTime: BigInt(0),
339
+ language: ""
305
340
  }
306
- else if (this.#speaking) {
307
- newState = 'speaking';
308
- }
309
- else if (this.#started) {
310
- newState = 'listening';
311
- }
312
- this.#setState(newState);
341
+ ]
342
+ });
343
+ }
344
+ #updateState() {
345
+ let newState = "initializing";
346
+ if (this.#pendingFunctionCalls.size > 0) {
347
+ newState = "thinking";
348
+ } else if (this.#speaking) {
349
+ newState = "speaking";
350
+ } else if (this.#started) {
351
+ newState = "listening";
313
352
  }
314
- #setState(state) {
315
- if (this.room?.isConnected && this.room.localParticipant) {
316
- const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
317
- if (currentState !== state) {
318
- this.room.localParticipant.setAttributes({
319
- [AGENT_STATE_ATTRIBUTE]: state,
320
- });
321
- this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);
322
- }
323
- }
353
+ this.#setState(newState);
354
+ }
355
+ #setState(state) {
356
+ var _a;
357
+ if (((_a = this.room) == null ? void 0 : _a.isConnected) && this.room.localParticipant) {
358
+ const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
359
+ if (currentState !== state) {
360
+ this.room.localParticipant.setAttributes({
361
+ [AGENT_STATE_ATTRIBUTE]: state
362
+ });
363
+ this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);
364
+ }
324
365
  }
366
+ }
325
367
  }
368
+ export {
369
+ AGENT_STATE_ATTRIBUTE,
370
+ MultimodalAgent,
371
+ RealtimeModel,
372
+ RealtimeSession
373
+ };
326
374
  //# sourceMappingURL=multimodal_agent.js.map