@livekit/agents 0.4.6 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +17 -0
  2. package/dist/audio.cjs +77 -0
  3. package/dist/audio.cjs.map +1 -0
  4. package/dist/audio.js +48 -37
  5. package/dist/audio.js.map +1 -1
  6. package/dist/cli.cjs +131 -0
  7. package/dist/cli.cjs.map +1 -0
  8. package/dist/cli.js +96 -122
  9. package/dist/cli.js.map +1 -1
  10. package/dist/generator.cjs +36 -0
  11. package/dist/generator.cjs.map +1 -0
  12. package/dist/generator.js +8 -22
  13. package/dist/generator.js.map +1 -1
  14. package/dist/http_server.cjs +72 -0
  15. package/dist/http_server.cjs.map +1 -0
  16. package/dist/http_server.d.ts +1 -1
  17. package/dist/http_server.js +44 -47
  18. package/dist/http_server.js.map +1 -1
  19. package/dist/index.cjs +78 -0
  20. package/dist/index.cjs.map +1 -0
  21. package/dist/index.js +26 -28
  22. package/dist/index.js.map +1 -1
  23. package/dist/ipc/job_executor.cjs +33 -0
  24. package/dist/ipc/job_executor.cjs.map +1 -0
  25. package/dist/ipc/job_executor.js +7 -4
  26. package/dist/ipc/job_executor.js.map +1 -1
  27. package/dist/ipc/job_main.cjs +147 -0
  28. package/dist/ipc/job_main.cjs.map +1 -0
  29. package/dist/ipc/job_main.d.ts +1 -1
  30. package/dist/ipc/job_main.js +103 -103
  31. package/dist/ipc/job_main.js.map +1 -1
  32. package/dist/ipc/message.cjs +17 -0
  33. package/dist/ipc/message.cjs.map +1 -0
  34. package/dist/ipc/message.js +0 -1
  35. package/dist/ipc/message.js.map +1 -1
  36. package/dist/ipc/proc_job_executor.cjs +174 -0
  37. package/dist/ipc/proc_job_executor.cjs.map +1 -0
  38. package/dist/ipc/proc_job_executor.js +130 -126
  39. package/dist/ipc/proc_job_executor.js.map +1 -1
  40. package/dist/ipc/proc_pool.cjs +126 -0
  41. package/dist/ipc/proc_pool.cjs.map +1 -0
  42. package/dist/ipc/proc_pool.js +93 -96
  43. package/dist/ipc/proc_pool.js.map +1 -1
  44. package/dist/job.cjs +230 -0
  45. package/dist/job.cjs.map +1 -0
  46. package/dist/job.d.ts +6 -1
  47. package/dist/job.d.ts.map +1 -1
  48. package/dist/job.js +195 -198
  49. package/dist/job.js.map +1 -1
  50. package/dist/llm/chat_context.cjs +131 -0
  51. package/dist/llm/chat_context.cjs.map +1 -0
  52. package/dist/llm/chat_context.js +98 -86
  53. package/dist/llm/chat_context.js.map +1 -1
  54. package/dist/llm/function_context.cjs +103 -0
  55. package/dist/llm/function_context.cjs.map +1 -0
  56. package/dist/llm/function_context.js +72 -81
  57. package/dist/llm/function_context.js.map +1 -1
  58. package/dist/llm/function_context.test.cjs +218 -0
  59. package/dist/llm/function_context.test.cjs.map +1 -0
  60. package/dist/llm/function_context.test.js +209 -210
  61. package/dist/llm/function_context.test.js.map +1 -1
  62. package/dist/llm/index.cjs +43 -0
  63. package/dist/llm/index.cjs.map +1 -0
  64. package/dist/llm/index.js +22 -6
  65. package/dist/llm/index.js.map +1 -1
  66. package/dist/llm/llm.cjs +76 -0
  67. package/dist/llm/llm.cjs.map +1 -0
  68. package/dist/llm/llm.js +48 -42
  69. package/dist/llm/llm.js.map +1 -1
  70. package/dist/log.cjs +57 -0
  71. package/dist/log.cjs.map +1 -0
  72. package/dist/log.js +27 -26
  73. package/dist/log.js.map +1 -1
  74. package/dist/multimodal/agent_playout.cjs +228 -0
  75. package/dist/multimodal/agent_playout.cjs.map +1 -0
  76. package/dist/multimodal/agent_playout.d.ts +1 -1
  77. package/dist/multimodal/agent_playout.js +193 -180
  78. package/dist/multimodal/agent_playout.js.map +1 -1
  79. package/dist/multimodal/index.cjs +25 -0
  80. package/dist/multimodal/index.cjs.map +1 -0
  81. package/dist/multimodal/index.js +2 -5
  82. package/dist/multimodal/index.js.map +1 -1
  83. package/dist/multimodal/multimodal_agent.cjs +404 -0
  84. package/dist/multimodal/multimodal_agent.cjs.map +1 -0
  85. package/dist/multimodal/multimodal_agent.d.ts +1 -1
  86. package/dist/multimodal/multimodal_agent.js +351 -330
  87. package/dist/multimodal/multimodal_agent.js.map +1 -1
  88. package/dist/pipeline/agent_output.cjs +172 -0
  89. package/dist/pipeline/agent_output.cjs.map +1 -0
  90. package/dist/pipeline/agent_output.js +136 -138
  91. package/dist/pipeline/agent_output.js.map +1 -1
  92. package/dist/pipeline/agent_playout.cjs +169 -0
  93. package/dist/pipeline/agent_playout.cjs.map +1 -0
  94. package/dist/pipeline/agent_playout.js +126 -136
  95. package/dist/pipeline/agent_playout.js.map +1 -1
  96. package/dist/pipeline/human_input.cjs +158 -0
  97. package/dist/pipeline/human_input.cjs.map +1 -0
  98. package/dist/pipeline/human_input.js +124 -125
  99. package/dist/pipeline/human_input.js.map +1 -1
  100. package/dist/pipeline/index.cjs +31 -0
  101. package/dist/pipeline/index.cjs.map +1 -0
  102. package/dist/pipeline/index.js +8 -4
  103. package/dist/pipeline/index.js.map +1 -1
  104. package/dist/pipeline/pipeline_agent.cjs +642 -0
  105. package/dist/pipeline/pipeline_agent.cjs.map +1 -0
  106. package/dist/pipeline/pipeline_agent.js +595 -651
  107. package/dist/pipeline/pipeline_agent.js.map +1 -1
  108. package/dist/pipeline/speech_handle.cjs +128 -0
  109. package/dist/pipeline/speech_handle.cjs.map +1 -0
  110. package/dist/pipeline/speech_handle.js +102 -100
  111. package/dist/pipeline/speech_handle.js.map +1 -1
  112. package/dist/plugin.cjs +46 -0
  113. package/dist/plugin.cjs.map +1 -0
  114. package/dist/plugin.js +20 -20
  115. package/dist/plugin.js.map +1 -1
  116. package/dist/stt/index.cjs +38 -0
  117. package/dist/stt/index.cjs.map +1 -0
  118. package/dist/stt/index.js +13 -5
  119. package/dist/stt/index.js.map +1 -1
  120. package/dist/stt/stream_adapter.cjs +87 -0
  121. package/dist/stt/stream_adapter.cjs.map +1 -0
  122. package/dist/stt/stream_adapter.js +58 -55
  123. package/dist/stt/stream_adapter.js.map +1 -1
  124. package/dist/stt/stt.cjs +98 -0
  125. package/dist/stt/stt.cjs.map +1 -0
  126. package/dist/stt/stt.js +63 -98
  127. package/dist/stt/stt.js.map +1 -1
  128. package/dist/tokenize/basic/basic.cjs +98 -0
  129. package/dist/tokenize/basic/basic.cjs.map +1 -0
  130. package/dist/tokenize/basic/basic.d.ts +1 -1
  131. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  132. package/dist/tokenize/basic/basic.js +56 -45
  133. package/dist/tokenize/basic/basic.js.map +1 -1
  134. package/dist/tokenize/basic/hyphenator.cjs +425 -0
  135. package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
  136. package/dist/tokenize/basic/hyphenator.js +66 -82
  137. package/dist/tokenize/basic/hyphenator.js.map +1 -1
  138. package/dist/tokenize/basic/index.cjs +35 -0
  139. package/dist/tokenize/basic/index.cjs.map +1 -0
  140. package/dist/tokenize/basic/index.js +7 -4
  141. package/dist/tokenize/basic/index.js.map +1 -1
  142. package/dist/tokenize/basic/paragraph.cjs +57 -0
  143. package/dist/tokenize/basic/paragraph.cjs.map +1 -0
  144. package/dist/tokenize/basic/paragraph.js +30 -35
  145. package/dist/tokenize/basic/paragraph.js.map +1 -1
  146. package/dist/tokenize/basic/sentence.cjs +89 -0
  147. package/dist/tokenize/basic/sentence.cjs.map +1 -0
  148. package/dist/tokenize/basic/sentence.d.ts.map +1 -1
  149. package/dist/tokenize/basic/sentence.js +62 -57
  150. package/dist/tokenize/basic/sentence.js.map +1 -1
  151. package/dist/tokenize/basic/word.cjs +44 -0
  152. package/dist/tokenize/basic/word.cjs.map +1 -0
  153. package/dist/tokenize/basic/word.js +17 -20
  154. package/dist/tokenize/basic/word.js.map +1 -1
  155. package/dist/tokenize/index.cjs +55 -0
  156. package/dist/tokenize/index.cjs.map +1 -0
  157. package/dist/tokenize/index.js +18 -7
  158. package/dist/tokenize/index.js.map +1 -1
  159. package/dist/tokenize/token_stream.cjs +164 -0
  160. package/dist/tokenize/token_stream.cjs.map +1 -0
  161. package/dist/tokenize/token_stream.js +133 -139
  162. package/dist/tokenize/token_stream.js.map +1 -1
  163. package/dist/tokenize/tokenizer.cjs +184 -0
  164. package/dist/tokenize/tokenizer.cjs.map +1 -0
  165. package/dist/tokenize/tokenizer.js +138 -99
  166. package/dist/tokenize/tokenizer.js.map +1 -1
  167. package/dist/tokenize/tokenizer.test.cjs +220 -0
  168. package/dist/tokenize/tokenizer.test.cjs.map +1 -0
  169. package/dist/tokenize/tokenizer.test.d.ts +2 -0
  170. package/dist/tokenize/tokenizer.test.d.ts.map +1 -0
  171. package/dist/tokenize/tokenizer.test.js +219 -0
  172. package/dist/tokenize/tokenizer.test.js.map +1 -0
  173. package/dist/transcription.cjs +131 -0
  174. package/dist/transcription.cjs.map +1 -0
  175. package/dist/transcription.js +99 -96
  176. package/dist/transcription.js.map +1 -1
  177. package/dist/tts/index.cjs +38 -0
  178. package/dist/tts/index.cjs.map +1 -0
  179. package/dist/tts/index.js +13 -5
  180. package/dist/tts/index.js.map +1 -1
  181. package/dist/tts/stream_adapter.cjs +78 -0
  182. package/dist/tts/stream_adapter.cjs.map +1 -0
  183. package/dist/tts/stream_adapter.js +50 -47
  184. package/dist/tts/stream_adapter.js.map +1 -1
  185. package/dist/tts/tts.cjs +127 -0
  186. package/dist/tts/tts.cjs.map +1 -0
  187. package/dist/tts/tts.js +90 -120
  188. package/dist/tts/tts.js.map +1 -1
  189. package/dist/utils.cjs +284 -0
  190. package/dist/utils.cjs.map +1 -0
  191. package/dist/utils.js +242 -247
  192. package/dist/utils.js.map +1 -1
  193. package/dist/vad.cjs +92 -0
  194. package/dist/vad.cjs.map +1 -0
  195. package/dist/vad.js +57 -52
  196. package/dist/vad.js.map +1 -1
  197. package/dist/version.cjs +29 -0
  198. package/dist/version.cjs.map +1 -0
  199. package/dist/version.js +4 -4
  200. package/dist/version.js.map +1 -1
  201. package/dist/worker.cjs +577 -0
  202. package/dist/worker.cjs.map +1 -0
  203. package/dist/worker.d.ts +1 -1
  204. package/dist/worker.d.ts.map +1 -1
  205. package/dist/worker.js +512 -484
  206. package/dist/worker.js.map +1 -1
  207. package/package.json +18 -8
  208. package/src/ipc/job_main.ts +66 -64
  209. package/src/job.ts +3 -2
  210. package/src/pipeline/pipeline_agent.ts +23 -23
  211. package/src/tokenize/basic/basic.ts +1 -1
  212. package/src/tokenize/basic/sentence.ts +14 -8
  213. package/src/tokenize/tokenizer.test.ts +255 -0
  214. package/src/worker.ts +1 -0
@@ -1,353 +1,374 @@
1
- import { AudioSource, AudioStream, LocalAudioTrack, RoomEvent, TrackPublishOptions, TrackSource, } from '@livekit/rtc-node';
2
- import { EventEmitter } from 'node:events';
3
- import { AudioByteStream } from '../audio.js';
4
- import * as llm from '../llm/index.js';
5
- import { log } from '../log.js';
6
- import { BasicTranscriptionForwarder } from '../transcription.js';
7
- import { findMicroTrackId } from '../utils.js';
8
- import { AgentPlayout } from './agent_playout.js';
9
- /**
10
- * @internal
11
- * @beta
12
- */
13
- export class RealtimeSession extends EventEmitter {
1
+ import {
2
+ AudioSource,
3
+ AudioStream,
4
+ LocalAudioTrack,
5
+ RoomEvent,
6
+ TrackPublishOptions,
7
+ TrackSource
8
+ } from "@livekit/rtc-node";
9
+ import { EventEmitter } from "node:events";
10
+ import { AudioByteStream } from "../audio.js";
11
+ import * as llm from "../llm/index.js";
12
+ import { log } from "../log.js";
13
+ import { BasicTranscriptionForwarder } from "../transcription.js";
14
+ import { findMicroTrackId } from "../utils.js";
15
+ import { AgentPlayout } from "./agent_playout.js";
16
+ class RealtimeSession extends EventEmitter {
14
17
  }
15
- /**
16
- * @internal
17
- * @beta
18
- */
19
- export class RealtimeModel {
18
+ class RealtimeModel {
20
19
  }
21
- export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';
22
- /** @beta */
23
- export class MultimodalAgent extends EventEmitter {
24
- model;
25
- room = null;
26
- linkedParticipant = null;
27
- subscribedTrack = null;
28
- readMicroTask = null;
29
- constructor({ model, chatCtx, fncCtx, }) {
30
- super();
31
- this.model = model;
32
- this.#chatCtx = chatCtx;
33
- this.#fncCtx = fncCtx;
20
+ const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
21
+ class MultimodalAgent extends EventEmitter {
22
+ model;
23
+ room = null;
24
+ linkedParticipant = null;
25
+ subscribedTrack = null;
26
+ readMicroTask = null;
27
+ constructor({
28
+ model,
29
+ chatCtx,
30
+ fncCtx
31
+ }) {
32
+ super();
33
+ this.model = model;
34
+ this.#chatCtx = chatCtx;
35
+ this.#fncCtx = fncCtx;
36
+ }
37
+ #participant = null;
38
+ #agentPublication = null;
39
+ #localTrackSid = null;
40
+ #localSource = null;
41
+ #agentPlayout = null;
42
+ #playingHandle = void 0;
43
+ #logger = log();
44
+ #session = null;
45
+ #fncCtx = void 0;
46
+ #chatCtx = void 0;
47
+ #_started = false;
48
+ #_pendingFunctionCalls = /* @__PURE__ */ new Set();
49
+ #_speaking = false;
50
+ get fncCtx() {
51
+ return this.#fncCtx;
52
+ }
53
+ set fncCtx(ctx) {
54
+ this.#fncCtx = ctx;
55
+ if (this.#session) {
56
+ this.#session.fncCtx = ctx;
34
57
  }
35
- #participant = null;
36
- #agentPublication = null;
37
- #localTrackSid = null;
38
- #localSource = null;
39
- #agentPlayout = null;
40
- #playingHandle = undefined;
41
- #logger = log();
42
- #session = null;
43
- #fncCtx = undefined;
44
- #chatCtx = undefined;
45
- #_started = false;
46
- #_pendingFunctionCalls = new Set();
47
- #_speaking = false;
48
- get fncCtx() {
49
- return this.#fncCtx;
50
- }
51
- set fncCtx(ctx) {
52
- this.#fncCtx = ctx;
53
- if (this.#session) {
54
- this.#session.fncCtx = ctx;
58
+ }
59
+ get #pendingFunctionCalls() {
60
+ return this.#_pendingFunctionCalls;
61
+ }
62
+ set #pendingFunctionCalls(calls) {
63
+ this.#_pendingFunctionCalls = calls;
64
+ this.#updateState();
65
+ }
66
+ get #speaking() {
67
+ return this.#_speaking;
68
+ }
69
+ set #speaking(isSpeaking) {
70
+ this.#_speaking = isSpeaking;
71
+ this.#updateState();
72
+ }
73
+ get #started() {
74
+ return this.#_started;
75
+ }
76
+ set #started(started) {
77
+ this.#_started = started;
78
+ this.#updateState();
79
+ }
80
+ start(room, participant = null) {
81
+ return new Promise(async (resolve, reject) => {
82
+ var _a;
83
+ if (this.#started) {
84
+ reject(new Error("MultimodalAgent already started"));
85
+ }
86
+ this.#updateState();
87
+ room.on(RoomEvent.ParticipantConnected, (participant2) => {
88
+ if (this.linkedParticipant) {
89
+ return;
90
+ }
91
+ this.#linkParticipant(participant2.identity);
92
+ });
93
+ room.on(
94
+ RoomEvent.TrackPublished,
95
+ (trackPublication, participant2) => {
96
+ if (this.linkedParticipant && participant2.identity === this.linkedParticipant.identity && trackPublication.source === TrackSource.SOURCE_MICROPHONE && !trackPublication.subscribed) {
97
+ trackPublication.setSubscribed(true);
98
+ }
99
+ }
100
+ );
101
+ room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));
102
+ this.room = room;
103
+ this.#participant = participant;
104
+ this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
105
+ this.#agentPlayout = new AgentPlayout(
106
+ this.#localSource,
107
+ this.model.sampleRate,
108
+ this.model.numChannels,
109
+ this.model.inFrameSize,
110
+ this.model.outFrameSize
111
+ );
112
+ const onPlayoutStarted = () => {
113
+ this.emit("agent_started_speaking");
114
+ this.#speaking = true;
115
+ };
116
+ const onPlayoutStopped = (interrupted) => {
117
+ this.emit("agent_stopped_speaking");
118
+ this.#speaking = false;
119
+ if (this.#playingHandle) {
120
+ let text = this.#playingHandle.transcriptionFwd.text;
121
+ if (interrupted) {
122
+ text += "\u2026";
123
+ }
124
+ const msg = llm.ChatMessage.create({
125
+ role: llm.ChatRole.ASSISTANT,
126
+ text
127
+ });
128
+ if (interrupted) {
129
+ this.emit("agent_speech_interrupted", msg);
130
+ } else {
131
+ this.emit("agent_speech_committed", msg);
132
+ }
133
+ this.#logger.child({ transcription: text, interrupted }).debug("committed agent speech");
134
+ }
135
+ };
136
+ this.#agentPlayout.on("playout_started", onPlayoutStarted);
137
+ this.#agentPlayout.on("playout_stopped", onPlayoutStopped);
138
+ const track = LocalAudioTrack.createAudioTrack("assistant_voice", this.#localSource);
139
+ const options = new TrackPublishOptions();
140
+ options.source = TrackSource.SOURCE_MICROPHONE;
141
+ this.#agentPublication = await ((_a = room.localParticipant) == null ? void 0 : _a.publishTrack(track, options)) || null;
142
+ if (!this.#agentPublication) {
143
+ this.#logger.error("Failed to publish track");
144
+ reject(new Error("Failed to publish track"));
145
+ return;
146
+ }
147
+ await this.#agentPublication.waitForSubscription();
148
+ if (participant) {
149
+ if (typeof participant === "string") {
150
+ this.#linkParticipant(participant);
151
+ } else {
152
+ this.#linkParticipant(participant.identity);
153
+ }
154
+ } else {
155
+ for (const participant2 of room.remoteParticipants.values()) {
156
+ this.#linkParticipant(participant2.identity);
157
+ break;
158
+ }
159
+ }
160
+ this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
161
+ this.#started = true;
162
+ this.#session.on("response_content_added", (message) => {
163
+ var _a2;
164
+ const trFwd = new BasicTranscriptionForwarder(
165
+ this.room,
166
+ this.room.localParticipant.identity,
167
+ this.#getLocalTrackSid(),
168
+ message.responseId
169
+ );
170
+ const handle = (_a2 = this.#agentPlayout) == null ? void 0 : _a2.play(
171
+ message.itemId,
172
+ message.contentIndex,
173
+ trFwd,
174
+ message.textStream,
175
+ message.audioStream
176
+ );
177
+ this.#playingHandle = handle;
178
+ });
179
+ this.#session.on("input_speech_committed", (ev) => {
180
+ var _a2, _b;
181
+ const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
182
+ const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
183
+ if (participantIdentity && trackSid) {
184
+ this.#publishTranscription(participantIdentity, trackSid, "\u2026", false, ev.itemId);
185
+ } else {
186
+ this.#logger.error("Participant or track not set");
187
+ }
188
+ });
189
+ this.#session.on("input_speech_transcription_completed", (ev) => {
190
+ var _a2, _b;
191
+ const transcription = ev.transcript;
192
+ const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
193
+ const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
194
+ if (participantIdentity && trackSid) {
195
+ this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
196
+ } else {
197
+ this.#logger.error("Participant or track not set");
198
+ }
199
+ const userMsg = llm.ChatMessage.create({
200
+ role: llm.ChatRole.USER,
201
+ text: transcription
202
+ });
203
+ this.emit("user_speech_committed", userMsg);
204
+ this.#logger.child({ transcription }).debug("committed user speech");
205
+ });
206
+ this.#session.on("input_speech_started", (ev) => {
207
+ var _a2, _b;
208
+ if (this.#playingHandle && !this.#playingHandle.done) {
209
+ this.#playingHandle.interrupt();
210
+ this.#session.conversation.item.truncate(
211
+ this.#playingHandle.itemId,
212
+ this.#playingHandle.contentIndex,
213
+ Math.floor(this.#playingHandle.audioSamples / 24e3 * 1e3)
214
+ );
215
+ this.#playingHandle = void 0;
55
216
  }
217
+ const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
218
+ const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
219
+ if (participantIdentity && trackSid) {
220
+ this.#publishTranscription(participantIdentity, trackSid, "\u2026", false, ev.itemId);
221
+ }
222
+ });
223
+ this.#session.on("input_speech_stopped", (ev) => {
224
+ this.emit("user_stopped_speaking");
225
+ });
226
+ this.#session.on("function_call_started", (ev) => {
227
+ this.#pendingFunctionCalls.add(ev.callId);
228
+ this.#updateState();
229
+ });
230
+ this.#session.on("function_call_completed", (ev) => {
231
+ this.#pendingFunctionCalls.delete(ev.callId);
232
+ this.#updateState();
233
+ });
234
+ this.#session.on("function_call_failed", (ev) => {
235
+ this.#pendingFunctionCalls.delete(ev.callId);
236
+ this.#updateState();
237
+ });
238
+ resolve(this.#session);
239
+ });
240
+ }
241
+ #linkParticipant(participantIdentity) {
242
+ if (!this.room) {
243
+ this.#logger.error("Room is not set");
244
+ return;
56
245
  }
57
- get #pendingFunctionCalls() {
58
- return this.#_pendingFunctionCalls;
246
+ this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
247
+ if (!this.linkedParticipant) {
248
+ this.#logger.error(`Participant with identity ${participantIdentity} not found`);
249
+ return;
59
250
  }
60
- set #pendingFunctionCalls(calls) {
61
- this.#_pendingFunctionCalls = calls;
62
- this.#updateState();
251
+ if (this.linkedParticipant.trackPublications.size > 0) {
252
+ this.#subscribeToMicrophone();
63
253
  }
64
- get #speaking() {
65
- return this.#_speaking;
254
+ for (const publication of this.linkedParticipant.trackPublications.values()) {
255
+ if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {
256
+ this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);
257
+ break;
258
+ }
66
259
  }
67
- set #speaking(isSpeaking) {
68
- this.#_speaking = isSpeaking;
69
- this.#updateState();
260
+ }
261
+ #subscribeToMicrophone() {
262
+ if (!this.linkedParticipant) {
263
+ this.#logger.error("Participant is not set");
264
+ return;
70
265
  }
71
- get #started() {
72
- return this.#_started;
266
+ let microphonePublication = void 0;
267
+ for (const publication of this.linkedParticipant.trackPublications.values()) {
268
+ if (publication.source === TrackSource.SOURCE_MICROPHONE) {
269
+ microphonePublication = publication;
270
+ break;
271
+ }
73
272
  }
74
- set #started(started) {
75
- this.#_started = started;
76
- this.#updateState();
273
+ if (!microphonePublication) {
274
+ return;
77
275
  }
78
- start(room, participant = null) {
79
- return new Promise(async (resolve, reject) => {
80
- if (this.#started) {
81
- reject(new Error('MultimodalAgent already started'));
82
- }
83
- this.#updateState();
84
- room.on(RoomEvent.ParticipantConnected, (participant) => {
85
- // automatically link to the first participant that connects, if not already linked
86
- if (this.linkedParticipant) {
87
- return;
88
- }
89
- this.#linkParticipant(participant.identity);
90
- });
91
- room.on(RoomEvent.TrackPublished, (trackPublication, participant) => {
92
- if (this.linkedParticipant &&
93
- participant.identity === this.linkedParticipant.identity &&
94
- trackPublication.source === TrackSource.SOURCE_MICROPHONE &&
95
- !trackPublication.subscribed) {
96
- trackPublication.setSubscribed(true);
97
- }
98
- });
99
- room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));
100
- this.room = room;
101
- this.#participant = participant;
102
- this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
103
- this.#agentPlayout = new AgentPlayout(this.#localSource, this.model.sampleRate, this.model.numChannels, this.model.inFrameSize, this.model.outFrameSize);
104
- const onPlayoutStarted = () => {
105
- this.emit('agent_started_speaking');
106
- this.#speaking = true;
107
- };
108
- const onPlayoutStopped = (interrupted) => {
109
- this.emit('agent_stopped_speaking');
110
- this.#speaking = false;
111
- if (this.#playingHandle) {
112
- let text = this.#playingHandle.transcriptionFwd.text;
113
- if (interrupted) {
114
- text += '…';
115
- }
116
- const msg = llm.ChatMessage.create({
117
- role: llm.ChatRole.ASSISTANT,
118
- text,
119
- });
120
- if (interrupted) {
121
- this.emit('agent_speech_interrupted', msg);
122
- }
123
- else {
124
- this.emit('agent_speech_committed', msg);
125
- }
126
- this.#logger.child({ transcription: text, interrupted }).debug('committed agent speech');
127
- }
128
- };
129
- this.#agentPlayout.on('playout_started', onPlayoutStarted);
130
- this.#agentPlayout.on('playout_stopped', onPlayoutStopped);
131
- const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.#localSource);
132
- const options = new TrackPublishOptions();
133
- options.source = TrackSource.SOURCE_MICROPHONE;
134
- this.#agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;
135
- if (!this.#agentPublication) {
136
- this.#logger.error('Failed to publish track');
137
- reject(new Error('Failed to publish track'));
138
- return;
139
- }
140
- await this.#agentPublication.waitForSubscription();
141
- if (participant) {
142
- if (typeof participant === 'string') {
143
- this.#linkParticipant(participant);
144
- }
145
- else {
146
- this.#linkParticipant(participant.identity);
147
- }
148
- }
149
- else {
150
- // No participant specified, try to find the first participant in the room
151
- for (const participant of room.remoteParticipants.values()) {
152
- this.#linkParticipant(participant.identity);
153
- break;
154
- }
155
- }
156
- this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
157
- this.#started = true;
158
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
159
- this.#session.on('response_content_added', (message) => {
160
- // openai.realtime.RealtimeContent
161
- const trFwd = new BasicTranscriptionForwarder(this.room, this.room.localParticipant.identity, this.#getLocalTrackSid(), message.responseId);
162
- const handle = this.#agentPlayout?.play(message.itemId, message.contentIndex, trFwd, message.textStream, message.audioStream);
163
- this.#playingHandle = handle;
164
- });
165
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
166
- this.#session.on('input_speech_committed', (ev) => {
167
- // openai.realtime.InputSpeechCommittedEvent
168
- const participantIdentity = this.linkedParticipant?.identity;
169
- const trackSid = this.subscribedTrack?.sid;
170
- if (participantIdentity && trackSid) {
171
- this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
172
- }
173
- else {
174
- this.#logger.error('Participant or track not set');
175
- }
176
- });
177
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
178
- this.#session.on('input_speech_transcription_completed', (ev) => {
179
- // openai.realtime.InputSpeechTranscriptionCompletedEvent
180
- const transcription = ev.transcript;
181
- const participantIdentity = this.linkedParticipant?.identity;
182
- const trackSid = this.subscribedTrack?.sid;
183
- if (participantIdentity && trackSid) {
184
- this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
185
- }
186
- else {
187
- this.#logger.error('Participant or track not set');
188
- }
189
- const userMsg = llm.ChatMessage.create({
190
- role: llm.ChatRole.USER,
191
- text: transcription,
192
- });
193
- this.emit('user_speech_committed', userMsg);
194
- this.#logger.child({ transcription }).debug('committed user speech');
195
- });
196
- this.#session.on('input_speech_started', (ev) => {
197
- if (this.#playingHandle && !this.#playingHandle.done) {
198
- this.#playingHandle.interrupt();
199
- this.#session.conversation.item.truncate(this.#playingHandle.itemId, this.#playingHandle.contentIndex, Math.floor((this.#playingHandle.audioSamples / 24000) * 1000));
200
- this.#playingHandle = undefined;
201
- }
202
- const participantIdentity = this.linkedParticipant?.identity;
203
- const trackSid = this.subscribedTrack?.sid;
204
- if (participantIdentity && trackSid) {
205
- this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
206
- }
207
- });
208
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
209
- this.#session.on('input_speech_stopped', (ev) => {
210
- this.emit('user_stopped_speaking');
211
- });
212
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
213
- this.#session.on('function_call_started', (ev) => {
214
- this.#pendingFunctionCalls.add(ev.callId);
215
- this.#updateState();
216
- });
217
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
218
- this.#session.on('function_call_completed', (ev) => {
219
- this.#pendingFunctionCalls.delete(ev.callId);
220
- this.#updateState();
221
- });
222
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
223
- this.#session.on('function_call_failed', (ev) => {
224
- this.#pendingFunctionCalls.delete(ev.callId);
225
- this.#updateState();
226
- });
227
- resolve(this.#session);
228
- });
276
+ if (!microphonePublication.subscribed) {
277
+ microphonePublication.setSubscribed(true);
229
278
  }
230
- #linkParticipant(participantIdentity) {
231
- if (!this.room) {
232
- this.#logger.error('Room is not set');
233
- return;
234
- }
235
- this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
236
- if (!this.linkedParticipant) {
237
- this.#logger.error(`Participant with identity ${participantIdentity} not found`);
238
- return;
239
- }
240
- if (this.linkedParticipant.trackPublications.size > 0) {
241
- this.#subscribeToMicrophone();
242
- }
243
- // also check if already subscribed
244
- for (const publication of this.linkedParticipant.trackPublications.values()) {
245
- if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {
246
- this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);
247
- break;
248
- }
249
- }
279
+ }
280
+ #handleTrackSubscription(track, publication, participant) {
281
+ var _a;
282
+ if (publication.source !== TrackSource.SOURCE_MICROPHONE || participant.identity !== ((_a = this.linkedParticipant) == null ? void 0 : _a.identity)) {
283
+ return;
250
284
  }
251
- #subscribeToMicrophone() {
252
- if (!this.linkedParticipant) {
253
- this.#logger.error('Participant is not set');
254
- return;
255
- }
256
- let microphonePublication = undefined;
257
- for (const publication of this.linkedParticipant.trackPublications.values()) {
258
- if (publication.source === TrackSource.SOURCE_MICROPHONE) {
259
- microphonePublication = publication;
260
- break;
261
- }
262
- }
263
- if (!microphonePublication) {
264
- return;
265
- }
266
- if (!microphonePublication.subscribed) {
267
- microphonePublication.setSubscribed(true);
285
+ const readAudioStreamTask = async (audioStream) => {
286
+ const bstream = new AudioByteStream(
287
+ this.model.sampleRate,
288
+ this.model.numChannels,
289
+ this.model.inFrameSize
290
+ );
291
+ for await (const frame of audioStream) {
292
+ const audioData = frame.data;
293
+ for (const frame2 of bstream.write(audioData.buffer)) {
294
+ this.#session.inputAudioBuffer.append(frame2);
268
295
  }
296
+ }
297
+ };
298
+ this.subscribedTrack = track;
299
+ if (this.readMicroTask) {
300
+ this.readMicroTask.cancel();
269
301
  }
270
- #handleTrackSubscription(track, publication, participant) {
271
- if (publication.source !== TrackSource.SOURCE_MICROPHONE ||
272
- participant.identity !== this.linkedParticipant?.identity) {
273
- return;
274
- }
275
- const readAudioStreamTask = async (audioStream) => {
276
- const bstream = new AudioByteStream(this.model.sampleRate, this.model.numChannels, this.model.inFrameSize);
277
- for await (const frame of audioStream) {
278
- const audioData = frame.data;
279
- for (const frame of bstream.write(audioData.buffer)) {
280
- this.#session.inputAudioBuffer.append(frame);
281
- }
282
- }
283
- };
284
- this.subscribedTrack = track;
285
- if (this.readMicroTask) {
286
- this.readMicroTask.cancel();
287
- }
288
- let cancel;
289
- this.readMicroTask = {
290
- promise: new Promise((resolve, reject) => {
291
- cancel = () => {
292
- reject(new Error('Task cancelled'));
293
- };
294
- readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))
295
- .then(resolve)
296
- .catch(reject);
297
- }),
298
- cancel: () => cancel(),
302
+ let cancel;
303
+ this.readMicroTask = {
304
+ promise: new Promise((resolve, reject) => {
305
+ cancel = () => {
306
+ reject(new Error("Task cancelled"));
299
307
  };
308
+ readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels)).then(resolve).catch(reject);
309
+ }),
310
+ cancel: () => cancel()
311
+ };
312
+ }
313
+ #getLocalTrackSid() {
314
+ var _a;
315
+ if (!this.#localTrackSid && this.room && this.room.localParticipant) {
316
+ this.#localTrackSid = findMicroTrackId(this.room, (_a = this.room.localParticipant) == null ? void 0 : _a.identity);
300
317
  }
301
- #getLocalTrackSid() {
302
- if (!this.#localTrackSid && this.room && this.room.localParticipant) {
303
- this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant?.identity);
304
- }
305
- return this.#localTrackSid;
306
- }
307
- #publishTranscription(participantIdentity, trackSid, text, isFinal, id) {
308
- this.#logger.debug(`Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`);
309
- if (!this.room?.localParticipant) {
310
- this.#logger.error('Room or local participant not set');
311
- return;
312
- }
313
- this.room.localParticipant.publishTranscription({
314
- participantIdentity,
315
- trackSid,
316
- segments: [
317
- {
318
- text,
319
- final: isFinal,
320
- id,
321
- startTime: BigInt(0),
322
- endTime: BigInt(0),
323
- language: '',
324
- },
325
- ],
326
- });
318
+ return this.#localTrackSid;
319
+ }
320
+ #publishTranscription(participantIdentity, trackSid, text, isFinal, id) {
321
+ var _a;
322
+ this.#logger.debug(
323
+ `Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`
324
+ );
325
+ if (!((_a = this.room) == null ? void 0 : _a.localParticipant)) {
326
+ this.#logger.error("Room or local participant not set");
327
+ return;
327
328
  }
328
- #updateState() {
329
- let newState = 'initializing';
330
- if (this.#pendingFunctionCalls.size > 0) {
331
- newState = 'thinking';
329
+ this.room.localParticipant.publishTranscription({
330
+ participantIdentity,
331
+ trackSid,
332
+ segments: [
333
+ {
334
+ text,
335
+ final: isFinal,
336
+ id,
337
+ startTime: BigInt(0),
338
+ endTime: BigInt(0),
339
+ language: ""
332
340
  }
333
- else if (this.#speaking) {
334
- newState = 'speaking';
335
- }
336
- else if (this.#started) {
337
- newState = 'listening';
338
- }
339
- this.#setState(newState);
341
+ ]
342
+ });
343
+ }
344
+ #updateState() {
345
+ let newState = "initializing";
346
+ if (this.#pendingFunctionCalls.size > 0) {
347
+ newState = "thinking";
348
+ } else if (this.#speaking) {
349
+ newState = "speaking";
350
+ } else if (this.#started) {
351
+ newState = "listening";
340
352
  }
341
- #setState(state) {
342
- if (this.room?.isConnected && this.room.localParticipant) {
343
- const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
344
- if (currentState !== state) {
345
- this.room.localParticipant.setAttributes({
346
- [AGENT_STATE_ATTRIBUTE]: state,
347
- });
348
- this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);
349
- }
350
- }
353
+ this.#setState(newState);
354
+ }
355
+ #setState(state) {
356
+ var _a;
357
+ if (((_a = this.room) == null ? void 0 : _a.isConnected) && this.room.localParticipant) {
358
+ const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
359
+ if (currentState !== state) {
360
+ this.room.localParticipant.setAttributes({
361
+ [AGENT_STATE_ATTRIBUTE]: state
362
+ });
363
+ this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);
364
+ }
351
365
  }
366
+ }
352
367
  }
368
+ export {
369
+ AGENT_STATE_ATTRIBUTE,
370
+ MultimodalAgent,
371
+ RealtimeModel,
372
+ RealtimeSession
373
+ };
353
374
  //# sourceMappingURL=multimodal_agent.js.map