@livekit/agents 0.4.6 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -0
- package/dist/audio.cjs +77 -0
- package/dist/audio.cjs.map +1 -0
- package/dist/audio.js +48 -37
- package/dist/audio.js.map +1 -1
- package/dist/cli.cjs +131 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.js +96 -122
- package/dist/cli.js.map +1 -1
- package/dist/generator.cjs +36 -0
- package/dist/generator.cjs.map +1 -0
- package/dist/generator.js +8 -22
- package/dist/generator.js.map +1 -1
- package/dist/http_server.cjs +72 -0
- package/dist/http_server.cjs.map +1 -0
- package/dist/http_server.d.ts +1 -1
- package/dist/http_server.js +44 -47
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +78 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +26 -28
- package/dist/index.js.map +1 -1
- package/dist/ipc/job_executor.cjs +33 -0
- package/dist/ipc/job_executor.cjs.map +1 -0
- package/dist/ipc/job_executor.js +7 -4
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_main.cjs +147 -0
- package/dist/ipc/job_main.cjs.map +1 -0
- package/dist/ipc/job_main.d.ts +1 -1
- package/dist/ipc/job_main.js +103 -103
- package/dist/ipc/job_main.js.map +1 -1
- package/dist/ipc/message.cjs +17 -0
- package/dist/ipc/message.cjs.map +1 -0
- package/dist/ipc/message.js +0 -1
- package/dist/ipc/message.js.map +1 -1
- package/dist/ipc/proc_job_executor.cjs +174 -0
- package/dist/ipc/proc_job_executor.cjs.map +1 -0
- package/dist/ipc/proc_job_executor.js +130 -126
- package/dist/ipc/proc_job_executor.js.map +1 -1
- package/dist/ipc/proc_pool.cjs +126 -0
- package/dist/ipc/proc_pool.cjs.map +1 -0
- package/dist/ipc/proc_pool.js +93 -96
- package/dist/ipc/proc_pool.js.map +1 -1
- package/dist/job.cjs +230 -0
- package/dist/job.cjs.map +1 -0
- package/dist/job.js +195 -198
- package/dist/job.js.map +1 -1
- package/dist/llm/chat_context.cjs +131 -0
- package/dist/llm/chat_context.cjs.map +1 -0
- package/dist/llm/chat_context.js +98 -86
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/function_context.cjs +103 -0
- package/dist/llm/function_context.cjs.map +1 -0
- package/dist/llm/function_context.js +72 -81
- package/dist/llm/function_context.js.map +1 -1
- package/dist/llm/function_context.test.cjs +218 -0
- package/dist/llm/function_context.test.cjs.map +1 -0
- package/dist/llm/function_context.test.js +209 -210
- package/dist/llm/function_context.test.js.map +1 -1
- package/dist/llm/index.cjs +43 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.js +22 -6
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +76 -0
- package/dist/llm/llm.cjs.map +1 -0
- package/dist/llm/llm.js +48 -42
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.cjs +57 -0
- package/dist/log.cjs.map +1 -0
- package/dist/log.js +27 -26
- package/dist/log.js.map +1 -1
- package/dist/multimodal/agent_playout.cjs +228 -0
- package/dist/multimodal/agent_playout.cjs.map +1 -0
- package/dist/multimodal/agent_playout.d.ts +1 -1
- package/dist/multimodal/agent_playout.js +193 -180
- package/dist/multimodal/agent_playout.js.map +1 -1
- package/dist/multimodal/index.cjs +25 -0
- package/dist/multimodal/index.cjs.map +1 -0
- package/dist/multimodal/index.js +2 -5
- package/dist/multimodal/index.js.map +1 -1
- package/dist/multimodal/multimodal_agent.cjs +404 -0
- package/dist/multimodal/multimodal_agent.cjs.map +1 -0
- package/dist/multimodal/multimodal_agent.d.ts +1 -1
- package/dist/multimodal/multimodal_agent.js +351 -330
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.cjs +172 -0
- package/dist/pipeline/agent_output.cjs.map +1 -0
- package/dist/pipeline/agent_output.js +136 -138
- package/dist/pipeline/agent_output.js.map +1 -1
- package/dist/pipeline/agent_playout.cjs +169 -0
- package/dist/pipeline/agent_playout.cjs.map +1 -0
- package/dist/pipeline/agent_playout.js +126 -136
- package/dist/pipeline/agent_playout.js.map +1 -1
- package/dist/pipeline/human_input.cjs +158 -0
- package/dist/pipeline/human_input.cjs.map +1 -0
- package/dist/pipeline/human_input.js +124 -125
- package/dist/pipeline/human_input.js.map +1 -1
- package/dist/pipeline/index.cjs +31 -0
- package/dist/pipeline/index.cjs.map +1 -0
- package/dist/pipeline/index.js +8 -4
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +642 -0
- package/dist/pipeline/pipeline_agent.cjs.map +1 -0
- package/dist/pipeline/pipeline_agent.js +595 -651
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +128 -0
- package/dist/pipeline/speech_handle.cjs.map +1 -0
- package/dist/pipeline/speech_handle.js +102 -100
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/plugin.cjs +46 -0
- package/dist/plugin.cjs.map +1 -0
- package/dist/plugin.js +20 -20
- package/dist/plugin.js.map +1 -1
- package/dist/stt/index.cjs +38 -0
- package/dist/stt/index.cjs.map +1 -0
- package/dist/stt/index.js +13 -5
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +87 -0
- package/dist/stt/stream_adapter.cjs.map +1 -0
- package/dist/stt/stream_adapter.js +58 -55
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +98 -0
- package/dist/stt/stt.cjs.map +1 -0
- package/dist/stt/stt.js +63 -98
- package/dist/stt/stt.js.map +1 -1
- package/dist/tokenize/basic/basic.cjs +98 -0
- package/dist/tokenize/basic/basic.cjs.map +1 -0
- package/dist/tokenize/basic/basic.js +56 -45
- package/dist/tokenize/basic/basic.js.map +1 -1
- package/dist/tokenize/basic/hyphenator.cjs +425 -0
- package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
- package/dist/tokenize/basic/hyphenator.js +66 -82
- package/dist/tokenize/basic/hyphenator.js.map +1 -1
- package/dist/tokenize/basic/index.cjs +35 -0
- package/dist/tokenize/basic/index.cjs.map +1 -0
- package/dist/tokenize/basic/index.js +7 -4
- package/dist/tokenize/basic/index.js.map +1 -1
- package/dist/tokenize/basic/paragraph.cjs +57 -0
- package/dist/tokenize/basic/paragraph.cjs.map +1 -0
- package/dist/tokenize/basic/paragraph.js +30 -35
- package/dist/tokenize/basic/paragraph.js.map +1 -1
- package/dist/tokenize/basic/sentence.cjs +83 -0
- package/dist/tokenize/basic/sentence.cjs.map +1 -0
- package/dist/tokenize/basic/sentence.js +56 -57
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/basic/word.cjs +44 -0
- package/dist/tokenize/basic/word.cjs.map +1 -0
- package/dist/tokenize/basic/word.js +17 -20
- package/dist/tokenize/basic/word.js.map +1 -1
- package/dist/tokenize/index.cjs +55 -0
- package/dist/tokenize/index.cjs.map +1 -0
- package/dist/tokenize/index.js +18 -7
- package/dist/tokenize/index.js.map +1 -1
- package/dist/tokenize/token_stream.cjs +164 -0
- package/dist/tokenize/token_stream.cjs.map +1 -0
- package/dist/tokenize/token_stream.js +133 -139
- package/dist/tokenize/token_stream.js.map +1 -1
- package/dist/tokenize/tokenizer.cjs +184 -0
- package/dist/tokenize/tokenizer.cjs.map +1 -0
- package/dist/tokenize/tokenizer.js +138 -99
- package/dist/tokenize/tokenizer.js.map +1 -1
- package/dist/transcription.cjs +131 -0
- package/dist/transcription.cjs.map +1 -0
- package/dist/transcription.js +99 -96
- package/dist/transcription.js.map +1 -1
- package/dist/tts/index.cjs +38 -0
- package/dist/tts/index.cjs.map +1 -0
- package/dist/tts/index.js +13 -5
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +78 -0
- package/dist/tts/stream_adapter.cjs.map +1 -0
- package/dist/tts/stream_adapter.js +50 -47
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +127 -0
- package/dist/tts/tts.cjs.map +1 -0
- package/dist/tts/tts.js +90 -120
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +284 -0
- package/dist/utils.cjs.map +1 -0
- package/dist/utils.js +242 -247
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +92 -0
- package/dist/vad.cjs.map +1 -0
- package/dist/vad.js +57 -52
- package/dist/vad.js.map +1 -1
- package/dist/version.cjs +29 -0
- package/dist/version.cjs.map +1 -0
- package/dist/version.js +4 -4
- package/dist/version.js.map +1 -1
- package/dist/worker.cjs +576 -0
- package/dist/worker.cjs.map +1 -0
- package/dist/worker.d.ts +1 -1
- package/dist/worker.js +511 -484
- package/dist/worker.js.map +1 -1
- package/package.json +18 -8
- package/src/ipc/job_main.ts +66 -64
- package/src/pipeline/pipeline_agent.ts +23 -23
|
@@ -1,353 +1,374 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
*
|
|
12
|
-
|
|
13
|
-
|
|
1
|
+
import {
|
|
2
|
+
AudioSource,
|
|
3
|
+
AudioStream,
|
|
4
|
+
LocalAudioTrack,
|
|
5
|
+
RoomEvent,
|
|
6
|
+
TrackPublishOptions,
|
|
7
|
+
TrackSource
|
|
8
|
+
} from "@livekit/rtc-node";
|
|
9
|
+
import { EventEmitter } from "node:events";
|
|
10
|
+
import { AudioByteStream } from "../audio.js";
|
|
11
|
+
import * as llm from "../llm/index.js";
|
|
12
|
+
import { log } from "../log.js";
|
|
13
|
+
import { BasicTranscriptionForwarder } from "../transcription.js";
|
|
14
|
+
import { findMicroTrackId } from "../utils.js";
|
|
15
|
+
import { AgentPlayout } from "./agent_playout.js";
|
|
16
|
+
class RealtimeSession extends EventEmitter {
|
|
14
17
|
}
|
|
15
|
-
|
|
16
|
-
* @internal
|
|
17
|
-
* @beta
|
|
18
|
-
*/
|
|
19
|
-
export class RealtimeModel {
|
|
18
|
+
class RealtimeModel {
|
|
20
19
|
}
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
20
|
+
const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
21
|
+
class MultimodalAgent extends EventEmitter {
|
|
22
|
+
model;
|
|
23
|
+
room = null;
|
|
24
|
+
linkedParticipant = null;
|
|
25
|
+
subscribedTrack = null;
|
|
26
|
+
readMicroTask = null;
|
|
27
|
+
constructor({
|
|
28
|
+
model,
|
|
29
|
+
chatCtx,
|
|
30
|
+
fncCtx
|
|
31
|
+
}) {
|
|
32
|
+
super();
|
|
33
|
+
this.model = model;
|
|
34
|
+
this.#chatCtx = chatCtx;
|
|
35
|
+
this.#fncCtx = fncCtx;
|
|
36
|
+
}
|
|
37
|
+
#participant = null;
|
|
38
|
+
#agentPublication = null;
|
|
39
|
+
#localTrackSid = null;
|
|
40
|
+
#localSource = null;
|
|
41
|
+
#agentPlayout = null;
|
|
42
|
+
#playingHandle = void 0;
|
|
43
|
+
#logger = log();
|
|
44
|
+
#session = null;
|
|
45
|
+
#fncCtx = void 0;
|
|
46
|
+
#chatCtx = void 0;
|
|
47
|
+
#_started = false;
|
|
48
|
+
#_pendingFunctionCalls = /* @__PURE__ */ new Set();
|
|
49
|
+
#_speaking = false;
|
|
50
|
+
get fncCtx() {
|
|
51
|
+
return this.#fncCtx;
|
|
52
|
+
}
|
|
53
|
+
set fncCtx(ctx) {
|
|
54
|
+
this.#fncCtx = ctx;
|
|
55
|
+
if (this.#session) {
|
|
56
|
+
this.#session.fncCtx = ctx;
|
|
34
57
|
}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
58
|
+
}
|
|
59
|
+
get #pendingFunctionCalls() {
|
|
60
|
+
return this.#_pendingFunctionCalls;
|
|
61
|
+
}
|
|
62
|
+
set #pendingFunctionCalls(calls) {
|
|
63
|
+
this.#_pendingFunctionCalls = calls;
|
|
64
|
+
this.#updateState();
|
|
65
|
+
}
|
|
66
|
+
get #speaking() {
|
|
67
|
+
return this.#_speaking;
|
|
68
|
+
}
|
|
69
|
+
set #speaking(isSpeaking) {
|
|
70
|
+
this.#_speaking = isSpeaking;
|
|
71
|
+
this.#updateState();
|
|
72
|
+
}
|
|
73
|
+
get #started() {
|
|
74
|
+
return this.#_started;
|
|
75
|
+
}
|
|
76
|
+
set #started(started) {
|
|
77
|
+
this.#_started = started;
|
|
78
|
+
this.#updateState();
|
|
79
|
+
}
|
|
80
|
+
start(room, participant = null) {
|
|
81
|
+
return new Promise(async (resolve, reject) => {
|
|
82
|
+
var _a;
|
|
83
|
+
if (this.#started) {
|
|
84
|
+
reject(new Error("MultimodalAgent already started"));
|
|
85
|
+
}
|
|
86
|
+
this.#updateState();
|
|
87
|
+
room.on(RoomEvent.ParticipantConnected, (participant2) => {
|
|
88
|
+
if (this.linkedParticipant) {
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
this.#linkParticipant(participant2.identity);
|
|
92
|
+
});
|
|
93
|
+
room.on(
|
|
94
|
+
RoomEvent.TrackPublished,
|
|
95
|
+
(trackPublication, participant2) => {
|
|
96
|
+
if (this.linkedParticipant && participant2.identity === this.linkedParticipant.identity && trackPublication.source === TrackSource.SOURCE_MICROPHONE && !trackPublication.subscribed) {
|
|
97
|
+
trackPublication.setSubscribed(true);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
);
|
|
101
|
+
room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));
|
|
102
|
+
this.room = room;
|
|
103
|
+
this.#participant = participant;
|
|
104
|
+
this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
|
|
105
|
+
this.#agentPlayout = new AgentPlayout(
|
|
106
|
+
this.#localSource,
|
|
107
|
+
this.model.sampleRate,
|
|
108
|
+
this.model.numChannels,
|
|
109
|
+
this.model.inFrameSize,
|
|
110
|
+
this.model.outFrameSize
|
|
111
|
+
);
|
|
112
|
+
const onPlayoutStarted = () => {
|
|
113
|
+
this.emit("agent_started_speaking");
|
|
114
|
+
this.#speaking = true;
|
|
115
|
+
};
|
|
116
|
+
const onPlayoutStopped = (interrupted) => {
|
|
117
|
+
this.emit("agent_stopped_speaking");
|
|
118
|
+
this.#speaking = false;
|
|
119
|
+
if (this.#playingHandle) {
|
|
120
|
+
let text = this.#playingHandle.transcriptionFwd.text;
|
|
121
|
+
if (interrupted) {
|
|
122
|
+
text += "\u2026";
|
|
123
|
+
}
|
|
124
|
+
const msg = llm.ChatMessage.create({
|
|
125
|
+
role: llm.ChatRole.ASSISTANT,
|
|
126
|
+
text
|
|
127
|
+
});
|
|
128
|
+
if (interrupted) {
|
|
129
|
+
this.emit("agent_speech_interrupted", msg);
|
|
130
|
+
} else {
|
|
131
|
+
this.emit("agent_speech_committed", msg);
|
|
132
|
+
}
|
|
133
|
+
this.#logger.child({ transcription: text, interrupted }).debug("committed agent speech");
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
this.#agentPlayout.on("playout_started", onPlayoutStarted);
|
|
137
|
+
this.#agentPlayout.on("playout_stopped", onPlayoutStopped);
|
|
138
|
+
const track = LocalAudioTrack.createAudioTrack("assistant_voice", this.#localSource);
|
|
139
|
+
const options = new TrackPublishOptions();
|
|
140
|
+
options.source = TrackSource.SOURCE_MICROPHONE;
|
|
141
|
+
this.#agentPublication = await ((_a = room.localParticipant) == null ? void 0 : _a.publishTrack(track, options)) || null;
|
|
142
|
+
if (!this.#agentPublication) {
|
|
143
|
+
this.#logger.error("Failed to publish track");
|
|
144
|
+
reject(new Error("Failed to publish track"));
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
await this.#agentPublication.waitForSubscription();
|
|
148
|
+
if (participant) {
|
|
149
|
+
if (typeof participant === "string") {
|
|
150
|
+
this.#linkParticipant(participant);
|
|
151
|
+
} else {
|
|
152
|
+
this.#linkParticipant(participant.identity);
|
|
153
|
+
}
|
|
154
|
+
} else {
|
|
155
|
+
for (const participant2 of room.remoteParticipants.values()) {
|
|
156
|
+
this.#linkParticipant(participant2.identity);
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
|
|
161
|
+
this.#started = true;
|
|
162
|
+
this.#session.on("response_content_added", (message) => {
|
|
163
|
+
var _a2;
|
|
164
|
+
const trFwd = new BasicTranscriptionForwarder(
|
|
165
|
+
this.room,
|
|
166
|
+
this.room.localParticipant.identity,
|
|
167
|
+
this.#getLocalTrackSid(),
|
|
168
|
+
message.responseId
|
|
169
|
+
);
|
|
170
|
+
const handle = (_a2 = this.#agentPlayout) == null ? void 0 : _a2.play(
|
|
171
|
+
message.itemId,
|
|
172
|
+
message.contentIndex,
|
|
173
|
+
trFwd,
|
|
174
|
+
message.textStream,
|
|
175
|
+
message.audioStream
|
|
176
|
+
);
|
|
177
|
+
this.#playingHandle = handle;
|
|
178
|
+
});
|
|
179
|
+
this.#session.on("input_speech_committed", (ev) => {
|
|
180
|
+
var _a2, _b;
|
|
181
|
+
const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
|
|
182
|
+
const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
|
|
183
|
+
if (participantIdentity && trackSid) {
|
|
184
|
+
this.#publishTranscription(participantIdentity, trackSid, "\u2026", false, ev.itemId);
|
|
185
|
+
} else {
|
|
186
|
+
this.#logger.error("Participant or track not set");
|
|
187
|
+
}
|
|
188
|
+
});
|
|
189
|
+
this.#session.on("input_speech_transcription_completed", (ev) => {
|
|
190
|
+
var _a2, _b;
|
|
191
|
+
const transcription = ev.transcript;
|
|
192
|
+
const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
|
|
193
|
+
const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
|
|
194
|
+
if (participantIdentity && trackSid) {
|
|
195
|
+
this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
|
|
196
|
+
} else {
|
|
197
|
+
this.#logger.error("Participant or track not set");
|
|
198
|
+
}
|
|
199
|
+
const userMsg = llm.ChatMessage.create({
|
|
200
|
+
role: llm.ChatRole.USER,
|
|
201
|
+
text: transcription
|
|
202
|
+
});
|
|
203
|
+
this.emit("user_speech_committed", userMsg);
|
|
204
|
+
this.#logger.child({ transcription }).debug("committed user speech");
|
|
205
|
+
});
|
|
206
|
+
this.#session.on("input_speech_started", (ev) => {
|
|
207
|
+
var _a2, _b;
|
|
208
|
+
if (this.#playingHandle && !this.#playingHandle.done) {
|
|
209
|
+
this.#playingHandle.interrupt();
|
|
210
|
+
this.#session.conversation.item.truncate(
|
|
211
|
+
this.#playingHandle.itemId,
|
|
212
|
+
this.#playingHandle.contentIndex,
|
|
213
|
+
Math.floor(this.#playingHandle.audioSamples / 24e3 * 1e3)
|
|
214
|
+
);
|
|
215
|
+
this.#playingHandle = void 0;
|
|
55
216
|
}
|
|
217
|
+
const participantIdentity = (_a2 = this.linkedParticipant) == null ? void 0 : _a2.identity;
|
|
218
|
+
const trackSid = (_b = this.subscribedTrack) == null ? void 0 : _b.sid;
|
|
219
|
+
if (participantIdentity && trackSid) {
|
|
220
|
+
this.#publishTranscription(participantIdentity, trackSid, "\u2026", false, ev.itemId);
|
|
221
|
+
}
|
|
222
|
+
});
|
|
223
|
+
this.#session.on("input_speech_stopped", (ev) => {
|
|
224
|
+
this.emit("user_stopped_speaking");
|
|
225
|
+
});
|
|
226
|
+
this.#session.on("function_call_started", (ev) => {
|
|
227
|
+
this.#pendingFunctionCalls.add(ev.callId);
|
|
228
|
+
this.#updateState();
|
|
229
|
+
});
|
|
230
|
+
this.#session.on("function_call_completed", (ev) => {
|
|
231
|
+
this.#pendingFunctionCalls.delete(ev.callId);
|
|
232
|
+
this.#updateState();
|
|
233
|
+
});
|
|
234
|
+
this.#session.on("function_call_failed", (ev) => {
|
|
235
|
+
this.#pendingFunctionCalls.delete(ev.callId);
|
|
236
|
+
this.#updateState();
|
|
237
|
+
});
|
|
238
|
+
resolve(this.#session);
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
#linkParticipant(participantIdentity) {
|
|
242
|
+
if (!this.room) {
|
|
243
|
+
this.#logger.error("Room is not set");
|
|
244
|
+
return;
|
|
56
245
|
}
|
|
57
|
-
get
|
|
58
|
-
|
|
246
|
+
this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
|
|
247
|
+
if (!this.linkedParticipant) {
|
|
248
|
+
this.#logger.error(`Participant with identity ${participantIdentity} not found`);
|
|
249
|
+
return;
|
|
59
250
|
}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
this.#updateState();
|
|
251
|
+
if (this.linkedParticipant.trackPublications.size > 0) {
|
|
252
|
+
this.#subscribeToMicrophone();
|
|
63
253
|
}
|
|
64
|
-
|
|
65
|
-
|
|
254
|
+
for (const publication of this.linkedParticipant.trackPublications.values()) {
|
|
255
|
+
if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {
|
|
256
|
+
this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);
|
|
257
|
+
break;
|
|
258
|
+
}
|
|
66
259
|
}
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
260
|
+
}
|
|
261
|
+
#subscribeToMicrophone() {
|
|
262
|
+
if (!this.linkedParticipant) {
|
|
263
|
+
this.#logger.error("Participant is not set");
|
|
264
|
+
return;
|
|
70
265
|
}
|
|
71
|
-
|
|
72
|
-
|
|
266
|
+
let microphonePublication = void 0;
|
|
267
|
+
for (const publication of this.linkedParticipant.trackPublications.values()) {
|
|
268
|
+
if (publication.source === TrackSource.SOURCE_MICROPHONE) {
|
|
269
|
+
microphonePublication = publication;
|
|
270
|
+
break;
|
|
271
|
+
}
|
|
73
272
|
}
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
this.#updateState();
|
|
273
|
+
if (!microphonePublication) {
|
|
274
|
+
return;
|
|
77
275
|
}
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
if (this.#started) {
|
|
81
|
-
reject(new Error('MultimodalAgent already started'));
|
|
82
|
-
}
|
|
83
|
-
this.#updateState();
|
|
84
|
-
room.on(RoomEvent.ParticipantConnected, (participant) => {
|
|
85
|
-
// automatically link to the first participant that connects, if not already linked
|
|
86
|
-
if (this.linkedParticipant) {
|
|
87
|
-
return;
|
|
88
|
-
}
|
|
89
|
-
this.#linkParticipant(participant.identity);
|
|
90
|
-
});
|
|
91
|
-
room.on(RoomEvent.TrackPublished, (trackPublication, participant) => {
|
|
92
|
-
if (this.linkedParticipant &&
|
|
93
|
-
participant.identity === this.linkedParticipant.identity &&
|
|
94
|
-
trackPublication.source === TrackSource.SOURCE_MICROPHONE &&
|
|
95
|
-
!trackPublication.subscribed) {
|
|
96
|
-
trackPublication.setSubscribed(true);
|
|
97
|
-
}
|
|
98
|
-
});
|
|
99
|
-
room.on(RoomEvent.TrackSubscribed, this.#handleTrackSubscription.bind(this));
|
|
100
|
-
this.room = room;
|
|
101
|
-
this.#participant = participant;
|
|
102
|
-
this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
|
|
103
|
-
this.#agentPlayout = new AgentPlayout(this.#localSource, this.model.sampleRate, this.model.numChannels, this.model.inFrameSize, this.model.outFrameSize);
|
|
104
|
-
const onPlayoutStarted = () => {
|
|
105
|
-
this.emit('agent_started_speaking');
|
|
106
|
-
this.#speaking = true;
|
|
107
|
-
};
|
|
108
|
-
const onPlayoutStopped = (interrupted) => {
|
|
109
|
-
this.emit('agent_stopped_speaking');
|
|
110
|
-
this.#speaking = false;
|
|
111
|
-
if (this.#playingHandle) {
|
|
112
|
-
let text = this.#playingHandle.transcriptionFwd.text;
|
|
113
|
-
if (interrupted) {
|
|
114
|
-
text += '…';
|
|
115
|
-
}
|
|
116
|
-
const msg = llm.ChatMessage.create({
|
|
117
|
-
role: llm.ChatRole.ASSISTANT,
|
|
118
|
-
text,
|
|
119
|
-
});
|
|
120
|
-
if (interrupted) {
|
|
121
|
-
this.emit('agent_speech_interrupted', msg);
|
|
122
|
-
}
|
|
123
|
-
else {
|
|
124
|
-
this.emit('agent_speech_committed', msg);
|
|
125
|
-
}
|
|
126
|
-
this.#logger.child({ transcription: text, interrupted }).debug('committed agent speech');
|
|
127
|
-
}
|
|
128
|
-
};
|
|
129
|
-
this.#agentPlayout.on('playout_started', onPlayoutStarted);
|
|
130
|
-
this.#agentPlayout.on('playout_stopped', onPlayoutStopped);
|
|
131
|
-
const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.#localSource);
|
|
132
|
-
const options = new TrackPublishOptions();
|
|
133
|
-
options.source = TrackSource.SOURCE_MICROPHONE;
|
|
134
|
-
this.#agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;
|
|
135
|
-
if (!this.#agentPublication) {
|
|
136
|
-
this.#logger.error('Failed to publish track');
|
|
137
|
-
reject(new Error('Failed to publish track'));
|
|
138
|
-
return;
|
|
139
|
-
}
|
|
140
|
-
await this.#agentPublication.waitForSubscription();
|
|
141
|
-
if (participant) {
|
|
142
|
-
if (typeof participant === 'string') {
|
|
143
|
-
this.#linkParticipant(participant);
|
|
144
|
-
}
|
|
145
|
-
else {
|
|
146
|
-
this.#linkParticipant(participant.identity);
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
else {
|
|
150
|
-
// No participant specified, try to find the first participant in the room
|
|
151
|
-
for (const participant of room.remoteParticipants.values()) {
|
|
152
|
-
this.#linkParticipant(participant.identity);
|
|
153
|
-
break;
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx });
|
|
157
|
-
this.#started = true;
|
|
158
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
159
|
-
this.#session.on('response_content_added', (message) => {
|
|
160
|
-
// openai.realtime.RealtimeContent
|
|
161
|
-
const trFwd = new BasicTranscriptionForwarder(this.room, this.room.localParticipant.identity, this.#getLocalTrackSid(), message.responseId);
|
|
162
|
-
const handle = this.#agentPlayout?.play(message.itemId, message.contentIndex, trFwd, message.textStream, message.audioStream);
|
|
163
|
-
this.#playingHandle = handle;
|
|
164
|
-
});
|
|
165
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
166
|
-
this.#session.on('input_speech_committed', (ev) => {
|
|
167
|
-
// openai.realtime.InputSpeechCommittedEvent
|
|
168
|
-
const participantIdentity = this.linkedParticipant?.identity;
|
|
169
|
-
const trackSid = this.subscribedTrack?.sid;
|
|
170
|
-
if (participantIdentity && trackSid) {
|
|
171
|
-
this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
|
|
172
|
-
}
|
|
173
|
-
else {
|
|
174
|
-
this.#logger.error('Participant or track not set');
|
|
175
|
-
}
|
|
176
|
-
});
|
|
177
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
178
|
-
this.#session.on('input_speech_transcription_completed', (ev) => {
|
|
179
|
-
// openai.realtime.InputSpeechTranscriptionCompletedEvent
|
|
180
|
-
const transcription = ev.transcript;
|
|
181
|
-
const participantIdentity = this.linkedParticipant?.identity;
|
|
182
|
-
const trackSid = this.subscribedTrack?.sid;
|
|
183
|
-
if (participantIdentity && trackSid) {
|
|
184
|
-
this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
|
|
185
|
-
}
|
|
186
|
-
else {
|
|
187
|
-
this.#logger.error('Participant or track not set');
|
|
188
|
-
}
|
|
189
|
-
const userMsg = llm.ChatMessage.create({
|
|
190
|
-
role: llm.ChatRole.USER,
|
|
191
|
-
text: transcription,
|
|
192
|
-
});
|
|
193
|
-
this.emit('user_speech_committed', userMsg);
|
|
194
|
-
this.#logger.child({ transcription }).debug('committed user speech');
|
|
195
|
-
});
|
|
196
|
-
this.#session.on('input_speech_started', (ev) => {
|
|
197
|
-
if (this.#playingHandle && !this.#playingHandle.done) {
|
|
198
|
-
this.#playingHandle.interrupt();
|
|
199
|
-
this.#session.conversation.item.truncate(this.#playingHandle.itemId, this.#playingHandle.contentIndex, Math.floor((this.#playingHandle.audioSamples / 24000) * 1000));
|
|
200
|
-
this.#playingHandle = undefined;
|
|
201
|
-
}
|
|
202
|
-
const participantIdentity = this.linkedParticipant?.identity;
|
|
203
|
-
const trackSid = this.subscribedTrack?.sid;
|
|
204
|
-
if (participantIdentity && trackSid) {
|
|
205
|
-
this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
|
|
206
|
-
}
|
|
207
|
-
});
|
|
208
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
209
|
-
this.#session.on('input_speech_stopped', (ev) => {
|
|
210
|
-
this.emit('user_stopped_speaking');
|
|
211
|
-
});
|
|
212
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
213
|
-
this.#session.on('function_call_started', (ev) => {
|
|
214
|
-
this.#pendingFunctionCalls.add(ev.callId);
|
|
215
|
-
this.#updateState();
|
|
216
|
-
});
|
|
217
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
218
|
-
this.#session.on('function_call_completed', (ev) => {
|
|
219
|
-
this.#pendingFunctionCalls.delete(ev.callId);
|
|
220
|
-
this.#updateState();
|
|
221
|
-
});
|
|
222
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
223
|
-
this.#session.on('function_call_failed', (ev) => {
|
|
224
|
-
this.#pendingFunctionCalls.delete(ev.callId);
|
|
225
|
-
this.#updateState();
|
|
226
|
-
});
|
|
227
|
-
resolve(this.#session);
|
|
228
|
-
});
|
|
276
|
+
if (!microphonePublication.subscribed) {
|
|
277
|
+
microphonePublication.setSubscribed(true);
|
|
229
278
|
}
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
|
|
236
|
-
if (!this.linkedParticipant) {
|
|
237
|
-
this.#logger.error(`Participant with identity ${participantIdentity} not found`);
|
|
238
|
-
return;
|
|
239
|
-
}
|
|
240
|
-
if (this.linkedParticipant.trackPublications.size > 0) {
|
|
241
|
-
this.#subscribeToMicrophone();
|
|
242
|
-
}
|
|
243
|
-
// also check if already subscribed
|
|
244
|
-
for (const publication of this.linkedParticipant.trackPublications.values()) {
|
|
245
|
-
if (publication.source === TrackSource.SOURCE_MICROPHONE && publication.track) {
|
|
246
|
-
this.#handleTrackSubscription(publication.track, publication, this.linkedParticipant);
|
|
247
|
-
break;
|
|
248
|
-
}
|
|
249
|
-
}
|
|
279
|
+
}
|
|
280
|
+
#handleTrackSubscription(track, publication, participant) {
|
|
281
|
+
var _a;
|
|
282
|
+
if (publication.source !== TrackSource.SOURCE_MICROPHONE || participant.identity !== ((_a = this.linkedParticipant) == null ? void 0 : _a.identity)) {
|
|
283
|
+
return;
|
|
250
284
|
}
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
}
|
|
262
|
-
}
|
|
263
|
-
if (!microphonePublication) {
|
|
264
|
-
return;
|
|
265
|
-
}
|
|
266
|
-
if (!microphonePublication.subscribed) {
|
|
267
|
-
microphonePublication.setSubscribed(true);
|
|
285
|
+
const readAudioStreamTask = async (audioStream) => {
|
|
286
|
+
const bstream = new AudioByteStream(
|
|
287
|
+
this.model.sampleRate,
|
|
288
|
+
this.model.numChannels,
|
|
289
|
+
this.model.inFrameSize
|
|
290
|
+
);
|
|
291
|
+
for await (const frame of audioStream) {
|
|
292
|
+
const audioData = frame.data;
|
|
293
|
+
for (const frame2 of bstream.write(audioData.buffer)) {
|
|
294
|
+
this.#session.inputAudioBuffer.append(frame2);
|
|
268
295
|
}
|
|
296
|
+
}
|
|
297
|
+
};
|
|
298
|
+
this.subscribedTrack = track;
|
|
299
|
+
if (this.readMicroTask) {
|
|
300
|
+
this.readMicroTask.cancel();
|
|
269
301
|
}
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
const readAudioStreamTask = async (audioStream) => {
|
|
276
|
-
const bstream = new AudioByteStream(this.model.sampleRate, this.model.numChannels, this.model.inFrameSize);
|
|
277
|
-
for await (const frame of audioStream) {
|
|
278
|
-
const audioData = frame.data;
|
|
279
|
-
for (const frame of bstream.write(audioData.buffer)) {
|
|
280
|
-
this.#session.inputAudioBuffer.append(frame);
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
};
|
|
284
|
-
this.subscribedTrack = track;
|
|
285
|
-
if (this.readMicroTask) {
|
|
286
|
-
this.readMicroTask.cancel();
|
|
287
|
-
}
|
|
288
|
-
let cancel;
|
|
289
|
-
this.readMicroTask = {
|
|
290
|
-
promise: new Promise((resolve, reject) => {
|
|
291
|
-
cancel = () => {
|
|
292
|
-
reject(new Error('Task cancelled'));
|
|
293
|
-
};
|
|
294
|
-
readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels))
|
|
295
|
-
.then(resolve)
|
|
296
|
-
.catch(reject);
|
|
297
|
-
}),
|
|
298
|
-
cancel: () => cancel(),
|
|
302
|
+
let cancel;
|
|
303
|
+
this.readMicroTask = {
|
|
304
|
+
promise: new Promise((resolve, reject) => {
|
|
305
|
+
cancel = () => {
|
|
306
|
+
reject(new Error("Task cancelled"));
|
|
299
307
|
};
|
|
308
|
+
readAudioStreamTask(new AudioStream(track, this.model.sampleRate, this.model.numChannels)).then(resolve).catch(reject);
|
|
309
|
+
}),
|
|
310
|
+
cancel: () => cancel()
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
#getLocalTrackSid() {
|
|
314
|
+
var _a;
|
|
315
|
+
if (!this.#localTrackSid && this.room && this.room.localParticipant) {
|
|
316
|
+
this.#localTrackSid = findMicroTrackId(this.room, (_a = this.room.localParticipant) == null ? void 0 : _a.identity);
|
|
300
317
|
}
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
return;
|
|
312
|
-
}
|
|
313
|
-
this.room.localParticipant.publishTranscription({
|
|
314
|
-
participantIdentity,
|
|
315
|
-
trackSid,
|
|
316
|
-
segments: [
|
|
317
|
-
{
|
|
318
|
-
text,
|
|
319
|
-
final: isFinal,
|
|
320
|
-
id,
|
|
321
|
-
startTime: BigInt(0),
|
|
322
|
-
endTime: BigInt(0),
|
|
323
|
-
language: '',
|
|
324
|
-
},
|
|
325
|
-
],
|
|
326
|
-
});
|
|
318
|
+
return this.#localTrackSid;
|
|
319
|
+
}
|
|
320
|
+
#publishTranscription(participantIdentity, trackSid, text, isFinal, id) {
|
|
321
|
+
var _a;
|
|
322
|
+
this.#logger.debug(
|
|
323
|
+
`Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`
|
|
324
|
+
);
|
|
325
|
+
if (!((_a = this.room) == null ? void 0 : _a.localParticipant)) {
|
|
326
|
+
this.#logger.error("Room or local participant not set");
|
|
327
|
+
return;
|
|
327
328
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
329
|
+
this.room.localParticipant.publishTranscription({
|
|
330
|
+
participantIdentity,
|
|
331
|
+
trackSid,
|
|
332
|
+
segments: [
|
|
333
|
+
{
|
|
334
|
+
text,
|
|
335
|
+
final: isFinal,
|
|
336
|
+
id,
|
|
337
|
+
startTime: BigInt(0),
|
|
338
|
+
endTime: BigInt(0),
|
|
339
|
+
language: ""
|
|
332
340
|
}
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
341
|
+
]
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
#updateState() {
|
|
345
|
+
let newState = "initializing";
|
|
346
|
+
if (this.#pendingFunctionCalls.size > 0) {
|
|
347
|
+
newState = "thinking";
|
|
348
|
+
} else if (this.#speaking) {
|
|
349
|
+
newState = "speaking";
|
|
350
|
+
} else if (this.#started) {
|
|
351
|
+
newState = "listening";
|
|
340
352
|
}
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
}
|
|
353
|
+
this.#setState(newState);
|
|
354
|
+
}
|
|
355
|
+
#setState(state) {
|
|
356
|
+
var _a;
|
|
357
|
+
if (((_a = this.room) == null ? void 0 : _a.isConnected) && this.room.localParticipant) {
|
|
358
|
+
const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
|
|
359
|
+
if (currentState !== state) {
|
|
360
|
+
this.room.localParticipant.setAttributes({
|
|
361
|
+
[AGENT_STATE_ATTRIBUTE]: state
|
|
362
|
+
});
|
|
363
|
+
this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);
|
|
364
|
+
}
|
|
351
365
|
}
|
|
366
|
+
}
|
|
352
367
|
}
|
|
368
|
+
export {
|
|
369
|
+
AGENT_STATE_ATTRIBUTE,
|
|
370
|
+
MultimodalAgent,
|
|
371
|
+
RealtimeModel,
|
|
372
|
+
RealtimeSession
|
|
373
|
+
};
|
|
353
374
|
//# sourceMappingURL=multimodal_agent.js.map
|