@livekit/agents 0.4.6 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -0
- package/dist/audio.cjs +77 -0
- package/dist/audio.cjs.map +1 -0
- package/dist/audio.js +48 -37
- package/dist/audio.js.map +1 -1
- package/dist/cli.cjs +131 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.js +96 -122
- package/dist/cli.js.map +1 -1
- package/dist/generator.cjs +36 -0
- package/dist/generator.cjs.map +1 -0
- package/dist/generator.js +8 -22
- package/dist/generator.js.map +1 -1
- package/dist/http_server.cjs +72 -0
- package/dist/http_server.cjs.map +1 -0
- package/dist/http_server.d.ts +1 -1
- package/dist/http_server.js +44 -47
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +78 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +26 -28
- package/dist/index.js.map +1 -1
- package/dist/ipc/job_executor.cjs +33 -0
- package/dist/ipc/job_executor.cjs.map +1 -0
- package/dist/ipc/job_executor.js +7 -4
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_main.cjs +147 -0
- package/dist/ipc/job_main.cjs.map +1 -0
- package/dist/ipc/job_main.d.ts +1 -1
- package/dist/ipc/job_main.js +103 -103
- package/dist/ipc/job_main.js.map +1 -1
- package/dist/ipc/message.cjs +17 -0
- package/dist/ipc/message.cjs.map +1 -0
- package/dist/ipc/message.js +0 -1
- package/dist/ipc/message.js.map +1 -1
- package/dist/ipc/proc_job_executor.cjs +174 -0
- package/dist/ipc/proc_job_executor.cjs.map +1 -0
- package/dist/ipc/proc_job_executor.js +130 -126
- package/dist/ipc/proc_job_executor.js.map +1 -1
- package/dist/ipc/proc_pool.cjs +126 -0
- package/dist/ipc/proc_pool.cjs.map +1 -0
- package/dist/ipc/proc_pool.js +93 -96
- package/dist/ipc/proc_pool.js.map +1 -1
- package/dist/job.cjs +230 -0
- package/dist/job.cjs.map +1 -0
- package/dist/job.d.ts +6 -1
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +195 -198
- package/dist/job.js.map +1 -1
- package/dist/llm/chat_context.cjs +131 -0
- package/dist/llm/chat_context.cjs.map +1 -0
- package/dist/llm/chat_context.js +98 -86
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/function_context.cjs +103 -0
- package/dist/llm/function_context.cjs.map +1 -0
- package/dist/llm/function_context.js +72 -81
- package/dist/llm/function_context.js.map +1 -1
- package/dist/llm/function_context.test.cjs +218 -0
- package/dist/llm/function_context.test.cjs.map +1 -0
- package/dist/llm/function_context.test.js +209 -210
- package/dist/llm/function_context.test.js.map +1 -1
- package/dist/llm/index.cjs +43 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.js +22 -6
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +76 -0
- package/dist/llm/llm.cjs.map +1 -0
- package/dist/llm/llm.js +48 -42
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.cjs +57 -0
- package/dist/log.cjs.map +1 -0
- package/dist/log.js +27 -26
- package/dist/log.js.map +1 -1
- package/dist/multimodal/agent_playout.cjs +228 -0
- package/dist/multimodal/agent_playout.cjs.map +1 -0
- package/dist/multimodal/agent_playout.d.ts +1 -1
- package/dist/multimodal/agent_playout.js +193 -180
- package/dist/multimodal/agent_playout.js.map +1 -1
- package/dist/multimodal/index.cjs +25 -0
- package/dist/multimodal/index.cjs.map +1 -0
- package/dist/multimodal/index.js +2 -5
- package/dist/multimodal/index.js.map +1 -1
- package/dist/multimodal/multimodal_agent.cjs +404 -0
- package/dist/multimodal/multimodal_agent.cjs.map +1 -0
- package/dist/multimodal/multimodal_agent.d.ts +1 -1
- package/dist/multimodal/multimodal_agent.js +351 -330
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.cjs +172 -0
- package/dist/pipeline/agent_output.cjs.map +1 -0
- package/dist/pipeline/agent_output.js +136 -138
- package/dist/pipeline/agent_output.js.map +1 -1
- package/dist/pipeline/agent_playout.cjs +169 -0
- package/dist/pipeline/agent_playout.cjs.map +1 -0
- package/dist/pipeline/agent_playout.js +126 -136
- package/dist/pipeline/agent_playout.js.map +1 -1
- package/dist/pipeline/human_input.cjs +158 -0
- package/dist/pipeline/human_input.cjs.map +1 -0
- package/dist/pipeline/human_input.js +124 -125
- package/dist/pipeline/human_input.js.map +1 -1
- package/dist/pipeline/index.cjs +31 -0
- package/dist/pipeline/index.cjs.map +1 -0
- package/dist/pipeline/index.js +8 -4
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +642 -0
- package/dist/pipeline/pipeline_agent.cjs.map +1 -0
- package/dist/pipeline/pipeline_agent.js +595 -651
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +128 -0
- package/dist/pipeline/speech_handle.cjs.map +1 -0
- package/dist/pipeline/speech_handle.js +102 -100
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/plugin.cjs +46 -0
- package/dist/plugin.cjs.map +1 -0
- package/dist/plugin.js +20 -20
- package/dist/plugin.js.map +1 -1
- package/dist/stt/index.cjs +38 -0
- package/dist/stt/index.cjs.map +1 -0
- package/dist/stt/index.js +13 -5
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +87 -0
- package/dist/stt/stream_adapter.cjs.map +1 -0
- package/dist/stt/stream_adapter.js +58 -55
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +98 -0
- package/dist/stt/stt.cjs.map +1 -0
- package/dist/stt/stt.js +63 -98
- package/dist/stt/stt.js.map +1 -1
- package/dist/tokenize/basic/basic.cjs +98 -0
- package/dist/tokenize/basic/basic.cjs.map +1 -0
- package/dist/tokenize/basic/basic.d.ts +1 -1
- package/dist/tokenize/basic/basic.d.ts.map +1 -1
- package/dist/tokenize/basic/basic.js +56 -45
- package/dist/tokenize/basic/basic.js.map +1 -1
- package/dist/tokenize/basic/hyphenator.cjs +425 -0
- package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
- package/dist/tokenize/basic/hyphenator.js +66 -82
- package/dist/tokenize/basic/hyphenator.js.map +1 -1
- package/dist/tokenize/basic/index.cjs +35 -0
- package/dist/tokenize/basic/index.cjs.map +1 -0
- package/dist/tokenize/basic/index.js +7 -4
- package/dist/tokenize/basic/index.js.map +1 -1
- package/dist/tokenize/basic/paragraph.cjs +57 -0
- package/dist/tokenize/basic/paragraph.cjs.map +1 -0
- package/dist/tokenize/basic/paragraph.js +30 -35
- package/dist/tokenize/basic/paragraph.js.map +1 -1
- package/dist/tokenize/basic/sentence.cjs +89 -0
- package/dist/tokenize/basic/sentence.cjs.map +1 -0
- package/dist/tokenize/basic/sentence.d.ts.map +1 -1
- package/dist/tokenize/basic/sentence.js +62 -57
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/basic/word.cjs +44 -0
- package/dist/tokenize/basic/word.cjs.map +1 -0
- package/dist/tokenize/basic/word.js +17 -20
- package/dist/tokenize/basic/word.js.map +1 -1
- package/dist/tokenize/index.cjs +55 -0
- package/dist/tokenize/index.cjs.map +1 -0
- package/dist/tokenize/index.js +18 -7
- package/dist/tokenize/index.js.map +1 -1
- package/dist/tokenize/token_stream.cjs +164 -0
- package/dist/tokenize/token_stream.cjs.map +1 -0
- package/dist/tokenize/token_stream.js +133 -139
- package/dist/tokenize/token_stream.js.map +1 -1
- package/dist/tokenize/tokenizer.cjs +184 -0
- package/dist/tokenize/tokenizer.cjs.map +1 -0
- package/dist/tokenize/tokenizer.js +138 -99
- package/dist/tokenize/tokenizer.js.map +1 -1
- package/dist/tokenize/tokenizer.test.cjs +220 -0
- package/dist/tokenize/tokenizer.test.cjs.map +1 -0
- package/dist/tokenize/tokenizer.test.d.ts +2 -0
- package/dist/tokenize/tokenizer.test.d.ts.map +1 -0
- package/dist/tokenize/tokenizer.test.js +219 -0
- package/dist/tokenize/tokenizer.test.js.map +1 -0
- package/dist/transcription.cjs +131 -0
- package/dist/transcription.cjs.map +1 -0
- package/dist/transcription.js +99 -96
- package/dist/transcription.js.map +1 -1
- package/dist/tts/index.cjs +38 -0
- package/dist/tts/index.cjs.map +1 -0
- package/dist/tts/index.js +13 -5
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +78 -0
- package/dist/tts/stream_adapter.cjs.map +1 -0
- package/dist/tts/stream_adapter.js +50 -47
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +127 -0
- package/dist/tts/tts.cjs.map +1 -0
- package/dist/tts/tts.js +90 -120
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +284 -0
- package/dist/utils.cjs.map +1 -0
- package/dist/utils.js +242 -247
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +92 -0
- package/dist/vad.cjs.map +1 -0
- package/dist/vad.js +57 -52
- package/dist/vad.js.map +1 -1
- package/dist/version.cjs +29 -0
- package/dist/version.cjs.map +1 -0
- package/dist/version.js +4 -4
- package/dist/version.js.map +1 -1
- package/dist/worker.cjs +577 -0
- package/dist/worker.cjs.map +1 -0
- package/dist/worker.d.ts +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +512 -484
- package/dist/worker.js.map +1 -1
- package/package.json +18 -8
- package/src/ipc/job_main.ts +66 -64
- package/src/job.ts +3 -2
- package/src/pipeline/pipeline_agent.ts +23 -23
- package/src/tokenize/basic/basic.ts +1 -1
- package/src/tokenize/basic/sentence.ts +14 -8
- package/src/tokenize/tokenizer.test.ts +255 -0
- package/src/worker.ts +1 -0
|
@@ -1,671 +1,615 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
1
|
+
import {
|
|
2
|
+
AudioSource,
|
|
3
|
+
LocalAudioTrack,
|
|
4
|
+
RoomEvent,
|
|
5
|
+
TrackPublishOptions,
|
|
6
|
+
TrackSource
|
|
7
|
+
} from "@livekit/rtc-node";
|
|
8
|
+
import EventEmitter from "node:events";
|
|
9
|
+
import { LLMStream } from "../llm/index.js";
|
|
10
|
+
import { ChatContext, ChatMessage, ChatRole } from "../llm/index.js";
|
|
11
|
+
import { log } from "../log.js";
|
|
12
|
+
import { StreamAdapter as STTStreamAdapter } from "../stt/index.js";
|
|
13
|
+
import {
|
|
14
|
+
SentenceTokenizer as BasicSentenceTokenizer,
|
|
15
|
+
WordTokenizer as BasicWordTokenizer,
|
|
16
|
+
hyphenateWord
|
|
17
|
+
} from "../tokenize/basic/index.js";
|
|
18
|
+
import { StreamAdapter as TTSStreamAdapter } from "../tts/index.js";
|
|
19
|
+
import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from "../utils.js";
|
|
20
|
+
import { AgentOutput } from "./agent_output.js";
|
|
21
|
+
import { AgentPlayout, AgentPlayoutEvent } from "./agent_playout.js";
|
|
22
|
+
import { HumanInput, HumanInputEvent } from "./human_input.js";
|
|
23
|
+
import { SpeechHandle } from "./speech_handle.js";
|
|
24
|
+
const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
25
|
+
var VPAEvent = /* @__PURE__ */ ((VPAEvent2) => {
|
|
26
|
+
VPAEvent2[VPAEvent2["USER_STARTED_SPEAKING"] = 0] = "USER_STARTED_SPEAKING";
|
|
27
|
+
VPAEvent2[VPAEvent2["USER_STOPPED_SPEAKING"] = 1] = "USER_STOPPED_SPEAKING";
|
|
28
|
+
VPAEvent2[VPAEvent2["AGENT_STARTED_SPEAKING"] = 2] = "AGENT_STARTED_SPEAKING";
|
|
29
|
+
VPAEvent2[VPAEvent2["AGENT_STOPPED_SPEAKING"] = 3] = "AGENT_STOPPED_SPEAKING";
|
|
30
|
+
VPAEvent2[VPAEvent2["USER_SPEECH_COMMITTED"] = 4] = "USER_SPEECH_COMMITTED";
|
|
31
|
+
VPAEvent2[VPAEvent2["AGENT_SPEECH_COMMITTED"] = 5] = "AGENT_SPEECH_COMMITTED";
|
|
32
|
+
VPAEvent2[VPAEvent2["AGENT_SPEECH_INTERRUPTED"] = 6] = "AGENT_SPEECH_INTERRUPTED";
|
|
33
|
+
VPAEvent2[VPAEvent2["FUNCTION_CALLS_COLLECTED"] = 7] = "FUNCTION_CALLS_COLLECTED";
|
|
34
|
+
VPAEvent2[VPAEvent2["FUNCTION_CALLS_FINISHED"] = 8] = "FUNCTION_CALLS_FINISHED";
|
|
35
|
+
return VPAEvent2;
|
|
36
|
+
})(VPAEvent || {});
|
|
37
|
+
class AgentCallContext {
|
|
38
|
+
#agent;
|
|
39
|
+
#llmStream;
|
|
40
|
+
#metadata = /* @__PURE__ */ new Map();
|
|
41
|
+
static #current;
|
|
42
|
+
constructor(agent, llmStream) {
|
|
43
|
+
this.#agent = agent;
|
|
44
|
+
this.#llmStream = llmStream;
|
|
45
|
+
AgentCallContext.#current = this;
|
|
46
|
+
}
|
|
47
|
+
static getCurrent() {
|
|
48
|
+
return AgentCallContext.#current;
|
|
49
|
+
}
|
|
50
|
+
get agent() {
|
|
51
|
+
return this.#agent;
|
|
52
|
+
}
|
|
53
|
+
storeMetadata(key, value) {
|
|
54
|
+
this.#metadata.set(key, value);
|
|
55
|
+
}
|
|
56
|
+
getMetadata(key, orDefault = void 0) {
|
|
57
|
+
return this.#metadata.get(key) || orDefault;
|
|
58
|
+
}
|
|
59
|
+
get llmStream() {
|
|
60
|
+
return this.#llmStream;
|
|
61
|
+
}
|
|
53
62
|
}
|
|
54
63
|
const defaultBeforeLLMCallback = (agent, chatCtx) => {
|
|
55
|
-
|
|
64
|
+
return agent.llm.chat({ chatCtx, fncCtx: agent.fncCtx });
|
|
56
65
|
};
|
|
57
|
-
const defaultBeforeTTSCallback = (
|
|
58
|
-
|
|
59
|
-
_, text) => {
|
|
60
|
-
return text;
|
|
66
|
+
const defaultBeforeTTSCallback = (_, text) => {
|
|
67
|
+
return text;
|
|
61
68
|
};
|
|
62
69
|
const defaultAgentTranscriptionOptions = {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
70
|
+
userTranscription: true,
|
|
71
|
+
agentTranscription: true,
|
|
72
|
+
agentTranscriptionSpeech: 1,
|
|
73
|
+
sentenceTokenizer: new BasicSentenceTokenizer(),
|
|
74
|
+
wordTokenizer: new BasicWordTokenizer(false),
|
|
75
|
+
hyphenateWord
|
|
69
76
|
};
|
|
70
77
|
const defaultVPAOptions = {
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
78
|
+
chatCtx: new ChatContext(),
|
|
79
|
+
allowInterruptions: true,
|
|
80
|
+
interruptSpeechDuration: 50,
|
|
81
|
+
interruptMinWords: 0,
|
|
82
|
+
minEndpointingDelay: 500,
|
|
83
|
+
maxRecursiveFncCalls: 1,
|
|
84
|
+
preemptiveSynthesis: false,
|
|
85
|
+
beforeLLMCallback: defaultBeforeLLMCallback,
|
|
86
|
+
beforeTTSCallback: defaultBeforeTTSCallback,
|
|
87
|
+
transcription: defaultAgentTranscriptionOptions
|
|
81
88
|
};
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
room
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
89
|
+
class VoicePipelineAgent extends EventEmitter {
|
|
90
|
+
/** Minimum time played for the user speech to be committed to the chat context. */
|
|
91
|
+
MIN_TIME_PLAYED_FOR_COMMIT = 1.5;
|
|
92
|
+
static FLUSH_SENTINEL = Symbol("FLUSH_SENTINEL");
|
|
93
|
+
#vad;
|
|
94
|
+
#stt;
|
|
95
|
+
#llm;
|
|
96
|
+
#tts;
|
|
97
|
+
#opts;
|
|
98
|
+
#humanInput;
|
|
99
|
+
#agentOutput;
|
|
100
|
+
#trackPublishedFut = new Future();
|
|
101
|
+
#pendingAgentReply;
|
|
102
|
+
#agentReplyTask;
|
|
103
|
+
#playingSpeech;
|
|
104
|
+
#transcribedText = "";
|
|
105
|
+
#transcribedInterimText = "";
|
|
106
|
+
#speechQueueOpen = new Future();
|
|
107
|
+
#speechQueue = new AsyncIterableQueue();
|
|
108
|
+
#lastEndOfSpeechTime;
|
|
109
|
+
#updateStateTask;
|
|
110
|
+
#started = false;
|
|
111
|
+
#room;
|
|
112
|
+
#participant = null;
|
|
113
|
+
#deferredValidation;
|
|
114
|
+
#logger = log();
|
|
115
|
+
#agentPublication;
|
|
116
|
+
constructor(vad, stt, llm, tts, opts = defaultVPAOptions) {
|
|
117
|
+
super();
|
|
118
|
+
this.#opts = { ...defaultVPAOptions, ...opts };
|
|
119
|
+
if (!stt.capabilities.streaming) {
|
|
120
|
+
stt = new STTStreamAdapter(stt, vad);
|
|
121
|
+
}
|
|
122
|
+
if (!tts.capabilities.streaming) {
|
|
123
|
+
tts = new TTSStreamAdapter(tts, new BasicSentenceTokenizer());
|
|
124
|
+
}
|
|
125
|
+
this.#vad = vad;
|
|
126
|
+
this.#stt = stt;
|
|
127
|
+
this.#llm = llm;
|
|
128
|
+
this.#tts = tts;
|
|
129
|
+
this.#deferredValidation = new DeferredReplyValidation(
|
|
130
|
+
this.#validateReplyIfPossible.bind(this),
|
|
131
|
+
this.#opts.minEndpointingDelay
|
|
132
|
+
);
|
|
133
|
+
}
|
|
134
|
+
get fncCtx() {
|
|
135
|
+
return this.#opts.fncCtx;
|
|
136
|
+
}
|
|
137
|
+
set fncCtx(ctx) {
|
|
138
|
+
this.#opts.fncCtx = ctx;
|
|
139
|
+
}
|
|
140
|
+
get chatCtx() {
|
|
141
|
+
return this.#opts.chatCtx;
|
|
142
|
+
}
|
|
143
|
+
get llm() {
|
|
144
|
+
return this.#llm;
|
|
145
|
+
}
|
|
146
|
+
get tts() {
|
|
147
|
+
return this.#tts;
|
|
148
|
+
}
|
|
149
|
+
get stt() {
|
|
150
|
+
return this.#stt;
|
|
151
|
+
}
|
|
152
|
+
get vad() {
|
|
153
|
+
return this.#vad;
|
|
154
|
+
}
|
|
155
|
+
/** Start the voice assistant. */
|
|
156
|
+
start(room, participant = null) {
|
|
157
|
+
if (this.#started) {
|
|
158
|
+
throw new Error("voice assistant already started");
|
|
159
|
+
}
|
|
160
|
+
room.on(RoomEvent.ParticipantConnected, (participant2) => {
|
|
161
|
+
if (this.#participant) {
|
|
162
|
+
return;
|
|
163
|
+
}
|
|
164
|
+
this.#linkParticipant.call(this, participant2.identity);
|
|
165
|
+
});
|
|
166
|
+
this.#room = room;
|
|
167
|
+
this.#participant = participant;
|
|
168
|
+
if (participant) {
|
|
169
|
+
if (typeof participant === "string") {
|
|
170
|
+
this.#linkParticipant(participant);
|
|
171
|
+
} else {
|
|
172
|
+
this.#linkParticipant(participant.identity);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
this.#run();
|
|
176
|
+
}
|
|
177
|
+
/** Play a speech source through the voice assistant. */
|
|
178
|
+
async say(source, allowInterruptions = true, addToChatCtx = true) {
|
|
179
|
+
await this.#trackPublishedFut.await;
|
|
180
|
+
const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
|
|
181
|
+
const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
|
|
182
|
+
newHandle.initialize(source, synthesisHandle);
|
|
183
|
+
this.#addSpeechForPlayout(newHandle);
|
|
184
|
+
}
|
|
185
|
+
#updateState(state, delay = 0) {
|
|
186
|
+
const runTask = (delay2) => {
|
|
187
|
+
return new CancellablePromise(async (resolve, _, onCancel) => {
|
|
188
|
+
var _a, _b;
|
|
189
|
+
let cancelled = false;
|
|
190
|
+
onCancel(() => {
|
|
191
|
+
cancelled = true;
|
|
177
192
|
});
|
|
178
|
-
|
|
179
|
-
this.#
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
}
|
|
184
|
-
else {
|
|
185
|
-
this.#linkParticipant(participant.identity);
|
|
186
|
-
}
|
|
193
|
+
await new Promise((resolve2) => setTimeout(resolve2, delay2));
|
|
194
|
+
if ((_a = this.#room) == null ? void 0 : _a.isConnected) {
|
|
195
|
+
if (!cancelled) {
|
|
196
|
+
await ((_b = this.#room.localParticipant) == null ? void 0 : _b.setAttributes({ [AGENT_STATE_ATTRIBUTE]: state }));
|
|
197
|
+
}
|
|
187
198
|
}
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
199
|
+
resolve();
|
|
200
|
+
});
|
|
201
|
+
};
|
|
202
|
+
if (this.#updateStateTask) {
|
|
203
|
+
this.#updateStateTask.cancel();
|
|
204
|
+
}
|
|
205
|
+
this.#updateStateTask = runTask(delay);
|
|
206
|
+
}
|
|
207
|
+
#linkParticipant(participantIdentity) {
|
|
208
|
+
if (!this.#room) {
|
|
209
|
+
this.#logger.error("Room is not set");
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
this.#participant = this.#room.remoteParticipants.get(participantIdentity) || null;
|
|
213
|
+
if (!this.#participant) {
|
|
214
|
+
this.#logger.error(`Participant with identity ${participantIdentity} not found`);
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
this.#humanInput = new HumanInput(this.#room, this.#vad, this.#stt, this.#participant);
|
|
218
|
+
this.#humanInput.on(HumanInputEvent.START_OF_SPEECH, (event) => {
|
|
219
|
+
this.emit(0 /* USER_STARTED_SPEAKING */);
|
|
220
|
+
this.#deferredValidation.onHumanStartOfSpeech(event);
|
|
221
|
+
});
|
|
222
|
+
this.#humanInput.on(HumanInputEvent.VAD_INFERENCE_DONE, (event) => {
|
|
223
|
+
if (!this.#trackPublishedFut.done) {
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
if (!this.#agentOutput) {
|
|
227
|
+
throw new Error("agent output is undefined");
|
|
228
|
+
}
|
|
229
|
+
let tv = 1;
|
|
230
|
+
if (this.#opts.allowInterruptions) {
|
|
231
|
+
tv = Math.max(0, 1 - event.probability);
|
|
232
|
+
this.#agentOutput.playout.targetVolume = tv;
|
|
233
|
+
}
|
|
234
|
+
if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
|
|
235
|
+
this.#interruptIfPossible();
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {
|
|
239
|
+
this.emit(0 /* USER_STARTED_SPEAKING */);
|
|
240
|
+
this.#deferredValidation.onHumanEndOfSpeech(event);
|
|
241
|
+
this.#lastEndOfSpeechTime = Date.now();
|
|
242
|
+
});
|
|
243
|
+
this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
|
|
244
|
+
this.#transcribedInterimText = event.alternatives[0].text;
|
|
245
|
+
});
|
|
246
|
+
this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
|
|
247
|
+
const newTranscript = event.alternatives[0].text;
|
|
248
|
+
if (!newTranscript) return;
|
|
249
|
+
this.#logger.child({ userTranscript: newTranscript }).debug("received user transcript");
|
|
250
|
+
this.#transcribedText += (this.#transcribedText ? " " : "") + newTranscript;
|
|
251
|
+
if (this.#opts.preemptiveSynthesis && (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)) {
|
|
252
|
+
this.#synthesizeAgentReply();
|
|
253
|
+
}
|
|
254
|
+
this.#deferredValidation.onHumanFinalTranscript(newTranscript);
|
|
255
|
+
const words = this.#opts.transcription.wordTokenizer.tokenize(newTranscript);
|
|
256
|
+
if (words.length >= 3) {
|
|
257
|
+
this.#interruptIfPossible();
|
|
258
|
+
}
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
async #run() {
|
|
262
|
+
var _a, _b;
|
|
263
|
+
this.#updateState("initializing");
|
|
264
|
+
const audioSource = new AudioSource(this.#tts.sampleRate, this.#tts.numChannels);
|
|
265
|
+
const track = LocalAudioTrack.createAudioTrack("assistant_voice", audioSource);
|
|
266
|
+
this.#agentPublication = await ((_b = (_a = this.#room) == null ? void 0 : _a.localParticipant) == null ? void 0 : _b.publishTrack(
|
|
267
|
+
track,
|
|
268
|
+
new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE })
|
|
269
|
+
));
|
|
270
|
+
const agentPlayout = new AgentPlayout(audioSource);
|
|
271
|
+
this.#agentOutput = new AgentOutput(agentPlayout, this.#tts);
|
|
272
|
+
agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STARTED, () => {
|
|
273
|
+
this.emit(2 /* AGENT_STARTED_SPEAKING */);
|
|
274
|
+
this.#updateState("speaking");
|
|
275
|
+
});
|
|
276
|
+
agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STOPPED, (_) => {
|
|
277
|
+
this.emit(3 /* AGENT_STOPPED_SPEAKING */);
|
|
278
|
+
this.#updateState("listening");
|
|
279
|
+
});
|
|
280
|
+
this.#trackPublishedFut.resolve();
|
|
281
|
+
while (true) {
|
|
282
|
+
await this.#speechQueueOpen.await;
|
|
283
|
+
for await (const speech of this.#speechQueue) {
|
|
284
|
+
if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;
|
|
285
|
+
this.#playingSpeech = speech;
|
|
286
|
+
await this.#playSpeech(speech);
|
|
287
|
+
this.#playingSpeech = void 0;
|
|
288
|
+
}
|
|
289
|
+
this.#speechQueueOpen = new Future();
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
#synthesizeAgentReply() {
|
|
293
|
+
var _a;
|
|
294
|
+
(_a = this.#pendingAgentReply) == null ? void 0 : _a.cancel();
|
|
295
|
+
if (this.#humanInput && this.#humanInput.speaking) {
|
|
296
|
+
this.#updateState("thinking", 200);
|
|
297
|
+
}
|
|
298
|
+
this.#pendingAgentReply = SpeechHandle.createAssistantReply(
|
|
299
|
+
this.#opts.allowInterruptions,
|
|
300
|
+
true,
|
|
301
|
+
this.#transcribedText
|
|
302
|
+
);
|
|
303
|
+
const newHandle = this.#pendingAgentReply;
|
|
304
|
+
this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);
|
|
305
|
+
}
|
|
306
|
+
#synthesizeAnswerTask(oldTask, handle) {
|
|
307
|
+
return new CancellablePromise(async (resolve, _, onCancel) => {
|
|
308
|
+
let cancelled = false;
|
|
309
|
+
onCancel(() => {
|
|
310
|
+
cancelled = true;
|
|
311
|
+
});
|
|
312
|
+
if (oldTask) {
|
|
313
|
+
await gracefullyCancel(oldTask);
|
|
314
|
+
}
|
|
315
|
+
const copiedCtx = this.chatCtx.copy();
|
|
316
|
+
const playingSpeech = this.#playingSpeech;
|
|
317
|
+
if (playingSpeech && playingSpeech.initialized) {
|
|
318
|
+
if ((!playingSpeech.userQuestion || playingSpeech.userCommitted) && !playingSpeech.speechCommitted) {
|
|
319
|
+
copiedCtx.messages.push(
|
|
320
|
+
ChatMessage.create({
|
|
321
|
+
// TODO(nbsp): uhhh unsure where to get the played text here
|
|
322
|
+
// text: playingSpeech.synthesisHandle.(theres no ttsForwarder here)
|
|
323
|
+
role: ChatRole.ASSISTANT
|
|
324
|
+
})
|
|
325
|
+
);
|
|
223
326
|
}
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
}
|
|
303
|
-
|
|
327
|
+
}
|
|
328
|
+
copiedCtx.messages.push(
|
|
329
|
+
ChatMessage.create({
|
|
330
|
+
text: handle == null ? void 0 : handle.userQuestion,
|
|
331
|
+
role: ChatRole.USER
|
|
332
|
+
})
|
|
333
|
+
);
|
|
334
|
+
if (cancelled) resolve();
|
|
335
|
+
let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
|
|
336
|
+
if (llmStream === false) {
|
|
337
|
+
handle == null ? void 0 : handle.cancel();
|
|
338
|
+
return;
|
|
339
|
+
}
|
|
340
|
+
if (cancelled) resolve();
|
|
341
|
+
if (!(llmStream instanceof LLMStream)) {
|
|
342
|
+
llmStream = await defaultBeforeLLMCallback(this, copiedCtx);
|
|
343
|
+
}
|
|
344
|
+
if (handle.interrupted) {
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
347
|
+
const synthesisHandle = this.#synthesizeAgentSpeech(handle.id, llmStream);
|
|
348
|
+
handle.initialize(llmStream, synthesisHandle);
|
|
349
|
+
const elapsed = !!this.#lastEndOfSpeechTime ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1e3) / 1e3 : -1;
|
|
350
|
+
this.#logger.child({ speechId: handle.id, elapsed }).debug("synthesizing agent reply");
|
|
351
|
+
resolve();
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
async #playSpeech(handle) {
|
|
355
|
+
try {
|
|
356
|
+
await handle.waitForInitialization();
|
|
357
|
+
} catch {
|
|
358
|
+
return;
|
|
359
|
+
}
|
|
360
|
+
await this.#agentPublication.waitForSubscription();
|
|
361
|
+
const synthesisHandle = handle.synthesisHandle;
|
|
362
|
+
if (synthesisHandle.interrupted) return;
|
|
363
|
+
const userQuestion = handle.userQuestion;
|
|
364
|
+
const playHandle = synthesisHandle.play();
|
|
365
|
+
const joinFut = playHandle.join();
|
|
366
|
+
const commitUserQuestionIfNeeded = () => {
|
|
367
|
+
if (!userQuestion || synthesisHandle.interrupted || handle.userCommitted) return;
|
|
368
|
+
const isUsingTools2 = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
|
|
369
|
+
if (handle.allowInterruptions && !isUsingTools2 && playHandle.timePlayed < this.MIN_TIME_PLAYED_FOR_COMMIT && !joinFut.done) {
|
|
370
|
+
return;
|
|
371
|
+
}
|
|
372
|
+
this.#logger.child({ userTranscript: userQuestion }).debug("committed user transcript");
|
|
373
|
+
const userMsg = ChatMessage.create({ text: userQuestion, role: ChatRole.USER });
|
|
374
|
+
this.chatCtx.messages.push(userMsg);
|
|
375
|
+
this.emit(4 /* USER_SPEECH_COMMITTED */, userMsg);
|
|
376
|
+
this.#transcribedText = this.#transcribedText.slice(userQuestion.length);
|
|
377
|
+
handle.markUserCommitted();
|
|
378
|
+
};
|
|
379
|
+
commitUserQuestionIfNeeded();
|
|
380
|
+
while (!joinFut.done) {
|
|
381
|
+
await new Promise(async (resolve) => {
|
|
382
|
+
setTimeout(resolve, 500);
|
|
383
|
+
await joinFut.await;
|
|
384
|
+
resolve();
|
|
385
|
+
});
|
|
386
|
+
commitUserQuestionIfNeeded();
|
|
387
|
+
if (handle.interrupted) break;
|
|
388
|
+
}
|
|
389
|
+
commitUserQuestionIfNeeded();
|
|
390
|
+
let collectedText = "";
|
|
391
|
+
const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
|
|
392
|
+
const extraToolsMessages = [];
|
|
393
|
+
let interrupted = handle.interrupted;
|
|
394
|
+
if (isUsingTools && !interrupted) {
|
|
395
|
+
if (!userQuestion || !handle.userCommitted) {
|
|
396
|
+
throw new Error("user speech should have been committed before using tools");
|
|
397
|
+
}
|
|
398
|
+
const llmStream = handle.source;
|
|
399
|
+
let newFunctionCalls = llmStream.functionCalls;
|
|
400
|
+
for (let i = 0; i < this.#opts.maxRecursiveFncCalls; i++) {
|
|
401
|
+
this.emit(7 /* FUNCTION_CALLS_COLLECTED */, newFunctionCalls);
|
|
402
|
+
const calledFuncs = [];
|
|
403
|
+
for (const func of newFunctionCalls) {
|
|
404
|
+
const task = func.func.execute(func.params).then(
|
|
405
|
+
(result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
|
|
406
|
+
(error) => ({ name: func.name, toolCallId: func.toolCallId, error })
|
|
407
|
+
);
|
|
408
|
+
calledFuncs.push({ ...func, task });
|
|
409
|
+
this.#logger.child({ function: func.name, speechId: handle.id }).debug("executing AI function");
|
|
410
|
+
try {
|
|
411
|
+
await task;
|
|
412
|
+
} catch {
|
|
413
|
+
this.#logger.child({ function: func.name, speechId: handle.id }).error("error executing AI function");
|
|
414
|
+
}
|
|
304
415
|
}
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
416
|
+
const toolCallsInfo = [];
|
|
417
|
+
const toolCallsResults = [];
|
|
418
|
+
for (const fnc of calledFuncs) {
|
|
419
|
+
const task = await fnc.task;
|
|
420
|
+
if (!task || task.result === void 0) continue;
|
|
421
|
+
toolCallsInfo.push(fnc);
|
|
422
|
+
toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
|
|
310
423
|
}
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
cancelled = true;
|
|
320
|
-
});
|
|
321
|
-
if (oldTask) {
|
|
322
|
-
await gracefullyCancel(oldTask);
|
|
323
|
-
}
|
|
324
|
-
const copiedCtx = this.chatCtx.copy();
|
|
325
|
-
const playingSpeech = this.#playingSpeech;
|
|
326
|
-
if (playingSpeech && playingSpeech.initialized) {
|
|
327
|
-
if ((!playingSpeech.userQuestion || playingSpeech.userCommitted) &&
|
|
328
|
-
!playingSpeech.speechCommitted) {
|
|
329
|
-
// the speech is playing but not committed yet,
|
|
330
|
-
// add it to the chat context for this new reply synthesis
|
|
331
|
-
copiedCtx.messages.push(ChatMessage.create({
|
|
332
|
-
// TODO(nbsp): uhhh unsure where to get the played text here
|
|
333
|
-
// text: playingSpeech.synthesisHandle.(theres no ttsForwarder here)
|
|
334
|
-
role: ChatRole.ASSISTANT,
|
|
335
|
-
}));
|
|
336
|
-
}
|
|
337
|
-
}
|
|
338
|
-
copiedCtx.messages.push(ChatMessage.create({
|
|
339
|
-
text: handle?.userQuestion,
|
|
340
|
-
role: ChatRole.USER,
|
|
341
|
-
}));
|
|
342
|
-
if (cancelled)
|
|
343
|
-
resolve();
|
|
344
|
-
let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
|
|
345
|
-
if (llmStream === false) {
|
|
346
|
-
handle?.cancel();
|
|
347
|
-
return;
|
|
348
|
-
}
|
|
349
|
-
if (cancelled)
|
|
350
|
-
resolve();
|
|
351
|
-
// fallback to default impl if no custom/user stream is returned
|
|
352
|
-
if (!(llmStream instanceof LLMStream)) {
|
|
353
|
-
llmStream = (await defaultBeforeLLMCallback(this, copiedCtx));
|
|
354
|
-
}
|
|
355
|
-
if (handle.interrupted) {
|
|
356
|
-
return;
|
|
357
|
-
}
|
|
358
|
-
const synthesisHandle = this.#synthesizeAgentSpeech(handle.id, llmStream);
|
|
359
|
-
handle.initialize(llmStream, synthesisHandle);
|
|
360
|
-
// TODO(theomonnom): find a more reliable way to get the elapsed time from the last EOS
|
|
361
|
-
// (VAD could not have detected any speech — maybe unlikely?)
|
|
362
|
-
const elapsed = !!this.#lastEndOfSpeechTime
|
|
363
|
-
? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1000) / 1000
|
|
364
|
-
: -1;
|
|
365
|
-
this.#logger.child({ speechId: handle.id, elapsed }).debug('synthesizing agent reply');
|
|
366
|
-
resolve();
|
|
424
|
+
if (!toolCallsInfo.length) break;
|
|
425
|
+
extraToolsMessages.push(ChatMessage.createToolCalls(toolCallsInfo, collectedText));
|
|
426
|
+
extraToolsMessages.push(...toolCallsResults);
|
|
427
|
+
const chatCtx = handle.source.chatCtx.copy();
|
|
428
|
+
chatCtx.messages.push(...extraToolsMessages);
|
|
429
|
+
const answerLLMStream = this.llm.chat({
|
|
430
|
+
chatCtx,
|
|
431
|
+
fncCtx: this.fncCtx
|
|
367
432
|
});
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);
|
|
469
|
-
// replace the synthesis handle with the new one to allow interruption
|
|
470
|
-
handle.synthesisHandle = answerSynthesis;
|
|
471
|
-
const playHandle = answerSynthesis.play();
|
|
472
|
-
await playHandle.join().await;
|
|
473
|
-
// TODO(nbsp): what text goes here
|
|
474
|
-
collectedText = '';
|
|
475
|
-
interrupted = answerSynthesis.interrupted;
|
|
476
|
-
newFunctionCalls = answerLLMStream.functionCalls;
|
|
477
|
-
this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);
|
|
478
|
-
if (!newFunctionCalls)
|
|
479
|
-
break;
|
|
480
|
-
}
|
|
481
|
-
if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
|
|
482
|
-
this.chatCtx.messages.push(...extraToolsMessages);
|
|
483
|
-
if (interrupted) {
|
|
484
|
-
collectedText + '…';
|
|
485
|
-
}
|
|
486
|
-
const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });
|
|
487
|
-
this.chatCtx.messages.push(msg);
|
|
488
|
-
handle.markSpeechCommitted();
|
|
489
|
-
if (interrupted) {
|
|
490
|
-
this.emit(VPAEvent.AGENT_SPEECH_INTERRUPTED, msg);
|
|
491
|
-
}
|
|
492
|
-
else {
|
|
493
|
-
this.emit(VPAEvent.AGENT_SPEECH_COMMITTED, msg);
|
|
494
|
-
}
|
|
495
|
-
this.#logger
|
|
496
|
-
.child({
|
|
497
|
-
agentTranscript: collectedText,
|
|
498
|
-
interrupted,
|
|
499
|
-
speechId: handle.id,
|
|
500
|
-
})
|
|
501
|
-
.debug('committed agent speech');
|
|
502
|
-
}
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
#synthesizeAgentSpeech(speechId, source) {
|
|
506
|
-
if (!this.#agentOutput) {
|
|
507
|
-
throw new Error('agent output should be initialized when ready');
|
|
508
|
-
}
|
|
509
|
-
if (source instanceof LLMStream) {
|
|
510
|
-
source = llmStreamToStringIterable(speechId, source);
|
|
511
|
-
}
|
|
512
|
-
const ogSource = source;
|
|
513
|
-
if (!(typeof source === 'string')) {
|
|
514
|
-
// TODO(nbsp): itertools.tee
|
|
515
|
-
}
|
|
516
|
-
const ttsSource = this.#opts.beforeTTSCallback(this, ogSource);
|
|
517
|
-
if (!ttsSource) {
|
|
518
|
-
throw new Error('beforeTTSCallback must return string or AsyncIterable<string>');
|
|
519
|
-
}
|
|
520
|
-
return this.#agentOutput.synthesize(speechId, ttsSource);
|
|
521
|
-
}
|
|
522
|
-
async #validateReplyIfPossible() {
|
|
523
|
-
if (this.#playingSpeech && this.#playingSpeech.allowInterruptions) {
|
|
524
|
-
this.#logger
|
|
525
|
-
.child({ speechId: this.#playingSpeech.id })
|
|
526
|
-
.debug('skipping validation, agent is speaking and does not allow interruptions');
|
|
527
|
-
return;
|
|
528
|
-
}
|
|
529
|
-
if (!this.#pendingAgentReply) {
|
|
530
|
-
if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {
|
|
531
|
-
return;
|
|
532
|
-
}
|
|
533
|
-
this.#synthesizeAgentReply();
|
|
534
|
-
}
|
|
535
|
-
if (!this.#pendingAgentReply) {
|
|
536
|
-
throw new Error('pending agent reply is undefined');
|
|
537
|
-
}
|
|
538
|
-
// in some bad timimg, we could end up with two pushed agent replies inside the speech queue.
|
|
539
|
-
// so make sure we directly interrupt every reply when validating a new one
|
|
540
|
-
if (this.#speechQueueOpen.done) {
|
|
541
|
-
for await (const speech of this.#speechQueue) {
|
|
542
|
-
if (speech === _a.FLUSH_SENTINEL)
|
|
543
|
-
break;
|
|
544
|
-
if (!speech.isReply)
|
|
545
|
-
continue;
|
|
546
|
-
if (!speech.allowInterruptions)
|
|
547
|
-
speech.interrupt();
|
|
548
|
-
}
|
|
549
|
-
}
|
|
550
|
-
this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug('validated agent reply');
|
|
551
|
-
this.#addSpeechForPlayout(this.#pendingAgentReply);
|
|
552
|
-
this.#pendingAgentReply = undefined;
|
|
553
|
-
this.#transcribedInterimText = '';
|
|
554
|
-
}
|
|
555
|
-
#interruptIfPossible() {
|
|
556
|
-
if (!this.#playingSpeech ||
|
|
557
|
-
!this.#playingSpeech.allowInterruptions ||
|
|
558
|
-
this.#playingSpeech.interrupted) {
|
|
559
|
-
return;
|
|
560
|
-
}
|
|
561
|
-
if (this.#opts.interruptMinWords !== 0) {
|
|
562
|
-
// check the final/interim transcribed text for the minimum word count
|
|
563
|
-
// to interrupt the agent speech
|
|
564
|
-
const interimWords = this.#opts.transcription.wordTokenizer.tokenize(this.#transcribedInterimText);
|
|
565
|
-
if (interimWords.length < this.#opts.interruptMinWords) {
|
|
566
|
-
return;
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
|
-
this.#playingSpeech.interrupt();
|
|
570
|
-
}
|
|
571
|
-
#addSpeechForPlayout(handle) {
|
|
572
|
-
this.#speechQueue.put(handle);
|
|
573
|
-
this.#speechQueue.put(_a.FLUSH_SENTINEL);
|
|
574
|
-
this.#speechQueueOpen.resolve();
|
|
575
|
-
}
|
|
576
|
-
/** Close the voice assistant. */
|
|
577
|
-
async close() {
|
|
578
|
-
if (!this.#started) {
|
|
579
|
-
return;
|
|
580
|
-
}
|
|
581
|
-
this.#room?.removeAllListeners(RoomEvent.ParticipantConnected);
|
|
582
|
-
// TODO(nbsp): await this.#deferredValidation.close()
|
|
583
|
-
}
|
|
433
|
+
const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);
|
|
434
|
+
handle.synthesisHandle = answerSynthesis;
|
|
435
|
+
const playHandle2 = answerSynthesis.play();
|
|
436
|
+
await playHandle2.join().await;
|
|
437
|
+
collectedText = "";
|
|
438
|
+
interrupted = answerSynthesis.interrupted;
|
|
439
|
+
newFunctionCalls = answerLLMStream.functionCalls;
|
|
440
|
+
this.emit(8 /* FUNCTION_CALLS_FINISHED */, calledFuncs);
|
|
441
|
+
if (!newFunctionCalls) break;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
|
|
445
|
+
this.chatCtx.messages.push(...extraToolsMessages);
|
|
446
|
+
if (interrupted) {
|
|
447
|
+
collectedText + "\u2026";
|
|
448
|
+
}
|
|
449
|
+
const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });
|
|
450
|
+
this.chatCtx.messages.push(msg);
|
|
451
|
+
handle.markSpeechCommitted();
|
|
452
|
+
if (interrupted) {
|
|
453
|
+
this.emit(6 /* AGENT_SPEECH_INTERRUPTED */, msg);
|
|
454
|
+
} else {
|
|
455
|
+
this.emit(5 /* AGENT_SPEECH_COMMITTED */, msg);
|
|
456
|
+
}
|
|
457
|
+
this.#logger.child({
|
|
458
|
+
agentTranscript: collectedText,
|
|
459
|
+
interrupted,
|
|
460
|
+
speechId: handle.id
|
|
461
|
+
}).debug("committed agent speech");
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
#synthesizeAgentSpeech(speechId, source) {
|
|
465
|
+
if (!this.#agentOutput) {
|
|
466
|
+
throw new Error("agent output should be initialized when ready");
|
|
467
|
+
}
|
|
468
|
+
if (source instanceof LLMStream) {
|
|
469
|
+
source = llmStreamToStringIterable(speechId, source);
|
|
470
|
+
}
|
|
471
|
+
const ogSource = source;
|
|
472
|
+
if (!(typeof source === "string")) {
|
|
473
|
+
}
|
|
474
|
+
const ttsSource = this.#opts.beforeTTSCallback(this, ogSource);
|
|
475
|
+
if (!ttsSource) {
|
|
476
|
+
throw new Error("beforeTTSCallback must return string or AsyncIterable<string>");
|
|
477
|
+
}
|
|
478
|
+
return this.#agentOutput.synthesize(speechId, ttsSource);
|
|
479
|
+
}
|
|
480
|
+
async #validateReplyIfPossible() {
|
|
481
|
+
if (this.#playingSpeech && !this.#playingSpeech.allowInterruptions) {
|
|
482
|
+
this.#logger.child({ speechId: this.#playingSpeech.id }).debug("skipping validation, agent is speaking and does not allow interruptions");
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
if (!this.#pendingAgentReply) {
|
|
486
|
+
if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {
|
|
487
|
+
return;
|
|
488
|
+
}
|
|
489
|
+
this.#synthesizeAgentReply();
|
|
490
|
+
}
|
|
491
|
+
if (!this.#pendingAgentReply) {
|
|
492
|
+
throw new Error("pending agent reply is undefined");
|
|
493
|
+
}
|
|
494
|
+
if (this.#speechQueueOpen.done) {
|
|
495
|
+
for await (const speech of this.#speechQueue) {
|
|
496
|
+
if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;
|
|
497
|
+
if (!speech.isReply) continue;
|
|
498
|
+
if (speech.allowInterruptions) speech.interrupt();
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug("validated agent reply");
|
|
502
|
+
this.#addSpeechForPlayout(this.#pendingAgentReply);
|
|
503
|
+
this.#pendingAgentReply = void 0;
|
|
504
|
+
this.#transcribedInterimText = "";
|
|
505
|
+
}
|
|
506
|
+
#interruptIfPossible() {
|
|
507
|
+
if (!this.#playingSpeech || !this.#playingSpeech.allowInterruptions || this.#playingSpeech.interrupted) {
|
|
508
|
+
return;
|
|
509
|
+
}
|
|
510
|
+
if (this.#opts.interruptMinWords !== 0) {
|
|
511
|
+
const interimWords = this.#opts.transcription.wordTokenizer.tokenize(
|
|
512
|
+
this.#transcribedInterimText
|
|
513
|
+
);
|
|
514
|
+
if (interimWords.length < this.#opts.interruptMinWords) {
|
|
515
|
+
return;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
this.#playingSpeech.interrupt();
|
|
519
|
+
}
|
|
520
|
+
#addSpeechForPlayout(handle) {
|
|
521
|
+
this.#speechQueue.put(handle);
|
|
522
|
+
this.#speechQueue.put(VoicePipelineAgent.FLUSH_SENTINEL);
|
|
523
|
+
this.#speechQueueOpen.resolve();
|
|
524
|
+
}
|
|
525
|
+
/** Close the voice assistant. */
|
|
526
|
+
async close() {
|
|
527
|
+
var _a;
|
|
528
|
+
if (!this.#started) {
|
|
529
|
+
return;
|
|
530
|
+
}
|
|
531
|
+
(_a = this.#room) == null ? void 0 : _a.removeAllListeners(RoomEvent.ParticipantConnected);
|
|
532
|
+
}
|
|
584
533
|
}
|
|
585
|
-
_a = VoicePipelineAgent;
|
|
586
534
|
async function* llmStreamToStringIterable(speechId, stream) {
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
yield content;
|
|
600
|
-
}
|
|
535
|
+
var _a;
|
|
536
|
+
const startTime = Date.now();
|
|
537
|
+
let firstFrame = true;
|
|
538
|
+
for await (const chunk of stream) {
|
|
539
|
+
const content = (_a = chunk.choices[0]) == null ? void 0 : _a.delta.content;
|
|
540
|
+
if (!content) continue;
|
|
541
|
+
if (firstFrame) {
|
|
542
|
+
firstFrame = false;
|
|
543
|
+
log().child({ speechId, elapsed: Math.round(Date.now() - startTime) }).debug("received first LLM token");
|
|
544
|
+
}
|
|
545
|
+
yield content;
|
|
546
|
+
}
|
|
601
547
|
}
|
|
602
|
-
/** This class is used to try to find the best time to validate the agent reply. */
|
|
603
548
|
class DeferredReplyValidation {
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
664
|
-
this.#resetStates();
|
|
665
|
-
await this.#validateFunc();
|
|
666
|
-
};
|
|
667
|
-
this.#validatingFuture = new Future();
|
|
668
|
-
this.#validatingPromise = runTask(delay);
|
|
669
|
-
}
|
|
549
|
+
// if the STT gives us punctuation, we can try to validate the reply faster.
|
|
550
|
+
PUNCTUATION = ".!?";
|
|
551
|
+
PUNCTUATION_REDUCE_FACTOR = 0.75;
|
|
552
|
+
LATE_TRANSCRIPT_TOLERANCE = 1.5;
|
|
553
|
+
// late compared to end of speech
|
|
554
|
+
#validateFunc;
|
|
555
|
+
#validatingPromise;
|
|
556
|
+
#validatingFuture = new Future();
|
|
557
|
+
#lastFinalTranscript = "";
|
|
558
|
+
#lastRecvEndOfSpeechTime = 0;
|
|
559
|
+
#speaking = false;
|
|
560
|
+
#endOfSpeechDelay;
|
|
561
|
+
#finalTranscriptDelay;
|
|
562
|
+
constructor(validateFunc, minEndpointingDelay) {
|
|
563
|
+
this.#validateFunc = validateFunc;
|
|
564
|
+
this.#endOfSpeechDelay = minEndpointingDelay;
|
|
565
|
+
this.#finalTranscriptDelay = minEndpointingDelay;
|
|
566
|
+
}
|
|
567
|
+
get validating() {
|
|
568
|
+
return !this.#validatingFuture.done;
|
|
569
|
+
}
|
|
570
|
+
onHumanFinalTranscript(transcript) {
|
|
571
|
+
this.#lastFinalTranscript = transcript.trim();
|
|
572
|
+
if (this.#speaking) return;
|
|
573
|
+
const hasRecentEndOfSpeech = Date.now() - this.#lastRecvEndOfSpeechTime < this.LATE_TRANSCRIPT_TOLERANCE;
|
|
574
|
+
let delay = hasRecentEndOfSpeech ? this.#endOfSpeechDelay : this.#finalTranscriptDelay;
|
|
575
|
+
delay = this.#endWithPunctuation() ? delay * this.PUNCTUATION_REDUCE_FACTOR : 1;
|
|
576
|
+
this.#run(delay);
|
|
577
|
+
}
|
|
578
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
579
|
+
onHumanStartOfSpeech(_) {
|
|
580
|
+
this.#speaking = true;
|
|
581
|
+
}
|
|
582
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
583
|
+
onHumanEndOfSpeech(_) {
|
|
584
|
+
this.#speaking = false;
|
|
585
|
+
this.#lastRecvEndOfSpeechTime = Date.now();
|
|
586
|
+
if (this.#lastFinalTranscript) {
|
|
587
|
+
const delay = this.#endWithPunctuation() ? this.#endOfSpeechDelay * this.PUNCTUATION_REDUCE_FACTOR : 1;
|
|
588
|
+
this.#run(delay);
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
// TODO(nbsp): aclose
|
|
592
|
+
#endWithPunctuation() {
|
|
593
|
+
return this.#lastFinalTranscript.length > 0 && this.PUNCTUATION.includes(this.#lastFinalTranscript[this.#lastFinalTranscript.length - 1]);
|
|
594
|
+
}
|
|
595
|
+
#resetStates() {
|
|
596
|
+
this.#lastFinalTranscript = "";
|
|
597
|
+
this.#lastRecvEndOfSpeechTime = 0;
|
|
598
|
+
}
|
|
599
|
+
#run(delay) {
|
|
600
|
+
const runTask = async (delay2) => {
|
|
601
|
+
await new Promise((resolve) => setTimeout(resolve, delay2));
|
|
602
|
+
this.#resetStates();
|
|
603
|
+
await this.#validateFunc();
|
|
604
|
+
};
|
|
605
|
+
this.#validatingFuture = new Future();
|
|
606
|
+
this.#validatingPromise = runTask(delay);
|
|
607
|
+
}
|
|
670
608
|
}
|
|
609
|
+
export {
|
|
610
|
+
AGENT_STATE_ATTRIBUTE,
|
|
611
|
+
AgentCallContext,
|
|
612
|
+
VPAEvent,
|
|
613
|
+
VoicePipelineAgent
|
|
614
|
+
};
|
|
671
615
|
//# sourceMappingURL=pipeline_agent.js.map
|