@livekit/agents 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +47 -0
- package/LICENSE +201 -0
- package/dist/audio.d.ts +9 -0
- package/dist/audio.d.ts.map +1 -0
- package/dist/audio.js +54 -0
- package/dist/audio.js.map +1 -0
- package/dist/cli.d.ts +12 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +102 -19
- package/dist/cli.js.map +1 -1
- package/dist/generator.d.ts +17 -6
- package/dist/generator.d.ts.map +1 -1
- package/dist/generator.js +20 -3
- package/dist/generator.js.map +1 -1
- package/dist/http_server.d.ts +1 -1
- package/dist/http_server.d.ts.map +1 -1
- package/dist/http_server.js +5 -3
- package/dist/http_server.js.map +1 -1
- package/dist/index.d.ts +14 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -3
- package/dist/index.js.map +1 -1
- package/dist/ipc/job_executor.d.ts +19 -0
- package/dist/ipc/job_executor.d.ts.map +1 -0
- package/dist/ipc/job_executor.js +8 -0
- package/dist/ipc/job_executor.js.map +1 -0
- package/dist/ipc/job_main.d.ts +7 -4
- package/dist/ipc/job_main.d.ts.map +1 -1
- package/dist/ipc/job_main.js +102 -59
- package/dist/ipc/job_main.js.map +1 -1
- package/dist/ipc/message.d.ts +41 -0
- package/dist/ipc/message.d.ts.map +1 -0
- package/dist/ipc/message.js +2 -0
- package/dist/ipc/message.js.map +1 -0
- package/dist/ipc/proc_job_executor.d.ts +15 -0
- package/dist/ipc/proc_job_executor.d.ts.map +1 -0
- package/dist/ipc/proc_job_executor.js +150 -0
- package/dist/ipc/proc_job_executor.js.map +1 -0
- package/dist/ipc/proc_pool.d.ts +26 -0
- package/dist/ipc/proc_pool.d.ts.map +1 -0
- package/dist/ipc/proc_pool.js +83 -0
- package/dist/ipc/proc_pool.js.map +1 -0
- package/dist/job.d.ts +100 -0
- package/dist/job.d.ts.map +1 -0
- package/dist/job.js +213 -0
- package/dist/job.js.map +1 -0
- package/dist/llm/function_context.d.ts +20 -0
- package/dist/llm/function_context.d.ts.map +1 -0
- package/dist/llm/function_context.js +37 -0
- package/dist/llm/function_context.js.map +1 -0
- package/dist/llm/index.d.ts +3 -0
- package/dist/llm/index.d.ts.map +1 -0
- package/dist/llm/index.js +6 -0
- package/dist/llm/index.js.map +1 -0
- package/dist/log.d.ts +12 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +28 -11
- package/dist/log.js.map +1 -1
- package/dist/multimodal/agent_playout.d.ts +34 -0
- package/dist/multimodal/agent_playout.d.ts.map +1 -0
- package/dist/multimodal/agent_playout.js +221 -0
- package/dist/multimodal/agent_playout.js.map +1 -0
- package/dist/multimodal/index.d.ts +3 -0
- package/dist/multimodal/index.d.ts.map +1 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/multimodal/index.js.map +1 -0
- package/dist/multimodal/multimodal_agent.d.ts +47 -0
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -0
- package/dist/multimodal/multimodal_agent.js +331 -0
- package/dist/multimodal/multimodal_agent.js.map +1 -0
- package/dist/plugin.js +20 -7
- package/dist/plugin.js.map +1 -1
- package/dist/stt/index.d.ts +1 -1
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.d.ts +2 -11
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +47 -33
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.d.ts +27 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +32 -5
- package/dist/stt/stt.js.map +1 -1
- package/dist/transcription.d.ts +22 -0
- package/dist/transcription.d.ts.map +1 -0
- package/dist/transcription.js +111 -0
- package/dist/transcription.js.map +1 -0
- package/dist/tts/stream_adapter.d.ts +4 -11
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +66 -32
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.d.ts +10 -0
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +48 -7
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.d.ts +59 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +212 -6
- package/dist/utils.js.map +1 -1
- package/dist/vad.d.ts +29 -0
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js.map +1 -1
- package/dist/worker.d.ts +69 -50
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +414 -213
- package/dist/worker.js.map +1 -1
- package/package.json +12 -10
- package/src/audio.ts +62 -0
- package/src/cli.ts +108 -20
- package/src/generator.ts +27 -7
- package/src/http_server.ts +5 -0
- package/src/index.ts +15 -3
- package/src/ipc/job_executor.ts +25 -0
- package/src/ipc/job_main.ts +141 -61
- package/src/ipc/message.ts +39 -0
- package/src/ipc/proc_job_executor.ts +162 -0
- package/src/ipc/proc_pool.ts +109 -0
- package/src/job.ts +278 -0
- package/src/llm/function_context.ts +61 -0
- package/src/llm/index.ts +11 -0
- package/src/log.ts +40 -8
- package/src/multimodal/agent_playout.ts +254 -0
- package/src/multimodal/index.ts +5 -0
- package/src/multimodal/multimodal_agent.ts +428 -0
- package/src/stt/index.ts +1 -1
- package/src/stt/stream_adapter.ts +32 -32
- package/src/stt/stt.ts +27 -0
- package/src/transcription.ts +128 -0
- package/src/tts/stream_adapter.ts +32 -31
- package/src/tts/tts.ts +10 -0
- package/src/utils.ts +257 -3
- package/src/vad.ts +29 -0
- package/src/worker.ts +465 -172
- package/tsconfig.json +7 -1
- package/dist/ipc/job_process.d.ts +0 -22
- package/dist/ipc/job_process.d.ts.map +0 -1
- package/dist/ipc/job_process.js +0 -73
- package/dist/ipc/job_process.js.map +0 -1
- package/dist/ipc/protocol.d.ts +0 -40
- package/dist/ipc/protocol.d.ts.map +0 -1
- package/dist/ipc/protocol.js +0 -14
- package/dist/ipc/protocol.js.map +0 -1
- package/dist/job_context.d.ts +0 -16
- package/dist/job_context.d.ts.map +0 -1
- package/dist/job_context.js +0 -31
- package/dist/job_context.js.map +0 -1
- package/dist/job_request.d.ts +0 -42
- package/dist/job_request.d.ts.map +0 -1
- package/dist/job_request.js +0 -79
- package/dist/job_request.js.map +0 -1
- package/src/ipc/job_process.ts +0 -96
- package/src/ipc/protocol.ts +0 -51
- package/src/job_context.ts +0 -49
- package/src/job_request.ts +0 -118
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type {
|
|
5
|
+
LocalTrackPublication,
|
|
6
|
+
RemoteAudioTrack,
|
|
7
|
+
RemoteParticipant,
|
|
8
|
+
Room,
|
|
9
|
+
} from '@livekit/rtc-node';
|
|
10
|
+
import {
|
|
11
|
+
AudioSource,
|
|
12
|
+
AudioStream,
|
|
13
|
+
LocalAudioTrack,
|
|
14
|
+
RoomEvent,
|
|
15
|
+
TrackPublishOptions,
|
|
16
|
+
TrackSource,
|
|
17
|
+
} from '@livekit/rtc-node';
|
|
18
|
+
import { EventEmitter } from 'events';
|
|
19
|
+
import { AudioByteStream } from '../audio.js';
|
|
20
|
+
import type * as llm from '../llm/index.js';
|
|
21
|
+
import { log } from '../log.js';
|
|
22
|
+
import { BasicTranscriptionForwarder } from '../transcription.js';
|
|
23
|
+
import { findMicroTrackId } from '../utils.js';
|
|
24
|
+
import { AgentPlayout, type PlayoutHandle } from './agent_playout.js';
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @internal
|
|
28
|
+
* @beta
|
|
29
|
+
*/
|
|
30
|
+
export abstract class RealtimeSession extends EventEmitter {
|
|
31
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
32
|
+
abstract defaultConversation: any; // openai.realtime.Conversation
|
|
33
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
34
|
+
abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer
|
|
35
|
+
abstract fncCtx: llm.FunctionContext | undefined;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* @internal
|
|
40
|
+
* @beta
|
|
41
|
+
*/
|
|
42
|
+
export abstract class RealtimeModel {
|
|
43
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
44
|
+
abstract session(options: any): RealtimeSession; // openai.realtime.ModelOptions
|
|
45
|
+
abstract close(): Promise<void>;
|
|
46
|
+
abstract sampleRate: number;
|
|
47
|
+
abstract numChannels: number;
|
|
48
|
+
abstract inFrameSize: number;
|
|
49
|
+
abstract outFrameSize: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
|
|
53
|
+
export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';
|
|
54
|
+
|
|
55
|
+
/** @beta */
|
|
56
|
+
export class MultimodalAgent {
|
|
57
|
+
model: RealtimeModel;
|
|
58
|
+
room: Room | null = null;
|
|
59
|
+
linkedParticipant: RemoteParticipant | null = null;
|
|
60
|
+
subscribedTrack: RemoteAudioTrack | null = null;
|
|
61
|
+
readMicroTask: { promise: Promise<void>; cancel: () => void } | null = null;
|
|
62
|
+
|
|
63
|
+
constructor({
|
|
64
|
+
model,
|
|
65
|
+
fncCtx,
|
|
66
|
+
}: {
|
|
67
|
+
model: RealtimeModel;
|
|
68
|
+
fncCtx?: llm.FunctionContext | undefined;
|
|
69
|
+
}) {
|
|
70
|
+
this.model = model;
|
|
71
|
+
this.#fncCtx = fncCtx;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
#participant: RemoteParticipant | string | null = null;
|
|
75
|
+
#agentPublication: LocalTrackPublication | null = null;
|
|
76
|
+
#localTrackSid: string | null = null;
|
|
77
|
+
#localSource: AudioSource | null = null;
|
|
78
|
+
#agentPlayout: AgentPlayout | null = null;
|
|
79
|
+
#playingHandle: PlayoutHandle | undefined = undefined;
|
|
80
|
+
#logger = log();
|
|
81
|
+
#session: RealtimeSession | null = null;
|
|
82
|
+
#fncCtx: llm.FunctionContext | undefined = undefined;
|
|
83
|
+
|
|
84
|
+
#_started: boolean = false;
|
|
85
|
+
#_pendingFunctionCalls: Set<string> = new Set();
|
|
86
|
+
#_speaking: boolean = false;
|
|
87
|
+
|
|
88
|
+
get fncCtx(): llm.FunctionContext | undefined {
|
|
89
|
+
return this.#fncCtx!;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
set fncCtx(ctx: llm.FunctionContext | undefined) {
|
|
93
|
+
this.#fncCtx = ctx;
|
|
94
|
+
if (this.#session) {
|
|
95
|
+
this.#session.fncCtx = ctx;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
get #pendingFunctionCalls(): Set<string> {
|
|
100
|
+
return this.#_pendingFunctionCalls;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
set #pendingFunctionCalls(calls: Set<string>) {
|
|
104
|
+
this.#_pendingFunctionCalls = calls;
|
|
105
|
+
this.#updateState();
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
get #speaking(): boolean {
|
|
109
|
+
return this.#_speaking;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
set #speaking(isSpeaking: boolean) {
|
|
113
|
+
this.#_speaking = isSpeaking;
|
|
114
|
+
this.#updateState();
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
get #started(): boolean {
|
|
118
|
+
return this.#_started;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
set #started(started: boolean) {
|
|
122
|
+
this.#_started = started;
|
|
123
|
+
this.#updateState();
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
start(
|
|
127
|
+
room: Room,
|
|
128
|
+
participant: RemoteParticipant | string | null = null,
|
|
129
|
+
): Promise<RealtimeSession> {
|
|
130
|
+
return new Promise(async (resolve, reject) => {
|
|
131
|
+
if (this.#started) {
|
|
132
|
+
this.#logger.warn('MultimodalAgent already started');
|
|
133
|
+
resolve(this.#session!); // TODO: throw error?
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
this.#updateState();
|
|
137
|
+
|
|
138
|
+
room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
|
|
139
|
+
if (!this.linkedParticipant) {
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
this.#linkParticipant(participant.identity);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
this.room = room;
|
|
147
|
+
this.#participant = participant;
|
|
148
|
+
|
|
149
|
+
this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
|
|
150
|
+
this.#agentPlayout = new AgentPlayout(
|
|
151
|
+
this.#localSource,
|
|
152
|
+
this.model.sampleRate,
|
|
153
|
+
this.model.numChannels,
|
|
154
|
+
this.model.inFrameSize,
|
|
155
|
+
this.model.outFrameSize,
|
|
156
|
+
);
|
|
157
|
+
const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.#localSource);
|
|
158
|
+
const options = new TrackPublishOptions();
|
|
159
|
+
options.source = TrackSource.SOURCE_MICROPHONE;
|
|
160
|
+
this.#agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;
|
|
161
|
+
if (!this.#agentPublication) {
|
|
162
|
+
this.#logger.error('Failed to publish track');
|
|
163
|
+
reject(new Error('Failed to publish track'));
|
|
164
|
+
return;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
await this.#agentPublication.waitForSubscription();
|
|
168
|
+
|
|
169
|
+
if (participant) {
|
|
170
|
+
if (typeof participant === 'string') {
|
|
171
|
+
this.#linkParticipant(participant);
|
|
172
|
+
} else {
|
|
173
|
+
this.#linkParticipant(participant.identity);
|
|
174
|
+
}
|
|
175
|
+
} else {
|
|
176
|
+
// No participant specified, try to find the first participant in the room
|
|
177
|
+
for (const participant of room.remoteParticipants.values()) {
|
|
178
|
+
this.#linkParticipant(participant.identity);
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
this.#session = this.model.session({ fncCtx: this.#fncCtx });
|
|
184
|
+
this.#started = true;
|
|
185
|
+
|
|
186
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
187
|
+
this.#session.on('response_content_added', (message: any) => {
|
|
188
|
+
// openai.realtime.RealtimeContent
|
|
189
|
+
const trFwd = new BasicTranscriptionForwarder(
|
|
190
|
+
this.room!,
|
|
191
|
+
this.room!.localParticipant!.identity,
|
|
192
|
+
this.#getLocalTrackSid()!,
|
|
193
|
+
message.responseId,
|
|
194
|
+
);
|
|
195
|
+
|
|
196
|
+
const handle = this.#agentPlayout?.play(
|
|
197
|
+
message.itemId,
|
|
198
|
+
message.contentIndex,
|
|
199
|
+
trFwd,
|
|
200
|
+
message.textStream,
|
|
201
|
+
message.audioStream,
|
|
202
|
+
);
|
|
203
|
+
if (handle) {
|
|
204
|
+
this.#speaking = true;
|
|
205
|
+
handle.on('done', () => {
|
|
206
|
+
if (this.#playingHandle == handle) {
|
|
207
|
+
this.#speaking = false;
|
|
208
|
+
}
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
this.#playingHandle = handle;
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
215
|
+
this.#session.on('input_speech_committed', (ev: any) => {
|
|
216
|
+
// openai.realtime.InputSpeechCommittedEvent
|
|
217
|
+
const participantIdentity = this.linkedParticipant?.identity;
|
|
218
|
+
const trackSid = this.subscribedTrack?.sid;
|
|
219
|
+
if (participantIdentity && trackSid) {
|
|
220
|
+
this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
|
|
221
|
+
} else {
|
|
222
|
+
this.#logger.error('Participant or track not set');
|
|
223
|
+
}
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
227
|
+
this.#session.on('input_speech_transcription_completed', (ev: any) => {
|
|
228
|
+
// openai.realtime.InputSpeechTranscriptionCompletedEvent
|
|
229
|
+
const transcription = ev.transcript;
|
|
230
|
+
const participantIdentity = this.linkedParticipant?.identity;
|
|
231
|
+
const trackSid = this.subscribedTrack?.sid;
|
|
232
|
+
if (participantIdentity && trackSid) {
|
|
233
|
+
this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
|
|
234
|
+
} else {
|
|
235
|
+
this.#logger.error('Participant or track not set');
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
this.#session.on('input_speech_started', (ev: any) => {
|
|
240
|
+
if (this.#playingHandle && !this.#playingHandle.done) {
|
|
241
|
+
this.#playingHandle.interrupt();
|
|
242
|
+
|
|
243
|
+
this.#session!.defaultConversation.item.truncate(
|
|
244
|
+
this.#playingHandle.itemId,
|
|
245
|
+
this.#playingHandle.contentIndex,
|
|
246
|
+
Math.floor((this.#playingHandle.audioSamples / 24000) * 1000),
|
|
247
|
+
);
|
|
248
|
+
|
|
249
|
+
this.#playingHandle = undefined;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
const participantIdentity = this.linkedParticipant?.identity;
|
|
253
|
+
const trackSid = this.subscribedTrack?.sid;
|
|
254
|
+
if (participantIdentity && trackSid) {
|
|
255
|
+
this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
|
|
256
|
+
}
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
260
|
+
this.#session.on('function_call_started', (ev: any) => {
|
|
261
|
+
this.#pendingFunctionCalls.add(ev.callId);
|
|
262
|
+
this.#updateState();
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
266
|
+
this.#session.on('function_call_completed', (ev: any) => {
|
|
267
|
+
this.#pendingFunctionCalls.delete(ev.callId);
|
|
268
|
+
this.#updateState();
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
272
|
+
this.#session.on('function_call_failed', (ev: any) => {
|
|
273
|
+
this.#pendingFunctionCalls.delete(ev.callId);
|
|
274
|
+
this.#updateState();
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
resolve(this.#session!);
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// TODO
|
|
282
|
+
// close() {
|
|
283
|
+
// if (!this.connected || !this.ws) return;
|
|
284
|
+
// this.logger.debug('stopping assistant');
|
|
285
|
+
// this.ws.close();
|
|
286
|
+
// }
|
|
287
|
+
|
|
288
|
+
#linkParticipant(participantIdentity: string): void {
|
|
289
|
+
if (!this.room) {
|
|
290
|
+
this.#logger.error('Room is not set');
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
|
|
295
|
+
if (!this.linkedParticipant) {
|
|
296
|
+
this.#logger.error(`Participant with identity ${participantIdentity} not found`);
|
|
297
|
+
return;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (this.linkedParticipant.trackPublications.size > 0) {
|
|
301
|
+
this.#subscribeToMicrophone();
|
|
302
|
+
} else {
|
|
303
|
+
this.room.on(RoomEvent.TrackPublished, () => {
|
|
304
|
+
this.#subscribeToMicrophone();
|
|
305
|
+
});
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
#subscribeToMicrophone(): void {
|
|
310
|
+
const readAudioStreamTask = async (audioStream: AudioStream) => {
|
|
311
|
+
const bstream = new AudioByteStream(
|
|
312
|
+
this.model.sampleRate,
|
|
313
|
+
this.model.numChannels,
|
|
314
|
+
this.model.inFrameSize,
|
|
315
|
+
);
|
|
316
|
+
|
|
317
|
+
for await (const frame of audioStream) {
|
|
318
|
+
const audioData = frame.data;
|
|
319
|
+
for (const frame of bstream.write(audioData.buffer)) {
|
|
320
|
+
this.#session!.inputAudioBuffer.append(frame);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
if (!this.linkedParticipant) {
|
|
326
|
+
this.#logger.error('Participant is not set');
|
|
327
|
+
return;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
for (const publication of this.linkedParticipant.trackPublications.values()) {
|
|
331
|
+
if (publication.source !== TrackSource.SOURCE_MICROPHONE) {
|
|
332
|
+
continue;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if (!publication.subscribed) {
|
|
336
|
+
publication.setSubscribed(true);
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
const track = publication.track;
|
|
340
|
+
|
|
341
|
+
if (track && track !== this.subscribedTrack) {
|
|
342
|
+
this.subscribedTrack = track!;
|
|
343
|
+
|
|
344
|
+
if (this.readMicroTask) {
|
|
345
|
+
this.readMicroTask.cancel();
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
let cancel: () => void;
|
|
349
|
+
this.readMicroTask = {
|
|
350
|
+
promise: new Promise<void>((resolve, reject) => {
|
|
351
|
+
cancel = () => {
|
|
352
|
+
reject(new Error('Task cancelled'));
|
|
353
|
+
};
|
|
354
|
+
readAudioStreamTask(
|
|
355
|
+
new AudioStream(track, this.model.sampleRate, this.model.numChannels),
|
|
356
|
+
)
|
|
357
|
+
.then(resolve)
|
|
358
|
+
.catch(reject);
|
|
359
|
+
}),
|
|
360
|
+
cancel: () => cancel(),
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
#getLocalTrackSid(): string | null {
|
|
367
|
+
if (!this.#localTrackSid && this.room && this.room.localParticipant) {
|
|
368
|
+
this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant?.identity);
|
|
369
|
+
}
|
|
370
|
+
return this.#localTrackSid;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
#publishTranscription(
|
|
374
|
+
participantIdentity: string,
|
|
375
|
+
trackSid: string,
|
|
376
|
+
text: string,
|
|
377
|
+
isFinal: boolean,
|
|
378
|
+
id: string,
|
|
379
|
+
): void {
|
|
380
|
+
this.#logger.debug(
|
|
381
|
+
`Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`,
|
|
382
|
+
);
|
|
383
|
+
if (!this.room?.localParticipant) {
|
|
384
|
+
this.#logger.error('Room or local participant not set');
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
this.room.localParticipant.publishTranscription({
|
|
389
|
+
participantIdentity,
|
|
390
|
+
trackSid,
|
|
391
|
+
segments: [
|
|
392
|
+
{
|
|
393
|
+
text,
|
|
394
|
+
final: isFinal,
|
|
395
|
+
id,
|
|
396
|
+
startTime: BigInt(0),
|
|
397
|
+
endTime: BigInt(0),
|
|
398
|
+
language: '',
|
|
399
|
+
},
|
|
400
|
+
],
|
|
401
|
+
});
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
#updateState() {
|
|
405
|
+
let newState: AgentState = 'initializing';
|
|
406
|
+
if (this.#pendingFunctionCalls.size > 0) {
|
|
407
|
+
newState = 'thinking';
|
|
408
|
+
} else if (this.#speaking) {
|
|
409
|
+
newState = 'speaking';
|
|
410
|
+
} else if (this.#started) {
|
|
411
|
+
newState = 'listening';
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
this.#setState(newState);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
#setState(state: AgentState) {
|
|
418
|
+
if (this.room?.isConnected && this.room.localParticipant) {
|
|
419
|
+
const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
|
|
420
|
+
if (currentState !== state) {
|
|
421
|
+
this.room.localParticipant!.setAttributes({
|
|
422
|
+
[AGENT_STATE_ATTRIBUTE]: state,
|
|
423
|
+
});
|
|
424
|
+
this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
package/src/stt/index.ts
CHANGED
|
@@ -2,5 +2,5 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
export { STT, SpeechEvent, SpeechEventType, SpeechStream } from './stt.js';
|
|
5
|
+
export { STT, SpeechEvent, SpeechEventType, SpeechStream, type SpeechData } from './stt.js';
|
|
6
6
|
export { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
|
|
@@ -7,24 +7,24 @@ import { VADEventType, type VADStream } from '../vad.js';
|
|
|
7
7
|
import { STT, SpeechEvent, SpeechEventType, SpeechStream } from './stt.js';
|
|
8
8
|
|
|
9
9
|
export class StreamAdapterWrapper extends SpeechStream {
|
|
10
|
-
closed: boolean;
|
|
11
|
-
stt: STT;
|
|
12
|
-
vadStream: VADStream;
|
|
13
|
-
eventQueue: (SpeechEvent | undefined)[];
|
|
14
|
-
language?: string;
|
|
15
|
-
task: {
|
|
10
|
+
#closed: boolean;
|
|
11
|
+
#stt: STT;
|
|
12
|
+
#vadStream: VADStream;
|
|
13
|
+
#eventQueue: (SpeechEvent | undefined)[];
|
|
14
|
+
#language?: string;
|
|
15
|
+
#task: {
|
|
16
16
|
run: Promise<void>;
|
|
17
17
|
cancel: () => void;
|
|
18
18
|
};
|
|
19
19
|
|
|
20
20
|
constructor(stt: STT, vadStream: VADStream, language: string | undefined = undefined) {
|
|
21
21
|
super();
|
|
22
|
-
this
|
|
23
|
-
this
|
|
24
|
-
this
|
|
25
|
-
this
|
|
26
|
-
this
|
|
27
|
-
this
|
|
22
|
+
this.#closed = false;
|
|
23
|
+
this.#stt = stt;
|
|
24
|
+
this.#vadStream = vadStream;
|
|
25
|
+
this.#eventQueue = [];
|
|
26
|
+
this.#language = language;
|
|
27
|
+
this.#task = {
|
|
28
28
|
run: new Promise((_, reject) => {
|
|
29
29
|
this.run(reject);
|
|
30
30
|
}),
|
|
@@ -33,46 +33,46 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
33
33
|
}
|
|
34
34
|
|
|
35
35
|
async run(reject: (arg: Error) => void) {
|
|
36
|
-
this
|
|
37
|
-
this
|
|
36
|
+
this.#task.cancel = () => {
|
|
37
|
+
this.#closed = true;
|
|
38
38
|
reject(new Error('cancelled'));
|
|
39
39
|
};
|
|
40
40
|
|
|
41
|
-
for (const event of this
|
|
41
|
+
for (const event of this.#vadStream) {
|
|
42
42
|
if (event.type == VADEventType.START_OF_SPEECH) {
|
|
43
43
|
const startEvent = new SpeechEvent(SpeechEventType.START_OF_SPEECH);
|
|
44
|
-
this
|
|
44
|
+
this.#eventQueue.push(startEvent);
|
|
45
45
|
} else if (event.type == VADEventType.END_OF_SPEECH) {
|
|
46
46
|
const mergedFrames = mergeFrames(event.speech);
|
|
47
|
-
const endEvent = await this
|
|
48
|
-
this
|
|
47
|
+
const endEvent = await this.#stt.recognize(mergedFrames, this.#language);
|
|
48
|
+
this.#eventQueue.push(endEvent);
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
-
this
|
|
52
|
+
this.#eventQueue.push(undefined);
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
pushFrame(frame: AudioFrame) {
|
|
56
|
-
if (this
|
|
56
|
+
if (this.#closed) {
|
|
57
57
|
throw new TypeError('cannot push frame to closed stream');
|
|
58
58
|
}
|
|
59
59
|
|
|
60
|
-
this
|
|
60
|
+
this.#vadStream.pushFrame(frame);
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
async close(wait: boolean = true): Promise<void> {
|
|
64
|
-
this
|
|
64
|
+
this.#closed = true;
|
|
65
65
|
|
|
66
66
|
if (!wait) {
|
|
67
|
-
this
|
|
67
|
+
this.#task.cancel();
|
|
68
68
|
}
|
|
69
69
|
|
|
70
|
-
await this
|
|
71
|
-
await this
|
|
70
|
+
await this.#vadStream.close(wait);
|
|
71
|
+
await this.#task.run;
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
next(): IteratorResult<SpeechEvent> {
|
|
75
|
-
const item = this
|
|
75
|
+
const item = this.#eventQueue.shift();
|
|
76
76
|
if (item) {
|
|
77
77
|
return { done: false, value: item };
|
|
78
78
|
} else {
|
|
@@ -82,23 +82,23 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
82
82
|
}
|
|
83
83
|
|
|
84
84
|
export class StreamAdapter extends STT {
|
|
85
|
-
stt: STT;
|
|
86
|
-
vadStream: VADStream;
|
|
85
|
+
#stt: STT;
|
|
86
|
+
#vadStream: VADStream;
|
|
87
87
|
|
|
88
88
|
constructor(stt: STT, vadStream: VADStream) {
|
|
89
89
|
super(true);
|
|
90
|
-
this
|
|
91
|
-
this
|
|
90
|
+
this.#stt = stt;
|
|
91
|
+
this.#vadStream = vadStream;
|
|
92
92
|
}
|
|
93
93
|
|
|
94
94
|
async recognize(
|
|
95
95
|
buffer: AudioBuffer,
|
|
96
96
|
language: string | undefined = undefined,
|
|
97
97
|
): Promise<SpeechEvent> {
|
|
98
|
-
return await this
|
|
98
|
+
return await this.#stt.recognize(buffer, language);
|
|
99
99
|
}
|
|
100
100
|
|
|
101
101
|
stream(language: string | undefined = undefined) {
|
|
102
|
-
return new StreamAdapterWrapper(this
|
|
102
|
+
return new StreamAdapterWrapper(this.#stt, this.#vadStream, language);
|
|
103
103
|
}
|
|
104
104
|
}
|
package/src/stt/stt.ts
CHANGED
|
@@ -5,9 +5,25 @@ import type { AudioFrame } from '@livekit/rtc-node';
|
|
|
5
5
|
import type { AudioBuffer } from '../utils.js';
|
|
6
6
|
|
|
7
7
|
export enum SpeechEventType {
|
|
8
|
+
/**
|
|
9
|
+
* Indicate the start of speech.
|
|
10
|
+
* If the STT doesn't support this event, this will be emitted at the same time
|
|
11
|
+
* as the first INTERMIN_TRANSCRIPT.
|
|
12
|
+
*/
|
|
8
13
|
START_OF_SPEECH = 0,
|
|
14
|
+
/**
|
|
15
|
+
* Interim transcript, useful for real-time transcription.
|
|
16
|
+
*/
|
|
9
17
|
INTERIM_TRANSCRIPT = 1,
|
|
18
|
+
/**
|
|
19
|
+
* Final transcript, emitted when the STT is confident enough that a certain
|
|
20
|
+
* portion of the speech will not change.
|
|
21
|
+
*/
|
|
10
22
|
FINAL_TRANSCRIPT = 2,
|
|
23
|
+
/**
|
|
24
|
+
* Indicate the end of speech, emitted when the user stops speaking.
|
|
25
|
+
* The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.
|
|
26
|
+
*/
|
|
11
27
|
END_OF_SPEECH = 3,
|
|
12
28
|
}
|
|
13
29
|
|
|
@@ -30,8 +46,19 @@ export class SpeechEvent {
|
|
|
30
46
|
}
|
|
31
47
|
|
|
32
48
|
export abstract class SpeechStream implements IterableIterator<SpeechEvent> {
|
|
49
|
+
/**
|
|
50
|
+
* Push a frame to be recognised.
|
|
51
|
+
* It is recommended to push frames as soon as they are available.
|
|
52
|
+
*/
|
|
33
53
|
abstract pushFrame(token: AudioFrame): void;
|
|
34
54
|
|
|
55
|
+
/**
|
|
56
|
+
* Close the stream.
|
|
57
|
+
*
|
|
58
|
+
* @param wait
|
|
59
|
+
* Whether to wait for the STT to finish processing the remaining
|
|
60
|
+
* frames before closing
|
|
61
|
+
*/
|
|
35
62
|
abstract close(wait: boolean): Promise<void>;
|
|
36
63
|
|
|
37
64
|
abstract next(): IteratorResult<SpeechEvent>;
|