@lokutor/sdk 1.1.10 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +17 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.js +455 -384
- package/dist/index.mjs +455 -384
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -34,287 +34,6 @@ var DEFAULT_URLS = {
|
|
|
34
34
|
TTS: "wss://api.lokutor.com/ws/tts"
|
|
35
35
|
};
|
|
36
36
|
|
|
37
|
-
// src/client.ts
|
|
38
|
-
function base64ToUint8Array(base64) {
|
|
39
|
-
const binaryString = atob(base64);
|
|
40
|
-
const bytes = new Uint8Array(binaryString.length);
|
|
41
|
-
for (let i = 0; i < binaryString.length; i++) {
|
|
42
|
-
bytes[i] = binaryString.charCodeAt(i);
|
|
43
|
-
}
|
|
44
|
-
return bytes;
|
|
45
|
-
}
|
|
46
|
-
var VoiceAgentClient = class {
|
|
47
|
-
ws = null;
|
|
48
|
-
apiKey;
|
|
49
|
-
prompt;
|
|
50
|
-
voice;
|
|
51
|
-
language;
|
|
52
|
-
// Callbacks
|
|
53
|
-
onTranscription;
|
|
54
|
-
onResponse;
|
|
55
|
-
onAudioCallback;
|
|
56
|
-
onVisemesCallback;
|
|
57
|
-
onStatus;
|
|
58
|
-
onError;
|
|
59
|
-
isConnected = false;
|
|
60
|
-
messages = [];
|
|
61
|
-
visemeListeners = [];
|
|
62
|
-
wantVisemes = false;
|
|
63
|
-
constructor(config) {
|
|
64
|
-
this.apiKey = config.apiKey;
|
|
65
|
-
this.prompt = config.prompt;
|
|
66
|
-
this.voice = config.voice || "F1" /* F1 */;
|
|
67
|
-
this.language = config.language || "en" /* ENGLISH */;
|
|
68
|
-
this.onTranscription = config.onTranscription;
|
|
69
|
-
this.onResponse = config.onResponse;
|
|
70
|
-
this.onAudioCallback = config.onAudio;
|
|
71
|
-
this.onVisemesCallback = config.onVisemes;
|
|
72
|
-
this.onStatus = config.onStatus;
|
|
73
|
-
this.onError = config.onError;
|
|
74
|
-
this.wantVisemes = config.visemes || false;
|
|
75
|
-
}
|
|
76
|
-
/**
|
|
77
|
-
* Connect to the Lokutor Voice Agent server
|
|
78
|
-
*/
|
|
79
|
-
async connect() {
|
|
80
|
-
return new Promise((resolve, reject) => {
|
|
81
|
-
try {
|
|
82
|
-
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
83
|
-
if (this.apiKey) {
|
|
84
|
-
const separator = url.includes("?") ? "&" : "?";
|
|
85
|
-
url += `${separator}api_key=${this.apiKey}`;
|
|
86
|
-
}
|
|
87
|
-
console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
|
|
88
|
-
this.ws = new WebSocket(url);
|
|
89
|
-
this.ws.binaryType = "arraybuffer";
|
|
90
|
-
this.ws.onopen = () => {
|
|
91
|
-
this.isConnected = true;
|
|
92
|
-
console.log("\u2705 Connected to voice agent!");
|
|
93
|
-
this.sendConfig();
|
|
94
|
-
resolve(true);
|
|
95
|
-
};
|
|
96
|
-
this.ws.onmessage = async (event) => {
|
|
97
|
-
if (event.data instanceof ArrayBuffer) {
|
|
98
|
-
this.handleBinaryMessage(new Uint8Array(event.data));
|
|
99
|
-
} else {
|
|
100
|
-
this.handleTextMessage(event.data.toString());
|
|
101
|
-
}
|
|
102
|
-
};
|
|
103
|
-
this.ws.onerror = (err) => {
|
|
104
|
-
console.error("\u274C WebSocket error:", err);
|
|
105
|
-
if (this.onError) this.onError(err);
|
|
106
|
-
if (!this.isConnected) reject(err);
|
|
107
|
-
};
|
|
108
|
-
this.ws.onclose = () => {
|
|
109
|
-
this.isConnected = false;
|
|
110
|
-
console.log("Disconnected");
|
|
111
|
-
};
|
|
112
|
-
} catch (err) {
|
|
113
|
-
if (this.onError) this.onError(err);
|
|
114
|
-
reject(err);
|
|
115
|
-
}
|
|
116
|
-
});
|
|
117
|
-
}
|
|
118
|
-
/**
|
|
119
|
-
* Send initial configuration to the server
|
|
120
|
-
*/
|
|
121
|
-
sendConfig() {
|
|
122
|
-
if (!this.ws || !this.isConnected) return;
|
|
123
|
-
this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
|
|
124
|
-
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
125
|
-
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
126
|
-
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
127
|
-
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
|
|
128
|
-
}
|
|
129
|
-
/**
|
|
130
|
-
* Send raw PCM audio data to the server
|
|
131
|
-
* @param audioData Int16 PCM audio buffer
|
|
132
|
-
*/
|
|
133
|
-
sendAudio(audioData) {
|
|
134
|
-
if (this.ws && this.isConnected) {
|
|
135
|
-
this.ws.send(audioData);
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
/**
|
|
139
|
-
* Handle incoming binary data (audio response)
|
|
140
|
-
*/
|
|
141
|
-
handleBinaryMessage(data) {
|
|
142
|
-
this.emit("audio", data);
|
|
143
|
-
}
|
|
144
|
-
/**
|
|
145
|
-
* Handle incoming text messages (metadata/transcriptions)
|
|
146
|
-
*/
|
|
147
|
-
handleTextMessage(text) {
|
|
148
|
-
try {
|
|
149
|
-
const msg = JSON.parse(text);
|
|
150
|
-
switch (msg.type) {
|
|
151
|
-
case "audio":
|
|
152
|
-
if (msg.data) {
|
|
153
|
-
const buffer = base64ToUint8Array(msg.data);
|
|
154
|
-
this.handleBinaryMessage(buffer);
|
|
155
|
-
}
|
|
156
|
-
break;
|
|
157
|
-
case "transcript":
|
|
158
|
-
const role = msg.role === "user" ? "user" : "agent";
|
|
159
|
-
this.messages.push({
|
|
160
|
-
role,
|
|
161
|
-
text: msg.data,
|
|
162
|
-
timestamp: Date.now()
|
|
163
|
-
});
|
|
164
|
-
if (msg.role === "user") {
|
|
165
|
-
if (this.onTranscription) this.onTranscription(msg.data);
|
|
166
|
-
console.log(`\u{1F4AC} You: ${msg.data}`);
|
|
167
|
-
} else {
|
|
168
|
-
if (this.onResponse) this.onResponse(msg.data);
|
|
169
|
-
console.log(`\u{1F916} Agent: ${msg.data}`);
|
|
170
|
-
}
|
|
171
|
-
break;
|
|
172
|
-
case "status":
|
|
173
|
-
if (this.onStatus) this.onStatus(msg.data);
|
|
174
|
-
const icons = {
|
|
175
|
-
"interrupted": "\u26A1",
|
|
176
|
-
"thinking": "\u{1F9E0}",
|
|
177
|
-
"speaking": "\u{1F50A}",
|
|
178
|
-
"listening": "\u{1F442}"
|
|
179
|
-
};
|
|
180
|
-
console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
|
|
181
|
-
break;
|
|
182
|
-
case "visemes":
|
|
183
|
-
if (Array.isArray(msg.data) && msg.data.length > 0) {
|
|
184
|
-
this.emit("visemes", msg.data);
|
|
185
|
-
}
|
|
186
|
-
break;
|
|
187
|
-
case "error":
|
|
188
|
-
if (this.onError) this.onError(msg.data);
|
|
189
|
-
console.error(`\u274C Server error: ${msg.data}`);
|
|
190
|
-
break;
|
|
191
|
-
}
|
|
192
|
-
} catch (e) {
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
audioListeners = [];
|
|
196
|
-
emit(event, data) {
|
|
197
|
-
if (event === "audio") {
|
|
198
|
-
if (this.onAudioCallback) this.onAudioCallback(data);
|
|
199
|
-
this.audioListeners.forEach((l) => l(data));
|
|
200
|
-
} else if (event === "visemes") {
|
|
201
|
-
if (this.onVisemesCallback) this.onVisemesCallback(data);
|
|
202
|
-
this.visemeListeners.forEach((l) => l(data));
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
onAudio(callback) {
|
|
206
|
-
this.audioListeners.push(callback);
|
|
207
|
-
}
|
|
208
|
-
onVisemes(callback) {
|
|
209
|
-
this.visemeListeners.push(callback);
|
|
210
|
-
}
|
|
211
|
-
/**
|
|
212
|
-
* Disconnect from the server
|
|
213
|
-
*/
|
|
214
|
-
disconnect() {
|
|
215
|
-
if (this.ws) {
|
|
216
|
-
this.ws.close();
|
|
217
|
-
this.ws = null;
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
/**
|
|
221
|
-
* Update the system prompt mid-conversation
|
|
222
|
-
*/
|
|
223
|
-
updatePrompt(newPrompt) {
|
|
224
|
-
this.prompt = newPrompt;
|
|
225
|
-
if (this.ws && this.isConnected) {
|
|
226
|
-
try {
|
|
227
|
-
this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
|
|
228
|
-
console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
|
|
229
|
-
} catch (error) {
|
|
230
|
-
console.error("Error updating prompt:", error);
|
|
231
|
-
}
|
|
232
|
-
} else {
|
|
233
|
-
console.warn("Not connected - prompt will be updated on next connection");
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
/**
|
|
237
|
-
* Get full conversation transcript
|
|
238
|
-
*/
|
|
239
|
-
getTranscript() {
|
|
240
|
-
return this.messages.slice();
|
|
241
|
-
}
|
|
242
|
-
/**
|
|
243
|
-
* Get conversation as formatted text
|
|
244
|
-
*/
|
|
245
|
-
getTranscriptText() {
|
|
246
|
-
return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
|
|
247
|
-
}
|
|
248
|
-
};
|
|
249
|
-
var TTSClient = class {
|
|
250
|
-
apiKey;
|
|
251
|
-
constructor(config) {
|
|
252
|
-
this.apiKey = config.apiKey;
|
|
253
|
-
}
|
|
254
|
-
/**
|
|
255
|
-
* Synthesize text to speech
|
|
256
|
-
*
|
|
257
|
-
* This opens a temporary WebSocket connection, sends the request,
|
|
258
|
-
* and streams back the audio.
|
|
259
|
-
*/
|
|
260
|
-
synthesize(options) {
|
|
261
|
-
return new Promise((resolve, reject) => {
|
|
262
|
-
try {
|
|
263
|
-
let url = DEFAULT_URLS.TTS;
|
|
264
|
-
if (this.apiKey) {
|
|
265
|
-
const separator = url.includes("?") ? "&" : "?";
|
|
266
|
-
url += `${separator}api_key=${this.apiKey}`;
|
|
267
|
-
}
|
|
268
|
-
const ws = new WebSocket(url);
|
|
269
|
-
ws.binaryType = "arraybuffer";
|
|
270
|
-
ws.onopen = () => {
|
|
271
|
-
const req = {
|
|
272
|
-
text: options.text,
|
|
273
|
-
voice: options.voice || "F1" /* F1 */,
|
|
274
|
-
lang: options.language || "en" /* ENGLISH */,
|
|
275
|
-
speed: options.speed || 1.05,
|
|
276
|
-
steps: options.steps || 24,
|
|
277
|
-
visemes: options.visemes || false
|
|
278
|
-
};
|
|
279
|
-
ws.send(JSON.stringify(req));
|
|
280
|
-
};
|
|
281
|
-
ws.onmessage = async (event) => {
|
|
282
|
-
if (event.data instanceof ArrayBuffer) {
|
|
283
|
-
if (options.onAudio) options.onAudio(new Uint8Array(event.data));
|
|
284
|
-
} else {
|
|
285
|
-
try {
|
|
286
|
-
const msg = JSON.parse(event.data.toString());
|
|
287
|
-
if (Array.isArray(msg) && options.onVisemes) {
|
|
288
|
-
options.onVisemes(msg);
|
|
289
|
-
}
|
|
290
|
-
} catch (e) {
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
};
|
|
294
|
-
ws.onerror = (err) => {
|
|
295
|
-
if (options.onError) options.onError(err);
|
|
296
|
-
reject(err);
|
|
297
|
-
};
|
|
298
|
-
ws.onclose = () => {
|
|
299
|
-
resolve();
|
|
300
|
-
};
|
|
301
|
-
} catch (err) {
|
|
302
|
-
if (options.onError) options.onError(err);
|
|
303
|
-
reject(err);
|
|
304
|
-
}
|
|
305
|
-
});
|
|
306
|
-
}
|
|
307
|
-
};
|
|
308
|
-
async function simpleConversation(config) {
|
|
309
|
-
const client = new VoiceAgentClient(config);
|
|
310
|
-
await client.connect();
|
|
311
|
-
return client;
|
|
312
|
-
}
|
|
313
|
-
async function simpleTTS(options) {
|
|
314
|
-
const client = new TTSClient({ apiKey: options.apiKey });
|
|
315
|
-
return client.synthesize(options);
|
|
316
|
-
}
|
|
317
|
-
|
|
318
37
|
// src/audio-utils.ts
|
|
319
38
|
function pcm16ToFloat32(int16Data) {
|
|
320
39
|
const float32 = new Float32Array(int16Data.length);
|
|
@@ -454,6 +173,7 @@ var BrowserAudioManager = class {
|
|
|
454
173
|
scriptProcessor = null;
|
|
455
174
|
analyserNode = null;
|
|
456
175
|
mediaStream = null;
|
|
176
|
+
resampler = null;
|
|
457
177
|
// Playback scheduling
|
|
458
178
|
nextPlaybackTime = 0;
|
|
459
179
|
activeSources = [];
|
|
@@ -531,6 +251,12 @@ var BrowserAudioManager = class {
|
|
|
531
251
|
if (this.analyserNode) {
|
|
532
252
|
this.mediaStreamAudioSourceNode.connect(this.analyserNode);
|
|
533
253
|
}
|
|
254
|
+
const hardwareRate = this.audioContext.sampleRate;
|
|
255
|
+
if (hardwareRate !== this.inputSampleRate) {
|
|
256
|
+
this.resampler = new StreamResampler(hardwareRate, this.inputSampleRate);
|
|
257
|
+
} else {
|
|
258
|
+
this.resampler = null;
|
|
259
|
+
}
|
|
534
260
|
this.scriptProcessor.onaudioprocess = (event) => {
|
|
535
261
|
this._processAudioInput(event);
|
|
536
262
|
};
|
|
@@ -553,15 +279,11 @@ var BrowserAudioManager = class {
|
|
|
553
279
|
for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
|
|
554
280
|
outputBuffer.getChannelData(0)[i] = 0;
|
|
555
281
|
}
|
|
556
|
-
const hardwareRate = this.audioContext.sampleRate;
|
|
557
282
|
let processedData = new Float32Array(inputData);
|
|
558
|
-
if (
|
|
559
|
-
processedData =
|
|
560
|
-
processedData,
|
|
561
|
-
hardwareRate,
|
|
562
|
-
this.inputSampleRate
|
|
563
|
-
);
|
|
283
|
+
if (this.resampler) {
|
|
284
|
+
processedData = this.resampler.process(processedData);
|
|
564
285
|
}
|
|
286
|
+
if (processedData.length === 0) return;
|
|
565
287
|
const int16Data = float32ToPcm16(processedData);
|
|
566
288
|
const uint8Data = new Uint8Array(
|
|
567
289
|
int16Data.buffer,
|
|
@@ -593,137 +315,486 @@ var BrowserAudioManager = class {
|
|
|
593
315
|
* Play back audio received from the server
|
|
594
316
|
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
595
317
|
*/
|
|
596
|
-
playAudio(pcm16Data) {
|
|
597
|
-
if (!this.audioContext) {
|
|
598
|
-
console.warn("AudioContext not initialized");
|
|
599
|
-
return;
|
|
318
|
+
playAudio(pcm16Data) {
|
|
319
|
+
if (!this.audioContext) {
|
|
320
|
+
console.warn("AudioContext not initialized");
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
const int16Array = new Int16Array(
|
|
324
|
+
pcm16Data.buffer,
|
|
325
|
+
pcm16Data.byteOffset,
|
|
326
|
+
pcm16Data.length / 2
|
|
327
|
+
);
|
|
328
|
+
const float32Data = pcm16ToFloat32(int16Array);
|
|
329
|
+
const audioBuffer = this.audioContext.createBuffer(
|
|
330
|
+
1,
|
|
331
|
+
float32Data.length,
|
|
332
|
+
this.outputSampleRate
|
|
333
|
+
);
|
|
334
|
+
audioBuffer.getChannelData(0).set(float32Data);
|
|
335
|
+
this._schedulePlayback(audioBuffer);
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Internal method to schedule and play audio with sample-accurate timing
|
|
339
|
+
*/
|
|
340
|
+
_schedulePlayback(audioBuffer) {
|
|
341
|
+
if (!this.audioContext) return;
|
|
342
|
+
const currentTime = this.audioContext.currentTime;
|
|
343
|
+
const duration = audioBuffer.length / this.outputSampleRate;
|
|
344
|
+
const startTime = Math.max(
|
|
345
|
+
currentTime + 0.01,
|
|
346
|
+
// Minimum 10ms delay
|
|
347
|
+
this.nextPlaybackTime
|
|
348
|
+
);
|
|
349
|
+
this.nextPlaybackTime = startTime + duration;
|
|
350
|
+
const source = this.audioContext.createBufferSource();
|
|
351
|
+
source.buffer = audioBuffer;
|
|
352
|
+
source.connect(this.audioContext.destination);
|
|
353
|
+
if (this.analyserNode) {
|
|
354
|
+
source.connect(this.analyserNode);
|
|
355
|
+
}
|
|
356
|
+
source.start(startTime);
|
|
357
|
+
this.activeSources.push(source);
|
|
358
|
+
source.onended = () => {
|
|
359
|
+
const index = this.activeSources.indexOf(source);
|
|
360
|
+
if (index > -1) {
|
|
361
|
+
this.activeSources.splice(index, 1);
|
|
362
|
+
}
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Stop all currently playing audio and clear the queue
|
|
367
|
+
*/
|
|
368
|
+
stopPlayback() {
|
|
369
|
+
this.activeSources.forEach((source) => {
|
|
370
|
+
try {
|
|
371
|
+
source.stop();
|
|
372
|
+
} catch (e) {
|
|
373
|
+
}
|
|
374
|
+
});
|
|
375
|
+
this.activeSources = [];
|
|
376
|
+
this.playbackQueue = [];
|
|
377
|
+
this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
|
|
378
|
+
console.log("\u{1F507} Playback stopped");
|
|
379
|
+
}
|
|
380
|
+
/**
|
|
381
|
+
* Toggle mute state
|
|
382
|
+
*/
|
|
383
|
+
setMuted(muted) {
|
|
384
|
+
this.isMuted = muted;
|
|
385
|
+
}
|
|
386
|
+
/**
|
|
387
|
+
* Get current mute state
|
|
388
|
+
*/
|
|
389
|
+
isMicMuted() {
|
|
390
|
+
return this.isMuted;
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Get current amplitude from analyser (for visualization)
|
|
394
|
+
* Returns value between 0 and 1
|
|
395
|
+
*/
|
|
396
|
+
getAmplitude() {
|
|
397
|
+
if (!this.analyserNode) return 0;
|
|
398
|
+
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
399
|
+
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
400
|
+
const rms = calculateRMS(dataArray);
|
|
401
|
+
return Math.min(rms * 10, 1);
|
|
402
|
+
}
|
|
403
|
+
/**
|
|
404
|
+
* Get frequency data from analyser for visualization
|
|
405
|
+
*/
|
|
406
|
+
getFrequencyData() {
|
|
407
|
+
if (!this.analyserNode) {
|
|
408
|
+
return new Uint8Array(0);
|
|
409
|
+
}
|
|
410
|
+
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
411
|
+
this.analyserNode.getByteFrequencyData(dataArray);
|
|
412
|
+
return dataArray;
|
|
413
|
+
}
|
|
414
|
+
/**
|
|
415
|
+
* Get time-domain data from analyser for waveform visualization
|
|
416
|
+
*/
|
|
417
|
+
getWaveformData() {
|
|
418
|
+
if (!this.analyserNode) {
|
|
419
|
+
return new Uint8Array(0);
|
|
420
|
+
}
|
|
421
|
+
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
422
|
+
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
423
|
+
return dataArray;
|
|
424
|
+
}
|
|
425
|
+
/**
|
|
426
|
+
* Cleanup and close AudioContext
|
|
427
|
+
*/
|
|
428
|
+
cleanup() {
|
|
429
|
+
this.stopMicrophone();
|
|
430
|
+
this.stopPlayback();
|
|
431
|
+
if (this.analyserNode) {
|
|
432
|
+
this.analyserNode.disconnect();
|
|
433
|
+
this.analyserNode = null;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Get current audio context state
|
|
438
|
+
*/
|
|
439
|
+
getState() {
|
|
440
|
+
return this.audioContext?.state ?? null;
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Check if microphone is currently listening
|
|
444
|
+
*/
|
|
445
|
+
isRecording() {
|
|
446
|
+
return this.isListening;
|
|
447
|
+
}
|
|
448
|
+
};
|
|
449
|
+
|
|
450
|
+
// src/client.ts
|
|
451
|
+
function base64ToUint8Array(base64) {
|
|
452
|
+
const binaryString = atob(base64);
|
|
453
|
+
const bytes = new Uint8Array(binaryString.length);
|
|
454
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
455
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
456
|
+
}
|
|
457
|
+
return bytes;
|
|
458
|
+
}
|
|
459
|
+
var VoiceAgentClient = class {
|
|
460
|
+
ws = null;
|
|
461
|
+
apiKey;
|
|
462
|
+
prompt;
|
|
463
|
+
voice;
|
|
464
|
+
language;
|
|
465
|
+
// Callbacks
|
|
466
|
+
onTranscription;
|
|
467
|
+
onResponse;
|
|
468
|
+
onAudioCallback;
|
|
469
|
+
onVisemesCallback;
|
|
470
|
+
onStatus;
|
|
471
|
+
onError;
|
|
472
|
+
isConnected = false;
|
|
473
|
+
messages = [];
|
|
474
|
+
visemeListeners = [];
|
|
475
|
+
wantVisemes = false;
|
|
476
|
+
audioManager = null;
|
|
477
|
+
enableAudio = false;
|
|
478
|
+
// Connection resilience
|
|
479
|
+
isUserDisconnect = false;
|
|
480
|
+
reconnecting = false;
|
|
481
|
+
reconnectAttempts = 0;
|
|
482
|
+
maxReconnectAttempts = 5;
|
|
483
|
+
constructor(config) {
|
|
484
|
+
this.apiKey = config.apiKey;
|
|
485
|
+
this.prompt = config.prompt;
|
|
486
|
+
this.voice = config.voice || "F1" /* F1 */;
|
|
487
|
+
this.language = config.language || "en" /* ENGLISH */;
|
|
488
|
+
this.onTranscription = config.onTranscription;
|
|
489
|
+
this.onResponse = config.onResponse;
|
|
490
|
+
this.onAudioCallback = config.onAudio;
|
|
491
|
+
this.onVisemesCallback = config.onVisemes;
|
|
492
|
+
this.onStatus = config.onStatus;
|
|
493
|
+
this.onError = config.onError;
|
|
494
|
+
this.wantVisemes = config.visemes || false;
|
|
495
|
+
this.enableAudio = config.enableAudio ?? false;
|
|
496
|
+
}
|
|
497
|
+
/**
|
|
498
|
+
* Connect to the Lokutor Voice Agent server
|
|
499
|
+
*/
|
|
500
|
+
async connect() {
|
|
501
|
+
this.isUserDisconnect = false;
|
|
502
|
+
if (this.enableAudio) {
|
|
503
|
+
if (!this.audioManager) {
|
|
504
|
+
this.audioManager = new BrowserAudioManager();
|
|
505
|
+
}
|
|
506
|
+
await this.audioManager.init();
|
|
507
|
+
}
|
|
508
|
+
return new Promise((resolve, reject) => {
|
|
509
|
+
try {
|
|
510
|
+
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
511
|
+
if (this.apiKey) {
|
|
512
|
+
const separator = url.includes("?") ? "&" : "?";
|
|
513
|
+
url += `${separator}api_key=${this.apiKey}`;
|
|
514
|
+
}
|
|
515
|
+
console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
|
|
516
|
+
this.ws = new WebSocket(url);
|
|
517
|
+
this.ws.binaryType = "arraybuffer";
|
|
518
|
+
this.ws.onopen = async () => {
|
|
519
|
+
this.isConnected = true;
|
|
520
|
+
this.reconnectAttempts = 0;
|
|
521
|
+
this.reconnecting = false;
|
|
522
|
+
console.log("\u2705 Connected to voice agent!");
|
|
523
|
+
this.sendConfig();
|
|
524
|
+
if (this.audioManager) {
|
|
525
|
+
await this.audioManager.startMicrophone((data) => {
|
|
526
|
+
if (this.isConnected) {
|
|
527
|
+
this.sendAudio(data);
|
|
528
|
+
}
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
resolve(true);
|
|
532
|
+
};
|
|
533
|
+
this.ws.onmessage = async (event) => {
|
|
534
|
+
if (event.data instanceof ArrayBuffer) {
|
|
535
|
+
this.handleBinaryMessage(new Uint8Array(event.data));
|
|
536
|
+
} else {
|
|
537
|
+
this.handleTextMessage(event.data.toString());
|
|
538
|
+
}
|
|
539
|
+
};
|
|
540
|
+
this.ws.onerror = (err) => {
|
|
541
|
+
console.error("\u274C WebSocket error:", err);
|
|
542
|
+
if (this.onError) this.onError(err);
|
|
543
|
+
if (!this.isConnected) reject(err);
|
|
544
|
+
};
|
|
545
|
+
this.ws.onclose = () => {
|
|
546
|
+
this.isConnected = false;
|
|
547
|
+
if (!this.isUserDisconnect && this.reconnectAttempts < this.maxReconnectAttempts) {
|
|
548
|
+
this.reconnecting = true;
|
|
549
|
+
this.reconnectAttempts++;
|
|
550
|
+
const backoffDelay = Math.min(1e3 * Math.pow(2, this.reconnectAttempts), 1e4);
|
|
551
|
+
console.warn(`Connection lost. Reconnecting in ${backoffDelay}ms (attempt ${this.reconnectAttempts}/${this.maxReconnectAttempts})`);
|
|
552
|
+
if (this.onStatus) this.onStatus("reconnecting");
|
|
553
|
+
setTimeout(() => {
|
|
554
|
+
this.connect().catch((e) => console.error("Reconnect failed", e));
|
|
555
|
+
}, backoffDelay);
|
|
556
|
+
} else {
|
|
557
|
+
console.log("Disconnected");
|
|
558
|
+
if (this.onStatus) this.onStatus("disconnected");
|
|
559
|
+
}
|
|
560
|
+
};
|
|
561
|
+
} catch (err) {
|
|
562
|
+
if (this.onError) this.onError(err);
|
|
563
|
+
reject(err);
|
|
564
|
+
}
|
|
565
|
+
});
|
|
566
|
+
}
|
|
567
|
+
/**
|
|
568
|
+
* Send initial configuration to the server
|
|
569
|
+
*/
|
|
570
|
+
sendConfig() {
|
|
571
|
+
if (!this.ws || !this.isConnected) return;
|
|
572
|
+
this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
|
|
573
|
+
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
574
|
+
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
575
|
+
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
576
|
+
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
|
|
577
|
+
}
|
|
578
|
+
/**
|
|
579
|
+
* Send raw PCM audio data to the server
|
|
580
|
+
* @param audioData Int16 PCM audio buffer
|
|
581
|
+
*/
|
|
582
|
+
sendAudio(audioData) {
|
|
583
|
+
if (this.ws && this.ws.readyState === WebSocket.OPEN && this.isConnected) {
|
|
584
|
+
this.ws.send(audioData);
|
|
600
585
|
}
|
|
601
|
-
const int16Array = new Int16Array(
|
|
602
|
-
pcm16Data.buffer,
|
|
603
|
-
pcm16Data.byteOffset,
|
|
604
|
-
pcm16Data.length / 2
|
|
605
|
-
);
|
|
606
|
-
const float32Data = pcm16ToFloat32(int16Array);
|
|
607
|
-
const audioBuffer = this.audioContext.createBuffer(
|
|
608
|
-
1,
|
|
609
|
-
float32Data.length,
|
|
610
|
-
this.outputSampleRate
|
|
611
|
-
);
|
|
612
|
-
audioBuffer.getChannelData(0).set(float32Data);
|
|
613
|
-
this._schedulePlayback(audioBuffer);
|
|
614
586
|
}
|
|
615
587
|
/**
|
|
616
|
-
*
|
|
588
|
+
* Handle incoming binary data (audio response)
|
|
617
589
|
*/
|
|
618
|
-
|
|
619
|
-
if (
|
|
620
|
-
|
|
621
|
-
const duration = audioBuffer.length / this.outputSampleRate;
|
|
622
|
-
const startTime = Math.max(
|
|
623
|
-
currentTime + 0.01,
|
|
624
|
-
// Minimum 10ms delay
|
|
625
|
-
this.nextPlaybackTime
|
|
626
|
-
);
|
|
627
|
-
this.nextPlaybackTime = startTime + duration;
|
|
628
|
-
const source = this.audioContext.createBufferSource();
|
|
629
|
-
source.buffer = audioBuffer;
|
|
630
|
-
source.connect(this.audioContext.destination);
|
|
631
|
-
if (this.analyserNode) {
|
|
632
|
-
source.connect(this.analyserNode);
|
|
590
|
+
handleBinaryMessage(data) {
|
|
591
|
+
if (this.audioManager) {
|
|
592
|
+
this.audioManager.playAudio(data);
|
|
633
593
|
}
|
|
634
|
-
|
|
635
|
-
this.activeSources.push(source);
|
|
636
|
-
source.onended = () => {
|
|
637
|
-
const index = this.activeSources.indexOf(source);
|
|
638
|
-
if (index > -1) {
|
|
639
|
-
this.activeSources.splice(index, 1);
|
|
640
|
-
}
|
|
641
|
-
};
|
|
594
|
+
this.emit("audio", data);
|
|
642
595
|
}
|
|
643
596
|
/**
|
|
644
|
-
*
|
|
597
|
+
* Handle incoming text messages (metadata/transcriptions)
|
|
645
598
|
*/
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
599
|
+
handleTextMessage(text) {
|
|
600
|
+
try {
|
|
601
|
+
const msg = JSON.parse(text);
|
|
602
|
+
switch (msg.type) {
|
|
603
|
+
case "audio":
|
|
604
|
+
if (msg.data) {
|
|
605
|
+
const buffer = base64ToUint8Array(msg.data);
|
|
606
|
+
this.handleBinaryMessage(buffer);
|
|
607
|
+
}
|
|
608
|
+
break;
|
|
609
|
+
case "transcript":
|
|
610
|
+
const role = msg.role === "user" ? "user" : "agent";
|
|
611
|
+
this.messages.push({
|
|
612
|
+
role,
|
|
613
|
+
text: msg.data,
|
|
614
|
+
timestamp: Date.now()
|
|
615
|
+
});
|
|
616
|
+
if (msg.role === "user") {
|
|
617
|
+
if (this.onTranscription) this.onTranscription(msg.data);
|
|
618
|
+
console.log(`\u{1F4AC} You: ${msg.data}`);
|
|
619
|
+
} else {
|
|
620
|
+
if (this.onResponse) this.onResponse(msg.data);
|
|
621
|
+
console.log(`\u{1F916} Agent: ${msg.data}`);
|
|
622
|
+
}
|
|
623
|
+
break;
|
|
624
|
+
case "status":
|
|
625
|
+
if (msg.data === "interrupted" && this.audioManager) {
|
|
626
|
+
this.audioManager.stopPlayback();
|
|
627
|
+
}
|
|
628
|
+
if (this.onStatus) this.onStatus(msg.data);
|
|
629
|
+
const icons = {
|
|
630
|
+
"interrupted": "\u26A1",
|
|
631
|
+
"thinking": "\u{1F9E0}",
|
|
632
|
+
"speaking": "\u{1F50A}",
|
|
633
|
+
"listening": "\u{1F442}"
|
|
634
|
+
};
|
|
635
|
+
console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
|
|
636
|
+
break;
|
|
637
|
+
case "visemes":
|
|
638
|
+
if (Array.isArray(msg.data) && msg.data.length > 0) {
|
|
639
|
+
this.emit("visemes", msg.data);
|
|
640
|
+
}
|
|
641
|
+
break;
|
|
642
|
+
case "error":
|
|
643
|
+
if (this.onError) this.onError(msg.data);
|
|
644
|
+
console.error(`\u274C Server error: ${msg.data}`);
|
|
645
|
+
break;
|
|
651
646
|
}
|
|
652
|
-
})
|
|
653
|
-
|
|
654
|
-
this.playbackQueue = [];
|
|
655
|
-
this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
|
|
656
|
-
console.log("\u{1F507} Playback stopped");
|
|
647
|
+
} catch (e) {
|
|
648
|
+
}
|
|
657
649
|
}
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
650
|
+
audioListeners = [];
|
|
651
|
+
emit(event, data) {
|
|
652
|
+
if (event === "audio") {
|
|
653
|
+
if (this.onAudioCallback) this.onAudioCallback(data);
|
|
654
|
+
this.audioListeners.forEach((l) => l(data));
|
|
655
|
+
} else if (event === "visemes") {
|
|
656
|
+
if (this.onVisemesCallback) this.onVisemesCallback(data);
|
|
657
|
+
this.visemeListeners.forEach((l) => l(data));
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
onAudio(callback) {
|
|
661
|
+
this.audioListeners.push(callback);
|
|
662
|
+
}
|
|
663
|
+
onVisemes(callback) {
|
|
664
|
+
this.visemeListeners.push(callback);
|
|
663
665
|
}
|
|
664
666
|
/**
|
|
665
|
-
*
|
|
667
|
+
* Disconnect from the server
|
|
666
668
|
*/
|
|
667
|
-
|
|
668
|
-
|
|
669
|
+
disconnect() {
|
|
670
|
+
this.isUserDisconnect = true;
|
|
671
|
+
if (this.ws) {
|
|
672
|
+
this.ws.close();
|
|
673
|
+
this.ws = null;
|
|
674
|
+
}
|
|
675
|
+
if (this.audioManager) {
|
|
676
|
+
this.audioManager.cleanup();
|
|
677
|
+
}
|
|
678
|
+
this.isConnected = false;
|
|
669
679
|
}
|
|
670
680
|
/**
|
|
671
|
-
*
|
|
672
|
-
*
|
|
681
|
+
* Toggles the microphone mute state (if managed by client)
|
|
682
|
+
* returns the new mute state
|
|
673
683
|
*/
|
|
674
|
-
|
|
675
|
-
if (
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
684
|
+
toggleMute() {
|
|
685
|
+
if (this.audioManager) {
|
|
686
|
+
const isMuted = this.audioManager.isMicMuted();
|
|
687
|
+
this.audioManager.setMuted(!isMuted);
|
|
688
|
+
return !isMuted;
|
|
689
|
+
}
|
|
690
|
+
return false;
|
|
680
691
|
}
|
|
681
692
|
/**
|
|
682
|
-
*
|
|
693
|
+
* Gets the microphone volume amplitude 0-1 (if managed by client)
|
|
683
694
|
*/
|
|
684
|
-
|
|
685
|
-
if (
|
|
686
|
-
return
|
|
695
|
+
getAmplitude() {
|
|
696
|
+
if (this.audioManager) {
|
|
697
|
+
return this.audioManager.getAmplitude();
|
|
687
698
|
}
|
|
688
|
-
|
|
689
|
-
this.analyserNode.getByteFrequencyData(dataArray);
|
|
690
|
-
return dataArray;
|
|
699
|
+
return 0;
|
|
691
700
|
}
|
|
692
701
|
/**
|
|
693
|
-
*
|
|
702
|
+
* Update the system prompt mid-conversation
|
|
694
703
|
*/
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
704
|
+
updatePrompt(newPrompt) {
|
|
705
|
+
this.prompt = newPrompt;
|
|
706
|
+
if (this.ws && this.isConnected) {
|
|
707
|
+
try {
|
|
708
|
+
this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
|
|
709
|
+
console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
|
|
710
|
+
} catch (error) {
|
|
711
|
+
console.error("Error updating prompt:", error);
|
|
712
|
+
}
|
|
713
|
+
} else {
|
|
714
|
+
console.warn("Not connected - prompt will be updated on next connection");
|
|
698
715
|
}
|
|
699
|
-
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
700
|
-
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
701
|
-
return dataArray;
|
|
702
716
|
}
|
|
703
717
|
/**
|
|
704
|
-
*
|
|
718
|
+
* Get full conversation transcript
|
|
705
719
|
*/
|
|
706
|
-
|
|
707
|
-
this.
|
|
708
|
-
this.stopPlayback();
|
|
709
|
-
if (this.analyserNode) {
|
|
710
|
-
this.analyserNode.disconnect();
|
|
711
|
-
this.analyserNode = null;
|
|
712
|
-
}
|
|
720
|
+
getTranscript() {
|
|
721
|
+
return this.messages.slice();
|
|
713
722
|
}
|
|
714
723
|
/**
|
|
715
|
-
* Get
|
|
724
|
+
* Get conversation as formatted text
|
|
716
725
|
*/
|
|
717
|
-
|
|
718
|
-
return this.
|
|
726
|
+
getTranscriptText() {
|
|
727
|
+
return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
|
|
728
|
+
}
|
|
729
|
+
};
|
|
730
|
+
var TTSClient = class {
|
|
731
|
+
apiKey;
|
|
732
|
+
constructor(config) {
|
|
733
|
+
this.apiKey = config.apiKey;
|
|
719
734
|
}
|
|
720
735
|
/**
|
|
721
|
-
*
|
|
736
|
+
* Synthesize text to speech
|
|
737
|
+
*
|
|
738
|
+
* This opens a temporary WebSocket connection, sends the request,
|
|
739
|
+
* and streams back the audio.
|
|
722
740
|
*/
|
|
723
|
-
|
|
724
|
-
return
|
|
741
|
+
synthesize(options) {
|
|
742
|
+
return new Promise((resolve, reject) => {
|
|
743
|
+
try {
|
|
744
|
+
let url = DEFAULT_URLS.TTS;
|
|
745
|
+
if (this.apiKey) {
|
|
746
|
+
const separator = url.includes("?") ? "&" : "?";
|
|
747
|
+
url += `${separator}api_key=${this.apiKey}`;
|
|
748
|
+
}
|
|
749
|
+
const ws = new WebSocket(url);
|
|
750
|
+
ws.binaryType = "arraybuffer";
|
|
751
|
+
ws.onopen = () => {
|
|
752
|
+
const req = {
|
|
753
|
+
text: options.text,
|
|
754
|
+
voice: options.voice || "F1" /* F1 */,
|
|
755
|
+
lang: options.language || "en" /* ENGLISH */,
|
|
756
|
+
speed: options.speed || 1.05,
|
|
757
|
+
steps: options.steps || 24,
|
|
758
|
+
visemes: options.visemes || false
|
|
759
|
+
};
|
|
760
|
+
ws.send(JSON.stringify(req));
|
|
761
|
+
};
|
|
762
|
+
ws.onmessage = async (event) => {
|
|
763
|
+
if (event.data instanceof ArrayBuffer) {
|
|
764
|
+
if (options.onAudio) options.onAudio(new Uint8Array(event.data));
|
|
765
|
+
} else {
|
|
766
|
+
try {
|
|
767
|
+
const msg = JSON.parse(event.data.toString());
|
|
768
|
+
if (Array.isArray(msg) && options.onVisemes) {
|
|
769
|
+
options.onVisemes(msg);
|
|
770
|
+
}
|
|
771
|
+
} catch (e) {
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
};
|
|
775
|
+
ws.onerror = (err) => {
|
|
776
|
+
if (options.onError) options.onError(err);
|
|
777
|
+
reject(err);
|
|
778
|
+
};
|
|
779
|
+
ws.onclose = () => {
|
|
780
|
+
resolve();
|
|
781
|
+
};
|
|
782
|
+
} catch (err) {
|
|
783
|
+
if (options.onError) options.onError(err);
|
|
784
|
+
reject(err);
|
|
785
|
+
}
|
|
786
|
+
});
|
|
725
787
|
}
|
|
726
788
|
};
|
|
789
|
+
async function simpleConversation(config) {
|
|
790
|
+
const client = new VoiceAgentClient(config);
|
|
791
|
+
await client.connect();
|
|
792
|
+
return client;
|
|
793
|
+
}
|
|
794
|
+
async function simpleTTS(options) {
|
|
795
|
+
const client = new TTSClient({ apiKey: options.apiKey });
|
|
796
|
+
return client.synthesize(options);
|
|
797
|
+
}
|
|
727
798
|
export {
|
|
728
799
|
AUDIO_CONFIG,
|
|
729
800
|
BrowserAudioManager,
|