@lokutor/sdk 1.1.10 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +17 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.js +455 -384
- package/dist/index.mjs +455 -384
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -78,287 +78,6 @@ var DEFAULT_URLS = {
|
|
|
78
78
|
TTS: "wss://api.lokutor.com/ws/tts"
|
|
79
79
|
};
|
|
80
80
|
|
|
81
|
-
// src/client.ts
|
|
82
|
-
function base64ToUint8Array(base64) {
|
|
83
|
-
const binaryString = atob(base64);
|
|
84
|
-
const bytes = new Uint8Array(binaryString.length);
|
|
85
|
-
for (let i = 0; i < binaryString.length; i++) {
|
|
86
|
-
bytes[i] = binaryString.charCodeAt(i);
|
|
87
|
-
}
|
|
88
|
-
return bytes;
|
|
89
|
-
}
|
|
90
|
-
var VoiceAgentClient = class {
|
|
91
|
-
ws = null;
|
|
92
|
-
apiKey;
|
|
93
|
-
prompt;
|
|
94
|
-
voice;
|
|
95
|
-
language;
|
|
96
|
-
// Callbacks
|
|
97
|
-
onTranscription;
|
|
98
|
-
onResponse;
|
|
99
|
-
onAudioCallback;
|
|
100
|
-
onVisemesCallback;
|
|
101
|
-
onStatus;
|
|
102
|
-
onError;
|
|
103
|
-
isConnected = false;
|
|
104
|
-
messages = [];
|
|
105
|
-
visemeListeners = [];
|
|
106
|
-
wantVisemes = false;
|
|
107
|
-
constructor(config) {
|
|
108
|
-
this.apiKey = config.apiKey;
|
|
109
|
-
this.prompt = config.prompt;
|
|
110
|
-
this.voice = config.voice || "F1" /* F1 */;
|
|
111
|
-
this.language = config.language || "en" /* ENGLISH */;
|
|
112
|
-
this.onTranscription = config.onTranscription;
|
|
113
|
-
this.onResponse = config.onResponse;
|
|
114
|
-
this.onAudioCallback = config.onAudio;
|
|
115
|
-
this.onVisemesCallback = config.onVisemes;
|
|
116
|
-
this.onStatus = config.onStatus;
|
|
117
|
-
this.onError = config.onError;
|
|
118
|
-
this.wantVisemes = config.visemes || false;
|
|
119
|
-
}
|
|
120
|
-
/**
|
|
121
|
-
* Connect to the Lokutor Voice Agent server
|
|
122
|
-
*/
|
|
123
|
-
async connect() {
|
|
124
|
-
return new Promise((resolve, reject) => {
|
|
125
|
-
try {
|
|
126
|
-
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
127
|
-
if (this.apiKey) {
|
|
128
|
-
const separator = url.includes("?") ? "&" : "?";
|
|
129
|
-
url += `${separator}api_key=${this.apiKey}`;
|
|
130
|
-
}
|
|
131
|
-
console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
|
|
132
|
-
this.ws = new WebSocket(url);
|
|
133
|
-
this.ws.binaryType = "arraybuffer";
|
|
134
|
-
this.ws.onopen = () => {
|
|
135
|
-
this.isConnected = true;
|
|
136
|
-
console.log("\u2705 Connected to voice agent!");
|
|
137
|
-
this.sendConfig();
|
|
138
|
-
resolve(true);
|
|
139
|
-
};
|
|
140
|
-
this.ws.onmessage = async (event) => {
|
|
141
|
-
if (event.data instanceof ArrayBuffer) {
|
|
142
|
-
this.handleBinaryMessage(new Uint8Array(event.data));
|
|
143
|
-
} else {
|
|
144
|
-
this.handleTextMessage(event.data.toString());
|
|
145
|
-
}
|
|
146
|
-
};
|
|
147
|
-
this.ws.onerror = (err) => {
|
|
148
|
-
console.error("\u274C WebSocket error:", err);
|
|
149
|
-
if (this.onError) this.onError(err);
|
|
150
|
-
if (!this.isConnected) reject(err);
|
|
151
|
-
};
|
|
152
|
-
this.ws.onclose = () => {
|
|
153
|
-
this.isConnected = false;
|
|
154
|
-
console.log("Disconnected");
|
|
155
|
-
};
|
|
156
|
-
} catch (err) {
|
|
157
|
-
if (this.onError) this.onError(err);
|
|
158
|
-
reject(err);
|
|
159
|
-
}
|
|
160
|
-
});
|
|
161
|
-
}
|
|
162
|
-
/**
|
|
163
|
-
* Send initial configuration to the server
|
|
164
|
-
*/
|
|
165
|
-
sendConfig() {
|
|
166
|
-
if (!this.ws || !this.isConnected) return;
|
|
167
|
-
this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
|
|
168
|
-
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
169
|
-
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
170
|
-
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
171
|
-
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
|
|
172
|
-
}
|
|
173
|
-
/**
|
|
174
|
-
* Send raw PCM audio data to the server
|
|
175
|
-
* @param audioData Int16 PCM audio buffer
|
|
176
|
-
*/
|
|
177
|
-
sendAudio(audioData) {
|
|
178
|
-
if (this.ws && this.isConnected) {
|
|
179
|
-
this.ws.send(audioData);
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
/**
|
|
183
|
-
* Handle incoming binary data (audio response)
|
|
184
|
-
*/
|
|
185
|
-
handleBinaryMessage(data) {
|
|
186
|
-
this.emit("audio", data);
|
|
187
|
-
}
|
|
188
|
-
/**
|
|
189
|
-
* Handle incoming text messages (metadata/transcriptions)
|
|
190
|
-
*/
|
|
191
|
-
handleTextMessage(text) {
|
|
192
|
-
try {
|
|
193
|
-
const msg = JSON.parse(text);
|
|
194
|
-
switch (msg.type) {
|
|
195
|
-
case "audio":
|
|
196
|
-
if (msg.data) {
|
|
197
|
-
const buffer = base64ToUint8Array(msg.data);
|
|
198
|
-
this.handleBinaryMessage(buffer);
|
|
199
|
-
}
|
|
200
|
-
break;
|
|
201
|
-
case "transcript":
|
|
202
|
-
const role = msg.role === "user" ? "user" : "agent";
|
|
203
|
-
this.messages.push({
|
|
204
|
-
role,
|
|
205
|
-
text: msg.data,
|
|
206
|
-
timestamp: Date.now()
|
|
207
|
-
});
|
|
208
|
-
if (msg.role === "user") {
|
|
209
|
-
if (this.onTranscription) this.onTranscription(msg.data);
|
|
210
|
-
console.log(`\u{1F4AC} You: ${msg.data}`);
|
|
211
|
-
} else {
|
|
212
|
-
if (this.onResponse) this.onResponse(msg.data);
|
|
213
|
-
console.log(`\u{1F916} Agent: ${msg.data}`);
|
|
214
|
-
}
|
|
215
|
-
break;
|
|
216
|
-
case "status":
|
|
217
|
-
if (this.onStatus) this.onStatus(msg.data);
|
|
218
|
-
const icons = {
|
|
219
|
-
"interrupted": "\u26A1",
|
|
220
|
-
"thinking": "\u{1F9E0}",
|
|
221
|
-
"speaking": "\u{1F50A}",
|
|
222
|
-
"listening": "\u{1F442}"
|
|
223
|
-
};
|
|
224
|
-
console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
|
|
225
|
-
break;
|
|
226
|
-
case "visemes":
|
|
227
|
-
if (Array.isArray(msg.data) && msg.data.length > 0) {
|
|
228
|
-
this.emit("visemes", msg.data);
|
|
229
|
-
}
|
|
230
|
-
break;
|
|
231
|
-
case "error":
|
|
232
|
-
if (this.onError) this.onError(msg.data);
|
|
233
|
-
console.error(`\u274C Server error: ${msg.data}`);
|
|
234
|
-
break;
|
|
235
|
-
}
|
|
236
|
-
} catch (e) {
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
audioListeners = [];
|
|
240
|
-
emit(event, data) {
|
|
241
|
-
if (event === "audio") {
|
|
242
|
-
if (this.onAudioCallback) this.onAudioCallback(data);
|
|
243
|
-
this.audioListeners.forEach((l) => l(data));
|
|
244
|
-
} else if (event === "visemes") {
|
|
245
|
-
if (this.onVisemesCallback) this.onVisemesCallback(data);
|
|
246
|
-
this.visemeListeners.forEach((l) => l(data));
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
onAudio(callback) {
|
|
250
|
-
this.audioListeners.push(callback);
|
|
251
|
-
}
|
|
252
|
-
onVisemes(callback) {
|
|
253
|
-
this.visemeListeners.push(callback);
|
|
254
|
-
}
|
|
255
|
-
/**
|
|
256
|
-
* Disconnect from the server
|
|
257
|
-
*/
|
|
258
|
-
disconnect() {
|
|
259
|
-
if (this.ws) {
|
|
260
|
-
this.ws.close();
|
|
261
|
-
this.ws = null;
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
/**
|
|
265
|
-
* Update the system prompt mid-conversation
|
|
266
|
-
*/
|
|
267
|
-
updatePrompt(newPrompt) {
|
|
268
|
-
this.prompt = newPrompt;
|
|
269
|
-
if (this.ws && this.isConnected) {
|
|
270
|
-
try {
|
|
271
|
-
this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
|
|
272
|
-
console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
|
|
273
|
-
} catch (error) {
|
|
274
|
-
console.error("Error updating prompt:", error);
|
|
275
|
-
}
|
|
276
|
-
} else {
|
|
277
|
-
console.warn("Not connected - prompt will be updated on next connection");
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
/**
|
|
281
|
-
* Get full conversation transcript
|
|
282
|
-
*/
|
|
283
|
-
getTranscript() {
|
|
284
|
-
return this.messages.slice();
|
|
285
|
-
}
|
|
286
|
-
/**
|
|
287
|
-
* Get conversation as formatted text
|
|
288
|
-
*/
|
|
289
|
-
getTranscriptText() {
|
|
290
|
-
return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
|
|
291
|
-
}
|
|
292
|
-
};
|
|
293
|
-
var TTSClient = class {
|
|
294
|
-
apiKey;
|
|
295
|
-
constructor(config) {
|
|
296
|
-
this.apiKey = config.apiKey;
|
|
297
|
-
}
|
|
298
|
-
/**
|
|
299
|
-
* Synthesize text to speech
|
|
300
|
-
*
|
|
301
|
-
* This opens a temporary WebSocket connection, sends the request,
|
|
302
|
-
* and streams back the audio.
|
|
303
|
-
*/
|
|
304
|
-
synthesize(options) {
|
|
305
|
-
return new Promise((resolve, reject) => {
|
|
306
|
-
try {
|
|
307
|
-
let url = DEFAULT_URLS.TTS;
|
|
308
|
-
if (this.apiKey) {
|
|
309
|
-
const separator = url.includes("?") ? "&" : "?";
|
|
310
|
-
url += `${separator}api_key=${this.apiKey}`;
|
|
311
|
-
}
|
|
312
|
-
const ws = new WebSocket(url);
|
|
313
|
-
ws.binaryType = "arraybuffer";
|
|
314
|
-
ws.onopen = () => {
|
|
315
|
-
const req = {
|
|
316
|
-
text: options.text,
|
|
317
|
-
voice: options.voice || "F1" /* F1 */,
|
|
318
|
-
lang: options.language || "en" /* ENGLISH */,
|
|
319
|
-
speed: options.speed || 1.05,
|
|
320
|
-
steps: options.steps || 24,
|
|
321
|
-
visemes: options.visemes || false
|
|
322
|
-
};
|
|
323
|
-
ws.send(JSON.stringify(req));
|
|
324
|
-
};
|
|
325
|
-
ws.onmessage = async (event) => {
|
|
326
|
-
if (event.data instanceof ArrayBuffer) {
|
|
327
|
-
if (options.onAudio) options.onAudio(new Uint8Array(event.data));
|
|
328
|
-
} else {
|
|
329
|
-
try {
|
|
330
|
-
const msg = JSON.parse(event.data.toString());
|
|
331
|
-
if (Array.isArray(msg) && options.onVisemes) {
|
|
332
|
-
options.onVisemes(msg);
|
|
333
|
-
}
|
|
334
|
-
} catch (e) {
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
};
|
|
338
|
-
ws.onerror = (err) => {
|
|
339
|
-
if (options.onError) options.onError(err);
|
|
340
|
-
reject(err);
|
|
341
|
-
};
|
|
342
|
-
ws.onclose = () => {
|
|
343
|
-
resolve();
|
|
344
|
-
};
|
|
345
|
-
} catch (err) {
|
|
346
|
-
if (options.onError) options.onError(err);
|
|
347
|
-
reject(err);
|
|
348
|
-
}
|
|
349
|
-
});
|
|
350
|
-
}
|
|
351
|
-
};
|
|
352
|
-
async function simpleConversation(config) {
|
|
353
|
-
const client = new VoiceAgentClient(config);
|
|
354
|
-
await client.connect();
|
|
355
|
-
return client;
|
|
356
|
-
}
|
|
357
|
-
async function simpleTTS(options) {
|
|
358
|
-
const client = new TTSClient({ apiKey: options.apiKey });
|
|
359
|
-
return client.synthesize(options);
|
|
360
|
-
}
|
|
361
|
-
|
|
362
81
|
// src/audio-utils.ts
|
|
363
82
|
function pcm16ToFloat32(int16Data) {
|
|
364
83
|
const float32 = new Float32Array(int16Data.length);
|
|
@@ -498,6 +217,7 @@ var BrowserAudioManager = class {
|
|
|
498
217
|
scriptProcessor = null;
|
|
499
218
|
analyserNode = null;
|
|
500
219
|
mediaStream = null;
|
|
220
|
+
resampler = null;
|
|
501
221
|
// Playback scheduling
|
|
502
222
|
nextPlaybackTime = 0;
|
|
503
223
|
activeSources = [];
|
|
@@ -575,6 +295,12 @@ var BrowserAudioManager = class {
|
|
|
575
295
|
if (this.analyserNode) {
|
|
576
296
|
this.mediaStreamAudioSourceNode.connect(this.analyserNode);
|
|
577
297
|
}
|
|
298
|
+
const hardwareRate = this.audioContext.sampleRate;
|
|
299
|
+
if (hardwareRate !== this.inputSampleRate) {
|
|
300
|
+
this.resampler = new StreamResampler(hardwareRate, this.inputSampleRate);
|
|
301
|
+
} else {
|
|
302
|
+
this.resampler = null;
|
|
303
|
+
}
|
|
578
304
|
this.scriptProcessor.onaudioprocess = (event) => {
|
|
579
305
|
this._processAudioInput(event);
|
|
580
306
|
};
|
|
@@ -597,15 +323,11 @@ var BrowserAudioManager = class {
|
|
|
597
323
|
for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
|
|
598
324
|
outputBuffer.getChannelData(0)[i] = 0;
|
|
599
325
|
}
|
|
600
|
-
const hardwareRate = this.audioContext.sampleRate;
|
|
601
326
|
let processedData = new Float32Array(inputData);
|
|
602
|
-
if (
|
|
603
|
-
processedData =
|
|
604
|
-
processedData,
|
|
605
|
-
hardwareRate,
|
|
606
|
-
this.inputSampleRate
|
|
607
|
-
);
|
|
327
|
+
if (this.resampler) {
|
|
328
|
+
processedData = this.resampler.process(processedData);
|
|
608
329
|
}
|
|
330
|
+
if (processedData.length === 0) return;
|
|
609
331
|
const int16Data = float32ToPcm16(processedData);
|
|
610
332
|
const uint8Data = new Uint8Array(
|
|
611
333
|
int16Data.buffer,
|
|
@@ -637,137 +359,486 @@ var BrowserAudioManager = class {
|
|
|
637
359
|
* Play back audio received from the server
|
|
638
360
|
* @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
|
|
639
361
|
*/
|
|
640
|
-
playAudio(pcm16Data) {
|
|
641
|
-
if (!this.audioContext) {
|
|
642
|
-
console.warn("AudioContext not initialized");
|
|
643
|
-
return;
|
|
362
|
+
playAudio(pcm16Data) {
|
|
363
|
+
if (!this.audioContext) {
|
|
364
|
+
console.warn("AudioContext not initialized");
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
const int16Array = new Int16Array(
|
|
368
|
+
pcm16Data.buffer,
|
|
369
|
+
pcm16Data.byteOffset,
|
|
370
|
+
pcm16Data.length / 2
|
|
371
|
+
);
|
|
372
|
+
const float32Data = pcm16ToFloat32(int16Array);
|
|
373
|
+
const audioBuffer = this.audioContext.createBuffer(
|
|
374
|
+
1,
|
|
375
|
+
float32Data.length,
|
|
376
|
+
this.outputSampleRate
|
|
377
|
+
);
|
|
378
|
+
audioBuffer.getChannelData(0).set(float32Data);
|
|
379
|
+
this._schedulePlayback(audioBuffer);
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Internal method to schedule and play audio with sample-accurate timing
|
|
383
|
+
*/
|
|
384
|
+
_schedulePlayback(audioBuffer) {
|
|
385
|
+
if (!this.audioContext) return;
|
|
386
|
+
const currentTime = this.audioContext.currentTime;
|
|
387
|
+
const duration = audioBuffer.length / this.outputSampleRate;
|
|
388
|
+
const startTime = Math.max(
|
|
389
|
+
currentTime + 0.01,
|
|
390
|
+
// Minimum 10ms delay
|
|
391
|
+
this.nextPlaybackTime
|
|
392
|
+
);
|
|
393
|
+
this.nextPlaybackTime = startTime + duration;
|
|
394
|
+
const source = this.audioContext.createBufferSource();
|
|
395
|
+
source.buffer = audioBuffer;
|
|
396
|
+
source.connect(this.audioContext.destination);
|
|
397
|
+
if (this.analyserNode) {
|
|
398
|
+
source.connect(this.analyserNode);
|
|
399
|
+
}
|
|
400
|
+
source.start(startTime);
|
|
401
|
+
this.activeSources.push(source);
|
|
402
|
+
source.onended = () => {
|
|
403
|
+
const index = this.activeSources.indexOf(source);
|
|
404
|
+
if (index > -1) {
|
|
405
|
+
this.activeSources.splice(index, 1);
|
|
406
|
+
}
|
|
407
|
+
};
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Stop all currently playing audio and clear the queue
|
|
411
|
+
*/
|
|
412
|
+
stopPlayback() {
|
|
413
|
+
this.activeSources.forEach((source) => {
|
|
414
|
+
try {
|
|
415
|
+
source.stop();
|
|
416
|
+
} catch (e) {
|
|
417
|
+
}
|
|
418
|
+
});
|
|
419
|
+
this.activeSources = [];
|
|
420
|
+
this.playbackQueue = [];
|
|
421
|
+
this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
|
|
422
|
+
console.log("\u{1F507} Playback stopped");
|
|
423
|
+
}
|
|
424
|
+
/**
|
|
425
|
+
* Toggle mute state
|
|
426
|
+
*/
|
|
427
|
+
setMuted(muted) {
|
|
428
|
+
this.isMuted = muted;
|
|
429
|
+
}
|
|
430
|
+
/**
|
|
431
|
+
* Get current mute state
|
|
432
|
+
*/
|
|
433
|
+
isMicMuted() {
|
|
434
|
+
return this.isMuted;
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Get current amplitude from analyser (for visualization)
|
|
438
|
+
* Returns value between 0 and 1
|
|
439
|
+
*/
|
|
440
|
+
getAmplitude() {
|
|
441
|
+
if (!this.analyserNode) return 0;
|
|
442
|
+
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
443
|
+
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
444
|
+
const rms = calculateRMS(dataArray);
|
|
445
|
+
return Math.min(rms * 10, 1);
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Get frequency data from analyser for visualization
|
|
449
|
+
*/
|
|
450
|
+
getFrequencyData() {
|
|
451
|
+
if (!this.analyserNode) {
|
|
452
|
+
return new Uint8Array(0);
|
|
453
|
+
}
|
|
454
|
+
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
455
|
+
this.analyserNode.getByteFrequencyData(dataArray);
|
|
456
|
+
return dataArray;
|
|
457
|
+
}
|
|
458
|
+
/**
|
|
459
|
+
* Get time-domain data from analyser for waveform visualization
|
|
460
|
+
*/
|
|
461
|
+
getWaveformData() {
|
|
462
|
+
if (!this.analyserNode) {
|
|
463
|
+
return new Uint8Array(0);
|
|
464
|
+
}
|
|
465
|
+
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
466
|
+
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
467
|
+
return dataArray;
|
|
468
|
+
}
|
|
469
|
+
/**
|
|
470
|
+
* Cleanup and close AudioContext
|
|
471
|
+
*/
|
|
472
|
+
cleanup() {
|
|
473
|
+
this.stopMicrophone();
|
|
474
|
+
this.stopPlayback();
|
|
475
|
+
if (this.analyserNode) {
|
|
476
|
+
this.analyserNode.disconnect();
|
|
477
|
+
this.analyserNode = null;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
/**
|
|
481
|
+
* Get current audio context state
|
|
482
|
+
*/
|
|
483
|
+
getState() {
|
|
484
|
+
return this.audioContext?.state ?? null;
|
|
485
|
+
}
|
|
486
|
+
/**
|
|
487
|
+
* Check if microphone is currently listening
|
|
488
|
+
*/
|
|
489
|
+
isRecording() {
|
|
490
|
+
return this.isListening;
|
|
491
|
+
}
|
|
492
|
+
};
|
|
493
|
+
|
|
494
|
+
// src/client.ts
|
|
495
|
+
function base64ToUint8Array(base64) {
|
|
496
|
+
const binaryString = atob(base64);
|
|
497
|
+
const bytes = new Uint8Array(binaryString.length);
|
|
498
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
499
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
500
|
+
}
|
|
501
|
+
return bytes;
|
|
502
|
+
}
|
|
503
|
+
var VoiceAgentClient = class {
|
|
504
|
+
ws = null;
|
|
505
|
+
apiKey;
|
|
506
|
+
prompt;
|
|
507
|
+
voice;
|
|
508
|
+
language;
|
|
509
|
+
// Callbacks
|
|
510
|
+
onTranscription;
|
|
511
|
+
onResponse;
|
|
512
|
+
onAudioCallback;
|
|
513
|
+
onVisemesCallback;
|
|
514
|
+
onStatus;
|
|
515
|
+
onError;
|
|
516
|
+
isConnected = false;
|
|
517
|
+
messages = [];
|
|
518
|
+
visemeListeners = [];
|
|
519
|
+
wantVisemes = false;
|
|
520
|
+
audioManager = null;
|
|
521
|
+
enableAudio = false;
|
|
522
|
+
// Connection resilience
|
|
523
|
+
isUserDisconnect = false;
|
|
524
|
+
reconnecting = false;
|
|
525
|
+
reconnectAttempts = 0;
|
|
526
|
+
maxReconnectAttempts = 5;
|
|
527
|
+
constructor(config) {
|
|
528
|
+
this.apiKey = config.apiKey;
|
|
529
|
+
this.prompt = config.prompt;
|
|
530
|
+
this.voice = config.voice || "F1" /* F1 */;
|
|
531
|
+
this.language = config.language || "en" /* ENGLISH */;
|
|
532
|
+
this.onTranscription = config.onTranscription;
|
|
533
|
+
this.onResponse = config.onResponse;
|
|
534
|
+
this.onAudioCallback = config.onAudio;
|
|
535
|
+
this.onVisemesCallback = config.onVisemes;
|
|
536
|
+
this.onStatus = config.onStatus;
|
|
537
|
+
this.onError = config.onError;
|
|
538
|
+
this.wantVisemes = config.visemes || false;
|
|
539
|
+
this.enableAudio = config.enableAudio ?? false;
|
|
540
|
+
}
|
|
541
|
+
/**
|
|
542
|
+
* Connect to the Lokutor Voice Agent server
|
|
543
|
+
*/
|
|
544
|
+
async connect() {
|
|
545
|
+
this.isUserDisconnect = false;
|
|
546
|
+
if (this.enableAudio) {
|
|
547
|
+
if (!this.audioManager) {
|
|
548
|
+
this.audioManager = new BrowserAudioManager();
|
|
549
|
+
}
|
|
550
|
+
await this.audioManager.init();
|
|
551
|
+
}
|
|
552
|
+
return new Promise((resolve, reject) => {
|
|
553
|
+
try {
|
|
554
|
+
let url = DEFAULT_URLS.VOICE_AGENT;
|
|
555
|
+
if (this.apiKey) {
|
|
556
|
+
const separator = url.includes("?") ? "&" : "?";
|
|
557
|
+
url += `${separator}api_key=${this.apiKey}`;
|
|
558
|
+
}
|
|
559
|
+
console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
|
|
560
|
+
this.ws = new WebSocket(url);
|
|
561
|
+
this.ws.binaryType = "arraybuffer";
|
|
562
|
+
this.ws.onopen = async () => {
|
|
563
|
+
this.isConnected = true;
|
|
564
|
+
this.reconnectAttempts = 0;
|
|
565
|
+
this.reconnecting = false;
|
|
566
|
+
console.log("\u2705 Connected to voice agent!");
|
|
567
|
+
this.sendConfig();
|
|
568
|
+
if (this.audioManager) {
|
|
569
|
+
await this.audioManager.startMicrophone((data) => {
|
|
570
|
+
if (this.isConnected) {
|
|
571
|
+
this.sendAudio(data);
|
|
572
|
+
}
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
resolve(true);
|
|
576
|
+
};
|
|
577
|
+
this.ws.onmessage = async (event) => {
|
|
578
|
+
if (event.data instanceof ArrayBuffer) {
|
|
579
|
+
this.handleBinaryMessage(new Uint8Array(event.data));
|
|
580
|
+
} else {
|
|
581
|
+
this.handleTextMessage(event.data.toString());
|
|
582
|
+
}
|
|
583
|
+
};
|
|
584
|
+
this.ws.onerror = (err) => {
|
|
585
|
+
console.error("\u274C WebSocket error:", err);
|
|
586
|
+
if (this.onError) this.onError(err);
|
|
587
|
+
if (!this.isConnected) reject(err);
|
|
588
|
+
};
|
|
589
|
+
this.ws.onclose = () => {
|
|
590
|
+
this.isConnected = false;
|
|
591
|
+
if (!this.isUserDisconnect && this.reconnectAttempts < this.maxReconnectAttempts) {
|
|
592
|
+
this.reconnecting = true;
|
|
593
|
+
this.reconnectAttempts++;
|
|
594
|
+
const backoffDelay = Math.min(1e3 * Math.pow(2, this.reconnectAttempts), 1e4);
|
|
595
|
+
console.warn(`Connection lost. Reconnecting in ${backoffDelay}ms (attempt ${this.reconnectAttempts}/${this.maxReconnectAttempts})`);
|
|
596
|
+
if (this.onStatus) this.onStatus("reconnecting");
|
|
597
|
+
setTimeout(() => {
|
|
598
|
+
this.connect().catch((e) => console.error("Reconnect failed", e));
|
|
599
|
+
}, backoffDelay);
|
|
600
|
+
} else {
|
|
601
|
+
console.log("Disconnected");
|
|
602
|
+
if (this.onStatus) this.onStatus("disconnected");
|
|
603
|
+
}
|
|
604
|
+
};
|
|
605
|
+
} catch (err) {
|
|
606
|
+
if (this.onError) this.onError(err);
|
|
607
|
+
reject(err);
|
|
608
|
+
}
|
|
609
|
+
});
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Send initial configuration to the server
|
|
613
|
+
*/
|
|
614
|
+
sendConfig() {
|
|
615
|
+
if (!this.ws || !this.isConnected) return;
|
|
616
|
+
this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
|
|
617
|
+
this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
|
|
618
|
+
this.ws.send(JSON.stringify({ type: "language", data: this.language }));
|
|
619
|
+
this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
|
|
620
|
+
console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
|
|
621
|
+
}
|
|
622
|
+
/**
|
|
623
|
+
* Send raw PCM audio data to the server
|
|
624
|
+
* @param audioData Int16 PCM audio buffer
|
|
625
|
+
*/
|
|
626
|
+
sendAudio(audioData) {
|
|
627
|
+
if (this.ws && this.ws.readyState === WebSocket.OPEN && this.isConnected) {
|
|
628
|
+
this.ws.send(audioData);
|
|
644
629
|
}
|
|
645
|
-
const int16Array = new Int16Array(
|
|
646
|
-
pcm16Data.buffer,
|
|
647
|
-
pcm16Data.byteOffset,
|
|
648
|
-
pcm16Data.length / 2
|
|
649
|
-
);
|
|
650
|
-
const float32Data = pcm16ToFloat32(int16Array);
|
|
651
|
-
const audioBuffer = this.audioContext.createBuffer(
|
|
652
|
-
1,
|
|
653
|
-
float32Data.length,
|
|
654
|
-
this.outputSampleRate
|
|
655
|
-
);
|
|
656
|
-
audioBuffer.getChannelData(0).set(float32Data);
|
|
657
|
-
this._schedulePlayback(audioBuffer);
|
|
658
630
|
}
|
|
659
631
|
/**
|
|
660
|
-
*
|
|
632
|
+
* Handle incoming binary data (audio response)
|
|
661
633
|
*/
|
|
662
|
-
|
|
663
|
-
if (
|
|
664
|
-
|
|
665
|
-
const duration = audioBuffer.length / this.outputSampleRate;
|
|
666
|
-
const startTime = Math.max(
|
|
667
|
-
currentTime + 0.01,
|
|
668
|
-
// Minimum 10ms delay
|
|
669
|
-
this.nextPlaybackTime
|
|
670
|
-
);
|
|
671
|
-
this.nextPlaybackTime = startTime + duration;
|
|
672
|
-
const source = this.audioContext.createBufferSource();
|
|
673
|
-
source.buffer = audioBuffer;
|
|
674
|
-
source.connect(this.audioContext.destination);
|
|
675
|
-
if (this.analyserNode) {
|
|
676
|
-
source.connect(this.analyserNode);
|
|
634
|
+
handleBinaryMessage(data) {
|
|
635
|
+
if (this.audioManager) {
|
|
636
|
+
this.audioManager.playAudio(data);
|
|
677
637
|
}
|
|
678
|
-
|
|
679
|
-
this.activeSources.push(source);
|
|
680
|
-
source.onended = () => {
|
|
681
|
-
const index = this.activeSources.indexOf(source);
|
|
682
|
-
if (index > -1) {
|
|
683
|
-
this.activeSources.splice(index, 1);
|
|
684
|
-
}
|
|
685
|
-
};
|
|
638
|
+
this.emit("audio", data);
|
|
686
639
|
}
|
|
687
640
|
/**
|
|
688
|
-
*
|
|
641
|
+
* Handle incoming text messages (metadata/transcriptions)
|
|
689
642
|
*/
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
643
|
+
handleTextMessage(text) {
|
|
644
|
+
try {
|
|
645
|
+
const msg = JSON.parse(text);
|
|
646
|
+
switch (msg.type) {
|
|
647
|
+
case "audio":
|
|
648
|
+
if (msg.data) {
|
|
649
|
+
const buffer = base64ToUint8Array(msg.data);
|
|
650
|
+
this.handleBinaryMessage(buffer);
|
|
651
|
+
}
|
|
652
|
+
break;
|
|
653
|
+
case "transcript":
|
|
654
|
+
const role = msg.role === "user" ? "user" : "agent";
|
|
655
|
+
this.messages.push({
|
|
656
|
+
role,
|
|
657
|
+
text: msg.data,
|
|
658
|
+
timestamp: Date.now()
|
|
659
|
+
});
|
|
660
|
+
if (msg.role === "user") {
|
|
661
|
+
if (this.onTranscription) this.onTranscription(msg.data);
|
|
662
|
+
console.log(`\u{1F4AC} You: ${msg.data}`);
|
|
663
|
+
} else {
|
|
664
|
+
if (this.onResponse) this.onResponse(msg.data);
|
|
665
|
+
console.log(`\u{1F916} Agent: ${msg.data}`);
|
|
666
|
+
}
|
|
667
|
+
break;
|
|
668
|
+
case "status":
|
|
669
|
+
if (msg.data === "interrupted" && this.audioManager) {
|
|
670
|
+
this.audioManager.stopPlayback();
|
|
671
|
+
}
|
|
672
|
+
if (this.onStatus) this.onStatus(msg.data);
|
|
673
|
+
const icons = {
|
|
674
|
+
"interrupted": "\u26A1",
|
|
675
|
+
"thinking": "\u{1F9E0}",
|
|
676
|
+
"speaking": "\u{1F50A}",
|
|
677
|
+
"listening": "\u{1F442}"
|
|
678
|
+
};
|
|
679
|
+
console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
|
|
680
|
+
break;
|
|
681
|
+
case "visemes":
|
|
682
|
+
if (Array.isArray(msg.data) && msg.data.length > 0) {
|
|
683
|
+
this.emit("visemes", msg.data);
|
|
684
|
+
}
|
|
685
|
+
break;
|
|
686
|
+
case "error":
|
|
687
|
+
if (this.onError) this.onError(msg.data);
|
|
688
|
+
console.error(`\u274C Server error: ${msg.data}`);
|
|
689
|
+
break;
|
|
695
690
|
}
|
|
696
|
-
})
|
|
697
|
-
|
|
698
|
-
this.playbackQueue = [];
|
|
699
|
-
this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
|
|
700
|
-
console.log("\u{1F507} Playback stopped");
|
|
691
|
+
} catch (e) {
|
|
692
|
+
}
|
|
701
693
|
}
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
694
|
+
audioListeners = [];
|
|
695
|
+
emit(event, data) {
|
|
696
|
+
if (event === "audio") {
|
|
697
|
+
if (this.onAudioCallback) this.onAudioCallback(data);
|
|
698
|
+
this.audioListeners.forEach((l) => l(data));
|
|
699
|
+
} else if (event === "visemes") {
|
|
700
|
+
if (this.onVisemesCallback) this.onVisemesCallback(data);
|
|
701
|
+
this.visemeListeners.forEach((l) => l(data));
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
onAudio(callback) {
|
|
705
|
+
this.audioListeners.push(callback);
|
|
706
|
+
}
|
|
707
|
+
onVisemes(callback) {
|
|
708
|
+
this.visemeListeners.push(callback);
|
|
707
709
|
}
|
|
708
710
|
/**
|
|
709
|
-
*
|
|
711
|
+
* Disconnect from the server
|
|
710
712
|
*/
|
|
711
|
-
|
|
712
|
-
|
|
713
|
+
disconnect() {
|
|
714
|
+
this.isUserDisconnect = true;
|
|
715
|
+
if (this.ws) {
|
|
716
|
+
this.ws.close();
|
|
717
|
+
this.ws = null;
|
|
718
|
+
}
|
|
719
|
+
if (this.audioManager) {
|
|
720
|
+
this.audioManager.cleanup();
|
|
721
|
+
}
|
|
722
|
+
this.isConnected = false;
|
|
713
723
|
}
|
|
714
724
|
/**
|
|
715
|
-
*
|
|
716
|
-
*
|
|
725
|
+
* Toggles the microphone mute state (if managed by client)
|
|
726
|
+
* returns the new mute state
|
|
717
727
|
*/
|
|
718
|
-
|
|
719
|
-
if (
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
728
|
+
toggleMute() {
|
|
729
|
+
if (this.audioManager) {
|
|
730
|
+
const isMuted = this.audioManager.isMicMuted();
|
|
731
|
+
this.audioManager.setMuted(!isMuted);
|
|
732
|
+
return !isMuted;
|
|
733
|
+
}
|
|
734
|
+
return false;
|
|
724
735
|
}
|
|
725
736
|
/**
|
|
726
|
-
*
|
|
737
|
+
* Gets the microphone volume amplitude 0-1 (if managed by client)
|
|
727
738
|
*/
|
|
728
|
-
|
|
729
|
-
if (
|
|
730
|
-
return
|
|
739
|
+
getAmplitude() {
|
|
740
|
+
if (this.audioManager) {
|
|
741
|
+
return this.audioManager.getAmplitude();
|
|
731
742
|
}
|
|
732
|
-
|
|
733
|
-
this.analyserNode.getByteFrequencyData(dataArray);
|
|
734
|
-
return dataArray;
|
|
743
|
+
return 0;
|
|
735
744
|
}
|
|
736
745
|
/**
|
|
737
|
-
*
|
|
746
|
+
* Update the system prompt mid-conversation
|
|
738
747
|
*/
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
748
|
+
updatePrompt(newPrompt) {
|
|
749
|
+
this.prompt = newPrompt;
|
|
750
|
+
if (this.ws && this.isConnected) {
|
|
751
|
+
try {
|
|
752
|
+
this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
|
|
753
|
+
console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
|
|
754
|
+
} catch (error) {
|
|
755
|
+
console.error("Error updating prompt:", error);
|
|
756
|
+
}
|
|
757
|
+
} else {
|
|
758
|
+
console.warn("Not connected - prompt will be updated on next connection");
|
|
742
759
|
}
|
|
743
|
-
const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
|
|
744
|
-
this.analyserNode.getByteTimeDomainData(dataArray);
|
|
745
|
-
return dataArray;
|
|
746
760
|
}
|
|
747
761
|
/**
|
|
748
|
-
*
|
|
762
|
+
* Get full conversation transcript
|
|
749
763
|
*/
|
|
750
|
-
|
|
751
|
-
this.
|
|
752
|
-
this.stopPlayback();
|
|
753
|
-
if (this.analyserNode) {
|
|
754
|
-
this.analyserNode.disconnect();
|
|
755
|
-
this.analyserNode = null;
|
|
756
|
-
}
|
|
764
|
+
getTranscript() {
|
|
765
|
+
return this.messages.slice();
|
|
757
766
|
}
|
|
758
767
|
/**
|
|
759
|
-
* Get
|
|
768
|
+
* Get conversation as formatted text
|
|
760
769
|
*/
|
|
761
|
-
|
|
762
|
-
return this.
|
|
770
|
+
getTranscriptText() {
|
|
771
|
+
return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
|
|
772
|
+
}
|
|
773
|
+
};
|
|
774
|
+
var TTSClient = class {
|
|
775
|
+
apiKey;
|
|
776
|
+
constructor(config) {
|
|
777
|
+
this.apiKey = config.apiKey;
|
|
763
778
|
}
|
|
764
779
|
/**
|
|
765
|
-
*
|
|
780
|
+
* Synthesize text to speech
|
|
781
|
+
*
|
|
782
|
+
* This opens a temporary WebSocket connection, sends the request,
|
|
783
|
+
* and streams back the audio.
|
|
766
784
|
*/
|
|
767
|
-
|
|
768
|
-
return
|
|
785
|
+
synthesize(options) {
|
|
786
|
+
return new Promise((resolve, reject) => {
|
|
787
|
+
try {
|
|
788
|
+
let url = DEFAULT_URLS.TTS;
|
|
789
|
+
if (this.apiKey) {
|
|
790
|
+
const separator = url.includes("?") ? "&" : "?";
|
|
791
|
+
url += `${separator}api_key=${this.apiKey}`;
|
|
792
|
+
}
|
|
793
|
+
const ws = new WebSocket(url);
|
|
794
|
+
ws.binaryType = "arraybuffer";
|
|
795
|
+
ws.onopen = () => {
|
|
796
|
+
const req = {
|
|
797
|
+
text: options.text,
|
|
798
|
+
voice: options.voice || "F1" /* F1 */,
|
|
799
|
+
lang: options.language || "en" /* ENGLISH */,
|
|
800
|
+
speed: options.speed || 1.05,
|
|
801
|
+
steps: options.steps || 24,
|
|
802
|
+
visemes: options.visemes || false
|
|
803
|
+
};
|
|
804
|
+
ws.send(JSON.stringify(req));
|
|
805
|
+
};
|
|
806
|
+
ws.onmessage = async (event) => {
|
|
807
|
+
if (event.data instanceof ArrayBuffer) {
|
|
808
|
+
if (options.onAudio) options.onAudio(new Uint8Array(event.data));
|
|
809
|
+
} else {
|
|
810
|
+
try {
|
|
811
|
+
const msg = JSON.parse(event.data.toString());
|
|
812
|
+
if (Array.isArray(msg) && options.onVisemes) {
|
|
813
|
+
options.onVisemes(msg);
|
|
814
|
+
}
|
|
815
|
+
} catch (e) {
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
};
|
|
819
|
+
ws.onerror = (err) => {
|
|
820
|
+
if (options.onError) options.onError(err);
|
|
821
|
+
reject(err);
|
|
822
|
+
};
|
|
823
|
+
ws.onclose = () => {
|
|
824
|
+
resolve();
|
|
825
|
+
};
|
|
826
|
+
} catch (err) {
|
|
827
|
+
if (options.onError) options.onError(err);
|
|
828
|
+
reject(err);
|
|
829
|
+
}
|
|
830
|
+
});
|
|
769
831
|
}
|
|
770
832
|
};
|
|
833
|
+
async function simpleConversation(config) {
|
|
834
|
+
const client = new VoiceAgentClient(config);
|
|
835
|
+
await client.connect();
|
|
836
|
+
return client;
|
|
837
|
+
}
|
|
838
|
+
async function simpleTTS(options) {
|
|
839
|
+
const client = new TTSClient({ apiKey: options.apiKey });
|
|
840
|
+
return client.synthesize(options);
|
|
841
|
+
}
|
|
771
842
|
// Annotate the CommonJS export names for ESM import in node:
|
|
772
843
|
0 && (module.exports = {
|
|
773
844
|
AUDIO_CONFIG,
|