@lokutor/sdk 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -26,7 +26,6 @@ __export(index_exports, {
26
26
  Language: () => Language,
27
27
  StreamResampler: () => StreamResampler,
28
28
  TTSClient: () => TTSClient,
29
- VoiceAgent: () => VoiceAgent,
30
29
  VoiceAgentClient: () => VoiceAgentClient,
31
30
  VoiceStyle: () => VoiceStyle,
32
31
  applyLowPassFilter: () => applyLowPassFilter,
@@ -79,289 +78,6 @@ var DEFAULT_URLS = {
79
78
  TTS: "wss://api.lokutor.com/ws/tts"
80
79
  };
81
80
 
82
- // src/client.ts
83
- function base64ToUint8Array(base64) {
84
- const binaryString = atob(base64);
85
- const bytes = new Uint8Array(binaryString.length);
86
- for (let i = 0; i < binaryString.length; i++) {
87
- bytes[i] = binaryString.charCodeAt(i);
88
- }
89
- return bytes;
90
- }
91
- var VoiceAgentClient = class {
92
- ws = null;
93
- apiKey;
94
- prompt;
95
- voice;
96
- language;
97
- // Callbacks
98
- onTranscription;
99
- onResponse;
100
- onAudioCallback;
101
- onVisemesCallback;
102
- onStatus;
103
- onError;
104
- isConnected = false;
105
- messages = [];
106
- visemeListeners = [];
107
- wantVisemes = false;
108
- serverUrl = null;
109
- constructor(config) {
110
- this.apiKey = config.apiKey;
111
- this.prompt = config.prompt;
112
- this.voice = config.voice || "F1" /* F1 */;
113
- this.language = config.language || "en" /* ENGLISH */;
114
- this.serverUrl = config.serverUrl || null;
115
- this.onTranscription = config.onTranscription;
116
- this.onResponse = config.onResponse;
117
- this.onAudioCallback = config.onAudio;
118
- this.onVisemesCallback = config.onVisemes;
119
- this.onStatus = config.onStatus;
120
- this.onError = config.onError;
121
- this.wantVisemes = config.visemes || false;
122
- }
123
- /**
124
- * Connect to the Lokutor Voice Agent server
125
- */
126
- async connect() {
127
- return new Promise((resolve, reject) => {
128
- try {
129
- let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
130
- if (this.apiKey) {
131
- const separator = url.includes("?") ? "&" : "?";
132
- url += `${separator}api_key=${this.apiKey}`;
133
- }
134
- console.log(`\u{1F517} Connecting to ${url}...`);
135
- this.ws = new WebSocket(url);
136
- this.ws.binaryType = "arraybuffer";
137
- this.ws.onopen = () => {
138
- this.isConnected = true;
139
- console.log("\u2705 Connected to voice agent!");
140
- this.sendConfig();
141
- resolve(true);
142
- };
143
- this.ws.onmessage = async (event) => {
144
- if (event.data instanceof ArrayBuffer) {
145
- this.handleBinaryMessage(new Uint8Array(event.data));
146
- } else {
147
- this.handleTextMessage(event.data.toString());
148
- }
149
- };
150
- this.ws.onerror = (err) => {
151
- console.error("\u274C WebSocket error:", err);
152
- if (this.onError) this.onError(err);
153
- if (!this.isConnected) reject(err);
154
- };
155
- this.ws.onclose = () => {
156
- this.isConnected = false;
157
- console.log("Disconnected");
158
- };
159
- } catch (err) {
160
- if (this.onError) this.onError(err);
161
- reject(err);
162
- }
163
- });
164
- }
165
- /**
166
- * Send initial configuration to the server
167
- */
168
- sendConfig() {
169
- if (!this.ws || !this.isConnected) return;
170
- this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
171
- this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
172
- this.ws.send(JSON.stringify({ type: "language", data: this.language }));
173
- this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
174
- console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
175
- }
176
- /**
177
- * Send raw PCM audio data to the server
178
- * @param audioData Int16 PCM audio buffer
179
- */
180
- sendAudio(audioData) {
181
- if (this.ws && this.isConnected) {
182
- this.ws.send(audioData);
183
- }
184
- }
185
- /**
186
- * Handle incoming binary data (audio response)
187
- */
188
- handleBinaryMessage(data) {
189
- this.emit("audio", data);
190
- }
191
- /**
192
- * Handle incoming text messages (metadata/transcriptions)
193
- */
194
- handleTextMessage(text) {
195
- try {
196
- const msg = JSON.parse(text);
197
- switch (msg.type) {
198
- case "audio":
199
- if (msg.data) {
200
- const buffer = base64ToUint8Array(msg.data);
201
- this.handleBinaryMessage(buffer);
202
- }
203
- break;
204
- case "transcript":
205
- const role = msg.role === "user" ? "user" : "agent";
206
- this.messages.push({
207
- role,
208
- text: msg.data,
209
- timestamp: Date.now()
210
- });
211
- if (msg.role === "user") {
212
- if (this.onTranscription) this.onTranscription(msg.data);
213
- console.log(`\u{1F4AC} You: ${msg.data}`);
214
- } else {
215
- if (this.onResponse) this.onResponse(msg.data);
216
- console.log(`\u{1F916} Agent: ${msg.data}`);
217
- }
218
- break;
219
- case "status":
220
- if (this.onStatus) this.onStatus(msg.data);
221
- const icons = {
222
- "interrupted": "\u26A1",
223
- "thinking": "\u{1F9E0}",
224
- "speaking": "\u{1F50A}",
225
- "listening": "\u{1F442}"
226
- };
227
- console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
228
- break;
229
- case "visemes":
230
- if (Array.isArray(msg.data) && msg.data.length > 0) {
231
- this.emit("visemes", msg.data);
232
- }
233
- break;
234
- case "error":
235
- if (this.onError) this.onError(msg.data);
236
- console.error(`\u274C Server error: ${msg.data}`);
237
- break;
238
- }
239
- } catch (e) {
240
- }
241
- }
242
- audioListeners = [];
243
- emit(event, data) {
244
- if (event === "audio") {
245
- if (this.onAudioCallback) this.onAudioCallback(data);
246
- this.audioListeners.forEach((l) => l(data));
247
- } else if (event === "visemes") {
248
- if (this.onVisemesCallback) this.onVisemesCallback(data);
249
- this.visemeListeners.forEach((l) => l(data));
250
- }
251
- }
252
- onAudio(callback) {
253
- this.audioListeners.push(callback);
254
- }
255
- onVisemes(callback) {
256
- this.visemeListeners.push(callback);
257
- }
258
- /**
259
- * Disconnect from the server
260
- */
261
- disconnect() {
262
- if (this.ws) {
263
- this.ws.close();
264
- this.ws = null;
265
- }
266
- }
267
- /**
268
- * Update the system prompt mid-conversation
269
- */
270
- updatePrompt(newPrompt) {
271
- this.prompt = newPrompt;
272
- if (this.ws && this.isConnected) {
273
- try {
274
- this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
275
- console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
276
- } catch (error) {
277
- console.error("Error updating prompt:", error);
278
- }
279
- } else {
280
- console.warn("Not connected - prompt will be updated on next connection");
281
- }
282
- }
283
- /**
284
- * Get full conversation transcript
285
- */
286
- getTranscript() {
287
- return this.messages.slice();
288
- }
289
- /**
290
- * Get conversation as formatted text
291
- */
292
- getTranscriptText() {
293
- return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
294
- }
295
- };
296
- var TTSClient = class {
297
- apiKey;
298
- constructor(config) {
299
- this.apiKey = config.apiKey;
300
- }
301
- /**
302
- * Synthesize text to speech
303
- *
304
- * This opens a temporary WebSocket connection, sends the request,
305
- * and streams back the audio.
306
- */
307
- synthesize(options) {
308
- return new Promise((resolve, reject) => {
309
- try {
310
- let url = DEFAULT_URLS.TTS;
311
- if (this.apiKey) {
312
- const separator = url.includes("?") ? "&" : "?";
313
- url += `${separator}api_key=${this.apiKey}`;
314
- }
315
- const ws = new WebSocket(url);
316
- ws.binaryType = "arraybuffer";
317
- ws.onopen = () => {
318
- const req = {
319
- text: options.text,
320
- voice: options.voice || "F1" /* F1 */,
321
- lang: options.language || "en" /* ENGLISH */,
322
- speed: options.speed || 1.05,
323
- steps: options.steps || 24,
324
- visemes: options.visemes || false
325
- };
326
- ws.send(JSON.stringify(req));
327
- };
328
- ws.onmessage = async (event) => {
329
- if (event.data instanceof ArrayBuffer) {
330
- if (options.onAudio) options.onAudio(new Uint8Array(event.data));
331
- } else {
332
- try {
333
- const msg = JSON.parse(event.data.toString());
334
- if (Array.isArray(msg) && options.onVisemes) {
335
- options.onVisemes(msg);
336
- }
337
- } catch (e) {
338
- }
339
- }
340
- };
341
- ws.onerror = (err) => {
342
- if (options.onError) options.onError(err);
343
- reject(err);
344
- };
345
- ws.onclose = () => {
346
- resolve();
347
- };
348
- } catch (err) {
349
- if (options.onError) options.onError(err);
350
- reject(err);
351
- }
352
- });
353
- }
354
- };
355
- async function simpleConversation(config) {
356
- const client = new VoiceAgentClient(config);
357
- await client.connect();
358
- return client;
359
- }
360
- async function simpleTTS(options) {
361
- const client = new TTSClient({ apiKey: options.apiKey });
362
- return client.synthesize(options);
363
- }
364
-
365
81
  // src/audio-utils.ts
366
82
  function pcm16ToFloat32(int16Data) {
367
83
  const float32 = new Float32Array(int16Data.length);
@@ -501,11 +217,11 @@ var BrowserAudioManager = class {
501
217
  scriptProcessor = null;
502
218
  analyserNode = null;
503
219
  mediaStream = null;
220
+ resampler = null;
504
221
  // Playback scheduling
505
222
  nextPlaybackTime = 0;
506
223
  activeSources = [];
507
- // High-precision clock anchor for viseme sync
508
- audioClockOffset = null;
224
+ playbackQueue = [];
509
225
  // Configuration
510
226
  inputSampleRate;
511
227
  outputSampleRate;
@@ -518,7 +234,6 @@ var BrowserAudioManager = class {
518
234
  // Audio processing state
519
235
  isMuted = false;
520
236
  isListening = false;
521
- resampler = null;
522
237
  constructor(config = {}) {
523
238
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
524
239
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -542,6 +257,7 @@ var BrowserAudioManager = class {
542
257
  }
543
258
  if (this.audioContext.state === "suspended") {
544
259
  await this.audioContext.resume();
260
+ console.log("\u{1F442} AudioContext resumed");
545
261
  }
546
262
  if (analyserConfig?.enabled !== false) {
547
263
  this.analyserNode = this.audioContext.createAnalyser();
@@ -555,7 +271,6 @@ var BrowserAudioManager = class {
555
271
  if (!this.audioContext) {
556
272
  await this.init();
557
273
  }
558
- this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
559
274
  try {
560
275
  this.onAudioInput = onAudioInput;
561
276
  this.isListening = true;
@@ -571,13 +286,21 @@ var BrowserAudioManager = class {
571
286
  this.scriptProcessor = this.audioContext.createScriptProcessor(
572
287
  bufferSize,
573
288
  1,
289
+ // input channels
574
290
  1
291
+ // output channels
575
292
  );
576
293
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
577
294
  this.scriptProcessor.connect(this.audioContext.destination);
578
295
  if (this.analyserNode) {
579
296
  this.mediaStreamAudioSourceNode.connect(this.analyserNode);
580
297
  }
298
+ const hardwareRate = this.audioContext.sampleRate;
299
+ if (hardwareRate !== this.inputSampleRate) {
300
+ this.resampler = new StreamResampler(hardwareRate, this.inputSampleRate);
301
+ } else {
302
+ this.resampler = null;
303
+ }
581
304
  this.scriptProcessor.onaudioprocess = (event) => {
582
305
  this._processAudioInput(event);
583
306
  };
@@ -588,19 +311,36 @@ var BrowserAudioManager = class {
588
311
  throw err;
589
312
  }
590
313
  }
314
+ /**
315
+ * Internal method to process microphone audio data
316
+ */
591
317
  _processAudioInput(event) {
592
- if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
593
- const inputData = event.inputBuffer.getChannelData(0);
594
- event.outputBuffer.getChannelData(0).fill(0);
595
- const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
596
- if (resampled && resampled.length > 0) {
597
- const int16Data = float32ToPcm16(resampled);
598
- this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
318
+ if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
319
+ if (this.isMuted) return;
320
+ const inputBuffer = event.inputBuffer;
321
+ const inputData = inputBuffer.getChannelData(0);
322
+ const outputBuffer = event.outputBuffer;
323
+ for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
324
+ outputBuffer.getChannelData(0)[i] = 0;
325
+ }
326
+ let processedData = new Float32Array(inputData);
327
+ if (this.resampler) {
328
+ processedData = this.resampler.process(processedData);
599
329
  }
330
+ if (processedData.length === 0) return;
331
+ const int16Data = float32ToPcm16(processedData);
332
+ const uint8Data = new Uint8Array(
333
+ int16Data.buffer,
334
+ int16Data.byteOffset,
335
+ int16Data.byteLength
336
+ );
337
+ this.onAudioInput(uint8Data);
600
338
  }
339
+ /**
340
+ * Stop capturing microphone input
341
+ */
601
342
  stopMicrophone() {
602
343
  this.isListening = false;
603
- this.resampler = null;
604
344
  if (this.mediaStream) {
605
345
  this.mediaStream.getTracks().forEach((track) => track.stop());
606
346
  this.mediaStream = null;
@@ -613,12 +353,17 @@ var BrowserAudioManager = class {
613
353
  this.mediaStreamAudioSourceNode.disconnect();
614
354
  this.mediaStreamAudioSourceNode = null;
615
355
  }
356
+ console.log("\u{1F3A4} Microphone stopped");
616
357
  }
617
358
  /**
618
359
  * Play back audio received from the server
360
+ * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
619
361
  */
620
362
  playAudio(pcm16Data) {
621
- if (!this.audioContext) return;
363
+ if (!this.audioContext) {
364
+ console.warn("AudioContext not initialized");
365
+ return;
366
+ }
622
367
  const int16Array = new Int16Array(
623
368
  pcm16Data.buffer,
624
369
  pcm16Data.byteOffset,
@@ -633,17 +378,18 @@ var BrowserAudioManager = class {
633
378
  audioBuffer.getChannelData(0).set(float32Data);
634
379
  this._schedulePlayback(audioBuffer);
635
380
  }
381
+ /**
382
+ * Internal method to schedule and play audio with sample-accurate timing
383
+ */
636
384
  _schedulePlayback(audioBuffer) {
637
385
  if (!this.audioContext) return;
638
386
  const currentTime = this.audioContext.currentTime;
639
387
  const duration = audioBuffer.length / this.outputSampleRate;
640
388
  const startTime = Math.max(
641
389
  currentTime + 0.01,
390
+ // Minimum 10ms delay
642
391
  this.nextPlaybackTime
643
392
  );
644
- if (this.audioClockOffset === null) {
645
- this.audioClockOffset = startTime;
646
- }
647
393
  this.nextPlaybackTime = startTime + duration;
648
394
  const source = this.audioContext.createBufferSource();
649
395
  source.buffer = audioBuffer;
@@ -658,185 +404,441 @@ var BrowserAudioManager = class {
658
404
  if (index > -1) {
659
405
  this.activeSources.splice(index, 1);
660
406
  }
661
- };
407
+ };
408
+ }
409
+ /**
410
+ * Stop all currently playing audio and clear the queue
411
+ */
412
+ stopPlayback() {
413
+ this.activeSources.forEach((source) => {
414
+ try {
415
+ source.stop();
416
+ } catch (e) {
417
+ }
418
+ });
419
+ this.activeSources = [];
420
+ this.playbackQueue = [];
421
+ this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
422
+ console.log("\u{1F507} Playback stopped");
423
+ }
424
+ /**
425
+ * Toggle mute state
426
+ */
427
+ setMuted(muted) {
428
+ this.isMuted = muted;
429
+ }
430
+ /**
431
+ * Get current mute state
432
+ */
433
+ isMicMuted() {
434
+ return this.isMuted;
435
+ }
436
+ /**
437
+ * Get current amplitude from analyser (for visualization)
438
+ * Returns value between 0 and 1
439
+ */
440
+ getAmplitude() {
441
+ if (!this.analyserNode) return 0;
442
+ const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
443
+ this.analyserNode.getByteTimeDomainData(dataArray);
444
+ const rms = calculateRMS(dataArray);
445
+ return Math.min(rms * 10, 1);
446
+ }
447
+ /**
448
+ * Get frequency data from analyser for visualization
449
+ */
450
+ getFrequencyData() {
451
+ if (!this.analyserNode) {
452
+ return new Uint8Array(0);
453
+ }
454
+ const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
455
+ this.analyserNode.getByteFrequencyData(dataArray);
456
+ return dataArray;
457
+ }
458
+ /**
459
+ * Get time-domain data from analyser for waveform visualization
460
+ */
461
+ getWaveformData() {
462
+ if (!this.analyserNode) {
463
+ return new Uint8Array(0);
464
+ }
465
+ const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
466
+ this.analyserNode.getByteTimeDomainData(dataArray);
467
+ return dataArray;
468
+ }
469
+ /**
470
+ * Cleanup and close AudioContext
471
+ */
472
+ cleanup() {
473
+ this.stopMicrophone();
474
+ this.stopPlayback();
475
+ if (this.analyserNode) {
476
+ this.analyserNode.disconnect();
477
+ this.analyserNode = null;
478
+ }
479
+ }
480
+ /**
481
+ * Get current audio context state
482
+ */
483
+ getState() {
484
+ return this.audioContext?.state ?? null;
485
+ }
486
+ /**
487
+ * Check if microphone is currently listening
488
+ */
489
+ isRecording() {
490
+ return this.isListening;
491
+ }
492
+ };
493
+
494
+ // src/client.ts
495
+ function base64ToUint8Array(base64) {
496
+ const binaryString = atob(base64);
497
+ const bytes = new Uint8Array(binaryString.length);
498
+ for (let i = 0; i < binaryString.length; i++) {
499
+ bytes[i] = binaryString.charCodeAt(i);
500
+ }
501
+ return bytes;
502
+ }
503
+ var VoiceAgentClient = class {
504
+ ws = null;
505
+ apiKey;
506
+ prompt;
507
+ voice;
508
+ language;
509
+ // Callbacks
510
+ onTranscription;
511
+ onResponse;
512
+ onAudioCallback;
513
+ onVisemesCallback;
514
+ onStatus;
515
+ onError;
516
+ isConnected = false;
517
+ messages = [];
518
+ visemeListeners = [];
519
+ wantVisemes = false;
520
+ audioManager = null;
521
+ enableAudio = false;
522
+ // Connection resilience
523
+ isUserDisconnect = false;
524
+ reconnecting = false;
525
+ reconnectAttempts = 0;
526
+ maxReconnectAttempts = 5;
527
+ constructor(config) {
528
+ this.apiKey = config.apiKey;
529
+ this.prompt = config.prompt;
530
+ this.voice = config.voice || "F1" /* F1 */;
531
+ this.language = config.language || "en" /* ENGLISH */;
532
+ this.onTranscription = config.onTranscription;
533
+ this.onResponse = config.onResponse;
534
+ this.onAudioCallback = config.onAudio;
535
+ this.onVisemesCallback = config.onVisemes;
536
+ this.onStatus = config.onStatus;
537
+ this.onError = config.onError;
538
+ this.wantVisemes = config.visemes || false;
539
+ this.enableAudio = config.enableAudio ?? false;
540
+ }
541
+ /**
542
+ * Connect to the Lokutor Voice Agent server
543
+ */
544
+ async connect() {
545
+ this.isUserDisconnect = false;
546
+ if (this.enableAudio) {
547
+ if (!this.audioManager) {
548
+ this.audioManager = new BrowserAudioManager();
549
+ }
550
+ await this.audioManager.init();
551
+ }
552
+ return new Promise((resolve, reject) => {
553
+ try {
554
+ let url = DEFAULT_URLS.VOICE_AGENT;
555
+ if (this.apiKey) {
556
+ const separator = url.includes("?") ? "&" : "?";
557
+ url += `${separator}api_key=${this.apiKey}`;
558
+ }
559
+ console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
560
+ this.ws = new WebSocket(url);
561
+ this.ws.binaryType = "arraybuffer";
562
+ this.ws.onopen = async () => {
563
+ this.isConnected = true;
564
+ this.reconnectAttempts = 0;
565
+ this.reconnecting = false;
566
+ console.log("\u2705 Connected to voice agent!");
567
+ this.sendConfig();
568
+ if (this.audioManager) {
569
+ await this.audioManager.startMicrophone((data) => {
570
+ if (this.isConnected) {
571
+ this.sendAudio(data);
572
+ }
573
+ });
574
+ }
575
+ resolve(true);
576
+ };
577
+ this.ws.onmessage = async (event) => {
578
+ if (event.data instanceof ArrayBuffer) {
579
+ this.handleBinaryMessage(new Uint8Array(event.data));
580
+ } else {
581
+ this.handleTextMessage(event.data.toString());
582
+ }
583
+ };
584
+ this.ws.onerror = (err) => {
585
+ console.error("\u274C WebSocket error:", err);
586
+ if (this.onError) this.onError(err);
587
+ if (!this.isConnected) reject(err);
588
+ };
589
+ this.ws.onclose = () => {
590
+ this.isConnected = false;
591
+ if (!this.isUserDisconnect && this.reconnectAttempts < this.maxReconnectAttempts) {
592
+ this.reconnecting = true;
593
+ this.reconnectAttempts++;
594
+ const backoffDelay = Math.min(1e3 * Math.pow(2, this.reconnectAttempts), 1e4);
595
+ console.warn(`Connection lost. Reconnecting in ${backoffDelay}ms (attempt ${this.reconnectAttempts}/${this.maxReconnectAttempts})`);
596
+ if (this.onStatus) this.onStatus("reconnecting");
597
+ setTimeout(() => {
598
+ this.connect().catch((e) => console.error("Reconnect failed", e));
599
+ }, backoffDelay);
600
+ } else {
601
+ console.log("Disconnected");
602
+ if (this.onStatus) this.onStatus("disconnected");
603
+ }
604
+ };
605
+ } catch (err) {
606
+ if (this.onError) this.onError(err);
607
+ reject(err);
608
+ }
609
+ });
662
610
  }
663
611
  /**
664
- * Get the current high-precision audio clock offset for viseme synchronization.
665
- * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
612
+ * Send initial configuration to the server
666
613
  */
667
- getAudioClockOffset() {
668
- return this.audioClockOffset;
614
+ sendConfig() {
615
+ if (!this.ws || !this.isConnected) return;
616
+ this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
617
+ this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
618
+ this.ws.send(JSON.stringify({ type: "language", data: this.language }));
619
+ this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
620
+ console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
669
621
  }
670
622
  /**
671
- * Reset the audio clock offset (call when a response is interrupted or finished)
623
+ * Send raw PCM audio data to the server
624
+ * @param audioData Int16 PCM audio buffer
672
625
  */
673
- resetAudioClock() {
674
- this.audioClockOffset = null;
675
- }
676
- stopPlayback() {
677
- this.activeSources.forEach((source) => {
678
- try {
679
- source.stop();
680
- } catch (e) {
681
- }
682
- });
683
- this.activeSources = [];
684
- this.nextPlaybackTime = 0;
685
- this.resetAudioClock();
626
+ sendAudio(audioData) {
627
+ if (this.ws && this.ws.readyState === WebSocket.OPEN && this.isConnected) {
628
+ this.ws.send(audioData);
629
+ }
686
630
  }
687
- setMuted(muted) {
688
- this.isMuted = muted;
631
+ /**
632
+ * Handle incoming binary data (audio response)
633
+ */
634
+ handleBinaryMessage(data) {
635
+ if (this.audioManager) {
636
+ this.audioManager.playAudio(data);
637
+ }
638
+ this.emit("audio", data);
689
639
  }
690
- isMicMuted() {
691
- return this.isMuted;
640
+ /**
641
+ * Handle incoming text messages (metadata/transcriptions)
642
+ */
643
+ handleTextMessage(text) {
644
+ try {
645
+ const msg = JSON.parse(text);
646
+ switch (msg.type) {
647
+ case "audio":
648
+ if (msg.data) {
649
+ const buffer = base64ToUint8Array(msg.data);
650
+ this.handleBinaryMessage(buffer);
651
+ }
652
+ break;
653
+ case "transcript":
654
+ const role = msg.role === "user" ? "user" : "agent";
655
+ this.messages.push({
656
+ role,
657
+ text: msg.data,
658
+ timestamp: Date.now()
659
+ });
660
+ if (msg.role === "user") {
661
+ if (this.onTranscription) this.onTranscription(msg.data);
662
+ console.log(`\u{1F4AC} You: ${msg.data}`);
663
+ } else {
664
+ if (this.onResponse) this.onResponse(msg.data);
665
+ console.log(`\u{1F916} Agent: ${msg.data}`);
666
+ }
667
+ break;
668
+ case "status":
669
+ if (msg.data === "interrupted" && this.audioManager) {
670
+ this.audioManager.stopPlayback();
671
+ }
672
+ if (this.onStatus) this.onStatus(msg.data);
673
+ const icons = {
674
+ "interrupted": "\u26A1",
675
+ "thinking": "\u{1F9E0}",
676
+ "speaking": "\u{1F50A}",
677
+ "listening": "\u{1F442}"
678
+ };
679
+ console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
680
+ break;
681
+ case "visemes":
682
+ if (Array.isArray(msg.data) && msg.data.length > 0) {
683
+ this.emit("visemes", msg.data);
684
+ }
685
+ break;
686
+ case "error":
687
+ if (this.onError) this.onError(msg.data);
688
+ console.error(`\u274C Server error: ${msg.data}`);
689
+ break;
690
+ }
691
+ } catch (e) {
692
+ }
692
693
  }
693
- getAmplitude() {
694
- if (!this.analyserNode) return 0;
695
- const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
696
- this.analyserNode.getByteTimeDomainData(dataArray);
697
- const rms = calculateRMS(dataArray);
698
- return Math.min(rms * 10, 1);
694
+ audioListeners = [];
695
+ emit(event, data) {
696
+ if (event === "audio") {
697
+ if (this.onAudioCallback) this.onAudioCallback(data);
698
+ this.audioListeners.forEach((l) => l(data));
699
+ } else if (event === "visemes") {
700
+ if (this.onVisemesCallback) this.onVisemesCallback(data);
701
+ this.visemeListeners.forEach((l) => l(data));
702
+ }
699
703
  }
700
- getFrequencyData() {
701
- if (!this.analyserNode) return new Uint8Array(0);
702
- const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
703
- this.analyserNode.getByteFrequencyData(dataArray);
704
- return dataArray;
704
+ onAudio(callback) {
705
+ this.audioListeners.push(callback);
705
706
  }
706
- getWaveformData() {
707
- if (!this.analyserNode) return new Uint8Array(0);
708
- const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
709
- this.analyserNode.getByteTimeDomainData(dataArray);
710
- return dataArray;
707
+ onVisemes(callback) {
708
+ this.visemeListeners.push(callback);
711
709
  }
712
- cleanup() {
713
- this.stopMicrophone();
714
- this.stopPlayback();
715
- if (this.analyserNode) {
716
- this.analyserNode.disconnect();
717
- this.analyserNode = null;
710
+ /**
711
+ * Disconnect from the server
712
+ */
713
+ disconnect() {
714
+ this.isUserDisconnect = true;
715
+ if (this.ws) {
716
+ this.ws.close();
717
+ this.ws = null;
718
718
  }
719
- }
720
- getAudioContext() {
721
- return this.audioContext;
722
- }
723
- };
724
-
725
- // src/voice-agent.ts
726
- var VoiceAgent = class {
727
- client;
728
- audioManager;
729
- options;
730
- isConnected = false;
731
- visemeQueue = [];
732
- constructor(options) {
733
- this.options = options;
734
- this.client = new VoiceAgentClient({
735
- apiKey: options.apiKey,
736
- prompt: options.prompt || "You are a helpful and friendly AI assistant.",
737
- voice: options.voice || "F1" /* F1 */,
738
- language: options.language || "en" /* ENGLISH */,
739
- visemes: options.visemes ?? true,
740
- serverUrl: options.serverUrl,
741
- onTranscription: (text) => {
742
- if (options.onTranscription) options.onTranscription(text, true);
743
- },
744
- onResponse: (text) => {
745
- if (options.onTranscription) options.onTranscription(text, false);
746
- },
747
- onAudio: (data) => {
748
- this.audioManager.playAudio(data);
749
- },
750
- onVisemes: (visemes) => {
751
- this.visemeQueue.push(...visemes);
752
- if (options.onVisemes) options.onVisemes(visemes);
753
- },
754
- onStatus: (status) => {
755
- if (options.onStatusChange) options.onStatusChange(status);
756
- if (status === "interrupted" || status === "thinking") {
757
- this.audioManager.stopPlayback();
758
- this.visemeQueue = [];
759
- }
760
- },
761
- onError: (err) => {
762
- if (options.onError) options.onError(err);
763
- }
764
- });
765
- this.audioManager = new BrowserAudioManager({
766
- autoGainControl: true,
767
- echoCancellation: true,
768
- noiseSuppression: true
769
- });
719
+ if (this.audioManager) {
720
+ this.audioManager.cleanup();
721
+ }
722
+ this.isConnected = false;
770
723
  }
771
724
  /**
772
- * Initialize hardware and connect to the AI server.
773
- * This must be called in response to a user guesture (like a click)
774
- * to satisfy browser AudioContext requirements.
725
+ * Toggles the microphone mute state (if managed by client)
726
+ * returns the new mute state
775
727
  */
776
- async connect() {
777
- try {
778
- await this.audioManager.init();
779
- const connected = await this.client.connect();
780
- if (!connected) return false;
781
- this.isConnected = true;
782
- await this.audioManager.startMicrophone((pcm16Data) => {
783
- if (this.isConnected) {
784
- this.client.sendAudio(pcm16Data);
785
- }
786
- });
787
- return true;
788
- } catch (err) {
789
- if (this.options.onError) this.options.onError(err);
790
- return false;
728
+ toggleMute() {
729
+ if (this.audioManager) {
730
+ const isMuted = this.audioManager.isMicMuted();
731
+ this.audioManager.setMuted(!isMuted);
732
+ return !isMuted;
791
733
  }
734
+ return false;
792
735
  }
793
736
  /**
794
- * Get the current amplitude/volume of the microphone or output audio.
795
- * Useful for voice activity visualization.
796
- * @returns value between 0 and 1
737
+ * Gets the microphone volume amplitude 0-1 (if managed by client)
797
738
  */
798
739
  getAmplitude() {
799
- return this.audioManager.getAmplitude();
740
+ if (this.audioManager) {
741
+ return this.audioManager.getAmplitude();
742
+ }
743
+ return 0;
800
744
  }
801
745
  /**
802
- * Mute or unmute the microphone.
746
+ * Update the system prompt mid-conversation
803
747
  */
804
- toggleMute() {
805
- const currentState = this.audioManager.isMicMuted();
806
- this.audioManager.setMuted(!currentState);
807
- return !currentState;
748
+ updatePrompt(newPrompt) {
749
+ this.prompt = newPrompt;
750
+ if (this.ws && this.isConnected) {
751
+ try {
752
+ this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
753
+ console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
754
+ } catch (error) {
755
+ console.error("Error updating prompt:", error);
756
+ }
757
+ } else {
758
+ console.warn("Not connected - prompt will be updated on next connection");
759
+ }
808
760
  }
809
761
  /**
810
- * High-precision method to get visemes that should be active
811
- * at the current playback frame. Use this in a requestAnimationFrame loop.
762
+ * Get full conversation transcript
812
763
  */
813
- getFrameVisemes() {
814
- const offset = this.audioManager.getAudioClockOffset();
815
- const audioCtx = this.audioManager.getAudioContext();
816
- if (offset === null || !audioCtx) return [];
817
- const streamTime = (audioCtx.currentTime - offset) * 1e3;
818
- const currentBatch = [];
819
- while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
820
- currentBatch.push(this.visemeQueue.shift());
821
- }
822
- return currentBatch;
764
+ getTranscript() {
765
+ return this.messages.slice();
823
766
  }
824
767
  /**
825
- * Change the system prompt mid-conversation.
768
+ * Get conversation as formatted text
826
769
  */
827
- updatePrompt(newPrompt) {
828
- this.client.updatePrompt(newPrompt);
770
+ getTranscriptText() {
771
+ return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
772
+ }
773
+ };
774
+ var TTSClient = class {
775
+ apiKey;
776
+ constructor(config) {
777
+ this.apiKey = config.apiKey;
829
778
  }
830
779
  /**
831
- * Disconnect and release audio resources.
780
+ * Synthesize text to speech
781
+ *
782
+ * This opens a temporary WebSocket connection, sends the request,
783
+ * and streams back the audio.
832
784
  */
833
- disconnect() {
834
- this.isConnected = false;
835
- this.client.disconnect();
836
- this.audioManager.cleanup();
837
- this.visemeQueue = [];
785
+ synthesize(options) {
786
+ return new Promise((resolve, reject) => {
787
+ try {
788
+ let url = DEFAULT_URLS.TTS;
789
+ if (this.apiKey) {
790
+ const separator = url.includes("?") ? "&" : "?";
791
+ url += `${separator}api_key=${this.apiKey}`;
792
+ }
793
+ const ws = new WebSocket(url);
794
+ ws.binaryType = "arraybuffer";
795
+ ws.onopen = () => {
796
+ const req = {
797
+ text: options.text,
798
+ voice: options.voice || "F1" /* F1 */,
799
+ lang: options.language || "en" /* ENGLISH */,
800
+ speed: options.speed || 1.05,
801
+ steps: options.steps || 24,
802
+ visemes: options.visemes || false
803
+ };
804
+ ws.send(JSON.stringify(req));
805
+ };
806
+ ws.onmessage = async (event) => {
807
+ if (event.data instanceof ArrayBuffer) {
808
+ if (options.onAudio) options.onAudio(new Uint8Array(event.data));
809
+ } else {
810
+ try {
811
+ const msg = JSON.parse(event.data.toString());
812
+ if (Array.isArray(msg) && options.onVisemes) {
813
+ options.onVisemes(msg);
814
+ }
815
+ } catch (e) {
816
+ }
817
+ }
818
+ };
819
+ ws.onerror = (err) => {
820
+ if (options.onError) options.onError(err);
821
+ reject(err);
822
+ };
823
+ ws.onclose = () => {
824
+ resolve();
825
+ };
826
+ } catch (err) {
827
+ if (options.onError) options.onError(err);
828
+ reject(err);
829
+ }
830
+ });
838
831
  }
839
832
  };
833
+ async function simpleConversation(config) {
834
+ const client = new VoiceAgentClient(config);
835
+ await client.connect();
836
+ return client;
837
+ }
838
+ async function simpleTTS(options) {
839
+ const client = new TTSClient({ apiKey: options.apiKey });
840
+ return client.synthesize(options);
841
+ }
840
842
  // Annotate the CommonJS export names for ESM import in node:
841
843
  0 && (module.exports = {
842
844
  AUDIO_CONFIG,
@@ -845,7 +847,6 @@ var VoiceAgent = class {
845
847
  Language,
846
848
  StreamResampler,
847
849
  TTSClient,
848
- VoiceAgent,
849
850
  VoiceAgentClient,
850
851
  VoiceStyle,
851
852
  applyLowPassFilter,