@lokutor/sdk 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -34,289 +34,6 @@ var DEFAULT_URLS = {
34
34
  TTS: "wss://api.lokutor.com/ws/tts"
35
35
  };
36
36
 
37
- // src/client.ts
38
- function base64ToUint8Array(base64) {
39
- const binaryString = atob(base64);
40
- const bytes = new Uint8Array(binaryString.length);
41
- for (let i = 0; i < binaryString.length; i++) {
42
- bytes[i] = binaryString.charCodeAt(i);
43
- }
44
- return bytes;
45
- }
46
- var VoiceAgentClient = class {
47
- ws = null;
48
- apiKey;
49
- prompt;
50
- voice;
51
- language;
52
- // Callbacks
53
- onTranscription;
54
- onResponse;
55
- onAudioCallback;
56
- onVisemesCallback;
57
- onStatus;
58
- onError;
59
- isConnected = false;
60
- messages = [];
61
- visemeListeners = [];
62
- wantVisemes = false;
63
- serverUrl = null;
64
- constructor(config) {
65
- this.apiKey = config.apiKey;
66
- this.prompt = config.prompt;
67
- this.voice = config.voice || "F1" /* F1 */;
68
- this.language = config.language || "en" /* ENGLISH */;
69
- this.serverUrl = config.serverUrl || null;
70
- this.onTranscription = config.onTranscription;
71
- this.onResponse = config.onResponse;
72
- this.onAudioCallback = config.onAudio;
73
- this.onVisemesCallback = config.onVisemes;
74
- this.onStatus = config.onStatus;
75
- this.onError = config.onError;
76
- this.wantVisemes = config.visemes || false;
77
- }
78
- /**
79
- * Connect to the Lokutor Voice Agent server
80
- */
81
- async connect() {
82
- return new Promise((resolve, reject) => {
83
- try {
84
- let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
85
- if (this.apiKey) {
86
- const separator = url.includes("?") ? "&" : "?";
87
- url += `${separator}api_key=${this.apiKey}`;
88
- }
89
- console.log(`\u{1F517} Connecting to ${url}...`);
90
- this.ws = new WebSocket(url);
91
- this.ws.binaryType = "arraybuffer";
92
- this.ws.onopen = () => {
93
- this.isConnected = true;
94
- console.log("\u2705 Connected to voice agent!");
95
- this.sendConfig();
96
- resolve(true);
97
- };
98
- this.ws.onmessage = async (event) => {
99
- if (event.data instanceof ArrayBuffer) {
100
- this.handleBinaryMessage(new Uint8Array(event.data));
101
- } else {
102
- this.handleTextMessage(event.data.toString());
103
- }
104
- };
105
- this.ws.onerror = (err) => {
106
- console.error("\u274C WebSocket error:", err);
107
- if (this.onError) this.onError(err);
108
- if (!this.isConnected) reject(err);
109
- };
110
- this.ws.onclose = () => {
111
- this.isConnected = false;
112
- console.log("Disconnected");
113
- };
114
- } catch (err) {
115
- if (this.onError) this.onError(err);
116
- reject(err);
117
- }
118
- });
119
- }
120
- /**
121
- * Send initial configuration to the server
122
- */
123
- sendConfig() {
124
- if (!this.ws || !this.isConnected) return;
125
- this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
126
- this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
127
- this.ws.send(JSON.stringify({ type: "language", data: this.language }));
128
- this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
129
- console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
130
- }
131
- /**
132
- * Send raw PCM audio data to the server
133
- * @param audioData Int16 PCM audio buffer
134
- */
135
- sendAudio(audioData) {
136
- if (this.ws && this.isConnected) {
137
- this.ws.send(audioData);
138
- }
139
- }
140
- /**
141
- * Handle incoming binary data (audio response)
142
- */
143
- handleBinaryMessage(data) {
144
- this.emit("audio", data);
145
- }
146
- /**
147
- * Handle incoming text messages (metadata/transcriptions)
148
- */
149
- handleTextMessage(text) {
150
- try {
151
- const msg = JSON.parse(text);
152
- switch (msg.type) {
153
- case "audio":
154
- if (msg.data) {
155
- const buffer = base64ToUint8Array(msg.data);
156
- this.handleBinaryMessage(buffer);
157
- }
158
- break;
159
- case "transcript":
160
- const role = msg.role === "user" ? "user" : "agent";
161
- this.messages.push({
162
- role,
163
- text: msg.data,
164
- timestamp: Date.now()
165
- });
166
- if (msg.role === "user") {
167
- if (this.onTranscription) this.onTranscription(msg.data);
168
- console.log(`\u{1F4AC} You: ${msg.data}`);
169
- } else {
170
- if (this.onResponse) this.onResponse(msg.data);
171
- console.log(`\u{1F916} Agent: ${msg.data}`);
172
- }
173
- break;
174
- case "status":
175
- if (this.onStatus) this.onStatus(msg.data);
176
- const icons = {
177
- "interrupted": "\u26A1",
178
- "thinking": "\u{1F9E0}",
179
- "speaking": "\u{1F50A}",
180
- "listening": "\u{1F442}"
181
- };
182
- console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
183
- break;
184
- case "visemes":
185
- if (Array.isArray(msg.data) && msg.data.length > 0) {
186
- this.emit("visemes", msg.data);
187
- }
188
- break;
189
- case "error":
190
- if (this.onError) this.onError(msg.data);
191
- console.error(`\u274C Server error: ${msg.data}`);
192
- break;
193
- }
194
- } catch (e) {
195
- }
196
- }
197
- audioListeners = [];
198
- emit(event, data) {
199
- if (event === "audio") {
200
- if (this.onAudioCallback) this.onAudioCallback(data);
201
- this.audioListeners.forEach((l) => l(data));
202
- } else if (event === "visemes") {
203
- if (this.onVisemesCallback) this.onVisemesCallback(data);
204
- this.visemeListeners.forEach((l) => l(data));
205
- }
206
- }
207
- onAudio(callback) {
208
- this.audioListeners.push(callback);
209
- }
210
- onVisemes(callback) {
211
- this.visemeListeners.push(callback);
212
- }
213
- /**
214
- * Disconnect from the server
215
- */
216
- disconnect() {
217
- if (this.ws) {
218
- this.ws.close();
219
- this.ws = null;
220
- }
221
- }
222
- /**
223
- * Update the system prompt mid-conversation
224
- */
225
- updatePrompt(newPrompt) {
226
- this.prompt = newPrompt;
227
- if (this.ws && this.isConnected) {
228
- try {
229
- this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
230
- console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
231
- } catch (error) {
232
- console.error("Error updating prompt:", error);
233
- }
234
- } else {
235
- console.warn("Not connected - prompt will be updated on next connection");
236
- }
237
- }
238
- /**
239
- * Get full conversation transcript
240
- */
241
- getTranscript() {
242
- return this.messages.slice();
243
- }
244
- /**
245
- * Get conversation as formatted text
246
- */
247
- getTranscriptText() {
248
- return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
249
- }
250
- };
251
- var TTSClient = class {
252
- apiKey;
253
- constructor(config) {
254
- this.apiKey = config.apiKey;
255
- }
256
- /**
257
- * Synthesize text to speech
258
- *
259
- * This opens a temporary WebSocket connection, sends the request,
260
- * and streams back the audio.
261
- */
262
- synthesize(options) {
263
- return new Promise((resolve, reject) => {
264
- try {
265
- let url = DEFAULT_URLS.TTS;
266
- if (this.apiKey) {
267
- const separator = url.includes("?") ? "&" : "?";
268
- url += `${separator}api_key=${this.apiKey}`;
269
- }
270
- const ws = new WebSocket(url);
271
- ws.binaryType = "arraybuffer";
272
- ws.onopen = () => {
273
- const req = {
274
- text: options.text,
275
- voice: options.voice || "F1" /* F1 */,
276
- lang: options.language || "en" /* ENGLISH */,
277
- speed: options.speed || 1.05,
278
- steps: options.steps || 24,
279
- visemes: options.visemes || false
280
- };
281
- ws.send(JSON.stringify(req));
282
- };
283
- ws.onmessage = async (event) => {
284
- if (event.data instanceof ArrayBuffer) {
285
- if (options.onAudio) options.onAudio(new Uint8Array(event.data));
286
- } else {
287
- try {
288
- const msg = JSON.parse(event.data.toString());
289
- if (Array.isArray(msg) && options.onVisemes) {
290
- options.onVisemes(msg);
291
- }
292
- } catch (e) {
293
- }
294
- }
295
- };
296
- ws.onerror = (err) => {
297
- if (options.onError) options.onError(err);
298
- reject(err);
299
- };
300
- ws.onclose = () => {
301
- resolve();
302
- };
303
- } catch (err) {
304
- if (options.onError) options.onError(err);
305
- reject(err);
306
- }
307
- });
308
- }
309
- };
310
- async function simpleConversation(config) {
311
- const client = new VoiceAgentClient(config);
312
- await client.connect();
313
- return client;
314
- }
315
- async function simpleTTS(options) {
316
- const client = new TTSClient({ apiKey: options.apiKey });
317
- return client.synthesize(options);
318
- }
319
-
320
37
  // src/audio-utils.ts
321
38
  function pcm16ToFloat32(int16Data) {
322
39
  const float32 = new Float32Array(int16Data.length);
@@ -456,11 +173,11 @@ var BrowserAudioManager = class {
456
173
  scriptProcessor = null;
457
174
  analyserNode = null;
458
175
  mediaStream = null;
176
+ resampler = null;
459
177
  // Playback scheduling
460
178
  nextPlaybackTime = 0;
461
179
  activeSources = [];
462
- // High-precision clock anchor for viseme sync
463
- audioClockOffset = null;
180
+ playbackQueue = [];
464
181
  // Configuration
465
182
  inputSampleRate;
466
183
  outputSampleRate;
@@ -473,7 +190,6 @@ var BrowserAudioManager = class {
473
190
  // Audio processing state
474
191
  isMuted = false;
475
192
  isListening = false;
476
- resampler = null;
477
193
  constructor(config = {}) {
478
194
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
479
195
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -497,6 +213,7 @@ var BrowserAudioManager = class {
497
213
  }
498
214
  if (this.audioContext.state === "suspended") {
499
215
  await this.audioContext.resume();
216
+ console.log("\u{1F442} AudioContext resumed");
500
217
  }
501
218
  if (analyserConfig?.enabled !== false) {
502
219
  this.analyserNode = this.audioContext.createAnalyser();
@@ -510,7 +227,6 @@ var BrowserAudioManager = class {
510
227
  if (!this.audioContext) {
511
228
  await this.init();
512
229
  }
513
- this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
514
230
  try {
515
231
  this.onAudioInput = onAudioInput;
516
232
  this.isListening = true;
@@ -526,13 +242,21 @@ var BrowserAudioManager = class {
526
242
  this.scriptProcessor = this.audioContext.createScriptProcessor(
527
243
  bufferSize,
528
244
  1,
245
+ // input channels
529
246
  1
247
+ // output channels
530
248
  );
531
249
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
532
250
  this.scriptProcessor.connect(this.audioContext.destination);
533
251
  if (this.analyserNode) {
534
252
  this.mediaStreamAudioSourceNode.connect(this.analyserNode);
535
253
  }
254
+ const hardwareRate = this.audioContext.sampleRate;
255
+ if (hardwareRate !== this.inputSampleRate) {
256
+ this.resampler = new StreamResampler(hardwareRate, this.inputSampleRate);
257
+ } else {
258
+ this.resampler = null;
259
+ }
536
260
  this.scriptProcessor.onaudioprocess = (event) => {
537
261
  this._processAudioInput(event);
538
262
  };
@@ -543,19 +267,36 @@ var BrowserAudioManager = class {
543
267
  throw err;
544
268
  }
545
269
  }
270
+ /**
271
+ * Internal method to process microphone audio data
272
+ */
546
273
  _processAudioInput(event) {
547
- if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
548
- const inputData = event.inputBuffer.getChannelData(0);
549
- event.outputBuffer.getChannelData(0).fill(0);
550
- const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
551
- if (resampled && resampled.length > 0) {
552
- const int16Data = float32ToPcm16(resampled);
553
- this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
274
+ if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
275
+ if (this.isMuted) return;
276
+ const inputBuffer = event.inputBuffer;
277
+ const inputData = inputBuffer.getChannelData(0);
278
+ const outputBuffer = event.outputBuffer;
279
+ for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
280
+ outputBuffer.getChannelData(0)[i] = 0;
281
+ }
282
+ let processedData = new Float32Array(inputData);
283
+ if (this.resampler) {
284
+ processedData = this.resampler.process(processedData);
554
285
  }
286
+ if (processedData.length === 0) return;
287
+ const int16Data = float32ToPcm16(processedData);
288
+ const uint8Data = new Uint8Array(
289
+ int16Data.buffer,
290
+ int16Data.byteOffset,
291
+ int16Data.byteLength
292
+ );
293
+ this.onAudioInput(uint8Data);
555
294
  }
295
+ /**
296
+ * Stop capturing microphone input
297
+ */
556
298
  stopMicrophone() {
557
299
  this.isListening = false;
558
- this.resampler = null;
559
300
  if (this.mediaStream) {
560
301
  this.mediaStream.getTracks().forEach((track) => track.stop());
561
302
  this.mediaStream = null;
@@ -568,12 +309,17 @@ var BrowserAudioManager = class {
568
309
  this.mediaStreamAudioSourceNode.disconnect();
569
310
  this.mediaStreamAudioSourceNode = null;
570
311
  }
312
+ console.log("\u{1F3A4} Microphone stopped");
571
313
  }
572
314
  /**
573
315
  * Play back audio received from the server
316
+ * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
574
317
  */
575
318
  playAudio(pcm16Data) {
576
- if (!this.audioContext) return;
319
+ if (!this.audioContext) {
320
+ console.warn("AudioContext not initialized");
321
+ return;
322
+ }
577
323
  const int16Array = new Int16Array(
578
324
  pcm16Data.buffer,
579
325
  pcm16Data.byteOffset,
@@ -588,17 +334,18 @@ var BrowserAudioManager = class {
588
334
  audioBuffer.getChannelData(0).set(float32Data);
589
335
  this._schedulePlayback(audioBuffer);
590
336
  }
337
+ /**
338
+ * Internal method to schedule and play audio with sample-accurate timing
339
+ */
591
340
  _schedulePlayback(audioBuffer) {
592
341
  if (!this.audioContext) return;
593
342
  const currentTime = this.audioContext.currentTime;
594
343
  const duration = audioBuffer.length / this.outputSampleRate;
595
344
  const startTime = Math.max(
596
345
  currentTime + 0.01,
346
+ // Minimum 10ms delay
597
347
  this.nextPlaybackTime
598
348
  );
599
- if (this.audioClockOffset === null) {
600
- this.audioClockOffset = startTime;
601
- }
602
349
  this.nextPlaybackTime = startTime + duration;
603
350
  const source = this.audioContext.createBufferSource();
604
351
  source.buffer = audioBuffer;
@@ -613,185 +360,441 @@ var BrowserAudioManager = class {
613
360
  if (index > -1) {
614
361
  this.activeSources.splice(index, 1);
615
362
  }
616
- };
363
+ };
364
+ }
365
+ /**
366
+ * Stop all currently playing audio and clear the queue
367
+ */
368
+ stopPlayback() {
369
+ this.activeSources.forEach((source) => {
370
+ try {
371
+ source.stop();
372
+ } catch (e) {
373
+ }
374
+ });
375
+ this.activeSources = [];
376
+ this.playbackQueue = [];
377
+ this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
378
+ console.log("\u{1F507} Playback stopped");
379
+ }
380
+ /**
381
+ * Toggle mute state
382
+ */
383
+ setMuted(muted) {
384
+ this.isMuted = muted;
385
+ }
386
+ /**
387
+ * Get current mute state
388
+ */
389
+ isMicMuted() {
390
+ return this.isMuted;
391
+ }
392
+ /**
393
+ * Get current amplitude from analyser (for visualization)
394
+ * Returns value between 0 and 1
395
+ */
396
+ getAmplitude() {
397
+ if (!this.analyserNode) return 0;
398
+ const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
399
+ this.analyserNode.getByteTimeDomainData(dataArray);
400
+ const rms = calculateRMS(dataArray);
401
+ return Math.min(rms * 10, 1);
402
+ }
403
+ /**
404
+ * Get frequency data from analyser for visualization
405
+ */
406
+ getFrequencyData() {
407
+ if (!this.analyserNode) {
408
+ return new Uint8Array(0);
409
+ }
410
+ const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
411
+ this.analyserNode.getByteFrequencyData(dataArray);
412
+ return dataArray;
413
+ }
414
+ /**
415
+ * Get time-domain data from analyser for waveform visualization
416
+ */
417
+ getWaveformData() {
418
+ if (!this.analyserNode) {
419
+ return new Uint8Array(0);
420
+ }
421
+ const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
422
+ this.analyserNode.getByteTimeDomainData(dataArray);
423
+ return dataArray;
424
+ }
425
+ /**
426
+ * Cleanup and close AudioContext
427
+ */
428
+ cleanup() {
429
+ this.stopMicrophone();
430
+ this.stopPlayback();
431
+ if (this.analyserNode) {
432
+ this.analyserNode.disconnect();
433
+ this.analyserNode = null;
434
+ }
435
+ }
436
+ /**
437
+ * Get current audio context state
438
+ */
439
+ getState() {
440
+ return this.audioContext?.state ?? null;
441
+ }
442
+ /**
443
+ * Check if microphone is currently listening
444
+ */
445
+ isRecording() {
446
+ return this.isListening;
447
+ }
448
+ };
449
+
450
+ // src/client.ts
451
+ function base64ToUint8Array(base64) {
452
+ const binaryString = atob(base64);
453
+ const bytes = new Uint8Array(binaryString.length);
454
+ for (let i = 0; i < binaryString.length; i++) {
455
+ bytes[i] = binaryString.charCodeAt(i);
456
+ }
457
+ return bytes;
458
+ }
459
+ var VoiceAgentClient = class {
460
+ ws = null;
461
+ apiKey;
462
+ prompt;
463
+ voice;
464
+ language;
465
+ // Callbacks
466
+ onTranscription;
467
+ onResponse;
468
+ onAudioCallback;
469
+ onVisemesCallback;
470
+ onStatus;
471
+ onError;
472
+ isConnected = false;
473
+ messages = [];
474
+ visemeListeners = [];
475
+ wantVisemes = false;
476
+ audioManager = null;
477
+ enableAudio = false;
478
+ // Connection resilience
479
+ isUserDisconnect = false;
480
+ reconnecting = false;
481
+ reconnectAttempts = 0;
482
+ maxReconnectAttempts = 5;
483
+ constructor(config) {
484
+ this.apiKey = config.apiKey;
485
+ this.prompt = config.prompt;
486
+ this.voice = config.voice || "F1" /* F1 */;
487
+ this.language = config.language || "en" /* ENGLISH */;
488
+ this.onTranscription = config.onTranscription;
489
+ this.onResponse = config.onResponse;
490
+ this.onAudioCallback = config.onAudio;
491
+ this.onVisemesCallback = config.onVisemes;
492
+ this.onStatus = config.onStatus;
493
+ this.onError = config.onError;
494
+ this.wantVisemes = config.visemes || false;
495
+ this.enableAudio = config.enableAudio ?? false;
496
+ }
497
+ /**
498
+ * Connect to the Lokutor Voice Agent server
499
+ */
500
+ async connect() {
501
+ this.isUserDisconnect = false;
502
+ if (this.enableAudio) {
503
+ if (!this.audioManager) {
504
+ this.audioManager = new BrowserAudioManager();
505
+ }
506
+ await this.audioManager.init();
507
+ }
508
+ return new Promise((resolve, reject) => {
509
+ try {
510
+ let url = DEFAULT_URLS.VOICE_AGENT;
511
+ if (this.apiKey) {
512
+ const separator = url.includes("?") ? "&" : "?";
513
+ url += `${separator}api_key=${this.apiKey}`;
514
+ }
515
+ console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
516
+ this.ws = new WebSocket(url);
517
+ this.ws.binaryType = "arraybuffer";
518
+ this.ws.onopen = async () => {
519
+ this.isConnected = true;
520
+ this.reconnectAttempts = 0;
521
+ this.reconnecting = false;
522
+ console.log("\u2705 Connected to voice agent!");
523
+ this.sendConfig();
524
+ if (this.audioManager) {
525
+ await this.audioManager.startMicrophone((data) => {
526
+ if (this.isConnected) {
527
+ this.sendAudio(data);
528
+ }
529
+ });
530
+ }
531
+ resolve(true);
532
+ };
533
+ this.ws.onmessage = async (event) => {
534
+ if (event.data instanceof ArrayBuffer) {
535
+ this.handleBinaryMessage(new Uint8Array(event.data));
536
+ } else {
537
+ this.handleTextMessage(event.data.toString());
538
+ }
539
+ };
540
+ this.ws.onerror = (err) => {
541
+ console.error("\u274C WebSocket error:", err);
542
+ if (this.onError) this.onError(err);
543
+ if (!this.isConnected) reject(err);
544
+ };
545
+ this.ws.onclose = () => {
546
+ this.isConnected = false;
547
+ if (!this.isUserDisconnect && this.reconnectAttempts < this.maxReconnectAttempts) {
548
+ this.reconnecting = true;
549
+ this.reconnectAttempts++;
550
+ const backoffDelay = Math.min(1e3 * Math.pow(2, this.reconnectAttempts), 1e4);
551
+ console.warn(`Connection lost. Reconnecting in ${backoffDelay}ms (attempt ${this.reconnectAttempts}/${this.maxReconnectAttempts})`);
552
+ if (this.onStatus) this.onStatus("reconnecting");
553
+ setTimeout(() => {
554
+ this.connect().catch((e) => console.error("Reconnect failed", e));
555
+ }, backoffDelay);
556
+ } else {
557
+ console.log("Disconnected");
558
+ if (this.onStatus) this.onStatus("disconnected");
559
+ }
560
+ };
561
+ } catch (err) {
562
+ if (this.onError) this.onError(err);
563
+ reject(err);
564
+ }
565
+ });
617
566
  }
618
567
  /**
619
- * Get the current high-precision audio clock offset for viseme synchronization.
620
- * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
568
+ * Send initial configuration to the server
621
569
  */
622
- getAudioClockOffset() {
623
- return this.audioClockOffset;
570
+ sendConfig() {
571
+ if (!this.ws || !this.isConnected) return;
572
+ this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
573
+ this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
574
+ this.ws.send(JSON.stringify({ type: "language", data: this.language }));
575
+ this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
576
+ console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
624
577
  }
625
578
  /**
626
- * Reset the audio clock offset (call when a response is interrupted or finished)
579
+ * Send raw PCM audio data to the server
580
+ * @param audioData Int16 PCM audio buffer
627
581
  */
628
- resetAudioClock() {
629
- this.audioClockOffset = null;
630
- }
631
- stopPlayback() {
632
- this.activeSources.forEach((source) => {
633
- try {
634
- source.stop();
635
- } catch (e) {
636
- }
637
- });
638
- this.activeSources = [];
639
- this.nextPlaybackTime = 0;
640
- this.resetAudioClock();
582
+ sendAudio(audioData) {
583
+ if (this.ws && this.ws.readyState === WebSocket.OPEN && this.isConnected) {
584
+ this.ws.send(audioData);
585
+ }
641
586
  }
642
- setMuted(muted) {
643
- this.isMuted = muted;
587
+ /**
588
+ * Handle incoming binary data (audio response)
589
+ */
590
+ handleBinaryMessage(data) {
591
+ if (this.audioManager) {
592
+ this.audioManager.playAudio(data);
593
+ }
594
+ this.emit("audio", data);
644
595
  }
645
- isMicMuted() {
646
- return this.isMuted;
596
+ /**
597
+ * Handle incoming text messages (metadata/transcriptions)
598
+ */
599
+ handleTextMessage(text) {
600
+ try {
601
+ const msg = JSON.parse(text);
602
+ switch (msg.type) {
603
+ case "audio":
604
+ if (msg.data) {
605
+ const buffer = base64ToUint8Array(msg.data);
606
+ this.handleBinaryMessage(buffer);
607
+ }
608
+ break;
609
+ case "transcript":
610
+ const role = msg.role === "user" ? "user" : "agent";
611
+ this.messages.push({
612
+ role,
613
+ text: msg.data,
614
+ timestamp: Date.now()
615
+ });
616
+ if (msg.role === "user") {
617
+ if (this.onTranscription) this.onTranscription(msg.data);
618
+ console.log(`\u{1F4AC} You: ${msg.data}`);
619
+ } else {
620
+ if (this.onResponse) this.onResponse(msg.data);
621
+ console.log(`\u{1F916} Agent: ${msg.data}`);
622
+ }
623
+ break;
624
+ case "status":
625
+ if (msg.data === "interrupted" && this.audioManager) {
626
+ this.audioManager.stopPlayback();
627
+ }
628
+ if (this.onStatus) this.onStatus(msg.data);
629
+ const icons = {
630
+ "interrupted": "\u26A1",
631
+ "thinking": "\u{1F9E0}",
632
+ "speaking": "\u{1F50A}",
633
+ "listening": "\u{1F442}"
634
+ };
635
+ console.log(`${icons[msg.data] || ""} Status: ${msg.data}`);
636
+ break;
637
+ case "visemes":
638
+ if (Array.isArray(msg.data) && msg.data.length > 0) {
639
+ this.emit("visemes", msg.data);
640
+ }
641
+ break;
642
+ case "error":
643
+ if (this.onError) this.onError(msg.data);
644
+ console.error(`\u274C Server error: ${msg.data}`);
645
+ break;
646
+ }
647
+ } catch (e) {
648
+ }
647
649
  }
648
- getAmplitude() {
649
- if (!this.analyserNode) return 0;
650
- const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
651
- this.analyserNode.getByteTimeDomainData(dataArray);
652
- const rms = calculateRMS(dataArray);
653
- return Math.min(rms * 10, 1);
650
+ audioListeners = [];
651
+ emit(event, data) {
652
+ if (event === "audio") {
653
+ if (this.onAudioCallback) this.onAudioCallback(data);
654
+ this.audioListeners.forEach((l) => l(data));
655
+ } else if (event === "visemes") {
656
+ if (this.onVisemesCallback) this.onVisemesCallback(data);
657
+ this.visemeListeners.forEach((l) => l(data));
658
+ }
654
659
  }
655
- getFrequencyData() {
656
- if (!this.analyserNode) return new Uint8Array(0);
657
- const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
658
- this.analyserNode.getByteFrequencyData(dataArray);
659
- return dataArray;
660
+ onAudio(callback) {
661
+ this.audioListeners.push(callback);
660
662
  }
661
- getWaveformData() {
662
- if (!this.analyserNode) return new Uint8Array(0);
663
- const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
664
- this.analyserNode.getByteTimeDomainData(dataArray);
665
- return dataArray;
663
+ onVisemes(callback) {
664
+ this.visemeListeners.push(callback);
666
665
  }
667
- cleanup() {
668
- this.stopMicrophone();
669
- this.stopPlayback();
670
- if (this.analyserNode) {
671
- this.analyserNode.disconnect();
672
- this.analyserNode = null;
666
+ /**
667
+ * Disconnect from the server
668
+ */
669
+ disconnect() {
670
+ this.isUserDisconnect = true;
671
+ if (this.ws) {
672
+ this.ws.close();
673
+ this.ws = null;
673
674
  }
674
- }
675
- getAudioContext() {
676
- return this.audioContext;
677
- }
678
- };
679
-
680
- // src/voice-agent.ts
681
- var VoiceAgent = class {
682
- client;
683
- audioManager;
684
- options;
685
- isConnected = false;
686
- visemeQueue = [];
687
- constructor(options) {
688
- this.options = options;
689
- this.client = new VoiceAgentClient({
690
- apiKey: options.apiKey,
691
- prompt: options.prompt || "You are a helpful and friendly AI assistant.",
692
- voice: options.voice || "F1" /* F1 */,
693
- language: options.language || "en" /* ENGLISH */,
694
- visemes: options.visemes ?? true,
695
- serverUrl: options.serverUrl,
696
- onTranscription: (text) => {
697
- if (options.onTranscription) options.onTranscription(text, true);
698
- },
699
- onResponse: (text) => {
700
- if (options.onTranscription) options.onTranscription(text, false);
701
- },
702
- onAudio: (data) => {
703
- this.audioManager.playAudio(data);
704
- },
705
- onVisemes: (visemes) => {
706
- this.visemeQueue.push(...visemes);
707
- if (options.onVisemes) options.onVisemes(visemes);
708
- },
709
- onStatus: (status) => {
710
- if (options.onStatusChange) options.onStatusChange(status);
711
- if (status === "interrupted" || status === "thinking") {
712
- this.audioManager.stopPlayback();
713
- this.visemeQueue = [];
714
- }
715
- },
716
- onError: (err) => {
717
- if (options.onError) options.onError(err);
718
- }
719
- });
720
- this.audioManager = new BrowserAudioManager({
721
- autoGainControl: true,
722
- echoCancellation: true,
723
- noiseSuppression: true
724
- });
675
+ if (this.audioManager) {
676
+ this.audioManager.cleanup();
677
+ }
678
+ this.isConnected = false;
725
679
  }
726
680
  /**
727
- * Initialize hardware and connect to the AI server.
728
- * This must be called in response to a user guesture (like a click)
729
- * to satisfy browser AudioContext requirements.
681
+ * Toggles the microphone mute state (if managed by client)
682
+ * returns the new mute state
730
683
  */
731
- async connect() {
732
- try {
733
- await this.audioManager.init();
734
- const connected = await this.client.connect();
735
- if (!connected) return false;
736
- this.isConnected = true;
737
- await this.audioManager.startMicrophone((pcm16Data) => {
738
- if (this.isConnected) {
739
- this.client.sendAudio(pcm16Data);
740
- }
741
- });
742
- return true;
743
- } catch (err) {
744
- if (this.options.onError) this.options.onError(err);
745
- return false;
684
+ toggleMute() {
685
+ if (this.audioManager) {
686
+ const isMuted = this.audioManager.isMicMuted();
687
+ this.audioManager.setMuted(!isMuted);
688
+ return !isMuted;
746
689
  }
690
+ return false;
747
691
  }
748
692
  /**
749
- * Get the current amplitude/volume of the microphone or output audio.
750
- * Useful for voice activity visualization.
751
- * @returns value between 0 and 1
693
+ * Gets the microphone volume amplitude 0-1 (if managed by client)
752
694
  */
753
695
  getAmplitude() {
754
- return this.audioManager.getAmplitude();
696
+ if (this.audioManager) {
697
+ return this.audioManager.getAmplitude();
698
+ }
699
+ return 0;
755
700
  }
756
701
  /**
757
- * Mute or unmute the microphone.
702
+ * Update the system prompt mid-conversation
758
703
  */
759
- toggleMute() {
760
- const currentState = this.audioManager.isMicMuted();
761
- this.audioManager.setMuted(!currentState);
762
- return !currentState;
704
+ updatePrompt(newPrompt) {
705
+ this.prompt = newPrompt;
706
+ if (this.ws && this.isConnected) {
707
+ try {
708
+ this.ws.send(JSON.stringify({ type: "prompt", data: newPrompt }));
709
+ console.log(`\u2699\uFE0F Updated prompt: ${newPrompt.substring(0, 50)}...`);
710
+ } catch (error) {
711
+ console.error("Error updating prompt:", error);
712
+ }
713
+ } else {
714
+ console.warn("Not connected - prompt will be updated on next connection");
715
+ }
763
716
  }
764
717
  /**
765
- * High-precision method to get visemes that should be active
766
- * at the current playback frame. Use this in a requestAnimationFrame loop.
718
+ * Get full conversation transcript
767
719
  */
768
- getFrameVisemes() {
769
- const offset = this.audioManager.getAudioClockOffset();
770
- const audioCtx = this.audioManager.getAudioContext();
771
- if (offset === null || !audioCtx) return [];
772
- const streamTime = (audioCtx.currentTime - offset) * 1e3;
773
- const currentBatch = [];
774
- while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
775
- currentBatch.push(this.visemeQueue.shift());
776
- }
777
- return currentBatch;
720
+ getTranscript() {
721
+ return this.messages.slice();
778
722
  }
779
723
  /**
780
- * Change the system prompt mid-conversation.
724
+ * Get conversation as formatted text
781
725
  */
782
- updatePrompt(newPrompt) {
783
- this.client.updatePrompt(newPrompt);
726
+ getTranscriptText() {
727
+ return this.messages.map((msg) => `${msg.role === "user" ? "You" : "Agent"}: ${msg.text}`).join("\n");
728
+ }
729
+ };
730
+ var TTSClient = class {
731
+ apiKey;
732
+ constructor(config) {
733
+ this.apiKey = config.apiKey;
784
734
  }
785
735
  /**
786
- * Disconnect and release audio resources.
736
+ * Synthesize text to speech
737
+ *
738
+ * This opens a temporary WebSocket connection, sends the request,
739
+ * and streams back the audio.
787
740
  */
788
- disconnect() {
789
- this.isConnected = false;
790
- this.client.disconnect();
791
- this.audioManager.cleanup();
792
- this.visemeQueue = [];
741
+ synthesize(options) {
742
+ return new Promise((resolve, reject) => {
743
+ try {
744
+ let url = DEFAULT_URLS.TTS;
745
+ if (this.apiKey) {
746
+ const separator = url.includes("?") ? "&" : "?";
747
+ url += `${separator}api_key=${this.apiKey}`;
748
+ }
749
+ const ws = new WebSocket(url);
750
+ ws.binaryType = "arraybuffer";
751
+ ws.onopen = () => {
752
+ const req = {
753
+ text: options.text,
754
+ voice: options.voice || "F1" /* F1 */,
755
+ lang: options.language || "en" /* ENGLISH */,
756
+ speed: options.speed || 1.05,
757
+ steps: options.steps || 24,
758
+ visemes: options.visemes || false
759
+ };
760
+ ws.send(JSON.stringify(req));
761
+ };
762
+ ws.onmessage = async (event) => {
763
+ if (event.data instanceof ArrayBuffer) {
764
+ if (options.onAudio) options.onAudio(new Uint8Array(event.data));
765
+ } else {
766
+ try {
767
+ const msg = JSON.parse(event.data.toString());
768
+ if (Array.isArray(msg) && options.onVisemes) {
769
+ options.onVisemes(msg);
770
+ }
771
+ } catch (e) {
772
+ }
773
+ }
774
+ };
775
+ ws.onerror = (err) => {
776
+ if (options.onError) options.onError(err);
777
+ reject(err);
778
+ };
779
+ ws.onclose = () => {
780
+ resolve();
781
+ };
782
+ } catch (err) {
783
+ if (options.onError) options.onError(err);
784
+ reject(err);
785
+ }
786
+ });
793
787
  }
794
788
  };
789
+ async function simpleConversation(config) {
790
+ const client = new VoiceAgentClient(config);
791
+ await client.connect();
792
+ return client;
793
+ }
794
+ async function simpleTTS(options) {
795
+ const client = new TTSClient({ apiKey: options.apiKey });
796
+ return client.synthesize(options);
797
+ }
795
798
  export {
796
799
  AUDIO_CONFIG,
797
800
  BrowserAudioManager,
@@ -799,7 +802,6 @@ export {
799
802
  Language,
800
803
  StreamResampler,
801
804
  TTSClient,
802
- VoiceAgent,
803
805
  VoiceAgentClient,
804
806
  VoiceStyle,
805
807
  applyLowPassFilter,