@lokutor/sdk 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -90,11 +90,12 @@ interface VoiceAgentOptions {
90
90
  }
91
91
  /**
92
92
  * Viseme data for lip-sync animation
93
+ * Format: {"v": index, "c": character, "t": timestamp}
93
94
  */
94
95
  interface Viseme {
95
- id: number;
96
- char: string;
97
- timestamp: number;
96
+ v: number;
97
+ c: string;
98
+ t: number;
98
99
  }
99
100
 
100
101
  /**
@@ -117,10 +118,14 @@ declare class VoiceAgentClient {
117
118
  private isConnected;
118
119
  private messages;
119
120
  private visemeListeners;
121
+ private wantVisemes;
122
+ private serverUrl;
120
123
  constructor(config: LokutorConfig & {
121
124
  prompt: string;
122
125
  voice?: VoiceStyle;
123
126
  language?: Language;
127
+ visemes?: boolean;
128
+ serverUrl?: string;
124
129
  onVisemes?: (visemes: Viseme[]) => void;
125
130
  });
126
131
  /**
@@ -323,7 +328,7 @@ declare class BrowserAudioManager {
323
328
  private mediaStream;
324
329
  private nextPlaybackTime;
325
330
  private activeSources;
326
- private playbackQueue;
331
+ private audioClockOffset;
327
332
  private inputSampleRate;
328
333
  private outputSampleRate;
329
334
  private autoGainControl;
@@ -333,6 +338,7 @@ declare class BrowserAudioManager {
333
338
  private onInputError?;
334
339
  private isMuted;
335
340
  private isListening;
341
+ private resampler;
336
342
  constructor(config?: BrowserAudioConfig);
337
343
  /**
338
344
  * Initialize the AudioContext and analyser
@@ -342,60 +348,77 @@ declare class BrowserAudioManager {
342
348
  * Start capturing audio from the microphone
343
349
  */
344
350
  startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
345
- /**
346
- * Internal method to process microphone audio data
347
- */
348
351
  private _processAudioInput;
349
- /**
350
- * Stop capturing microphone input
351
- */
352
352
  stopMicrophone(): void;
353
353
  /**
354
354
  * Play back audio received from the server
355
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
356
355
  */
357
356
  playAudio(pcm16Data: Uint8Array): void;
358
- /**
359
- * Internal method to schedule and play audio with sample-accurate timing
360
- */
361
357
  private _schedulePlayback;
362
358
  /**
363
- * Stop all currently playing audio and clear the queue
359
+ * Get the current high-precision audio clock offset for viseme synchronization.
360
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
364
361
  */
365
- stopPlayback(): void;
362
+ getAudioClockOffset(): number | null;
366
363
  /**
367
- * Toggle mute state
364
+ * Reset the audio clock offset (call when a response is interrupted or finished)
368
365
  */
366
+ resetAudioClock(): void;
367
+ stopPlayback(): void;
369
368
  setMuted(muted: boolean): void;
370
- /**
371
- * Get current mute state
372
- */
373
369
  isMicMuted(): boolean;
370
+ getAmplitude(): number;
371
+ getFrequencyData(): Uint8Array;
372
+ getWaveformData(): Uint8Array;
373
+ cleanup(): void;
374
+ getAudioContext(): AudioContext | null;
375
+ }
376
+
377
+ /**
378
+ * High-level AI Voice Agent for browser-based conversations.
379
+ *
380
+ * This class orchestrates microphone input, AI processing, and
381
+ * speaker output, providing a simple interface for building
382
+ * voice assistants with lip-sync support.
383
+ */
384
+ declare class VoiceAgent {
385
+ private client;
386
+ private audioManager;
387
+ private options;
388
+ private isConnected;
389
+ private visemeQueue;
390
+ constructor(options: VoiceAgentOptions & {
391
+ apiKey: string;
392
+ });
374
393
  /**
375
- * Get current amplitude from analyser (for visualization)
376
- * Returns value between 0 and 1
394
+ * Initialize hardware and connect to the AI server.
395
+ * This must be called in response to a user guesture (like a click)
396
+ * to satisfy browser AudioContext requirements.
377
397
  */
378
- getAmplitude(): number;
398
+ connect(): Promise<boolean>;
379
399
  /**
380
- * Get frequency data from analyser for visualization
400
+ * Get the current amplitude/volume of the microphone or output audio.
401
+ * Useful for voice activity visualization.
402
+ * @returns value between 0 and 1
381
403
  */
382
- getFrequencyData(): Uint8Array;
404
+ getAmplitude(): number;
383
405
  /**
384
- * Get time-domain data from analyser for waveform visualization
406
+ * Mute or unmute the microphone.
385
407
  */
386
- getWaveformData(): Uint8Array;
408
+ toggleMute(): boolean;
387
409
  /**
388
- * Cleanup and close AudioContext
410
+ * High-precision method to get visemes that should be active
411
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
389
412
  */
390
- cleanup(): void;
413
+ getFrameVisemes(): Viseme[];
391
414
  /**
392
- * Get current audio context state
415
+ * Change the system prompt mid-conversation.
393
416
  */
394
- getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
417
+ updatePrompt(newPrompt: string): void;
395
418
  /**
396
- * Check if microphone is currently listening
419
+ * Disconnect and release audio resources.
397
420
  */
398
- isRecording(): boolean;
421
+ disconnect(): void;
399
422
  }
400
423
 
401
- export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
424
+ export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
package/dist/index.d.ts CHANGED
@@ -90,11 +90,12 @@ interface VoiceAgentOptions {
90
90
  }
91
91
  /**
92
92
  * Viseme data for lip-sync animation
93
+ * Format: {"v": index, "c": character, "t": timestamp}
93
94
  */
94
95
  interface Viseme {
95
- id: number;
96
- char: string;
97
- timestamp: number;
96
+ v: number;
97
+ c: string;
98
+ t: number;
98
99
  }
99
100
 
100
101
  /**
@@ -117,10 +118,14 @@ declare class VoiceAgentClient {
117
118
  private isConnected;
118
119
  private messages;
119
120
  private visemeListeners;
121
+ private wantVisemes;
122
+ private serverUrl;
120
123
  constructor(config: LokutorConfig & {
121
124
  prompt: string;
122
125
  voice?: VoiceStyle;
123
126
  language?: Language;
127
+ visemes?: boolean;
128
+ serverUrl?: string;
124
129
  onVisemes?: (visemes: Viseme[]) => void;
125
130
  });
126
131
  /**
@@ -323,7 +328,7 @@ declare class BrowserAudioManager {
323
328
  private mediaStream;
324
329
  private nextPlaybackTime;
325
330
  private activeSources;
326
- private playbackQueue;
331
+ private audioClockOffset;
327
332
  private inputSampleRate;
328
333
  private outputSampleRate;
329
334
  private autoGainControl;
@@ -333,6 +338,7 @@ declare class BrowserAudioManager {
333
338
  private onInputError?;
334
339
  private isMuted;
335
340
  private isListening;
341
+ private resampler;
336
342
  constructor(config?: BrowserAudioConfig);
337
343
  /**
338
344
  * Initialize the AudioContext and analyser
@@ -342,60 +348,77 @@ declare class BrowserAudioManager {
342
348
  * Start capturing audio from the microphone
343
349
  */
344
350
  startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
345
- /**
346
- * Internal method to process microphone audio data
347
- */
348
351
  private _processAudioInput;
349
- /**
350
- * Stop capturing microphone input
351
- */
352
352
  stopMicrophone(): void;
353
353
  /**
354
354
  * Play back audio received from the server
355
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
356
355
  */
357
356
  playAudio(pcm16Data: Uint8Array): void;
358
- /**
359
- * Internal method to schedule and play audio with sample-accurate timing
360
- */
361
357
  private _schedulePlayback;
362
358
  /**
363
- * Stop all currently playing audio and clear the queue
359
+ * Get the current high-precision audio clock offset for viseme synchronization.
360
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
364
361
  */
365
- stopPlayback(): void;
362
+ getAudioClockOffset(): number | null;
366
363
  /**
367
- * Toggle mute state
364
+ * Reset the audio clock offset (call when a response is interrupted or finished)
368
365
  */
366
+ resetAudioClock(): void;
367
+ stopPlayback(): void;
369
368
  setMuted(muted: boolean): void;
370
- /**
371
- * Get current mute state
372
- */
373
369
  isMicMuted(): boolean;
370
+ getAmplitude(): number;
371
+ getFrequencyData(): Uint8Array;
372
+ getWaveformData(): Uint8Array;
373
+ cleanup(): void;
374
+ getAudioContext(): AudioContext | null;
375
+ }
376
+
377
+ /**
378
+ * High-level AI Voice Agent for browser-based conversations.
379
+ *
380
+ * This class orchestrates microphone input, AI processing, and
381
+ * speaker output, providing a simple interface for building
382
+ * voice assistants with lip-sync support.
383
+ */
384
+ declare class VoiceAgent {
385
+ private client;
386
+ private audioManager;
387
+ private options;
388
+ private isConnected;
389
+ private visemeQueue;
390
+ constructor(options: VoiceAgentOptions & {
391
+ apiKey: string;
392
+ });
374
393
  /**
375
- * Get current amplitude from analyser (for visualization)
376
- * Returns value between 0 and 1
394
+ * Initialize hardware and connect to the AI server.
395
+ * This must be called in response to a user guesture (like a click)
396
+ * to satisfy browser AudioContext requirements.
377
397
  */
378
- getAmplitude(): number;
398
+ connect(): Promise<boolean>;
379
399
  /**
380
- * Get frequency data from analyser for visualization
400
+ * Get the current amplitude/volume of the microphone or output audio.
401
+ * Useful for voice activity visualization.
402
+ * @returns value between 0 and 1
381
403
  */
382
- getFrequencyData(): Uint8Array;
404
+ getAmplitude(): number;
383
405
  /**
384
- * Get time-domain data from analyser for waveform visualization
406
+ * Mute or unmute the microphone.
385
407
  */
386
- getWaveformData(): Uint8Array;
408
+ toggleMute(): boolean;
387
409
  /**
388
- * Cleanup and close AudioContext
410
+ * High-precision method to get visemes that should be active
411
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
389
412
  */
390
- cleanup(): void;
413
+ getFrameVisemes(): Viseme[];
391
414
  /**
392
- * Get current audio context state
415
+ * Change the system prompt mid-conversation.
393
416
  */
394
- getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
417
+ updatePrompt(newPrompt: string): void;
395
418
  /**
396
- * Check if microphone is currently listening
419
+ * Disconnect and release audio resources.
397
420
  */
398
- isRecording(): boolean;
421
+ disconnect(): void;
399
422
  }
400
423
 
401
- export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
424
+ export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
package/dist/index.js CHANGED
@@ -26,6 +26,7 @@ __export(index_exports, {
26
26
  Language: () => Language,
27
27
  StreamResampler: () => StreamResampler,
28
28
  TTSClient: () => TTSClient,
29
+ VoiceAgent: () => VoiceAgent,
29
30
  VoiceAgentClient: () => VoiceAgentClient,
30
31
  VoiceStyle: () => VoiceStyle,
31
32
  applyLowPassFilter: () => applyLowPassFilter,
@@ -65,8 +66,8 @@ var Language = /* @__PURE__ */ ((Language2) => {
65
66
  return Language2;
66
67
  })(Language || {});
67
68
  var AUDIO_CONFIG = {
68
- SAMPLE_RATE: 44100,
69
- SPEAKER_SAMPLE_RATE: 16e3,
69
+ SAMPLE_RATE: 16e3,
70
+ SPEAKER_SAMPLE_RATE: 44100,
70
71
  CHANNELS: 1,
71
72
  CHUNK_DURATION_MS: 20,
72
73
  get CHUNK_SIZE() {
@@ -103,17 +104,21 @@ var VoiceAgentClient = class {
103
104
  isConnected = false;
104
105
  messages = [];
105
106
  visemeListeners = [];
107
+ wantVisemes = false;
108
+ serverUrl = null;
106
109
  constructor(config) {
107
110
  this.apiKey = config.apiKey;
108
111
  this.prompt = config.prompt;
109
112
  this.voice = config.voice || "F1" /* F1 */;
110
113
  this.language = config.language || "en" /* ENGLISH */;
114
+ this.serverUrl = config.serverUrl || null;
111
115
  this.onTranscription = config.onTranscription;
112
116
  this.onResponse = config.onResponse;
113
117
  this.onAudioCallback = config.onAudio;
114
118
  this.onVisemesCallback = config.onVisemes;
115
119
  this.onStatus = config.onStatus;
116
120
  this.onError = config.onError;
121
+ this.wantVisemes = config.visemes || false;
117
122
  }
118
123
  /**
119
124
  * Connect to the Lokutor Voice Agent server
@@ -121,12 +126,12 @@ var VoiceAgentClient = class {
121
126
  async connect() {
122
127
  return new Promise((resolve, reject) => {
123
128
  try {
124
- let url = DEFAULT_URLS.VOICE_AGENT;
129
+ let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
125
130
  if (this.apiKey) {
126
131
  const separator = url.includes("?") ? "&" : "?";
127
132
  url += `${separator}api_key=${this.apiKey}`;
128
133
  }
129
- console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
134
+ console.log(`\u{1F517} Connecting to ${url}...`);
130
135
  this.ws = new WebSocket(url);
131
136
  this.ws.binaryType = "arraybuffer";
132
137
  this.ws.onopen = () => {
@@ -165,7 +170,8 @@ var VoiceAgentClient = class {
165
170
  this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
166
171
  this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
167
172
  this.ws.send(JSON.stringify({ type: "language", data: this.language }));
168
- console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}`);
173
+ this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
174
+ console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
169
175
  }
170
176
  /**
171
177
  * Send raw PCM audio data to the server
@@ -498,7 +504,8 @@ var BrowserAudioManager = class {
498
504
  // Playback scheduling
499
505
  nextPlaybackTime = 0;
500
506
  activeSources = [];
501
- playbackQueue = [];
507
+ // High-precision clock anchor for viseme sync
508
+ audioClockOffset = null;
502
509
  // Configuration
503
510
  inputSampleRate;
504
511
  outputSampleRate;
@@ -511,6 +518,7 @@ var BrowserAudioManager = class {
511
518
  // Audio processing state
512
519
  isMuted = false;
513
520
  isListening = false;
521
+ resampler = null;
514
522
  constructor(config = {}) {
515
523
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
516
524
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -534,7 +542,6 @@ var BrowserAudioManager = class {
534
542
  }
535
543
  if (this.audioContext.state === "suspended") {
536
544
  await this.audioContext.resume();
537
- console.log("\u{1F442} AudioContext resumed");
538
545
  }
539
546
  if (analyserConfig?.enabled !== false) {
540
547
  this.analyserNode = this.audioContext.createAnalyser();
@@ -548,6 +555,7 @@ var BrowserAudioManager = class {
548
555
  if (!this.audioContext) {
549
556
  await this.init();
550
557
  }
558
+ this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
551
559
  try {
552
560
  this.onAudioInput = onAudioInput;
553
561
  this.isListening = true;
@@ -563,9 +571,7 @@ var BrowserAudioManager = class {
563
571
  this.scriptProcessor = this.audioContext.createScriptProcessor(
564
572
  bufferSize,
565
573
  1,
566
- // input channels
567
574
  1
568
- // output channels
569
575
  );
570
576
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
571
577
  this.scriptProcessor.connect(this.audioContext.destination);
@@ -582,40 +588,19 @@ var BrowserAudioManager = class {
582
588
  throw err;
583
589
  }
584
590
  }
585
- /**
586
- * Internal method to process microphone audio data
587
- */
588
591
  _processAudioInput(event) {
589
- if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
590
- if (this.isMuted) return;
591
- const inputBuffer = event.inputBuffer;
592
- const inputData = inputBuffer.getChannelData(0);
593
- const outputBuffer = event.outputBuffer;
594
- for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
595
- outputBuffer.getChannelData(0)[i] = 0;
596
- }
597
- const hardwareRate = this.audioContext.sampleRate;
598
- let processedData = new Float32Array(inputData);
599
- if (hardwareRate !== this.inputSampleRate) {
600
- processedData = resampleWithAntiAliasing(
601
- processedData,
602
- hardwareRate,
603
- this.inputSampleRate
604
- );
592
+ if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
593
+ const inputData = event.inputBuffer.getChannelData(0);
594
+ event.outputBuffer.getChannelData(0).fill(0);
595
+ const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
596
+ if (resampled && resampled.length > 0) {
597
+ const int16Data = float32ToPcm16(resampled);
598
+ this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
605
599
  }
606
- const int16Data = float32ToPcm16(processedData);
607
- const uint8Data = new Uint8Array(
608
- int16Data.buffer,
609
- int16Data.byteOffset,
610
- int16Data.byteLength
611
- );
612
- this.onAudioInput(uint8Data);
613
600
  }
614
- /**
615
- * Stop capturing microphone input
616
- */
617
601
  stopMicrophone() {
618
602
  this.isListening = false;
603
+ this.resampler = null;
619
604
  if (this.mediaStream) {
620
605
  this.mediaStream.getTracks().forEach((track) => track.stop());
621
606
  this.mediaStream = null;
@@ -628,17 +613,12 @@ var BrowserAudioManager = class {
628
613
  this.mediaStreamAudioSourceNode.disconnect();
629
614
  this.mediaStreamAudioSourceNode = null;
630
615
  }
631
- console.log("\u{1F3A4} Microphone stopped");
632
616
  }
633
617
  /**
634
618
  * Play back audio received from the server
635
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
636
619
  */
637
620
  playAudio(pcm16Data) {
638
- if (!this.audioContext) {
639
- console.warn("AudioContext not initialized");
640
- return;
641
- }
621
+ if (!this.audioContext) return;
642
622
  const int16Array = new Int16Array(
643
623
  pcm16Data.buffer,
644
624
  pcm16Data.byteOffset,
@@ -653,18 +633,17 @@ var BrowserAudioManager = class {
653
633
  audioBuffer.getChannelData(0).set(float32Data);
654
634
  this._schedulePlayback(audioBuffer);
655
635
  }
656
- /**
657
- * Internal method to schedule and play audio with sample-accurate timing
658
- */
659
636
  _schedulePlayback(audioBuffer) {
660
637
  if (!this.audioContext) return;
661
638
  const currentTime = this.audioContext.currentTime;
662
639
  const duration = audioBuffer.length / this.outputSampleRate;
663
640
  const startTime = Math.max(
664
641
  currentTime + 0.01,
665
- // Minimum 10ms delay
666
642
  this.nextPlaybackTime
667
643
  );
644
+ if (this.audioClockOffset === null) {
645
+ this.audioClockOffset = startTime;
646
+ }
668
647
  this.nextPlaybackTime = startTime + duration;
669
648
  const source = this.audioContext.createBufferSource();
670
649
  source.buffer = audioBuffer;
@@ -682,8 +661,18 @@ var BrowserAudioManager = class {
682
661
  };
683
662
  }
684
663
  /**
685
- * Stop all currently playing audio and clear the queue
664
+ * Get the current high-precision audio clock offset for viseme synchronization.
665
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
666
+ */
667
+ getAudioClockOffset() {
668
+ return this.audioClockOffset;
669
+ }
670
+ /**
671
+ * Reset the audio clock offset (call when a response is interrupted or finished)
686
672
  */
673
+ resetAudioClock() {
674
+ this.audioClockOffset = null;
675
+ }
687
676
  stopPlayback() {
688
677
  this.activeSources.forEach((source) => {
689
678
  try {
@@ -692,26 +681,15 @@ var BrowserAudioManager = class {
692
681
  }
693
682
  });
694
683
  this.activeSources = [];
695
- this.playbackQueue = [];
696
- this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
697
- console.log("\u{1F507} Playback stopped");
684
+ this.nextPlaybackTime = 0;
685
+ this.resetAudioClock();
698
686
  }
699
- /**
700
- * Toggle mute state
701
- */
702
687
  setMuted(muted) {
703
688
  this.isMuted = muted;
704
689
  }
705
- /**
706
- * Get current mute state
707
- */
708
690
  isMicMuted() {
709
691
  return this.isMuted;
710
692
  }
711
- /**
712
- * Get current amplitude from analyser (for visualization)
713
- * Returns value between 0 and 1
714
- */
715
693
  getAmplitude() {
716
694
  if (!this.analyserNode) return 0;
717
695
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
@@ -719,31 +697,18 @@ var BrowserAudioManager = class {
719
697
  const rms = calculateRMS(dataArray);
720
698
  return Math.min(rms * 10, 1);
721
699
  }
722
- /**
723
- * Get frequency data from analyser for visualization
724
- */
725
700
  getFrequencyData() {
726
- if (!this.analyserNode) {
727
- return new Uint8Array(0);
728
- }
701
+ if (!this.analyserNode) return new Uint8Array(0);
729
702
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
730
703
  this.analyserNode.getByteFrequencyData(dataArray);
731
704
  return dataArray;
732
705
  }
733
- /**
734
- * Get time-domain data from analyser for waveform visualization
735
- */
736
706
  getWaveformData() {
737
- if (!this.analyserNode) {
738
- return new Uint8Array(0);
739
- }
707
+ if (!this.analyserNode) return new Uint8Array(0);
740
708
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
741
709
  this.analyserNode.getByteTimeDomainData(dataArray);
742
710
  return dataArray;
743
711
  }
744
- /**
745
- * Cleanup and close AudioContext
746
- */
747
712
  cleanup() {
748
713
  this.stopMicrophone();
749
714
  this.stopPlayback();
@@ -752,17 +717,124 @@ var BrowserAudioManager = class {
752
717
  this.analyserNode = null;
753
718
  }
754
719
  }
720
+ getAudioContext() {
721
+ return this.audioContext;
722
+ }
723
+ };
724
+
725
+ // src/voice-agent.ts
726
+ var VoiceAgent = class {
727
+ client;
728
+ audioManager;
729
+ options;
730
+ isConnected = false;
731
+ visemeQueue = [];
732
+ constructor(options) {
733
+ this.options = options;
734
+ this.client = new VoiceAgentClient({
735
+ apiKey: options.apiKey,
736
+ prompt: options.prompt || "You are a helpful and friendly AI assistant.",
737
+ voice: options.voice || "F1" /* F1 */,
738
+ language: options.language || "en" /* ENGLISH */,
739
+ visemes: options.visemes ?? true,
740
+ serverUrl: options.serverUrl,
741
+ onTranscription: (text) => {
742
+ if (options.onTranscription) options.onTranscription(text, true);
743
+ },
744
+ onResponse: (text) => {
745
+ if (options.onTranscription) options.onTranscription(text, false);
746
+ },
747
+ onAudio: (data) => {
748
+ this.audioManager.playAudio(data);
749
+ },
750
+ onVisemes: (visemes) => {
751
+ this.visemeQueue.push(...visemes);
752
+ if (options.onVisemes) options.onVisemes(visemes);
753
+ },
754
+ onStatus: (status) => {
755
+ if (options.onStatusChange) options.onStatusChange(status);
756
+ if (status === "interrupted" || status === "thinking") {
757
+ this.audioManager.stopPlayback();
758
+ this.visemeQueue = [];
759
+ }
760
+ },
761
+ onError: (err) => {
762
+ if (options.onError) options.onError(err);
763
+ }
764
+ });
765
+ this.audioManager = new BrowserAudioManager({
766
+ autoGainControl: true,
767
+ echoCancellation: true,
768
+ noiseSuppression: true
769
+ });
770
+ }
755
771
  /**
756
- * Get current audio context state
772
+ * Initialize hardware and connect to the AI server.
773
+ * This must be called in response to a user guesture (like a click)
774
+ * to satisfy browser AudioContext requirements.
757
775
  */
758
- getState() {
759
- return this.audioContext?.state ?? null;
776
+ async connect() {
777
+ try {
778
+ await this.audioManager.init();
779
+ const connected = await this.client.connect();
780
+ if (!connected) return false;
781
+ this.isConnected = true;
782
+ await this.audioManager.startMicrophone((pcm16Data) => {
783
+ if (this.isConnected) {
784
+ this.client.sendAudio(pcm16Data);
785
+ }
786
+ });
787
+ return true;
788
+ } catch (err) {
789
+ if (this.options.onError) this.options.onError(err);
790
+ return false;
791
+ }
760
792
  }
761
793
  /**
762
- * Check if microphone is currently listening
794
+ * Get the current amplitude/volume of the microphone or output audio.
795
+ * Useful for voice activity visualization.
796
+ * @returns value between 0 and 1
763
797
  */
764
- isRecording() {
765
- return this.isListening;
798
+ getAmplitude() {
799
+ return this.audioManager.getAmplitude();
800
+ }
801
+ /**
802
+ * Mute or unmute the microphone.
803
+ */
804
+ toggleMute() {
805
+ const currentState = this.audioManager.isMicMuted();
806
+ this.audioManager.setMuted(!currentState);
807
+ return !currentState;
808
+ }
809
+ /**
810
+ * High-precision method to get visemes that should be active
811
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
812
+ */
813
+ getFrameVisemes() {
814
+ const offset = this.audioManager.getAudioClockOffset();
815
+ const audioCtx = this.audioManager.getAudioContext();
816
+ if (offset === null || !audioCtx) return [];
817
+ const streamTime = (audioCtx.currentTime - offset) * 1e3;
818
+ const currentBatch = [];
819
+ while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
820
+ currentBatch.push(this.visemeQueue.shift());
821
+ }
822
+ return currentBatch;
823
+ }
824
+ /**
825
+ * Change the system prompt mid-conversation.
826
+ */
827
+ updatePrompt(newPrompt) {
828
+ this.client.updatePrompt(newPrompt);
829
+ }
830
+ /**
831
+ * Disconnect and release audio resources.
832
+ */
833
+ disconnect() {
834
+ this.isConnected = false;
835
+ this.client.disconnect();
836
+ this.audioManager.cleanup();
837
+ this.visemeQueue = [];
766
838
  }
767
839
  };
768
840
  // Annotate the CommonJS export names for ESM import in node:
@@ -773,6 +845,7 @@ var BrowserAudioManager = class {
773
845
  Language,
774
846
  StreamResampler,
775
847
  TTSClient,
848
+ VoiceAgent,
776
849
  VoiceAgentClient,
777
850
  VoiceStyle,
778
851
  applyLowPassFilter,
package/dist/index.mjs CHANGED
@@ -21,8 +21,8 @@ var Language = /* @__PURE__ */ ((Language2) => {
21
21
  return Language2;
22
22
  })(Language || {});
23
23
  var AUDIO_CONFIG = {
24
- SAMPLE_RATE: 44100,
25
- SPEAKER_SAMPLE_RATE: 16e3,
24
+ SAMPLE_RATE: 16e3,
25
+ SPEAKER_SAMPLE_RATE: 44100,
26
26
  CHANNELS: 1,
27
27
  CHUNK_DURATION_MS: 20,
28
28
  get CHUNK_SIZE() {
@@ -59,17 +59,21 @@ var VoiceAgentClient = class {
59
59
  isConnected = false;
60
60
  messages = [];
61
61
  visemeListeners = [];
62
+ wantVisemes = false;
63
+ serverUrl = null;
62
64
  constructor(config) {
63
65
  this.apiKey = config.apiKey;
64
66
  this.prompt = config.prompt;
65
67
  this.voice = config.voice || "F1" /* F1 */;
66
68
  this.language = config.language || "en" /* ENGLISH */;
69
+ this.serverUrl = config.serverUrl || null;
67
70
  this.onTranscription = config.onTranscription;
68
71
  this.onResponse = config.onResponse;
69
72
  this.onAudioCallback = config.onAudio;
70
73
  this.onVisemesCallback = config.onVisemes;
71
74
  this.onStatus = config.onStatus;
72
75
  this.onError = config.onError;
76
+ this.wantVisemes = config.visemes || false;
73
77
  }
74
78
  /**
75
79
  * Connect to the Lokutor Voice Agent server
@@ -77,12 +81,12 @@ var VoiceAgentClient = class {
77
81
  async connect() {
78
82
  return new Promise((resolve, reject) => {
79
83
  try {
80
- let url = DEFAULT_URLS.VOICE_AGENT;
84
+ let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
81
85
  if (this.apiKey) {
82
86
  const separator = url.includes("?") ? "&" : "?";
83
87
  url += `${separator}api_key=${this.apiKey}`;
84
88
  }
85
- console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
89
+ console.log(`\u{1F517} Connecting to ${url}...`);
86
90
  this.ws = new WebSocket(url);
87
91
  this.ws.binaryType = "arraybuffer";
88
92
  this.ws.onopen = () => {
@@ -121,7 +125,8 @@ var VoiceAgentClient = class {
121
125
  this.ws.send(JSON.stringify({ type: "prompt", data: this.prompt }));
122
126
  this.ws.send(JSON.stringify({ type: "voice", data: this.voice }));
123
127
  this.ws.send(JSON.stringify({ type: "language", data: this.language }));
124
- console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}`);
128
+ this.ws.send(JSON.stringify({ type: "visemes", data: this.wantVisemes }));
129
+ console.log(`\u2699\uFE0F Configured: voice=${this.voice}, language=${this.language}, visemes=${this.wantVisemes}`);
125
130
  }
126
131
  /**
127
132
  * Send raw PCM audio data to the server
@@ -454,7 +459,8 @@ var BrowserAudioManager = class {
454
459
  // Playback scheduling
455
460
  nextPlaybackTime = 0;
456
461
  activeSources = [];
457
- playbackQueue = [];
462
+ // High-precision clock anchor for viseme sync
463
+ audioClockOffset = null;
458
464
  // Configuration
459
465
  inputSampleRate;
460
466
  outputSampleRate;
@@ -467,6 +473,7 @@ var BrowserAudioManager = class {
467
473
  // Audio processing state
468
474
  isMuted = false;
469
475
  isListening = false;
476
+ resampler = null;
470
477
  constructor(config = {}) {
471
478
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
472
479
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -490,7 +497,6 @@ var BrowserAudioManager = class {
490
497
  }
491
498
  if (this.audioContext.state === "suspended") {
492
499
  await this.audioContext.resume();
493
- console.log("\u{1F442} AudioContext resumed");
494
500
  }
495
501
  if (analyserConfig?.enabled !== false) {
496
502
  this.analyserNode = this.audioContext.createAnalyser();
@@ -504,6 +510,7 @@ var BrowserAudioManager = class {
504
510
  if (!this.audioContext) {
505
511
  await this.init();
506
512
  }
513
+ this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
507
514
  try {
508
515
  this.onAudioInput = onAudioInput;
509
516
  this.isListening = true;
@@ -519,9 +526,7 @@ var BrowserAudioManager = class {
519
526
  this.scriptProcessor = this.audioContext.createScriptProcessor(
520
527
  bufferSize,
521
528
  1,
522
- // input channels
523
529
  1
524
- // output channels
525
530
  );
526
531
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
527
532
  this.scriptProcessor.connect(this.audioContext.destination);
@@ -538,40 +543,19 @@ var BrowserAudioManager = class {
538
543
  throw err;
539
544
  }
540
545
  }
541
- /**
542
- * Internal method to process microphone audio data
543
- */
544
546
  _processAudioInput(event) {
545
- if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
546
- if (this.isMuted) return;
547
- const inputBuffer = event.inputBuffer;
548
- const inputData = inputBuffer.getChannelData(0);
549
- const outputBuffer = event.outputBuffer;
550
- for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
551
- outputBuffer.getChannelData(0)[i] = 0;
552
- }
553
- const hardwareRate = this.audioContext.sampleRate;
554
- let processedData = new Float32Array(inputData);
555
- if (hardwareRate !== this.inputSampleRate) {
556
- processedData = resampleWithAntiAliasing(
557
- processedData,
558
- hardwareRate,
559
- this.inputSampleRate
560
- );
547
+ if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
548
+ const inputData = event.inputBuffer.getChannelData(0);
549
+ event.outputBuffer.getChannelData(0).fill(0);
550
+ const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
551
+ if (resampled && resampled.length > 0) {
552
+ const int16Data = float32ToPcm16(resampled);
553
+ this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
561
554
  }
562
- const int16Data = float32ToPcm16(processedData);
563
- const uint8Data = new Uint8Array(
564
- int16Data.buffer,
565
- int16Data.byteOffset,
566
- int16Data.byteLength
567
- );
568
- this.onAudioInput(uint8Data);
569
555
  }
570
- /**
571
- * Stop capturing microphone input
572
- */
573
556
  stopMicrophone() {
574
557
  this.isListening = false;
558
+ this.resampler = null;
575
559
  if (this.mediaStream) {
576
560
  this.mediaStream.getTracks().forEach((track) => track.stop());
577
561
  this.mediaStream = null;
@@ -584,17 +568,12 @@ var BrowserAudioManager = class {
584
568
  this.mediaStreamAudioSourceNode.disconnect();
585
569
  this.mediaStreamAudioSourceNode = null;
586
570
  }
587
- console.log("\u{1F3A4} Microphone stopped");
588
571
  }
589
572
  /**
590
573
  * Play back audio received from the server
591
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
592
574
  */
593
575
  playAudio(pcm16Data) {
594
- if (!this.audioContext) {
595
- console.warn("AudioContext not initialized");
596
- return;
597
- }
576
+ if (!this.audioContext) return;
598
577
  const int16Array = new Int16Array(
599
578
  pcm16Data.buffer,
600
579
  pcm16Data.byteOffset,
@@ -609,18 +588,17 @@ var BrowserAudioManager = class {
609
588
  audioBuffer.getChannelData(0).set(float32Data);
610
589
  this._schedulePlayback(audioBuffer);
611
590
  }
612
- /**
613
- * Internal method to schedule and play audio with sample-accurate timing
614
- */
615
591
  _schedulePlayback(audioBuffer) {
616
592
  if (!this.audioContext) return;
617
593
  const currentTime = this.audioContext.currentTime;
618
594
  const duration = audioBuffer.length / this.outputSampleRate;
619
595
  const startTime = Math.max(
620
596
  currentTime + 0.01,
621
- // Minimum 10ms delay
622
597
  this.nextPlaybackTime
623
598
  );
599
+ if (this.audioClockOffset === null) {
600
+ this.audioClockOffset = startTime;
601
+ }
624
602
  this.nextPlaybackTime = startTime + duration;
625
603
  const source = this.audioContext.createBufferSource();
626
604
  source.buffer = audioBuffer;
@@ -638,8 +616,18 @@ var BrowserAudioManager = class {
638
616
  };
639
617
  }
640
618
  /**
641
- * Stop all currently playing audio and clear the queue
619
+ * Get the current high-precision audio clock offset for viseme synchronization.
620
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
621
+ */
622
+ getAudioClockOffset() {
623
+ return this.audioClockOffset;
624
+ }
625
+ /**
626
+ * Reset the audio clock offset (call when a response is interrupted or finished)
642
627
  */
628
+ resetAudioClock() {
629
+ this.audioClockOffset = null;
630
+ }
643
631
  stopPlayback() {
644
632
  this.activeSources.forEach((source) => {
645
633
  try {
@@ -648,26 +636,15 @@ var BrowserAudioManager = class {
648
636
  }
649
637
  });
650
638
  this.activeSources = [];
651
- this.playbackQueue = [];
652
- this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
653
- console.log("\u{1F507} Playback stopped");
639
+ this.nextPlaybackTime = 0;
640
+ this.resetAudioClock();
654
641
  }
655
- /**
656
- * Toggle mute state
657
- */
658
642
  setMuted(muted) {
659
643
  this.isMuted = muted;
660
644
  }
661
- /**
662
- * Get current mute state
663
- */
664
645
  isMicMuted() {
665
646
  return this.isMuted;
666
647
  }
667
- /**
668
- * Get current amplitude from analyser (for visualization)
669
- * Returns value between 0 and 1
670
- */
671
648
  getAmplitude() {
672
649
  if (!this.analyserNode) return 0;
673
650
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
@@ -675,31 +652,18 @@ var BrowserAudioManager = class {
675
652
  const rms = calculateRMS(dataArray);
676
653
  return Math.min(rms * 10, 1);
677
654
  }
678
- /**
679
- * Get frequency data from analyser for visualization
680
- */
681
655
  getFrequencyData() {
682
- if (!this.analyserNode) {
683
- return new Uint8Array(0);
684
- }
656
+ if (!this.analyserNode) return new Uint8Array(0);
685
657
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
686
658
  this.analyserNode.getByteFrequencyData(dataArray);
687
659
  return dataArray;
688
660
  }
689
- /**
690
- * Get time-domain data from analyser for waveform visualization
691
- */
692
661
  getWaveformData() {
693
- if (!this.analyserNode) {
694
- return new Uint8Array(0);
695
- }
662
+ if (!this.analyserNode) return new Uint8Array(0);
696
663
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
697
664
  this.analyserNode.getByteTimeDomainData(dataArray);
698
665
  return dataArray;
699
666
  }
700
- /**
701
- * Cleanup and close AudioContext
702
- */
703
667
  cleanup() {
704
668
  this.stopMicrophone();
705
669
  this.stopPlayback();
@@ -708,17 +672,124 @@ var BrowserAudioManager = class {
708
672
  this.analyserNode = null;
709
673
  }
710
674
  }
675
+ getAudioContext() {
676
+ return this.audioContext;
677
+ }
678
+ };
679
+
680
+ // src/voice-agent.ts
681
+ var VoiceAgent = class {
682
+ client;
683
+ audioManager;
684
+ options;
685
+ isConnected = false;
686
+ visemeQueue = [];
687
+ constructor(options) {
688
+ this.options = options;
689
+ this.client = new VoiceAgentClient({
690
+ apiKey: options.apiKey,
691
+ prompt: options.prompt || "You are a helpful and friendly AI assistant.",
692
+ voice: options.voice || "F1" /* F1 */,
693
+ language: options.language || "en" /* ENGLISH */,
694
+ visemes: options.visemes ?? true,
695
+ serverUrl: options.serverUrl,
696
+ onTranscription: (text) => {
697
+ if (options.onTranscription) options.onTranscription(text, true);
698
+ },
699
+ onResponse: (text) => {
700
+ if (options.onTranscription) options.onTranscription(text, false);
701
+ },
702
+ onAudio: (data) => {
703
+ this.audioManager.playAudio(data);
704
+ },
705
+ onVisemes: (visemes) => {
706
+ this.visemeQueue.push(...visemes);
707
+ if (options.onVisemes) options.onVisemes(visemes);
708
+ },
709
+ onStatus: (status) => {
710
+ if (options.onStatusChange) options.onStatusChange(status);
711
+ if (status === "interrupted" || status === "thinking") {
712
+ this.audioManager.stopPlayback();
713
+ this.visemeQueue = [];
714
+ }
715
+ },
716
+ onError: (err) => {
717
+ if (options.onError) options.onError(err);
718
+ }
719
+ });
720
+ this.audioManager = new BrowserAudioManager({
721
+ autoGainControl: true,
722
+ echoCancellation: true,
723
+ noiseSuppression: true
724
+ });
725
+ }
711
726
  /**
712
- * Get current audio context state
727
+ * Initialize hardware and connect to the AI server.
728
+ * This must be called in response to a user guesture (like a click)
729
+ * to satisfy browser AudioContext requirements.
713
730
  */
714
- getState() {
715
- return this.audioContext?.state ?? null;
731
+ async connect() {
732
+ try {
733
+ await this.audioManager.init();
734
+ const connected = await this.client.connect();
735
+ if (!connected) return false;
736
+ this.isConnected = true;
737
+ await this.audioManager.startMicrophone((pcm16Data) => {
738
+ if (this.isConnected) {
739
+ this.client.sendAudio(pcm16Data);
740
+ }
741
+ });
742
+ return true;
743
+ } catch (err) {
744
+ if (this.options.onError) this.options.onError(err);
745
+ return false;
746
+ }
716
747
  }
717
748
  /**
718
- * Check if microphone is currently listening
749
+ * Get the current amplitude/volume of the microphone or output audio.
750
+ * Useful for voice activity visualization.
751
+ * @returns value between 0 and 1
719
752
  */
720
- isRecording() {
721
- return this.isListening;
753
+ getAmplitude() {
754
+ return this.audioManager.getAmplitude();
755
+ }
756
+ /**
757
+ * Mute or unmute the microphone.
758
+ */
759
+ toggleMute() {
760
+ const currentState = this.audioManager.isMicMuted();
761
+ this.audioManager.setMuted(!currentState);
762
+ return !currentState;
763
+ }
764
+ /**
765
+ * High-precision method to get visemes that should be active
766
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
767
+ */
768
+ getFrameVisemes() {
769
+ const offset = this.audioManager.getAudioClockOffset();
770
+ const audioCtx = this.audioManager.getAudioContext();
771
+ if (offset === null || !audioCtx) return [];
772
+ const streamTime = (audioCtx.currentTime - offset) * 1e3;
773
+ const currentBatch = [];
774
+ while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
775
+ currentBatch.push(this.visemeQueue.shift());
776
+ }
777
+ return currentBatch;
778
+ }
779
+ /**
780
+ * Change the system prompt mid-conversation.
781
+ */
782
+ updatePrompt(newPrompt) {
783
+ this.client.updatePrompt(newPrompt);
784
+ }
785
+ /**
786
+ * Disconnect and release audio resources.
787
+ */
788
+ disconnect() {
789
+ this.isConnected = false;
790
+ this.client.disconnect();
791
+ this.audioManager.cleanup();
792
+ this.visemeQueue = [];
722
793
  }
723
794
  };
724
795
  export {
@@ -728,6 +799,7 @@ export {
728
799
  Language,
729
800
  StreamResampler,
730
801
  TTSClient,
802
+ VoiceAgent,
731
803
  VoiceAgentClient,
732
804
  VoiceStyle,
733
805
  applyLowPassFilter,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lokutor/sdk",
3
- "version": "1.1.7",
3
+ "version": "1.1.9",
4
4
  "description": "JavaScript/TypeScript SDK for Lokutor Real-time Voice AI",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",