@lokutor/sdk 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -119,11 +119,13 @@ declare class VoiceAgentClient {
119
119
  private messages;
120
120
  private visemeListeners;
121
121
  private wantVisemes;
122
+ private serverUrl;
122
123
  constructor(config: LokutorConfig & {
123
124
  prompt: string;
124
125
  voice?: VoiceStyle;
125
126
  language?: Language;
126
127
  visemes?: boolean;
128
+ serverUrl?: string;
127
129
  onVisemes?: (visemes: Viseme[]) => void;
128
130
  });
129
131
  /**
@@ -326,7 +328,7 @@ declare class BrowserAudioManager {
326
328
  private mediaStream;
327
329
  private nextPlaybackTime;
328
330
  private activeSources;
329
- private playbackQueue;
331
+ private audioClockOffset;
330
332
  private inputSampleRate;
331
333
  private outputSampleRate;
332
334
  private autoGainControl;
@@ -336,6 +338,7 @@ declare class BrowserAudioManager {
336
338
  private onInputError?;
337
339
  private isMuted;
338
340
  private isListening;
341
+ private resampler;
339
342
  constructor(config?: BrowserAudioConfig);
340
343
  /**
341
344
  * Initialize the AudioContext and analyser
@@ -345,60 +348,77 @@ declare class BrowserAudioManager {
345
348
  * Start capturing audio from the microphone
346
349
  */
347
350
  startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
348
- /**
349
- * Internal method to process microphone audio data
350
- */
351
351
  private _processAudioInput;
352
- /**
353
- * Stop capturing microphone input
354
- */
355
352
  stopMicrophone(): void;
356
353
  /**
357
354
  * Play back audio received from the server
358
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
359
355
  */
360
356
  playAudio(pcm16Data: Uint8Array): void;
361
- /**
362
- * Internal method to schedule and play audio with sample-accurate timing
363
- */
364
357
  private _schedulePlayback;
365
358
  /**
366
- * Stop all currently playing audio and clear the queue
359
+ * Get the current high-precision audio clock offset for viseme synchronization.
360
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
367
361
  */
368
- stopPlayback(): void;
362
+ getAudioClockOffset(): number | null;
369
363
  /**
370
- * Toggle mute state
364
+ * Reset the audio clock offset (call when a response is interrupted or finished)
371
365
  */
366
+ resetAudioClock(): void;
367
+ stopPlayback(): void;
372
368
  setMuted(muted: boolean): void;
373
- /**
374
- * Get current mute state
375
- */
376
369
  isMicMuted(): boolean;
370
+ getAmplitude(): number;
371
+ getFrequencyData(): Uint8Array;
372
+ getWaveformData(): Uint8Array;
373
+ cleanup(): void;
374
+ getAudioContext(): AudioContext | null;
375
+ }
376
+
377
+ /**
378
+ * High-level AI Voice Agent for browser-based conversations.
379
+ *
380
+ * This class orchestrates microphone input, AI processing, and
381
+ * speaker output, providing a simple interface for building
382
+ * voice assistants with lip-sync support.
383
+ */
384
+ declare class VoiceAgent {
385
+ private client;
386
+ private audioManager;
387
+ private options;
388
+ private isConnected;
389
+ private visemeQueue;
390
+ constructor(options: VoiceAgentOptions & {
391
+ apiKey: string;
392
+ });
377
393
  /**
378
- * Get current amplitude from analyser (for visualization)
379
- * Returns value between 0 and 1
394
+ * Initialize hardware and connect to the AI server.
395
+ * This must be called in response to a user guesture (like a click)
396
+ * to satisfy browser AudioContext requirements.
380
397
  */
381
- getAmplitude(): number;
398
+ connect(): Promise<boolean>;
382
399
  /**
383
- * Get frequency data from analyser for visualization
400
+ * Get the current amplitude/volume of the microphone or output audio.
401
+ * Useful for voice activity visualization.
402
+ * @returns value between 0 and 1
384
403
  */
385
- getFrequencyData(): Uint8Array;
404
+ getAmplitude(): number;
386
405
  /**
387
- * Get time-domain data from analyser for waveform visualization
406
+ * Mute or unmute the microphone.
388
407
  */
389
- getWaveformData(): Uint8Array;
408
+ toggleMute(): boolean;
390
409
  /**
391
- * Cleanup and close AudioContext
410
+ * High-precision method to get visemes that should be active
411
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
392
412
  */
393
- cleanup(): void;
413
+ getFrameVisemes(): Viseme[];
394
414
  /**
395
- * Get current audio context state
415
+ * Change the system prompt mid-conversation.
396
416
  */
397
- getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
417
+ updatePrompt(newPrompt: string): void;
398
418
  /**
399
- * Check if microphone is currently listening
419
+ * Disconnect and release audio resources.
400
420
  */
401
- isRecording(): boolean;
421
+ disconnect(): void;
402
422
  }
403
423
 
404
- export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
424
+ export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
package/dist/index.d.ts CHANGED
@@ -119,11 +119,13 @@ declare class VoiceAgentClient {
119
119
  private messages;
120
120
  private visemeListeners;
121
121
  private wantVisemes;
122
+ private serverUrl;
122
123
  constructor(config: LokutorConfig & {
123
124
  prompt: string;
124
125
  voice?: VoiceStyle;
125
126
  language?: Language;
126
127
  visemes?: boolean;
128
+ serverUrl?: string;
127
129
  onVisemes?: (visemes: Viseme[]) => void;
128
130
  });
129
131
  /**
@@ -326,7 +328,7 @@ declare class BrowserAudioManager {
326
328
  private mediaStream;
327
329
  private nextPlaybackTime;
328
330
  private activeSources;
329
- private playbackQueue;
331
+ private audioClockOffset;
330
332
  private inputSampleRate;
331
333
  private outputSampleRate;
332
334
  private autoGainControl;
@@ -336,6 +338,7 @@ declare class BrowserAudioManager {
336
338
  private onInputError?;
337
339
  private isMuted;
338
340
  private isListening;
341
+ private resampler;
339
342
  constructor(config?: BrowserAudioConfig);
340
343
  /**
341
344
  * Initialize the AudioContext and analyser
@@ -345,60 +348,77 @@ declare class BrowserAudioManager {
345
348
  * Start capturing audio from the microphone
346
349
  */
347
350
  startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
348
- /**
349
- * Internal method to process microphone audio data
350
- */
351
351
  private _processAudioInput;
352
- /**
353
- * Stop capturing microphone input
354
- */
355
352
  stopMicrophone(): void;
356
353
  /**
357
354
  * Play back audio received from the server
358
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
359
355
  */
360
356
  playAudio(pcm16Data: Uint8Array): void;
361
- /**
362
- * Internal method to schedule and play audio with sample-accurate timing
363
- */
364
357
  private _schedulePlayback;
365
358
  /**
366
- * Stop all currently playing audio and clear the queue
359
+ * Get the current high-precision audio clock offset for viseme synchronization.
360
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
367
361
  */
368
- stopPlayback(): void;
362
+ getAudioClockOffset(): number | null;
369
363
  /**
370
- * Toggle mute state
364
+ * Reset the audio clock offset (call when a response is interrupted or finished)
371
365
  */
366
+ resetAudioClock(): void;
367
+ stopPlayback(): void;
372
368
  setMuted(muted: boolean): void;
373
- /**
374
- * Get current mute state
375
- */
376
369
  isMicMuted(): boolean;
370
+ getAmplitude(): number;
371
+ getFrequencyData(): Uint8Array;
372
+ getWaveformData(): Uint8Array;
373
+ cleanup(): void;
374
+ getAudioContext(): AudioContext | null;
375
+ }
376
+
377
+ /**
378
+ * High-level AI Voice Agent for browser-based conversations.
379
+ *
380
+ * This class orchestrates microphone input, AI processing, and
381
+ * speaker output, providing a simple interface for building
382
+ * voice assistants with lip-sync support.
383
+ */
384
+ declare class VoiceAgent {
385
+ private client;
386
+ private audioManager;
387
+ private options;
388
+ private isConnected;
389
+ private visemeQueue;
390
+ constructor(options: VoiceAgentOptions & {
391
+ apiKey: string;
392
+ });
377
393
  /**
378
- * Get current amplitude from analyser (for visualization)
379
- * Returns value between 0 and 1
394
+ * Initialize hardware and connect to the AI server.
395
+ * This must be called in response to a user guesture (like a click)
396
+ * to satisfy browser AudioContext requirements.
380
397
  */
381
- getAmplitude(): number;
398
+ connect(): Promise<boolean>;
382
399
  /**
383
- * Get frequency data from analyser for visualization
400
+ * Get the current amplitude/volume of the microphone or output audio.
401
+ * Useful for voice activity visualization.
402
+ * @returns value between 0 and 1
384
403
  */
385
- getFrequencyData(): Uint8Array;
404
+ getAmplitude(): number;
386
405
  /**
387
- * Get time-domain data from analyser for waveform visualization
406
+ * Mute or unmute the microphone.
388
407
  */
389
- getWaveformData(): Uint8Array;
408
+ toggleMute(): boolean;
390
409
  /**
391
- * Cleanup and close AudioContext
410
+ * High-precision method to get visemes that should be active
411
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
392
412
  */
393
- cleanup(): void;
413
+ getFrameVisemes(): Viseme[];
394
414
  /**
395
- * Get current audio context state
415
+ * Change the system prompt mid-conversation.
396
416
  */
397
- getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
417
+ updatePrompt(newPrompt: string): void;
398
418
  /**
399
- * Check if microphone is currently listening
419
+ * Disconnect and release audio resources.
400
420
  */
401
- isRecording(): boolean;
421
+ disconnect(): void;
402
422
  }
403
423
 
404
- export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
424
+ export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
package/dist/index.js CHANGED
@@ -26,6 +26,7 @@ __export(index_exports, {
26
26
  Language: () => Language,
27
27
  StreamResampler: () => StreamResampler,
28
28
  TTSClient: () => TTSClient,
29
+ VoiceAgent: () => VoiceAgent,
29
30
  VoiceAgentClient: () => VoiceAgentClient,
30
31
  VoiceStyle: () => VoiceStyle,
31
32
  applyLowPassFilter: () => applyLowPassFilter,
@@ -104,11 +105,13 @@ var VoiceAgentClient = class {
104
105
  messages = [];
105
106
  visemeListeners = [];
106
107
  wantVisemes = false;
108
+ serverUrl = null;
107
109
  constructor(config) {
108
110
  this.apiKey = config.apiKey;
109
111
  this.prompt = config.prompt;
110
112
  this.voice = config.voice || "F1" /* F1 */;
111
113
  this.language = config.language || "en" /* ENGLISH */;
114
+ this.serverUrl = config.serverUrl || null;
112
115
  this.onTranscription = config.onTranscription;
113
116
  this.onResponse = config.onResponse;
114
117
  this.onAudioCallback = config.onAudio;
@@ -123,12 +126,12 @@ var VoiceAgentClient = class {
123
126
  async connect() {
124
127
  return new Promise((resolve, reject) => {
125
128
  try {
126
- let url = DEFAULT_URLS.VOICE_AGENT;
129
+ let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
127
130
  if (this.apiKey) {
128
131
  const separator = url.includes("?") ? "&" : "?";
129
132
  url += `${separator}api_key=${this.apiKey}`;
130
133
  }
131
- console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
134
+ console.log(`\u{1F517} Connecting to ${url}...`);
132
135
  this.ws = new WebSocket(url);
133
136
  this.ws.binaryType = "arraybuffer";
134
137
  this.ws.onopen = () => {
@@ -501,7 +504,8 @@ var BrowserAudioManager = class {
501
504
  // Playback scheduling
502
505
  nextPlaybackTime = 0;
503
506
  activeSources = [];
504
- playbackQueue = [];
507
+ // High-precision clock anchor for viseme sync
508
+ audioClockOffset = null;
505
509
  // Configuration
506
510
  inputSampleRate;
507
511
  outputSampleRate;
@@ -514,6 +518,7 @@ var BrowserAudioManager = class {
514
518
  // Audio processing state
515
519
  isMuted = false;
516
520
  isListening = false;
521
+ resampler = null;
517
522
  constructor(config = {}) {
518
523
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
519
524
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -537,7 +542,6 @@ var BrowserAudioManager = class {
537
542
  }
538
543
  if (this.audioContext.state === "suspended") {
539
544
  await this.audioContext.resume();
540
- console.log("\u{1F442} AudioContext resumed");
541
545
  }
542
546
  if (analyserConfig?.enabled !== false) {
543
547
  this.analyserNode = this.audioContext.createAnalyser();
@@ -551,6 +555,7 @@ var BrowserAudioManager = class {
551
555
  if (!this.audioContext) {
552
556
  await this.init();
553
557
  }
558
+ this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
554
559
  try {
555
560
  this.onAudioInput = onAudioInput;
556
561
  this.isListening = true;
@@ -566,9 +571,7 @@ var BrowserAudioManager = class {
566
571
  this.scriptProcessor = this.audioContext.createScriptProcessor(
567
572
  bufferSize,
568
573
  1,
569
- // input channels
570
574
  1
571
- // output channels
572
575
  );
573
576
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
574
577
  this.scriptProcessor.connect(this.audioContext.destination);
@@ -585,40 +588,19 @@ var BrowserAudioManager = class {
585
588
  throw err;
586
589
  }
587
590
  }
588
- /**
589
- * Internal method to process microphone audio data
590
- */
591
591
  _processAudioInput(event) {
592
- if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
593
- if (this.isMuted) return;
594
- const inputBuffer = event.inputBuffer;
595
- const inputData = inputBuffer.getChannelData(0);
596
- const outputBuffer = event.outputBuffer;
597
- for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
598
- outputBuffer.getChannelData(0)[i] = 0;
599
- }
600
- const hardwareRate = this.audioContext.sampleRate;
601
- let processedData = new Float32Array(inputData);
602
- if (hardwareRate !== this.inputSampleRate) {
603
- processedData = resampleWithAntiAliasing(
604
- processedData,
605
- hardwareRate,
606
- this.inputSampleRate
607
- );
592
+ if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
593
+ const inputData = event.inputBuffer.getChannelData(0);
594
+ event.outputBuffer.getChannelData(0).fill(0);
595
+ const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
596
+ if (resampled && resampled.length > 0) {
597
+ const int16Data = float32ToPcm16(resampled);
598
+ this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
608
599
  }
609
- const int16Data = float32ToPcm16(processedData);
610
- const uint8Data = new Uint8Array(
611
- int16Data.buffer,
612
- int16Data.byteOffset,
613
- int16Data.byteLength
614
- );
615
- this.onAudioInput(uint8Data);
616
600
  }
617
- /**
618
- * Stop capturing microphone input
619
- */
620
601
  stopMicrophone() {
621
602
  this.isListening = false;
603
+ this.resampler = null;
622
604
  if (this.mediaStream) {
623
605
  this.mediaStream.getTracks().forEach((track) => track.stop());
624
606
  this.mediaStream = null;
@@ -631,17 +613,12 @@ var BrowserAudioManager = class {
631
613
  this.mediaStreamAudioSourceNode.disconnect();
632
614
  this.mediaStreamAudioSourceNode = null;
633
615
  }
634
- console.log("\u{1F3A4} Microphone stopped");
635
616
  }
636
617
  /**
637
618
  * Play back audio received from the server
638
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
639
619
  */
640
620
  playAudio(pcm16Data) {
641
- if (!this.audioContext) {
642
- console.warn("AudioContext not initialized");
643
- return;
644
- }
621
+ if (!this.audioContext) return;
645
622
  const int16Array = new Int16Array(
646
623
  pcm16Data.buffer,
647
624
  pcm16Data.byteOffset,
@@ -656,18 +633,17 @@ var BrowserAudioManager = class {
656
633
  audioBuffer.getChannelData(0).set(float32Data);
657
634
  this._schedulePlayback(audioBuffer);
658
635
  }
659
- /**
660
- * Internal method to schedule and play audio with sample-accurate timing
661
- */
662
636
  _schedulePlayback(audioBuffer) {
663
637
  if (!this.audioContext) return;
664
638
  const currentTime = this.audioContext.currentTime;
665
639
  const duration = audioBuffer.length / this.outputSampleRate;
666
640
  const startTime = Math.max(
667
641
  currentTime + 0.01,
668
- // Minimum 10ms delay
669
642
  this.nextPlaybackTime
670
643
  );
644
+ if (this.audioClockOffset === null) {
645
+ this.audioClockOffset = startTime;
646
+ }
671
647
  this.nextPlaybackTime = startTime + duration;
672
648
  const source = this.audioContext.createBufferSource();
673
649
  source.buffer = audioBuffer;
@@ -685,8 +661,18 @@ var BrowserAudioManager = class {
685
661
  };
686
662
  }
687
663
  /**
688
- * Stop all currently playing audio and clear the queue
664
+ * Get the current high-precision audio clock offset for viseme synchronization.
665
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
666
+ */
667
+ getAudioClockOffset() {
668
+ return this.audioClockOffset;
669
+ }
670
+ /**
671
+ * Reset the audio clock offset (call when a response is interrupted or finished)
689
672
  */
673
+ resetAudioClock() {
674
+ this.audioClockOffset = null;
675
+ }
690
676
  stopPlayback() {
691
677
  this.activeSources.forEach((source) => {
692
678
  try {
@@ -695,26 +681,15 @@ var BrowserAudioManager = class {
695
681
  }
696
682
  });
697
683
  this.activeSources = [];
698
- this.playbackQueue = [];
699
- this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
700
- console.log("\u{1F507} Playback stopped");
684
+ this.nextPlaybackTime = 0;
685
+ this.resetAudioClock();
701
686
  }
702
- /**
703
- * Toggle mute state
704
- */
705
687
  setMuted(muted) {
706
688
  this.isMuted = muted;
707
689
  }
708
- /**
709
- * Get current mute state
710
- */
711
690
  isMicMuted() {
712
691
  return this.isMuted;
713
692
  }
714
- /**
715
- * Get current amplitude from analyser (for visualization)
716
- * Returns value between 0 and 1
717
- */
718
693
  getAmplitude() {
719
694
  if (!this.analyserNode) return 0;
720
695
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
@@ -722,31 +697,18 @@ var BrowserAudioManager = class {
722
697
  const rms = calculateRMS(dataArray);
723
698
  return Math.min(rms * 10, 1);
724
699
  }
725
- /**
726
- * Get frequency data from analyser for visualization
727
- */
728
700
  getFrequencyData() {
729
- if (!this.analyserNode) {
730
- return new Uint8Array(0);
731
- }
701
+ if (!this.analyserNode) return new Uint8Array(0);
732
702
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
733
703
  this.analyserNode.getByteFrequencyData(dataArray);
734
704
  return dataArray;
735
705
  }
736
- /**
737
- * Get time-domain data from analyser for waveform visualization
738
- */
739
706
  getWaveformData() {
740
- if (!this.analyserNode) {
741
- return new Uint8Array(0);
742
- }
707
+ if (!this.analyserNode) return new Uint8Array(0);
743
708
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
744
709
  this.analyserNode.getByteTimeDomainData(dataArray);
745
710
  return dataArray;
746
711
  }
747
- /**
748
- * Cleanup and close AudioContext
749
- */
750
712
  cleanup() {
751
713
  this.stopMicrophone();
752
714
  this.stopPlayback();
@@ -755,17 +717,124 @@ var BrowserAudioManager = class {
755
717
  this.analyserNode = null;
756
718
  }
757
719
  }
720
+ getAudioContext() {
721
+ return this.audioContext;
722
+ }
723
+ };
724
+
725
+ // src/voice-agent.ts
726
+ var VoiceAgent = class {
727
+ client;
728
+ audioManager;
729
+ options;
730
+ isConnected = false;
731
+ visemeQueue = [];
732
+ constructor(options) {
733
+ this.options = options;
734
+ this.client = new VoiceAgentClient({
735
+ apiKey: options.apiKey,
736
+ prompt: options.prompt || "You are a helpful and friendly AI assistant.",
737
+ voice: options.voice || "F1" /* F1 */,
738
+ language: options.language || "en" /* ENGLISH */,
739
+ visemes: options.visemes ?? true,
740
+ serverUrl: options.serverUrl,
741
+ onTranscription: (text) => {
742
+ if (options.onTranscription) options.onTranscription(text, true);
743
+ },
744
+ onResponse: (text) => {
745
+ if (options.onTranscription) options.onTranscription(text, false);
746
+ },
747
+ onAudio: (data) => {
748
+ this.audioManager.playAudio(data);
749
+ },
750
+ onVisemes: (visemes) => {
751
+ this.visemeQueue.push(...visemes);
752
+ if (options.onVisemes) options.onVisemes(visemes);
753
+ },
754
+ onStatus: (status) => {
755
+ if (options.onStatusChange) options.onStatusChange(status);
756
+ if (status === "interrupted" || status === "thinking") {
757
+ this.audioManager.stopPlayback();
758
+ this.visemeQueue = [];
759
+ }
760
+ },
761
+ onError: (err) => {
762
+ if (options.onError) options.onError(err);
763
+ }
764
+ });
765
+ this.audioManager = new BrowserAudioManager({
766
+ autoGainControl: true,
767
+ echoCancellation: true,
768
+ noiseSuppression: true
769
+ });
770
+ }
758
771
  /**
759
- * Get current audio context state
772
+ * Initialize hardware and connect to the AI server.
773
+ * This must be called in response to a user guesture (like a click)
774
+ * to satisfy browser AudioContext requirements.
760
775
  */
761
- getState() {
762
- return this.audioContext?.state ?? null;
776
+ async connect() {
777
+ try {
778
+ await this.audioManager.init();
779
+ const connected = await this.client.connect();
780
+ if (!connected) return false;
781
+ this.isConnected = true;
782
+ await this.audioManager.startMicrophone((pcm16Data) => {
783
+ if (this.isConnected) {
784
+ this.client.sendAudio(pcm16Data);
785
+ }
786
+ });
787
+ return true;
788
+ } catch (err) {
789
+ if (this.options.onError) this.options.onError(err);
790
+ return false;
791
+ }
763
792
  }
764
793
  /**
765
- * Check if microphone is currently listening
794
+ * Get the current amplitude/volume of the microphone or output audio.
795
+ * Useful for voice activity visualization.
796
+ * @returns value between 0 and 1
766
797
  */
767
- isRecording() {
768
- return this.isListening;
798
+ getAmplitude() {
799
+ return this.audioManager.getAmplitude();
800
+ }
801
+ /**
802
+ * Mute or unmute the microphone.
803
+ */
804
+ toggleMute() {
805
+ const currentState = this.audioManager.isMicMuted();
806
+ this.audioManager.setMuted(!currentState);
807
+ return !currentState;
808
+ }
809
+ /**
810
+ * High-precision method to get visemes that should be active
811
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
812
+ */
813
+ getFrameVisemes() {
814
+ const offset = this.audioManager.getAudioClockOffset();
815
+ const audioCtx = this.audioManager.getAudioContext();
816
+ if (offset === null || !audioCtx) return [];
817
+ const streamTime = (audioCtx.currentTime - offset) * 1e3;
818
+ const currentBatch = [];
819
+ while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
820
+ currentBatch.push(this.visemeQueue.shift());
821
+ }
822
+ return currentBatch;
823
+ }
824
+ /**
825
+ * Change the system prompt mid-conversation.
826
+ */
827
+ updatePrompt(newPrompt) {
828
+ this.client.updatePrompt(newPrompt);
829
+ }
830
+ /**
831
+ * Disconnect and release audio resources.
832
+ */
833
+ disconnect() {
834
+ this.isConnected = false;
835
+ this.client.disconnect();
836
+ this.audioManager.cleanup();
837
+ this.visemeQueue = [];
769
838
  }
770
839
  };
771
840
  // Annotate the CommonJS export names for ESM import in node:
@@ -776,6 +845,7 @@ var BrowserAudioManager = class {
776
845
  Language,
777
846
  StreamResampler,
778
847
  TTSClient,
848
+ VoiceAgent,
779
849
  VoiceAgentClient,
780
850
  VoiceStyle,
781
851
  applyLowPassFilter,
package/dist/index.mjs CHANGED
@@ -60,11 +60,13 @@ var VoiceAgentClient = class {
60
60
  messages = [];
61
61
  visemeListeners = [];
62
62
  wantVisemes = false;
63
+ serverUrl = null;
63
64
  constructor(config) {
64
65
  this.apiKey = config.apiKey;
65
66
  this.prompt = config.prompt;
66
67
  this.voice = config.voice || "F1" /* F1 */;
67
68
  this.language = config.language || "en" /* ENGLISH */;
69
+ this.serverUrl = config.serverUrl || null;
68
70
  this.onTranscription = config.onTranscription;
69
71
  this.onResponse = config.onResponse;
70
72
  this.onAudioCallback = config.onAudio;
@@ -79,12 +81,12 @@ var VoiceAgentClient = class {
79
81
  async connect() {
80
82
  return new Promise((resolve, reject) => {
81
83
  try {
82
- let url = DEFAULT_URLS.VOICE_AGENT;
84
+ let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
83
85
  if (this.apiKey) {
84
86
  const separator = url.includes("?") ? "&" : "?";
85
87
  url += `${separator}api_key=${this.apiKey}`;
86
88
  }
87
- console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
89
+ console.log(`\u{1F517} Connecting to ${url}...`);
88
90
  this.ws = new WebSocket(url);
89
91
  this.ws.binaryType = "arraybuffer";
90
92
  this.ws.onopen = () => {
@@ -457,7 +459,8 @@ var BrowserAudioManager = class {
457
459
  // Playback scheduling
458
460
  nextPlaybackTime = 0;
459
461
  activeSources = [];
460
- playbackQueue = [];
462
+ // High-precision clock anchor for viseme sync
463
+ audioClockOffset = null;
461
464
  // Configuration
462
465
  inputSampleRate;
463
466
  outputSampleRate;
@@ -470,6 +473,7 @@ var BrowserAudioManager = class {
470
473
  // Audio processing state
471
474
  isMuted = false;
472
475
  isListening = false;
476
+ resampler = null;
473
477
  constructor(config = {}) {
474
478
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
475
479
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -493,7 +497,6 @@ var BrowserAudioManager = class {
493
497
  }
494
498
  if (this.audioContext.state === "suspended") {
495
499
  await this.audioContext.resume();
496
- console.log("\u{1F442} AudioContext resumed");
497
500
  }
498
501
  if (analyserConfig?.enabled !== false) {
499
502
  this.analyserNode = this.audioContext.createAnalyser();
@@ -507,6 +510,7 @@ var BrowserAudioManager = class {
507
510
  if (!this.audioContext) {
508
511
  await this.init();
509
512
  }
513
+ this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
510
514
  try {
511
515
  this.onAudioInput = onAudioInput;
512
516
  this.isListening = true;
@@ -522,9 +526,7 @@ var BrowserAudioManager = class {
522
526
  this.scriptProcessor = this.audioContext.createScriptProcessor(
523
527
  bufferSize,
524
528
  1,
525
- // input channels
526
529
  1
527
- // output channels
528
530
  );
529
531
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
530
532
  this.scriptProcessor.connect(this.audioContext.destination);
@@ -541,40 +543,19 @@ var BrowserAudioManager = class {
541
543
  throw err;
542
544
  }
543
545
  }
544
- /**
545
- * Internal method to process microphone audio data
546
- */
547
546
  _processAudioInput(event) {
548
- if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
549
- if (this.isMuted) return;
550
- const inputBuffer = event.inputBuffer;
551
- const inputData = inputBuffer.getChannelData(0);
552
- const outputBuffer = event.outputBuffer;
553
- for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
554
- outputBuffer.getChannelData(0)[i] = 0;
555
- }
556
- const hardwareRate = this.audioContext.sampleRate;
557
- let processedData = new Float32Array(inputData);
558
- if (hardwareRate !== this.inputSampleRate) {
559
- processedData = resampleWithAntiAliasing(
560
- processedData,
561
- hardwareRate,
562
- this.inputSampleRate
563
- );
547
+ if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
548
+ const inputData = event.inputBuffer.getChannelData(0);
549
+ event.outputBuffer.getChannelData(0).fill(0);
550
+ const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
551
+ if (resampled && resampled.length > 0) {
552
+ const int16Data = float32ToPcm16(resampled);
553
+ this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
564
554
  }
565
- const int16Data = float32ToPcm16(processedData);
566
- const uint8Data = new Uint8Array(
567
- int16Data.buffer,
568
- int16Data.byteOffset,
569
- int16Data.byteLength
570
- );
571
- this.onAudioInput(uint8Data);
572
555
  }
573
- /**
574
- * Stop capturing microphone input
575
- */
576
556
  stopMicrophone() {
577
557
  this.isListening = false;
558
+ this.resampler = null;
578
559
  if (this.mediaStream) {
579
560
  this.mediaStream.getTracks().forEach((track) => track.stop());
580
561
  this.mediaStream = null;
@@ -587,17 +568,12 @@ var BrowserAudioManager = class {
587
568
  this.mediaStreamAudioSourceNode.disconnect();
588
569
  this.mediaStreamAudioSourceNode = null;
589
570
  }
590
- console.log("\u{1F3A4} Microphone stopped");
591
571
  }
592
572
  /**
593
573
  * Play back audio received from the server
594
- * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
595
574
  */
596
575
  playAudio(pcm16Data) {
597
- if (!this.audioContext) {
598
- console.warn("AudioContext not initialized");
599
- return;
600
- }
576
+ if (!this.audioContext) return;
601
577
  const int16Array = new Int16Array(
602
578
  pcm16Data.buffer,
603
579
  pcm16Data.byteOffset,
@@ -612,18 +588,17 @@ var BrowserAudioManager = class {
612
588
  audioBuffer.getChannelData(0).set(float32Data);
613
589
  this._schedulePlayback(audioBuffer);
614
590
  }
615
- /**
616
- * Internal method to schedule and play audio with sample-accurate timing
617
- */
618
591
  _schedulePlayback(audioBuffer) {
619
592
  if (!this.audioContext) return;
620
593
  const currentTime = this.audioContext.currentTime;
621
594
  const duration = audioBuffer.length / this.outputSampleRate;
622
595
  const startTime = Math.max(
623
596
  currentTime + 0.01,
624
- // Minimum 10ms delay
625
597
  this.nextPlaybackTime
626
598
  );
599
+ if (this.audioClockOffset === null) {
600
+ this.audioClockOffset = startTime;
601
+ }
627
602
  this.nextPlaybackTime = startTime + duration;
628
603
  const source = this.audioContext.createBufferSource();
629
604
  source.buffer = audioBuffer;
@@ -641,8 +616,18 @@ var BrowserAudioManager = class {
641
616
  };
642
617
  }
643
618
  /**
644
- * Stop all currently playing audio and clear the queue
619
+ * Get the current high-precision audio clock offset for viseme synchronization.
620
+ * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
621
+ */
622
+ getAudioClockOffset() {
623
+ return this.audioClockOffset;
624
+ }
625
+ /**
626
+ * Reset the audio clock offset (call when a response is interrupted or finished)
645
627
  */
628
+ resetAudioClock() {
629
+ this.audioClockOffset = null;
630
+ }
646
631
  stopPlayback() {
647
632
  this.activeSources.forEach((source) => {
648
633
  try {
@@ -651,26 +636,15 @@ var BrowserAudioManager = class {
651
636
  }
652
637
  });
653
638
  this.activeSources = [];
654
- this.playbackQueue = [];
655
- this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
656
- console.log("\u{1F507} Playback stopped");
639
+ this.nextPlaybackTime = 0;
640
+ this.resetAudioClock();
657
641
  }
658
- /**
659
- * Toggle mute state
660
- */
661
642
  setMuted(muted) {
662
643
  this.isMuted = muted;
663
644
  }
664
- /**
665
- * Get current mute state
666
- */
667
645
  isMicMuted() {
668
646
  return this.isMuted;
669
647
  }
670
- /**
671
- * Get current amplitude from analyser (for visualization)
672
- * Returns value between 0 and 1
673
- */
674
648
  getAmplitude() {
675
649
  if (!this.analyserNode) return 0;
676
650
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
@@ -678,31 +652,18 @@ var BrowserAudioManager = class {
678
652
  const rms = calculateRMS(dataArray);
679
653
  return Math.min(rms * 10, 1);
680
654
  }
681
- /**
682
- * Get frequency data from analyser for visualization
683
- */
684
655
  getFrequencyData() {
685
- if (!this.analyserNode) {
686
- return new Uint8Array(0);
687
- }
656
+ if (!this.analyserNode) return new Uint8Array(0);
688
657
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
689
658
  this.analyserNode.getByteFrequencyData(dataArray);
690
659
  return dataArray;
691
660
  }
692
- /**
693
- * Get time-domain data from analyser for waveform visualization
694
- */
695
661
  getWaveformData() {
696
- if (!this.analyserNode) {
697
- return new Uint8Array(0);
698
- }
662
+ if (!this.analyserNode) return new Uint8Array(0);
699
663
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
700
664
  this.analyserNode.getByteTimeDomainData(dataArray);
701
665
  return dataArray;
702
666
  }
703
- /**
704
- * Cleanup and close AudioContext
705
- */
706
667
  cleanup() {
707
668
  this.stopMicrophone();
708
669
  this.stopPlayback();
@@ -711,17 +672,124 @@ var BrowserAudioManager = class {
711
672
  this.analyserNode = null;
712
673
  }
713
674
  }
675
+ getAudioContext() {
676
+ return this.audioContext;
677
+ }
678
+ };
679
+
680
+ // src/voice-agent.ts
681
+ var VoiceAgent = class {
682
+ client;
683
+ audioManager;
684
+ options;
685
+ isConnected = false;
686
+ visemeQueue = [];
687
+ constructor(options) {
688
+ this.options = options;
689
+ this.client = new VoiceAgentClient({
690
+ apiKey: options.apiKey,
691
+ prompt: options.prompt || "You are a helpful and friendly AI assistant.",
692
+ voice: options.voice || "F1" /* F1 */,
693
+ language: options.language || "en" /* ENGLISH */,
694
+ visemes: options.visemes ?? true,
695
+ serverUrl: options.serverUrl,
696
+ onTranscription: (text) => {
697
+ if (options.onTranscription) options.onTranscription(text, true);
698
+ },
699
+ onResponse: (text) => {
700
+ if (options.onTranscription) options.onTranscription(text, false);
701
+ },
702
+ onAudio: (data) => {
703
+ this.audioManager.playAudio(data);
704
+ },
705
+ onVisemes: (visemes) => {
706
+ this.visemeQueue.push(...visemes);
707
+ if (options.onVisemes) options.onVisemes(visemes);
708
+ },
709
+ onStatus: (status) => {
710
+ if (options.onStatusChange) options.onStatusChange(status);
711
+ if (status === "interrupted" || status === "thinking") {
712
+ this.audioManager.stopPlayback();
713
+ this.visemeQueue = [];
714
+ }
715
+ },
716
+ onError: (err) => {
717
+ if (options.onError) options.onError(err);
718
+ }
719
+ });
720
+ this.audioManager = new BrowserAudioManager({
721
+ autoGainControl: true,
722
+ echoCancellation: true,
723
+ noiseSuppression: true
724
+ });
725
+ }
714
726
  /**
715
- * Get current audio context state
727
+ * Initialize hardware and connect to the AI server.
728
+ * This must be called in response to a user guesture (like a click)
729
+ * to satisfy browser AudioContext requirements.
716
730
  */
717
- getState() {
718
- return this.audioContext?.state ?? null;
731
+ async connect() {
732
+ try {
733
+ await this.audioManager.init();
734
+ const connected = await this.client.connect();
735
+ if (!connected) return false;
736
+ this.isConnected = true;
737
+ await this.audioManager.startMicrophone((pcm16Data) => {
738
+ if (this.isConnected) {
739
+ this.client.sendAudio(pcm16Data);
740
+ }
741
+ });
742
+ return true;
743
+ } catch (err) {
744
+ if (this.options.onError) this.options.onError(err);
745
+ return false;
746
+ }
719
747
  }
720
748
  /**
721
- * Check if microphone is currently listening
749
+ * Get the current amplitude/volume of the microphone or output audio.
750
+ * Useful for voice activity visualization.
751
+ * @returns value between 0 and 1
722
752
  */
723
- isRecording() {
724
- return this.isListening;
753
+ getAmplitude() {
754
+ return this.audioManager.getAmplitude();
755
+ }
756
+ /**
757
+ * Mute or unmute the microphone.
758
+ */
759
+ toggleMute() {
760
+ const currentState = this.audioManager.isMicMuted();
761
+ this.audioManager.setMuted(!currentState);
762
+ return !currentState;
763
+ }
764
+ /**
765
+ * High-precision method to get visemes that should be active
766
+ * at the current playback frame. Use this in a requestAnimationFrame loop.
767
+ */
768
+ getFrameVisemes() {
769
+ const offset = this.audioManager.getAudioClockOffset();
770
+ const audioCtx = this.audioManager.getAudioContext();
771
+ if (offset === null || !audioCtx) return [];
772
+ const streamTime = (audioCtx.currentTime - offset) * 1e3;
773
+ const currentBatch = [];
774
+ while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
775
+ currentBatch.push(this.visemeQueue.shift());
776
+ }
777
+ return currentBatch;
778
+ }
779
+ /**
780
+ * Change the system prompt mid-conversation.
781
+ */
782
+ updatePrompt(newPrompt) {
783
+ this.client.updatePrompt(newPrompt);
784
+ }
785
+ /**
786
+ * Disconnect and release audio resources.
787
+ */
788
+ disconnect() {
789
+ this.isConnected = false;
790
+ this.client.disconnect();
791
+ this.audioManager.cleanup();
792
+ this.visemeQueue = [];
725
793
  }
726
794
  };
727
795
  export {
@@ -731,6 +799,7 @@ export {
731
799
  Language,
732
800
  StreamResampler,
733
801
  TTSClient,
802
+ VoiceAgent,
734
803
  VoiceAgentClient,
735
804
  VoiceStyle,
736
805
  applyLowPassFilter,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lokutor/sdk",
3
- "version": "1.1.8",
3
+ "version": "1.1.9",
4
4
  "description": "JavaScript/TypeScript SDK for Lokutor Real-time Voice AI",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",