@lokutor/sdk 1.1.9 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -119,13 +119,11 @@ declare class VoiceAgentClient {
119
119
  private messages;
120
120
  private visemeListeners;
121
121
  private wantVisemes;
122
- private serverUrl;
123
122
  constructor(config: LokutorConfig & {
124
123
  prompt: string;
125
124
  voice?: VoiceStyle;
126
125
  language?: Language;
127
126
  visemes?: boolean;
128
- serverUrl?: string;
129
127
  onVisemes?: (visemes: Viseme[]) => void;
130
128
  });
131
129
  /**
@@ -328,7 +326,7 @@ declare class BrowserAudioManager {
328
326
  private mediaStream;
329
327
  private nextPlaybackTime;
330
328
  private activeSources;
331
- private audioClockOffset;
329
+ private playbackQueue;
332
330
  private inputSampleRate;
333
331
  private outputSampleRate;
334
332
  private autoGainControl;
@@ -338,7 +336,6 @@ declare class BrowserAudioManager {
338
336
  private onInputError?;
339
337
  private isMuted;
340
338
  private isListening;
341
- private resampler;
342
339
  constructor(config?: BrowserAudioConfig);
343
340
  /**
344
341
  * Initialize the AudioContext and analyser
@@ -348,77 +345,60 @@ declare class BrowserAudioManager {
348
345
  * Start capturing audio from the microphone
349
346
  */
350
347
  startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
348
+ /**
349
+ * Internal method to process microphone audio data
350
+ */
351
351
  private _processAudioInput;
352
+ /**
353
+ * Stop capturing microphone input
354
+ */
352
355
  stopMicrophone(): void;
353
356
  /**
354
357
  * Play back audio received from the server
358
+ * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
355
359
  */
356
360
  playAudio(pcm16Data: Uint8Array): void;
357
- private _schedulePlayback;
358
361
  /**
359
- * Get the current high-precision audio clock offset for viseme synchronization.
360
- * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
362
+ * Internal method to schedule and play audio with sample-accurate timing
361
363
  */
362
- getAudioClockOffset(): number | null;
364
+ private _schedulePlayback;
363
365
  /**
364
- * Reset the audio clock offset (call when a response is interrupted or finished)
366
+ * Stop all currently playing audio and clear the queue
365
367
  */
366
- resetAudioClock(): void;
367
368
  stopPlayback(): void;
369
+ /**
370
+ * Toggle mute state
371
+ */
368
372
  setMuted(muted: boolean): void;
369
- isMicMuted(): boolean;
370
- getAmplitude(): number;
371
- getFrequencyData(): Uint8Array;
372
- getWaveformData(): Uint8Array;
373
- cleanup(): void;
374
- getAudioContext(): AudioContext | null;
375
- }
376
-
377
- /**
378
- * High-level AI Voice Agent for browser-based conversations.
379
- *
380
- * This class orchestrates microphone input, AI processing, and
381
- * speaker output, providing a simple interface for building
382
- * voice assistants with lip-sync support.
383
- */
384
- declare class VoiceAgent {
385
- private client;
386
- private audioManager;
387
- private options;
388
- private isConnected;
389
- private visemeQueue;
390
- constructor(options: VoiceAgentOptions & {
391
- apiKey: string;
392
- });
393
373
  /**
394
- * Initialize hardware and connect to the AI server.
395
- * This must be called in response to a user guesture (like a click)
396
- * to satisfy browser AudioContext requirements.
374
+ * Get current mute state
397
375
  */
398
- connect(): Promise<boolean>;
376
+ isMicMuted(): boolean;
399
377
  /**
400
- * Get the current amplitude/volume of the microphone or output audio.
401
- * Useful for voice activity visualization.
402
- * @returns value between 0 and 1
378
+ * Get current amplitude from analyser (for visualization)
379
+ * Returns value between 0 and 1
403
380
  */
404
381
  getAmplitude(): number;
405
382
  /**
406
- * Mute or unmute the microphone.
383
+ * Get frequency data from analyser for visualization
407
384
  */
408
- toggleMute(): boolean;
385
+ getFrequencyData(): Uint8Array;
409
386
  /**
410
- * High-precision method to get visemes that should be active
411
- * at the current playback frame. Use this in a requestAnimationFrame loop.
387
+ * Get time-domain data from analyser for waveform visualization
412
388
  */
413
- getFrameVisemes(): Viseme[];
389
+ getWaveformData(): Uint8Array;
414
390
  /**
415
- * Change the system prompt mid-conversation.
391
+ * Cleanup and close AudioContext
416
392
  */
417
- updatePrompt(newPrompt: string): void;
393
+ cleanup(): void;
418
394
  /**
419
- * Disconnect and release audio resources.
395
+ * Get current audio context state
420
396
  */
421
- disconnect(): void;
397
+ getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
398
+ /**
399
+ * Check if microphone is currently listening
400
+ */
401
+ isRecording(): boolean;
422
402
  }
423
403
 
424
- export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
404
+ export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
package/dist/index.d.ts CHANGED
@@ -119,13 +119,11 @@ declare class VoiceAgentClient {
119
119
  private messages;
120
120
  private visemeListeners;
121
121
  private wantVisemes;
122
- private serverUrl;
123
122
  constructor(config: LokutorConfig & {
124
123
  prompt: string;
125
124
  voice?: VoiceStyle;
126
125
  language?: Language;
127
126
  visemes?: boolean;
128
- serverUrl?: string;
129
127
  onVisemes?: (visemes: Viseme[]) => void;
130
128
  });
131
129
  /**
@@ -328,7 +326,7 @@ declare class BrowserAudioManager {
328
326
  private mediaStream;
329
327
  private nextPlaybackTime;
330
328
  private activeSources;
331
- private audioClockOffset;
329
+ private playbackQueue;
332
330
  private inputSampleRate;
333
331
  private outputSampleRate;
334
332
  private autoGainControl;
@@ -338,7 +336,6 @@ declare class BrowserAudioManager {
338
336
  private onInputError?;
339
337
  private isMuted;
340
338
  private isListening;
341
- private resampler;
342
339
  constructor(config?: BrowserAudioConfig);
343
340
  /**
344
341
  * Initialize the AudioContext and analyser
@@ -348,77 +345,60 @@ declare class BrowserAudioManager {
348
345
  * Start capturing audio from the microphone
349
346
  */
350
347
  startMicrophone(onAudioInput: (pcm16Data: Uint8Array) => void): Promise<void>;
348
+ /**
349
+ * Internal method to process microphone audio data
350
+ */
351
351
  private _processAudioInput;
352
+ /**
353
+ * Stop capturing microphone input
354
+ */
352
355
  stopMicrophone(): void;
353
356
  /**
354
357
  * Play back audio received from the server
358
+ * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
355
359
  */
356
360
  playAudio(pcm16Data: Uint8Array): void;
357
- private _schedulePlayback;
358
361
  /**
359
- * Get the current high-precision audio clock offset for viseme synchronization.
360
- * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
362
+ * Internal method to schedule and play audio with sample-accurate timing
361
363
  */
362
- getAudioClockOffset(): number | null;
364
+ private _schedulePlayback;
363
365
  /**
364
- * Reset the audio clock offset (call when a response is interrupted or finished)
366
+ * Stop all currently playing audio and clear the queue
365
367
  */
366
- resetAudioClock(): void;
367
368
  stopPlayback(): void;
369
+ /**
370
+ * Toggle mute state
371
+ */
368
372
  setMuted(muted: boolean): void;
369
- isMicMuted(): boolean;
370
- getAmplitude(): number;
371
- getFrequencyData(): Uint8Array;
372
- getWaveformData(): Uint8Array;
373
- cleanup(): void;
374
- getAudioContext(): AudioContext | null;
375
- }
376
-
377
- /**
378
- * High-level AI Voice Agent for browser-based conversations.
379
- *
380
- * This class orchestrates microphone input, AI processing, and
381
- * speaker output, providing a simple interface for building
382
- * voice assistants with lip-sync support.
383
- */
384
- declare class VoiceAgent {
385
- private client;
386
- private audioManager;
387
- private options;
388
- private isConnected;
389
- private visemeQueue;
390
- constructor(options: VoiceAgentOptions & {
391
- apiKey: string;
392
- });
393
373
  /**
394
- * Initialize hardware and connect to the AI server.
395
- * This must be called in response to a user guesture (like a click)
396
- * to satisfy browser AudioContext requirements.
374
+ * Get current mute state
397
375
  */
398
- connect(): Promise<boolean>;
376
+ isMicMuted(): boolean;
399
377
  /**
400
- * Get the current amplitude/volume of the microphone or output audio.
401
- * Useful for voice activity visualization.
402
- * @returns value between 0 and 1
378
+ * Get current amplitude from analyser (for visualization)
379
+ * Returns value between 0 and 1
403
380
  */
404
381
  getAmplitude(): number;
405
382
  /**
406
- * Mute or unmute the microphone.
383
+ * Get frequency data from analyser for visualization
407
384
  */
408
- toggleMute(): boolean;
385
+ getFrequencyData(): Uint8Array;
409
386
  /**
410
- * High-precision method to get visemes that should be active
411
- * at the current playback frame. Use this in a requestAnimationFrame loop.
387
+ * Get time-domain data from analyser for waveform visualization
412
388
  */
413
- getFrameVisemes(): Viseme[];
389
+ getWaveformData(): Uint8Array;
414
390
  /**
415
- * Change the system prompt mid-conversation.
391
+ * Cleanup and close AudioContext
416
392
  */
417
- updatePrompt(newPrompt: string): void;
393
+ cleanup(): void;
418
394
  /**
419
- * Disconnect and release audio resources.
395
+ * Get current audio context state
420
396
  */
421
- disconnect(): void;
397
+ getState(): 'running' | 'suspended' | 'closed' | 'interrupted' | null;
398
+ /**
399
+ * Check if microphone is currently listening
400
+ */
401
+ isRecording(): boolean;
422
402
  }
423
403
 
424
- export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgent, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
404
+ export { AUDIO_CONFIG, type AnalyserConfig, type BrowserAudioConfig, BrowserAudioManager, type BrowserAudioOptions, DEFAULT_URLS, Language, type LokutorConfig, StreamResampler, type SynthesizeOptions, TTSClient, type Viseme, VoiceAgentClient, type VoiceAgentOptions, VoiceStyle, applyLowPassFilter, bytesToPcm16, calculateRMS, float32ToPcm16, normalizeAudio, pcm16ToBytes, pcm16ToFloat32, resample, resampleWithAntiAliasing, simpleConversation, simpleTTS };
package/dist/index.js CHANGED
@@ -26,7 +26,6 @@ __export(index_exports, {
26
26
  Language: () => Language,
27
27
  StreamResampler: () => StreamResampler,
28
28
  TTSClient: () => TTSClient,
29
- VoiceAgent: () => VoiceAgent,
30
29
  VoiceAgentClient: () => VoiceAgentClient,
31
30
  VoiceStyle: () => VoiceStyle,
32
31
  applyLowPassFilter: () => applyLowPassFilter,
@@ -105,13 +104,11 @@ var VoiceAgentClient = class {
105
104
  messages = [];
106
105
  visemeListeners = [];
107
106
  wantVisemes = false;
108
- serverUrl = null;
109
107
  constructor(config) {
110
108
  this.apiKey = config.apiKey;
111
109
  this.prompt = config.prompt;
112
110
  this.voice = config.voice || "F1" /* F1 */;
113
111
  this.language = config.language || "en" /* ENGLISH */;
114
- this.serverUrl = config.serverUrl || null;
115
112
  this.onTranscription = config.onTranscription;
116
113
  this.onResponse = config.onResponse;
117
114
  this.onAudioCallback = config.onAudio;
@@ -126,12 +123,12 @@ var VoiceAgentClient = class {
126
123
  async connect() {
127
124
  return new Promise((resolve, reject) => {
128
125
  try {
129
- let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
126
+ let url = DEFAULT_URLS.VOICE_AGENT;
130
127
  if (this.apiKey) {
131
128
  const separator = url.includes("?") ? "&" : "?";
132
129
  url += `${separator}api_key=${this.apiKey}`;
133
130
  }
134
- console.log(`\u{1F517} Connecting to ${url}...`);
131
+ console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
135
132
  this.ws = new WebSocket(url);
136
133
  this.ws.binaryType = "arraybuffer";
137
134
  this.ws.onopen = () => {
@@ -504,8 +501,7 @@ var BrowserAudioManager = class {
504
501
  // Playback scheduling
505
502
  nextPlaybackTime = 0;
506
503
  activeSources = [];
507
- // High-precision clock anchor for viseme sync
508
- audioClockOffset = null;
504
+ playbackQueue = [];
509
505
  // Configuration
510
506
  inputSampleRate;
511
507
  outputSampleRate;
@@ -518,7 +514,6 @@ var BrowserAudioManager = class {
518
514
  // Audio processing state
519
515
  isMuted = false;
520
516
  isListening = false;
521
- resampler = null;
522
517
  constructor(config = {}) {
523
518
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
524
519
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -542,6 +537,7 @@ var BrowserAudioManager = class {
542
537
  }
543
538
  if (this.audioContext.state === "suspended") {
544
539
  await this.audioContext.resume();
540
+ console.log("\u{1F442} AudioContext resumed");
545
541
  }
546
542
  if (analyserConfig?.enabled !== false) {
547
543
  this.analyserNode = this.audioContext.createAnalyser();
@@ -555,7 +551,6 @@ var BrowserAudioManager = class {
555
551
  if (!this.audioContext) {
556
552
  await this.init();
557
553
  }
558
- this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
559
554
  try {
560
555
  this.onAudioInput = onAudioInput;
561
556
  this.isListening = true;
@@ -571,7 +566,9 @@ var BrowserAudioManager = class {
571
566
  this.scriptProcessor = this.audioContext.createScriptProcessor(
572
567
  bufferSize,
573
568
  1,
569
+ // input channels
574
570
  1
571
+ // output channels
575
572
  );
576
573
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
577
574
  this.scriptProcessor.connect(this.audioContext.destination);
@@ -588,19 +585,40 @@ var BrowserAudioManager = class {
588
585
  throw err;
589
586
  }
590
587
  }
588
+ /**
589
+ * Internal method to process microphone audio data
590
+ */
591
591
  _processAudioInput(event) {
592
- if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
593
- const inputData = event.inputBuffer.getChannelData(0);
594
- event.outputBuffer.getChannelData(0).fill(0);
595
- const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
596
- if (resampled && resampled.length > 0) {
597
- const int16Data = float32ToPcm16(resampled);
598
- this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
592
+ if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
593
+ if (this.isMuted) return;
594
+ const inputBuffer = event.inputBuffer;
595
+ const inputData = inputBuffer.getChannelData(0);
596
+ const outputBuffer = event.outputBuffer;
597
+ for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
598
+ outputBuffer.getChannelData(0)[i] = 0;
599
+ }
600
+ const hardwareRate = this.audioContext.sampleRate;
601
+ let processedData = new Float32Array(inputData);
602
+ if (hardwareRate !== this.inputSampleRate) {
603
+ processedData = resampleWithAntiAliasing(
604
+ processedData,
605
+ hardwareRate,
606
+ this.inputSampleRate
607
+ );
599
608
  }
609
+ const int16Data = float32ToPcm16(processedData);
610
+ const uint8Data = new Uint8Array(
611
+ int16Data.buffer,
612
+ int16Data.byteOffset,
613
+ int16Data.byteLength
614
+ );
615
+ this.onAudioInput(uint8Data);
600
616
  }
617
+ /**
618
+ * Stop capturing microphone input
619
+ */
601
620
  stopMicrophone() {
602
621
  this.isListening = false;
603
- this.resampler = null;
604
622
  if (this.mediaStream) {
605
623
  this.mediaStream.getTracks().forEach((track) => track.stop());
606
624
  this.mediaStream = null;
@@ -613,12 +631,17 @@ var BrowserAudioManager = class {
613
631
  this.mediaStreamAudioSourceNode.disconnect();
614
632
  this.mediaStreamAudioSourceNode = null;
615
633
  }
634
+ console.log("\u{1F3A4} Microphone stopped");
616
635
  }
617
636
  /**
618
637
  * Play back audio received from the server
638
+ * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
619
639
  */
620
640
  playAudio(pcm16Data) {
621
- if (!this.audioContext) return;
641
+ if (!this.audioContext) {
642
+ console.warn("AudioContext not initialized");
643
+ return;
644
+ }
622
645
  const int16Array = new Int16Array(
623
646
  pcm16Data.buffer,
624
647
  pcm16Data.byteOffset,
@@ -633,17 +656,18 @@ var BrowserAudioManager = class {
633
656
  audioBuffer.getChannelData(0).set(float32Data);
634
657
  this._schedulePlayback(audioBuffer);
635
658
  }
659
+ /**
660
+ * Internal method to schedule and play audio with sample-accurate timing
661
+ */
636
662
  _schedulePlayback(audioBuffer) {
637
663
  if (!this.audioContext) return;
638
664
  const currentTime = this.audioContext.currentTime;
639
665
  const duration = audioBuffer.length / this.outputSampleRate;
640
666
  const startTime = Math.max(
641
667
  currentTime + 0.01,
668
+ // Minimum 10ms delay
642
669
  this.nextPlaybackTime
643
670
  );
644
- if (this.audioClockOffset === null) {
645
- this.audioClockOffset = startTime;
646
- }
647
671
  this.nextPlaybackTime = startTime + duration;
648
672
  const source = this.audioContext.createBufferSource();
649
673
  source.buffer = audioBuffer;
@@ -661,18 +685,8 @@ var BrowserAudioManager = class {
661
685
  };
662
686
  }
663
687
  /**
664
- * Get the current high-precision audio clock offset for viseme synchronization.
665
- * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
666
- */
667
- getAudioClockOffset() {
668
- return this.audioClockOffset;
669
- }
670
- /**
671
- * Reset the audio clock offset (call when a response is interrupted or finished)
688
+ * Stop all currently playing audio and clear the queue
672
689
  */
673
- resetAudioClock() {
674
- this.audioClockOffset = null;
675
- }
676
690
  stopPlayback() {
677
691
  this.activeSources.forEach((source) => {
678
692
  try {
@@ -681,15 +695,26 @@ var BrowserAudioManager = class {
681
695
  }
682
696
  });
683
697
  this.activeSources = [];
684
- this.nextPlaybackTime = 0;
685
- this.resetAudioClock();
698
+ this.playbackQueue = [];
699
+ this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
700
+ console.log("\u{1F507} Playback stopped");
686
701
  }
702
+ /**
703
+ * Toggle mute state
704
+ */
687
705
  setMuted(muted) {
688
706
  this.isMuted = muted;
689
707
  }
708
+ /**
709
+ * Get current mute state
710
+ */
690
711
  isMicMuted() {
691
712
  return this.isMuted;
692
713
  }
714
+ /**
715
+ * Get current amplitude from analyser (for visualization)
716
+ * Returns value between 0 and 1
717
+ */
693
718
  getAmplitude() {
694
719
  if (!this.analyserNode) return 0;
695
720
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
@@ -697,18 +722,31 @@ var BrowserAudioManager = class {
697
722
  const rms = calculateRMS(dataArray);
698
723
  return Math.min(rms * 10, 1);
699
724
  }
725
+ /**
726
+ * Get frequency data from analyser for visualization
727
+ */
700
728
  getFrequencyData() {
701
- if (!this.analyserNode) return new Uint8Array(0);
729
+ if (!this.analyserNode) {
730
+ return new Uint8Array(0);
731
+ }
702
732
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
703
733
  this.analyserNode.getByteFrequencyData(dataArray);
704
734
  return dataArray;
705
735
  }
736
+ /**
737
+ * Get time-domain data from analyser for waveform visualization
738
+ */
706
739
  getWaveformData() {
707
- if (!this.analyserNode) return new Uint8Array(0);
740
+ if (!this.analyserNode) {
741
+ return new Uint8Array(0);
742
+ }
708
743
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
709
744
  this.analyserNode.getByteTimeDomainData(dataArray);
710
745
  return dataArray;
711
746
  }
747
+ /**
748
+ * Cleanup and close AudioContext
749
+ */
712
750
  cleanup() {
713
751
  this.stopMicrophone();
714
752
  this.stopPlayback();
@@ -717,124 +755,17 @@ var BrowserAudioManager = class {
717
755
  this.analyserNode = null;
718
756
  }
719
757
  }
720
- getAudioContext() {
721
- return this.audioContext;
722
- }
723
- };
724
-
725
- // src/voice-agent.ts
726
- var VoiceAgent = class {
727
- client;
728
- audioManager;
729
- options;
730
- isConnected = false;
731
- visemeQueue = [];
732
- constructor(options) {
733
- this.options = options;
734
- this.client = new VoiceAgentClient({
735
- apiKey: options.apiKey,
736
- prompt: options.prompt || "You are a helpful and friendly AI assistant.",
737
- voice: options.voice || "F1" /* F1 */,
738
- language: options.language || "en" /* ENGLISH */,
739
- visemes: options.visemes ?? true,
740
- serverUrl: options.serverUrl,
741
- onTranscription: (text) => {
742
- if (options.onTranscription) options.onTranscription(text, true);
743
- },
744
- onResponse: (text) => {
745
- if (options.onTranscription) options.onTranscription(text, false);
746
- },
747
- onAudio: (data) => {
748
- this.audioManager.playAudio(data);
749
- },
750
- onVisemes: (visemes) => {
751
- this.visemeQueue.push(...visemes);
752
- if (options.onVisemes) options.onVisemes(visemes);
753
- },
754
- onStatus: (status) => {
755
- if (options.onStatusChange) options.onStatusChange(status);
756
- if (status === "interrupted" || status === "thinking") {
757
- this.audioManager.stopPlayback();
758
- this.visemeQueue = [];
759
- }
760
- },
761
- onError: (err) => {
762
- if (options.onError) options.onError(err);
763
- }
764
- });
765
- this.audioManager = new BrowserAudioManager({
766
- autoGainControl: true,
767
- echoCancellation: true,
768
- noiseSuppression: true
769
- });
770
- }
771
758
  /**
772
- * Initialize hardware and connect to the AI server.
773
- * This must be called in response to a user guesture (like a click)
774
- * to satisfy browser AudioContext requirements.
759
+ * Get current audio context state
775
760
  */
776
- async connect() {
777
- try {
778
- await this.audioManager.init();
779
- const connected = await this.client.connect();
780
- if (!connected) return false;
781
- this.isConnected = true;
782
- await this.audioManager.startMicrophone((pcm16Data) => {
783
- if (this.isConnected) {
784
- this.client.sendAudio(pcm16Data);
785
- }
786
- });
787
- return true;
788
- } catch (err) {
789
- if (this.options.onError) this.options.onError(err);
790
- return false;
791
- }
761
+ getState() {
762
+ return this.audioContext?.state ?? null;
792
763
  }
793
764
  /**
794
- * Get the current amplitude/volume of the microphone or output audio.
795
- * Useful for voice activity visualization.
796
- * @returns value between 0 and 1
765
+ * Check if microphone is currently listening
797
766
  */
798
- getAmplitude() {
799
- return this.audioManager.getAmplitude();
800
- }
801
- /**
802
- * Mute or unmute the microphone.
803
- */
804
- toggleMute() {
805
- const currentState = this.audioManager.isMicMuted();
806
- this.audioManager.setMuted(!currentState);
807
- return !currentState;
808
- }
809
- /**
810
- * High-precision method to get visemes that should be active
811
- * at the current playback frame. Use this in a requestAnimationFrame loop.
812
- */
813
- getFrameVisemes() {
814
- const offset = this.audioManager.getAudioClockOffset();
815
- const audioCtx = this.audioManager.getAudioContext();
816
- if (offset === null || !audioCtx) return [];
817
- const streamTime = (audioCtx.currentTime - offset) * 1e3;
818
- const currentBatch = [];
819
- while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
820
- currentBatch.push(this.visemeQueue.shift());
821
- }
822
- return currentBatch;
823
- }
824
- /**
825
- * Change the system prompt mid-conversation.
826
- */
827
- updatePrompt(newPrompt) {
828
- this.client.updatePrompt(newPrompt);
829
- }
830
- /**
831
- * Disconnect and release audio resources.
832
- */
833
- disconnect() {
834
- this.isConnected = false;
835
- this.client.disconnect();
836
- this.audioManager.cleanup();
837
- this.visemeQueue = [];
767
+ isRecording() {
768
+ return this.isListening;
838
769
  }
839
770
  };
840
771
  // Annotate the CommonJS export names for ESM import in node:
@@ -845,7 +776,6 @@ var VoiceAgent = class {
845
776
  Language,
846
777
  StreamResampler,
847
778
  TTSClient,
848
- VoiceAgent,
849
779
  VoiceAgentClient,
850
780
  VoiceStyle,
851
781
  applyLowPassFilter,
package/dist/index.mjs CHANGED
@@ -60,13 +60,11 @@ var VoiceAgentClient = class {
60
60
  messages = [];
61
61
  visemeListeners = [];
62
62
  wantVisemes = false;
63
- serverUrl = null;
64
63
  constructor(config) {
65
64
  this.apiKey = config.apiKey;
66
65
  this.prompt = config.prompt;
67
66
  this.voice = config.voice || "F1" /* F1 */;
68
67
  this.language = config.language || "en" /* ENGLISH */;
69
- this.serverUrl = config.serverUrl || null;
70
68
  this.onTranscription = config.onTranscription;
71
69
  this.onResponse = config.onResponse;
72
70
  this.onAudioCallback = config.onAudio;
@@ -81,12 +79,12 @@ var VoiceAgentClient = class {
81
79
  async connect() {
82
80
  return new Promise((resolve, reject) => {
83
81
  try {
84
- let url = this.serverUrl || DEFAULT_URLS.VOICE_AGENT;
82
+ let url = DEFAULT_URLS.VOICE_AGENT;
85
83
  if (this.apiKey) {
86
84
  const separator = url.includes("?") ? "&" : "?";
87
85
  url += `${separator}api_key=${this.apiKey}`;
88
86
  }
89
- console.log(`\u{1F517} Connecting to ${url}...`);
87
+ console.log(`\u{1F517} Connecting to ${DEFAULT_URLS.VOICE_AGENT}...`);
90
88
  this.ws = new WebSocket(url);
91
89
  this.ws.binaryType = "arraybuffer";
92
90
  this.ws.onopen = () => {
@@ -459,8 +457,7 @@ var BrowserAudioManager = class {
459
457
  // Playback scheduling
460
458
  nextPlaybackTime = 0;
461
459
  activeSources = [];
462
- // High-precision clock anchor for viseme sync
463
- audioClockOffset = null;
460
+ playbackQueue = [];
464
461
  // Configuration
465
462
  inputSampleRate;
466
463
  outputSampleRate;
@@ -473,7 +470,6 @@ var BrowserAudioManager = class {
473
470
  // Audio processing state
474
471
  isMuted = false;
475
472
  isListening = false;
476
- resampler = null;
477
473
  constructor(config = {}) {
478
474
  this.inputSampleRate = config.inputSampleRate ?? AUDIO_CONFIG.SAMPLE_RATE;
479
475
  this.outputSampleRate = config.outputSampleRate ?? AUDIO_CONFIG.SPEAKER_SAMPLE_RATE;
@@ -497,6 +493,7 @@ var BrowserAudioManager = class {
497
493
  }
498
494
  if (this.audioContext.state === "suspended") {
499
495
  await this.audioContext.resume();
496
+ console.log("\u{1F442} AudioContext resumed");
500
497
  }
501
498
  if (analyserConfig?.enabled !== false) {
502
499
  this.analyserNode = this.audioContext.createAnalyser();
@@ -510,7 +507,6 @@ var BrowserAudioManager = class {
510
507
  if (!this.audioContext) {
511
508
  await this.init();
512
509
  }
513
- this.resampler = new StreamResampler(this.audioContext.sampleRate, this.inputSampleRate);
514
510
  try {
515
511
  this.onAudioInput = onAudioInput;
516
512
  this.isListening = true;
@@ -526,7 +522,9 @@ var BrowserAudioManager = class {
526
522
  this.scriptProcessor = this.audioContext.createScriptProcessor(
527
523
  bufferSize,
528
524
  1,
525
+ // input channels
529
526
  1
527
+ // output channels
530
528
  );
531
529
  this.mediaStreamAudioSourceNode.connect(this.scriptProcessor);
532
530
  this.scriptProcessor.connect(this.audioContext.destination);
@@ -543,19 +541,40 @@ var BrowserAudioManager = class {
543
541
  throw err;
544
542
  }
545
543
  }
544
+ /**
545
+ * Internal method to process microphone audio data
546
+ */
546
547
  _processAudioInput(event) {
547
- if (!this.onAudioInput || !this.audioContext || !this.isListening || this.isMuted) return;
548
- const inputData = event.inputBuffer.getChannelData(0);
549
- event.outputBuffer.getChannelData(0).fill(0);
550
- const resampled = this.resampler ? this.resampler.process(inputData) : inputData;
551
- if (resampled && resampled.length > 0) {
552
- const int16Data = float32ToPcm16(resampled);
553
- this.onAudioInput(new Uint8Array(int16Data.buffer, int16Data.byteOffset, int16Data.byteLength));
548
+ if (!this.onAudioInput || !this.audioContext || !this.isListening) return;
549
+ if (this.isMuted) return;
550
+ const inputBuffer = event.inputBuffer;
551
+ const inputData = inputBuffer.getChannelData(0);
552
+ const outputBuffer = event.outputBuffer;
553
+ for (let i = 0; i < outputBuffer.getChannelData(0).length; i++) {
554
+ outputBuffer.getChannelData(0)[i] = 0;
555
+ }
556
+ const hardwareRate = this.audioContext.sampleRate;
557
+ let processedData = new Float32Array(inputData);
558
+ if (hardwareRate !== this.inputSampleRate) {
559
+ processedData = resampleWithAntiAliasing(
560
+ processedData,
561
+ hardwareRate,
562
+ this.inputSampleRate
563
+ );
554
564
  }
565
+ const int16Data = float32ToPcm16(processedData);
566
+ const uint8Data = new Uint8Array(
567
+ int16Data.buffer,
568
+ int16Data.byteOffset,
569
+ int16Data.byteLength
570
+ );
571
+ this.onAudioInput(uint8Data);
555
572
  }
573
+ /**
574
+ * Stop capturing microphone input
575
+ */
556
576
  stopMicrophone() {
557
577
  this.isListening = false;
558
- this.resampler = null;
559
578
  if (this.mediaStream) {
560
579
  this.mediaStream.getTracks().forEach((track) => track.stop());
561
580
  this.mediaStream = null;
@@ -568,12 +587,17 @@ var BrowserAudioManager = class {
568
587
  this.mediaStreamAudioSourceNode.disconnect();
569
588
  this.mediaStreamAudioSourceNode = null;
570
589
  }
590
+ console.log("\u{1F3A4} Microphone stopped");
571
591
  }
572
592
  /**
573
593
  * Play back audio received from the server
594
+ * @param pcm16Data Int16 PCM audio data at SPEAKER_SAMPLE_RATE
574
595
  */
575
596
  playAudio(pcm16Data) {
576
- if (!this.audioContext) return;
597
+ if (!this.audioContext) {
598
+ console.warn("AudioContext not initialized");
599
+ return;
600
+ }
577
601
  const int16Array = new Int16Array(
578
602
  pcm16Data.buffer,
579
603
  pcm16Data.byteOffset,
@@ -588,17 +612,18 @@ var BrowserAudioManager = class {
588
612
  audioBuffer.getChannelData(0).set(float32Data);
589
613
  this._schedulePlayback(audioBuffer);
590
614
  }
615
+ /**
616
+ * Internal method to schedule and play audio with sample-accurate timing
617
+ */
591
618
  _schedulePlayback(audioBuffer) {
592
619
  if (!this.audioContext) return;
593
620
  const currentTime = this.audioContext.currentTime;
594
621
  const duration = audioBuffer.length / this.outputSampleRate;
595
622
  const startTime = Math.max(
596
623
  currentTime + 0.01,
624
+ // Minimum 10ms delay
597
625
  this.nextPlaybackTime
598
626
  );
599
- if (this.audioClockOffset === null) {
600
- this.audioClockOffset = startTime;
601
- }
602
627
  this.nextPlaybackTime = startTime + duration;
603
628
  const source = this.audioContext.createBufferSource();
604
629
  source.buffer = audioBuffer;
@@ -616,18 +641,8 @@ var BrowserAudioManager = class {
616
641
  };
617
642
  }
618
643
  /**
619
- * Get the current high-precision audio clock offset for viseme synchronization.
620
- * Total stream time (in ms) = (audioContext.currentTime - audioClockOffset) * 1000
621
- */
622
- getAudioClockOffset() {
623
- return this.audioClockOffset;
624
- }
625
- /**
626
- * Reset the audio clock offset (call when a response is interrupted or finished)
644
+ * Stop all currently playing audio and clear the queue
627
645
  */
628
- resetAudioClock() {
629
- this.audioClockOffset = null;
630
- }
631
646
  stopPlayback() {
632
647
  this.activeSources.forEach((source) => {
633
648
  try {
@@ -636,15 +651,26 @@ var BrowserAudioManager = class {
636
651
  }
637
652
  });
638
653
  this.activeSources = [];
639
- this.nextPlaybackTime = 0;
640
- this.resetAudioClock();
654
+ this.playbackQueue = [];
655
+ this.nextPlaybackTime = this.audioContext?.currentTime ?? 0;
656
+ console.log("\u{1F507} Playback stopped");
641
657
  }
658
+ /**
659
+ * Toggle mute state
660
+ */
642
661
  setMuted(muted) {
643
662
  this.isMuted = muted;
644
663
  }
664
+ /**
665
+ * Get current mute state
666
+ */
645
667
  isMicMuted() {
646
668
  return this.isMuted;
647
669
  }
670
+ /**
671
+ * Get current amplitude from analyser (for visualization)
672
+ * Returns value between 0 and 1
673
+ */
648
674
  getAmplitude() {
649
675
  if (!this.analyserNode) return 0;
650
676
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
@@ -652,18 +678,31 @@ var BrowserAudioManager = class {
652
678
  const rms = calculateRMS(dataArray);
653
679
  return Math.min(rms * 10, 1);
654
680
  }
681
+ /**
682
+ * Get frequency data from analyser for visualization
683
+ */
655
684
  getFrequencyData() {
656
- if (!this.analyserNode) return new Uint8Array(0);
685
+ if (!this.analyserNode) {
686
+ return new Uint8Array(0);
687
+ }
657
688
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
658
689
  this.analyserNode.getByteFrequencyData(dataArray);
659
690
  return dataArray;
660
691
  }
692
+ /**
693
+ * Get time-domain data from analyser for waveform visualization
694
+ */
661
695
  getWaveformData() {
662
- if (!this.analyserNode) return new Uint8Array(0);
696
+ if (!this.analyserNode) {
697
+ return new Uint8Array(0);
698
+ }
663
699
  const dataArray = new Uint8Array(this.analyserNode.frequencyBinCount);
664
700
  this.analyserNode.getByteTimeDomainData(dataArray);
665
701
  return dataArray;
666
702
  }
703
+ /**
704
+ * Cleanup and close AudioContext
705
+ */
667
706
  cleanup() {
668
707
  this.stopMicrophone();
669
708
  this.stopPlayback();
@@ -672,124 +711,17 @@ var BrowserAudioManager = class {
672
711
  this.analyserNode = null;
673
712
  }
674
713
  }
675
- getAudioContext() {
676
- return this.audioContext;
677
- }
678
- };
679
-
680
- // src/voice-agent.ts
681
- var VoiceAgent = class {
682
- client;
683
- audioManager;
684
- options;
685
- isConnected = false;
686
- visemeQueue = [];
687
- constructor(options) {
688
- this.options = options;
689
- this.client = new VoiceAgentClient({
690
- apiKey: options.apiKey,
691
- prompt: options.prompt || "You are a helpful and friendly AI assistant.",
692
- voice: options.voice || "F1" /* F1 */,
693
- language: options.language || "en" /* ENGLISH */,
694
- visemes: options.visemes ?? true,
695
- serverUrl: options.serverUrl,
696
- onTranscription: (text) => {
697
- if (options.onTranscription) options.onTranscription(text, true);
698
- },
699
- onResponse: (text) => {
700
- if (options.onTranscription) options.onTranscription(text, false);
701
- },
702
- onAudio: (data) => {
703
- this.audioManager.playAudio(data);
704
- },
705
- onVisemes: (visemes) => {
706
- this.visemeQueue.push(...visemes);
707
- if (options.onVisemes) options.onVisemes(visemes);
708
- },
709
- onStatus: (status) => {
710
- if (options.onStatusChange) options.onStatusChange(status);
711
- if (status === "interrupted" || status === "thinking") {
712
- this.audioManager.stopPlayback();
713
- this.visemeQueue = [];
714
- }
715
- },
716
- onError: (err) => {
717
- if (options.onError) options.onError(err);
718
- }
719
- });
720
- this.audioManager = new BrowserAudioManager({
721
- autoGainControl: true,
722
- echoCancellation: true,
723
- noiseSuppression: true
724
- });
725
- }
726
714
  /**
727
- * Initialize hardware and connect to the AI server.
728
- * This must be called in response to a user guesture (like a click)
729
- * to satisfy browser AudioContext requirements.
715
+ * Get current audio context state
730
716
  */
731
- async connect() {
732
- try {
733
- await this.audioManager.init();
734
- const connected = await this.client.connect();
735
- if (!connected) return false;
736
- this.isConnected = true;
737
- await this.audioManager.startMicrophone((pcm16Data) => {
738
- if (this.isConnected) {
739
- this.client.sendAudio(pcm16Data);
740
- }
741
- });
742
- return true;
743
- } catch (err) {
744
- if (this.options.onError) this.options.onError(err);
745
- return false;
746
- }
717
+ getState() {
718
+ return this.audioContext?.state ?? null;
747
719
  }
748
720
  /**
749
- * Get the current amplitude/volume of the microphone or output audio.
750
- * Useful for voice activity visualization.
751
- * @returns value between 0 and 1
721
+ * Check if microphone is currently listening
752
722
  */
753
- getAmplitude() {
754
- return this.audioManager.getAmplitude();
755
- }
756
- /**
757
- * Mute or unmute the microphone.
758
- */
759
- toggleMute() {
760
- const currentState = this.audioManager.isMicMuted();
761
- this.audioManager.setMuted(!currentState);
762
- return !currentState;
763
- }
764
- /**
765
- * High-precision method to get visemes that should be active
766
- * at the current playback frame. Use this in a requestAnimationFrame loop.
767
- */
768
- getFrameVisemes() {
769
- const offset = this.audioManager.getAudioClockOffset();
770
- const audioCtx = this.audioManager.getAudioContext();
771
- if (offset === null || !audioCtx) return [];
772
- const streamTime = (audioCtx.currentTime - offset) * 1e3;
773
- const currentBatch = [];
774
- while (this.visemeQueue.length > 0 && this.visemeQueue[0].t * 1e3 <= streamTime) {
775
- currentBatch.push(this.visemeQueue.shift());
776
- }
777
- return currentBatch;
778
- }
779
- /**
780
- * Change the system prompt mid-conversation.
781
- */
782
- updatePrompt(newPrompt) {
783
- this.client.updatePrompt(newPrompt);
784
- }
785
- /**
786
- * Disconnect and release audio resources.
787
- */
788
- disconnect() {
789
- this.isConnected = false;
790
- this.client.disconnect();
791
- this.audioManager.cleanup();
792
- this.visemeQueue = [];
723
+ isRecording() {
724
+ return this.isListening;
793
725
  }
794
726
  };
795
727
  export {
@@ -799,7 +731,6 @@ export {
799
731
  Language,
800
732
  StreamResampler,
801
733
  TTSClient,
802
- VoiceAgent,
803
734
  VoiceAgentClient,
804
735
  VoiceStyle,
805
736
  applyLowPassFilter,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lokutor/sdk",
3
- "version": "1.1.9",
3
+ "version": "1.1.10",
4
4
  "description": "JavaScript/TypeScript SDK for Lokutor Real-time Voice AI",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",