@layercode/js-sdk 1.0.22 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3528,13 +3528,53 @@ registerProcessor('audio_processor', AudioProcessor);
3528
3528
  this.endUserTurn = false;
3529
3529
  this.recorderStarted = false;
3530
3530
  this.readySent = false;
3531
- this.currentTurnText = '';
3532
3531
  this.currentTurnId = null;
3533
3532
  this.audioBuffer = [];
3533
+ this.audioPauseTime = null;
3534
3534
  // Bind event handlers
3535
3535
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3536
3536
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
3537
3537
  }
3538
+ _setupAmplitudeBasedVAD() {
3539
+ let isSpeakingByAmplitude = false;
3540
+ let silenceFrames = 0;
3541
+ const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
3542
+ const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
3543
+ // Monitor amplitude changes
3544
+ this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
3545
+ const wasSpeaking = isSpeakingByAmplitude;
3546
+ if (amplitude > AMPLITUDE_THRESHOLD) {
3547
+ silenceFrames = 0;
3548
+ if (!wasSpeaking) {
3549
+ // Speech started - pause audio if playing and track timing for interruption calculation
3550
+ if (this.canInterrupt && this.wavPlayer.isPlaying) {
3551
+ this.audioPauseTime = Date.now();
3552
+ this.wavPlayer.pause();
3553
+ }
3554
+ isSpeakingByAmplitude = true;
3555
+ this.userIsSpeaking = true;
3556
+ this.options.onUserIsSpeakingChange(true);
3557
+ this._wsSend({
3558
+ type: 'vad_events',
3559
+ event: 'vad_start',
3560
+ });
3561
+ }
3562
+ }
3563
+ else {
3564
+ silenceFrames++;
3565
+ if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
3566
+ // Speech ended
3567
+ isSpeakingByAmplitude = false;
3568
+ this.userIsSpeaking = false;
3569
+ this.options.onUserIsSpeakingChange(false);
3570
+ this._wsSend({
3571
+ type: 'vad_events',
3572
+ event: 'vad_end',
3573
+ });
3574
+ }
3575
+ }
3576
+ });
3577
+ }
3538
3578
  _initializeVAD() {
3539
3579
  console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
3540
3580
  // If we're in push to talk mode, we don't need to use the VAD model
@@ -3543,9 +3583,17 @@ registerProcessor('audio_processor', AudioProcessor);
3543
3583
  }
3544
3584
  const timeout = setTimeout(() => {
3545
3585
  console.log('silero vad model timeout');
3546
- // TODO: send message to server to indicate that the vad model timed out
3547
- this.userIsSpeaking = true; // allow audio to be sent to the server
3548
- this.options.onUserIsSpeakingChange(true);
3586
+ console.warn('VAD model failed to load - falling back to amplitude-based detection');
3587
+ // Send a message to server indicating VAD failure
3588
+ this._wsSend({
3589
+ type: 'vad_events',
3590
+ event: 'vad_model_failed',
3591
+ });
3592
+ // In automatic mode without VAD, allow the bot to speak initially
3593
+ this.userIsSpeaking = false;
3594
+ this.options.onUserIsSpeakingChange(false);
3595
+ // Set up amplitude-based fallback detection
3596
+ this._setupAmplitudeBasedVAD();
3549
3597
  }, 2000);
3550
3598
  if (!this.canInterrupt) {
3551
3599
  dist.MicVAD.new({
@@ -3554,7 +3602,7 @@ registerProcessor('audio_processor', AudioProcessor);
3554
3602
  positiveSpeechThreshold: 0.3,
3555
3603
  negativeSpeechThreshold: 0.2,
3556
3604
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3557
- minSpeechFrames: 15,
3605
+ minSpeechFrames: 0,
3558
3606
  preSpeechPadFrames: 0,
3559
3607
  onSpeechStart: () => {
3560
3608
  this.userIsSpeaking = true;
@@ -3565,27 +3613,6 @@ registerProcessor('audio_processor', AudioProcessor);
3565
3613
  event: 'vad_start',
3566
3614
  });
3567
3615
  },
3568
- onVADMisfire: () => {
3569
- console.log('onVADMisfire: Short utterance detected, resuming bot');
3570
- this.audioBuffer = []; // Clear buffer on misfire
3571
- this.userIsSpeaking = false;
3572
- this.options.onUserIsSpeakingChange(false);
3573
- // Send vad_end to indicate the short utterance is over
3574
- this._wsSend({
3575
- type: 'vad_events',
3576
- event: 'vad_end',
3577
- });
3578
- // End the user's turn
3579
- this._wsSend({
3580
- type: 'trigger.turn.end',
3581
- role: 'user',
3582
- });
3583
- // Resume bot audio if it was playing
3584
- if (!this.wavPlayer.isPlaying) {
3585
- console.log('onVADMisfire: Resuming bot audio');
3586
- this.wavPlayer.play();
3587
- }
3588
- },
3589
3616
  onSpeechEnd: () => {
3590
3617
  console.log('onSpeechEnd: sending vad_end');
3591
3618
  this.endUserTurn = true; // Set flag to indicate that the user turn has ended
@@ -3626,6 +3653,7 @@ registerProcessor('audio_processor', AudioProcessor);
3626
3653
  // Only pause agent audio if it's currently playing
3627
3654
  if (this.wavPlayer.isPlaying) {
3628
3655
  console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3656
+ this.audioPauseTime = Date.now(); // Track when we paused
3629
3657
  this.wavPlayer.pause();
3630
3658
  }
3631
3659
  else {
@@ -3650,9 +3678,8 @@ registerProcessor('audio_processor', AudioProcessor);
3650
3678
  setTimeout(() => {
3651
3679
  if (!this.wavPlayer.isPlaying) {
3652
3680
  console.log('onVADMisfire: Resuming after delay');
3681
+ this.audioPauseTime = null; // Clear pause time since we're resuming
3653
3682
  this.wavPlayer.play();
3654
- this.userIsSpeaking = true;
3655
- this.options.onUserIsSpeakingChange(true);
3656
3683
  }
3657
3684
  else {
3658
3685
  console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
@@ -3706,33 +3733,37 @@ registerProcessor('audio_processor', AudioProcessor);
3706
3733
  reason: 'completed',
3707
3734
  });
3708
3735
  }
3709
- _estimateWordsHeard(text, playbackOffsetSeconds) {
3710
- const words = text.split(/\s+/).filter((word) => word.length > 0);
3711
- const totalWords = words.length;
3712
- // Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
3713
- const estimatedWordsPerSecond = 2.5;
3714
- const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
3715
- const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
3716
- return { wordsHeard: estimatedWordsHeard, textHeard };
3717
- }
3718
3736
  async _clientInterruptAssistantReplay() {
3719
3737
  const offsetData = await this.wavPlayer.interrupt();
3720
- if (offsetData && this.currentTurnText && this.currentTurnId) {
3721
- const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
3722
- const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
3723
- console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
3724
- // Send interruption event with context
3738
+ if (offsetData && this.currentTurnId) {
3739
+ let offsetMs = offsetData.currentTime * 1000;
3740
+ // Calculate accurate offset by subtracting pause time if audio was paused for VAD
3741
+ if (this.audioPauseTime) {
3742
+ const pauseDurationMs = Date.now() - this.audioPauseTime;
3743
+ const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
3744
+ console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
3745
+ offsetMs = adjustedOffsetMs;
3746
+ this.audioPauseTime = null; // Clear the pause time
3747
+ }
3748
+ else {
3749
+ console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
3750
+ }
3751
+ // Send interruption event with accurate playback offset in milliseconds
3725
3752
  this._wsSend({
3726
3753
  type: 'trigger.response.audio.interrupted',
3727
- playback_offset: offsetData.currentTime,
3754
+ playback_offset: offsetMs,
3728
3755
  interruption_context: {
3729
3756
  turn_id: this.currentTurnId,
3730
- estimated_words_heard: wordsHeard,
3731
- total_words: totalWords,
3732
- text_heard: textHeard,
3757
+ playback_offset_ms: offsetMs,
3733
3758
  },
3734
3759
  });
3735
3760
  }
3761
+ else {
3762
+ console.warn('Interruption requested but missing required data:', {
3763
+ hasOffsetData: !!offsetData,
3764
+ hasTurnId: !!this.currentTurnId,
3765
+ });
3766
+ }
3736
3767
  }
3737
3768
  async triggerUserTurnStarted() {
3738
3769
  if (!this.pushToTalkActive) {
@@ -3781,30 +3812,20 @@ registerProcessor('audio_processor', AudioProcessor);
3781
3812
  // Set current turn ID from first audio message, or update if different turn
3782
3813
  if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
3783
3814
  console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
3784
- const oldTurnId = this.currentTurnId;
3785
3815
  this.currentTurnId = message.turn_id;
3786
- this.currentTurnText = ''; // Reset text for new turn
3787
3816
  // Clean up interrupted tracks, keeping only the current turn
3788
3817
  this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
3789
3818
  }
3790
3819
  break;
3791
- case 'response.text':
3792
- // Set turn ID from first text message if not set, or accumulate if matches current turn
3793
- if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
3794
- if (!this.currentTurnId) {
3795
- console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3796
- this.currentTurnId = message.turn_id;
3797
- this.currentTurnText = '';
3798
- }
3799
- this.currentTurnText += message.content;
3800
- }
3801
- else {
3802
- console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
3820
+ case 'response.text': {
3821
+ // Set turn ID from first text message if not set
3822
+ if (!this.currentTurnId) {
3823
+ this.currentTurnId = message.turn_id;
3824
+ console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3803
3825
  }
3826
+ // Note: We no longer track text content in the client - the pipeline handles interruption estimation
3804
3827
  break;
3805
- // case 'response.end':
3806
- // console.log('received response.end');
3807
- // break;
3828
+ }
3808
3829
  case 'response.data':
3809
3830
  console.log('received response.data', message);
3810
3831
  this.options.onDataMessage(message);
@@ -3908,6 +3929,8 @@ registerProcessor('audio_processor', AudioProcessor);
3908
3929
  async connect() {
3909
3930
  try {
3910
3931
  this._setStatus('connecting');
3932
+ // Reset turn tracking for clean start
3933
+ this._resetTurnTracking();
3911
3934
  // Get session key from server
3912
3935
  let authorizeSessionRequestBody = {
3913
3936
  pipeline_id: this.options.pipelineId,
@@ -3985,6 +4008,10 @@ registerProcessor('audio_processor', AudioProcessor);
3985
4008
  throw error;
3986
4009
  }
3987
4010
  }
4011
+ _resetTurnTracking() {
4012
+ this.currentTurnId = null;
4013
+ console.log('Reset turn tracking state');
4014
+ }
3988
4015
  async disconnect() {
3989
4016
  // Clean up VAD if it exists
3990
4017
  if (this.vad) {
@@ -3994,6 +4021,8 @@ registerProcessor('audio_processor', AudioProcessor);
3994
4021
  }
3995
4022
  this.wavRecorder.quit();
3996
4023
  this.wavPlayer.disconnect();
4024
+ // Reset turn tracking
4025
+ this._resetTurnTracking();
3997
4026
  // Close websocket and ensure status is updated
3998
4027
  if (this.ws) {
3999
4028
  this.ws.close();