@layercode/js-sdk 1.0.24 → 1.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3519,12 +3519,11 @@ class LayercodeClient {
3519
3519
  this.pushToTalkEnabled = false;
3520
3520
  this.canInterrupt = false;
3521
3521
  this.userIsSpeaking = false;
3522
- this.endUserTurn = false;
3523
3522
  this.recorderStarted = false;
3524
3523
  this.readySent = false;
3525
3524
  this.currentTurnId = null;
3526
3525
  this.audioBuffer = [];
3527
- this.audioPauseTime = null;
3526
+ // this.audioPauseTime = null;
3528
3527
  // Bind event handlers
3529
3528
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3530
3529
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
@@ -3533,18 +3532,13 @@ class LayercodeClient {
3533
3532
  let isSpeakingByAmplitude = false;
3534
3533
  let silenceFrames = 0;
3535
3534
  const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
3536
- const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
3535
+ const SILENCE_FRAMES_THRESHOLD = 6.4; // 6.4 * 20ms chunks = 128ms silence. Same as Silero ((frame samples: 512 / sampleRate: 16000) * 1000 * redemptionFrames: 4) = 128 ms silence
3537
3536
  // Monitor amplitude changes
3538
3537
  this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
3539
3538
  const wasSpeaking = isSpeakingByAmplitude;
3540
3539
  if (amplitude > AMPLITUDE_THRESHOLD) {
3541
3540
  silenceFrames = 0;
3542
3541
  if (!wasSpeaking) {
3543
- // Speech started - pause audio if playing and track timing for interruption calculation
3544
- if (this.canInterrupt && this.wavPlayer.isPlaying) {
3545
- this.audioPauseTime = Date.now();
3546
- this.wavPlayer.pause();
3547
- }
3548
3542
  isSpeakingByAmplitude = true;
3549
3543
  this.userIsSpeaking = true;
3550
3544
  this.options.onUserIsSpeakingChange(true);
@@ -3557,7 +3551,6 @@ class LayercodeClient {
3557
3551
  else {
3558
3552
  silenceFrames++;
3559
3553
  if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
3560
- // Speech ended
3561
3554
  isSpeakingByAmplitude = false;
3562
3555
  this.userIsSpeaking = false;
3563
3556
  this.options.onUserIsSpeakingChange(false);
@@ -3575,7 +3568,7 @@ class LayercodeClient {
3575
3568
  if (this.pushToTalkEnabled) {
3576
3569
  return;
3577
3570
  }
3578
- const timeout = setTimeout(() => {
3571
+ const vadLoadTimeout = setTimeout(() => {
3579
3572
  console.log('silero vad model timeout');
3580
3573
  console.warn('VAD model failed to load - falling back to amplitude-based detection');
3581
3574
  // Send a message to server indicating VAD failure
@@ -3583,134 +3576,54 @@ class LayercodeClient {
3583
3576
  type: 'vad_events',
3584
3577
  event: 'vad_model_failed',
3585
3578
  });
3586
- // In automatic mode without VAD, allow the bot to speak initially
3587
- this.userIsSpeaking = false;
3588
- this.options.onUserIsSpeakingChange(false);
3589
3579
  // Set up amplitude-based fallback detection
3590
3580
  this._setupAmplitudeBasedVAD();
3591
3581
  }, 2000);
3592
- if (!this.canInterrupt) {
3593
- dist.MicVAD.new({
3594
- stream: this.wavRecorder.getStream() || undefined,
3595
- model: 'v5',
3596
- positiveSpeechThreshold: 0.3,
3597
- negativeSpeechThreshold: 0.2,
3598
- redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3599
- minSpeechFrames: 0,
3600
- preSpeechPadFrames: 0,
3601
- onSpeechStart: () => {
3602
- this.userIsSpeaking = true;
3603
- this.options.onUserIsSpeakingChange(true);
3604
- console.log('onSpeechStart: sending vad_start');
3605
- this._wsSend({
3606
- type: 'vad_events',
3607
- event: 'vad_start',
3608
- });
3609
- },
3610
- onSpeechEnd: () => {
3611
- console.log('onSpeechEnd: sending vad_end');
3612
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3613
- this.audioBuffer = []; // Clear buffer on speech end
3614
- this.userIsSpeaking = false;
3615
- this.options.onUserIsSpeakingChange(false);
3616
- console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3617
- // Send vad_end immediately instead of waiting for next audio chunk
3618
- this._wsSend({
3619
- type: 'vad_events',
3620
- event: 'vad_end',
3621
- });
3622
- this.endUserTurn = false; // Reset the flag after sending vad_end
3623
- },
3624
- })
3625
- .then((vad) => {
3626
- clearTimeout(timeout);
3627
- this.vad = vad;
3628
- this.vad.start();
3629
- console.log('VAD started');
3630
- })
3631
- .catch((error) => {
3632
- console.error('Error initializing VAD:', error);
3633
- });
3634
- }
3635
- else {
3636
- dist.MicVAD.new({
3637
- stream: this.wavRecorder.getStream() || undefined,
3638
- model: 'v5',
3639
- // baseAssetPath: '/', // Use if bundling model locally
3640
- // onnxWASMBasePath: '/', // Use if bundling model locally
3641
- positiveSpeechThreshold: 0.5,
3642
- negativeSpeechThreshold: 0.3,
3643
- redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3644
- minSpeechFrames: 25,
3645
- preSpeechPadFrames: 0,
3646
- onSpeechStart: () => {
3647
- // Only pause agent audio if it's currently playing
3648
- if (this.wavPlayer.isPlaying) {
3649
- console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3650
- this.audioPauseTime = Date.now(); // Track when we paused
3651
- this.wavPlayer.pause();
3652
- }
3653
- else {
3654
- console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3655
- }
3656
- console.log('onSpeechStart: sending vad_start');
3657
- this._wsSend({
3658
- type: 'vad_events',
3659
- event: 'vad_start',
3660
- });
3661
- this.userIsSpeaking = true;
3662
- this.options.onUserIsSpeakingChange(true);
3663
- this.endUserTurn = false; // Reset endUserTurn when speech starts
3664
- console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3665
- },
3666
- onVADMisfire: () => {
3667
- // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3668
- this.userIsSpeaking = false;
3669
- this.audioBuffer = []; // Clear buffer on misfire
3670
- this.options.onUserIsSpeakingChange(false);
3671
- // Add the missing delay before resuming to prevent race conditions
3672
- setTimeout(() => {
3673
- if (!this.wavPlayer.isPlaying) {
3674
- console.log('onVADMisfire: Resuming after delay');
3675
- this.audioPauseTime = null; // Clear pause time since we're resuming
3676
- this.wavPlayer.play();
3677
- }
3678
- else {
3679
- console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
3680
- this.endUserTurn = true;
3681
- }
3682
- }, this.options.vadResumeDelay);
3683
- },
3684
- onSpeechEnd: () => {
3685
- console.log('onSpeechEnd: sending vad_end');
3686
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3687
- this.audioBuffer = []; // Clear buffer on speech end
3688
- this.userIsSpeaking = false;
3689
- this.options.onUserIsSpeakingChange(false);
3690
- console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3691
- // Send vad_end immediately instead of waiting for next audio chunk
3692
- this._wsSend({
3693
- type: 'vad_events',
3694
- event: 'vad_end',
3695
- });
3696
- this.endUserTurn = false; // Reset the flag after sending vad_end
3697
- },
3698
- })
3699
- .then((vad) => {
3700
- clearTimeout(timeout);
3701
- this.vad = vad;
3702
- this.vad.start();
3703
- console.log('VAD started');
3704
- })
3705
- .catch((error) => {
3706
- console.error('Error initializing VAD:', error);
3707
- });
3708
- }
3582
+ dist.MicVAD.new({
3583
+ stream: this.wavRecorder.getStream() || undefined,
3584
+ model: 'v5',
3585
+ positiveSpeechThreshold: 0.15,
3586
+ negativeSpeechThreshold: 0.05,
3587
+ redemptionFrames: 4,
3588
+ minSpeechFrames: 2,
3589
+ preSpeechPadFrames: 0,
3590
+ frameSamples: 512, // Required for v5 as per https://docs.vad.ricky0123.com/user-guide/algorithm/#configuration
3591
+ onSpeechStart: () => {
3592
+ console.log('onSpeechStart: sending vad_start');
3593
+ this.userIsSpeaking = true;
3594
+ this.options.onUserIsSpeakingChange(true);
3595
+ this._wsSend({
3596
+ type: 'vad_events',
3597
+ event: 'vad_start',
3598
+ });
3599
+ },
3600
+ onSpeechEnd: () => {
3601
+ console.log('onSpeechEnd: sending vad_end');
3602
+ this.userIsSpeaking = false;
3603
+ this.options.onUserIsSpeakingChange(false);
3604
+ this.audioBuffer = []; // Clear buffer on speech end
3605
+ this._wsSend({
3606
+ type: 'vad_events',
3607
+ event: 'vad_end',
3608
+ });
3609
+ },
3610
+ // onVADMisfire: () => {
3611
+ // // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd.
3612
+ // },
3613
+ })
3614
+ .then((vad) => {
3615
+ clearTimeout(vadLoadTimeout);
3616
+ this.vad = vad;
3617
+ this.vad.start();
3618
+ console.log('VAD started');
3619
+ })
3620
+ .catch((error) => {
3621
+ console.error('Error initializing VAD:', error);
3622
+ });
3709
3623
  }
3710
3624
  /**
3711
3625
  * Updates the connection status and triggers the callback
3712
3626
  * @param {string} status - New status value
3713
- * @private
3714
3627
  */
3715
3628
  _setStatus(status) {
3716
3629
  this.status = status;
@@ -3718,7 +3631,6 @@ class LayercodeClient {
3718
3631
  }
3719
3632
  /**
3720
3633
  * Handles when agent audio finishes playing
3721
- * @private
3722
3634
  */
3723
3635
  _clientResponseAudioReplayFinished() {
3724
3636
  console.log('clientResponseAudioReplayFinished');
@@ -3731,17 +3643,6 @@ class LayercodeClient {
3731
3643
  const offsetData = await this.wavPlayer.interrupt();
3732
3644
  if (offsetData && this.currentTurnId) {
3733
3645
  let offsetMs = offsetData.currentTime * 1000;
3734
- // Calculate accurate offset by subtracting pause time if audio was paused for VAD
3735
- if (this.audioPauseTime) {
3736
- const pauseDurationMs = Date.now() - this.audioPauseTime;
3737
- const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
3738
- console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
3739
- offsetMs = adjustedOffsetMs;
3740
- this.audioPauseTime = null; // Clear the pause time
3741
- }
3742
- else {
3743
- console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
3744
- }
3745
3646
  // Send interruption event with accurate playback offset in milliseconds
3746
3647
  this._wsSend({
3747
3648
  type: 'trigger.response.audio.interrupted',
@@ -3775,7 +3676,6 @@ class LayercodeClient {
3775
3676
  /**
3776
3677
  * Handles incoming WebSocket messages
3777
3678
  * @param {MessageEvent} event - The WebSocket message event
3778
- * @private
3779
3679
  */
3780
3680
  async _handleWebSocketMessage(event) {
3781
3681
  try {
@@ -3790,12 +3690,10 @@ class LayercodeClient {
3790
3690
  console.log(message);
3791
3691
  if (message.role === 'assistant') {
3792
3692
  // Start tracking new assistant turn
3793
- // Note: Don't reset currentTurnId here - let response.audio set it
3794
- // This prevents race conditions where text arrives before audio
3795
3693
  console.log('Assistant turn started, will track new turn ID from audio/text');
3796
3694
  }
3797
- else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3798
- // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3695
+ else if (message.role === 'user' && !this.pushToTalkEnabled) {
3696
+ // Interrupt any playing assistant audio if this is a turn triggered by the server (and not push to talk, which will have already called interrupt)
3799
3697
  console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3800
3698
  await this._clientInterruptAssistantReplay();
3801
3699
  }
@@ -3817,7 +3715,6 @@ class LayercodeClient {
3817
3715
  this.currentTurnId = message.turn_id;
3818
3716
  console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3819
3717
  }
3820
- // Note: We no longer track text content in the client - the pipeline handles interruption estimation
3821
3718
  break;
3822
3719
  }
3823
3720
  case 'response.data':
@@ -3837,7 +3734,6 @@ class LayercodeClient {
3837
3734
  /**
3838
3735
  * Handles available client browser microphone audio data and sends it over the WebSocket
3839
3736
  * @param {ArrayBuffer} data - The audio data buffer
3840
- * @private
3841
3737
  */
3842
3738
  _handleDataAvailable(data) {
3843
3739
  try {
@@ -3897,7 +3793,6 @@ class LayercodeClient {
3897
3793
  * @param {WavRecorder | WavStreamPlayer} source - The audio source (recorder or player).
3898
3794
  * @param {(amplitude: number) => void} callback - The callback function to invoke on amplitude change.
3899
3795
  * @param {(amplitude: number) => void} updateInternalState - Function to update the internal amplitude state.
3900
- * @private
3901
3796
  */
3902
3797
  _setupAmplitudeMonitoring(source, callback, updateInternalState) {
3903
3798
  // Set up amplitude monitoring only if a callback is provided