@layercode/js-sdk 1.0.24 → 1.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3525,12 +3525,11 @@ registerProcessor('audio_processor', AudioProcessor);
3525
3525
  this.pushToTalkEnabled = false;
3526
3526
  this.canInterrupt = false;
3527
3527
  this.userIsSpeaking = false;
3528
- this.endUserTurn = false;
3529
3528
  this.recorderStarted = false;
3530
3529
  this.readySent = false;
3531
3530
  this.currentTurnId = null;
3532
3531
  this.audioBuffer = [];
3533
- this.audioPauseTime = null;
3532
+ // this.audioPauseTime = null;
3534
3533
  // Bind event handlers
3535
3534
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3536
3535
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
@@ -3539,18 +3538,13 @@ registerProcessor('audio_processor', AudioProcessor);
3539
3538
  let isSpeakingByAmplitude = false;
3540
3539
  let silenceFrames = 0;
3541
3540
  const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
3542
- const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
3541
+ const SILENCE_FRAMES_THRESHOLD = 6.4; // 6.4 * 20ms chunks = 128ms silence. Same as Silero ((frame samples: 512 / sampleRate: 16000) * 1000 * redemptionFrames: 4) = 128 ms silence
3543
3542
  // Monitor amplitude changes
3544
3543
  this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
3545
3544
  const wasSpeaking = isSpeakingByAmplitude;
3546
3545
  if (amplitude > AMPLITUDE_THRESHOLD) {
3547
3546
  silenceFrames = 0;
3548
3547
  if (!wasSpeaking) {
3549
- // Speech started - pause audio if playing and track timing for interruption calculation
3550
- if (this.canInterrupt && this.wavPlayer.isPlaying) {
3551
- this.audioPauseTime = Date.now();
3552
- this.wavPlayer.pause();
3553
- }
3554
3548
  isSpeakingByAmplitude = true;
3555
3549
  this.userIsSpeaking = true;
3556
3550
  this.options.onUserIsSpeakingChange(true);
@@ -3563,7 +3557,6 @@ registerProcessor('audio_processor', AudioProcessor);
3563
3557
  else {
3564
3558
  silenceFrames++;
3565
3559
  if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
3566
- // Speech ended
3567
3560
  isSpeakingByAmplitude = false;
3568
3561
  this.userIsSpeaking = false;
3569
3562
  this.options.onUserIsSpeakingChange(false);
@@ -3581,7 +3574,7 @@ registerProcessor('audio_processor', AudioProcessor);
3581
3574
  if (this.pushToTalkEnabled) {
3582
3575
  return;
3583
3576
  }
3584
- const timeout = setTimeout(() => {
3577
+ const vadLoadTimeout = setTimeout(() => {
3585
3578
  console.log('silero vad model timeout');
3586
3579
  console.warn('VAD model failed to load - falling back to amplitude-based detection');
3587
3580
  // Send a message to server indicating VAD failure
@@ -3589,134 +3582,54 @@ registerProcessor('audio_processor', AudioProcessor);
3589
3582
  type: 'vad_events',
3590
3583
  event: 'vad_model_failed',
3591
3584
  });
3592
- // In automatic mode without VAD, allow the bot to speak initially
3593
- this.userIsSpeaking = false;
3594
- this.options.onUserIsSpeakingChange(false);
3595
3585
  // Set up amplitude-based fallback detection
3596
3586
  this._setupAmplitudeBasedVAD();
3597
3587
  }, 2000);
3598
- if (!this.canInterrupt) {
3599
- dist.MicVAD.new({
3600
- stream: this.wavRecorder.getStream() || undefined,
3601
- model: 'v5',
3602
- positiveSpeechThreshold: 0.3,
3603
- negativeSpeechThreshold: 0.2,
3604
- redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3605
- minSpeechFrames: 0,
3606
- preSpeechPadFrames: 0,
3607
- onSpeechStart: () => {
3608
- this.userIsSpeaking = true;
3609
- this.options.onUserIsSpeakingChange(true);
3610
- console.log('onSpeechStart: sending vad_start');
3611
- this._wsSend({
3612
- type: 'vad_events',
3613
- event: 'vad_start',
3614
- });
3615
- },
3616
- onSpeechEnd: () => {
3617
- console.log('onSpeechEnd: sending vad_end');
3618
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3619
- this.audioBuffer = []; // Clear buffer on speech end
3620
- this.userIsSpeaking = false;
3621
- this.options.onUserIsSpeakingChange(false);
3622
- console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3623
- // Send vad_end immediately instead of waiting for next audio chunk
3624
- this._wsSend({
3625
- type: 'vad_events',
3626
- event: 'vad_end',
3627
- });
3628
- this.endUserTurn = false; // Reset the flag after sending vad_end
3629
- },
3630
- })
3631
- .then((vad) => {
3632
- clearTimeout(timeout);
3633
- this.vad = vad;
3634
- this.vad.start();
3635
- console.log('VAD started');
3636
- })
3637
- .catch((error) => {
3638
- console.error('Error initializing VAD:', error);
3639
- });
3640
- }
3641
- else {
3642
- dist.MicVAD.new({
3643
- stream: this.wavRecorder.getStream() || undefined,
3644
- model: 'v5',
3645
- // baseAssetPath: '/', // Use if bundling model locally
3646
- // onnxWASMBasePath: '/', // Use if bundling model locally
3647
- positiveSpeechThreshold: 0.5,
3648
- negativeSpeechThreshold: 0.3,
3649
- redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3650
- minSpeechFrames: 25,
3651
- preSpeechPadFrames: 0,
3652
- onSpeechStart: () => {
3653
- // Only pause agent audio if it's currently playing
3654
- if (this.wavPlayer.isPlaying) {
3655
- console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3656
- this.audioPauseTime = Date.now(); // Track when we paused
3657
- this.wavPlayer.pause();
3658
- }
3659
- else {
3660
- console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3661
- }
3662
- console.log('onSpeechStart: sending vad_start');
3663
- this._wsSend({
3664
- type: 'vad_events',
3665
- event: 'vad_start',
3666
- });
3667
- this.userIsSpeaking = true;
3668
- this.options.onUserIsSpeakingChange(true);
3669
- this.endUserTurn = false; // Reset endUserTurn when speech starts
3670
- console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3671
- },
3672
- onVADMisfire: () => {
3673
- // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3674
- this.userIsSpeaking = false;
3675
- this.audioBuffer = []; // Clear buffer on misfire
3676
- this.options.onUserIsSpeakingChange(false);
3677
- // Add the missing delay before resuming to prevent race conditions
3678
- setTimeout(() => {
3679
- if (!this.wavPlayer.isPlaying) {
3680
- console.log('onVADMisfire: Resuming after delay');
3681
- this.audioPauseTime = null; // Clear pause time since we're resuming
3682
- this.wavPlayer.play();
3683
- }
3684
- else {
3685
- console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
3686
- this.endUserTurn = true;
3687
- }
3688
- }, this.options.vadResumeDelay);
3689
- },
3690
- onSpeechEnd: () => {
3691
- console.log('onSpeechEnd: sending vad_end');
3692
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3693
- this.audioBuffer = []; // Clear buffer on speech end
3694
- this.userIsSpeaking = false;
3695
- this.options.onUserIsSpeakingChange(false);
3696
- console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3697
- // Send vad_end immediately instead of waiting for next audio chunk
3698
- this._wsSend({
3699
- type: 'vad_events',
3700
- event: 'vad_end',
3701
- });
3702
- this.endUserTurn = false; // Reset the flag after sending vad_end
3703
- },
3704
- })
3705
- .then((vad) => {
3706
- clearTimeout(timeout);
3707
- this.vad = vad;
3708
- this.vad.start();
3709
- console.log('VAD started');
3710
- })
3711
- .catch((error) => {
3712
- console.error('Error initializing VAD:', error);
3713
- });
3714
- }
3588
+ dist.MicVAD.new({
3589
+ stream: this.wavRecorder.getStream() || undefined,
3590
+ model: 'v5',
3591
+ positiveSpeechThreshold: 0.15,
3592
+ negativeSpeechThreshold: 0.05,
3593
+ redemptionFrames: 4,
3594
+ minSpeechFrames: 2,
3595
+ preSpeechPadFrames: 0,
3596
+ frameSamples: 512, // Required for v5 as per https://docs.vad.ricky0123.com/user-guide/algorithm/#configuration
3597
+ onSpeechStart: () => {
3598
+ console.log('onSpeechStart: sending vad_start');
3599
+ this.userIsSpeaking = true;
3600
+ this.options.onUserIsSpeakingChange(true);
3601
+ this._wsSend({
3602
+ type: 'vad_events',
3603
+ event: 'vad_start',
3604
+ });
3605
+ },
3606
+ onSpeechEnd: () => {
3607
+ console.log('onSpeechEnd: sending vad_end');
3608
+ this.userIsSpeaking = false;
3609
+ this.options.onUserIsSpeakingChange(false);
3610
+ this.audioBuffer = []; // Clear buffer on speech end
3611
+ this._wsSend({
3612
+ type: 'vad_events',
3613
+ event: 'vad_end',
3614
+ });
3615
+ },
3616
+ // onVADMisfire: () => {
3617
+ // // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd.
3618
+ // },
3619
+ })
3620
+ .then((vad) => {
3621
+ clearTimeout(vadLoadTimeout);
3622
+ this.vad = vad;
3623
+ this.vad.start();
3624
+ console.log('VAD started');
3625
+ })
3626
+ .catch((error) => {
3627
+ console.error('Error initializing VAD:', error);
3628
+ });
3715
3629
  }
3716
3630
  /**
3717
3631
  * Updates the connection status and triggers the callback
3718
3632
  * @param {string} status - New status value
3719
- * @private
3720
3633
  */
3721
3634
  _setStatus(status) {
3722
3635
  this.status = status;
@@ -3724,7 +3637,6 @@ registerProcessor('audio_processor', AudioProcessor);
3724
3637
  }
3725
3638
  /**
3726
3639
  * Handles when agent audio finishes playing
3727
- * @private
3728
3640
  */
3729
3641
  _clientResponseAudioReplayFinished() {
3730
3642
  console.log('clientResponseAudioReplayFinished');
@@ -3737,17 +3649,6 @@ registerProcessor('audio_processor', AudioProcessor);
3737
3649
  const offsetData = await this.wavPlayer.interrupt();
3738
3650
  if (offsetData && this.currentTurnId) {
3739
3651
  let offsetMs = offsetData.currentTime * 1000;
3740
- // Calculate accurate offset by subtracting pause time if audio was paused for VAD
3741
- if (this.audioPauseTime) {
3742
- const pauseDurationMs = Date.now() - this.audioPauseTime;
3743
- const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
3744
- console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
3745
- offsetMs = adjustedOffsetMs;
3746
- this.audioPauseTime = null; // Clear the pause time
3747
- }
3748
- else {
3749
- console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
3750
- }
3751
3652
  // Send interruption event with accurate playback offset in milliseconds
3752
3653
  this._wsSend({
3753
3654
  type: 'trigger.response.audio.interrupted',
@@ -3781,7 +3682,6 @@ registerProcessor('audio_processor', AudioProcessor);
3781
3682
  /**
3782
3683
  * Handles incoming WebSocket messages
3783
3684
  * @param {MessageEvent} event - The WebSocket message event
3784
- * @private
3785
3685
  */
3786
3686
  async _handleWebSocketMessage(event) {
3787
3687
  try {
@@ -3796,12 +3696,10 @@ registerProcessor('audio_processor', AudioProcessor);
3796
3696
  console.log(message);
3797
3697
  if (message.role === 'assistant') {
3798
3698
  // Start tracking new assistant turn
3799
- // Note: Don't reset currentTurnId here - let response.audio set it
3800
- // This prevents race conditions where text arrives before audio
3801
3699
  console.log('Assistant turn started, will track new turn ID from audio/text');
3802
3700
  }
3803
- else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3804
- // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3701
+ else if (message.role === 'user' && !this.pushToTalkEnabled) {
3702
+ // Interrupt any playing assistant audio if this is a turn triggered by the server (and not push to talk, which will have already called interrupt)
3805
3703
  console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3806
3704
  await this._clientInterruptAssistantReplay();
3807
3705
  }
@@ -3823,7 +3721,6 @@ registerProcessor('audio_processor', AudioProcessor);
3823
3721
  this.currentTurnId = message.turn_id;
3824
3722
  console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3825
3723
  }
3826
- // Note: We no longer track text content in the client - the pipeline handles interruption estimation
3827
3724
  break;
3828
3725
  }
3829
3726
  case 'response.data':
@@ -3843,7 +3740,6 @@ registerProcessor('audio_processor', AudioProcessor);
3843
3740
  /**
3844
3741
  * Handles available client browser microphone audio data and sends it over the WebSocket
3845
3742
  * @param {ArrayBuffer} data - The audio data buffer
3846
- * @private
3847
3743
  */
3848
3744
  _handleDataAvailable(data) {
3849
3745
  try {
@@ -3903,7 +3799,6 @@ registerProcessor('audio_processor', AudioProcessor);
3903
3799
  * @param {WavRecorder | WavStreamPlayer} source - The audio source (recorder or player).
3904
3800
  * @param {(amplitude: number) => void} callback - The callback function to invoke on amplitude change.
3905
3801
  * @param {(amplitude: number) => void} updateInternalState - Function to update the internal amplitude state.
3906
- * @private
3907
3802
  */
3908
3803
  _setupAmplitudeMonitoring(source, callback, updateInternalState) {
3909
3804
  // Set up amplitude monitoring only if a callback is provided