@layercode/js-sdk 1.0.21 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -510,6 +510,24 @@ class WavStreamPlayer {
510
510
  this.isPlaying = false;
511
511
  }
512
512
 
513
+ /**
514
+ * Clears interrupted track IDs to prevent memory leaks
515
+ * @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
516
+ */
517
+ clearInterruptedTracks(keepTrackIds = []) {
518
+ if (keepTrackIds.length === 0) {
519
+ this.interruptedTrackIds = {};
520
+ } else {
521
+ const newInterruptedTracks = {};
522
+ for (const trackId of keepTrackIds) {
523
+ if (this.interruptedTrackIds[trackId]) {
524
+ newInterruptedTracks[trackId] = true;
525
+ }
526
+ }
527
+ this.interruptedTrackIds = newInterruptedTracks;
528
+ }
529
+ }
530
+
513
531
  /**
514
532
  * Connects the audio context and enables output to speakers
515
533
  * @returns {Promise<true>}
@@ -743,7 +761,7 @@ class WavStreamPlayer {
743
761
  this.analyser.disconnect();
744
762
  }
745
763
 
746
- if (this.context) {
764
+ if (this.context && this.context.state !== 'closed') {
747
765
  this.context.close().catch((err) => console.error("Error closing audio context:", err));
748
766
  }
749
767
 
@@ -3498,17 +3516,59 @@ class LayercodeClient {
3498
3516
  this.agentAudioAmplitude = 0;
3499
3517
  this.sessionId = options.sessionId || null;
3500
3518
  this.pushToTalkActive = false;
3501
- this.vadPausedPlayer = false;
3502
3519
  this.pushToTalkEnabled = false;
3503
3520
  this.canInterrupt = false;
3504
3521
  this.userIsSpeaking = false;
3505
3522
  this.endUserTurn = false;
3506
3523
  this.recorderStarted = false;
3507
3524
  this.readySent = false;
3525
+ this.currentTurnId = null;
3526
+ this.audioBuffer = [];
3527
+ this.audioPauseTime = null;
3508
3528
  // Bind event handlers
3509
3529
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3510
3530
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
3511
3531
  }
3532
+ _setupAmplitudeBasedVAD() {
3533
+ let isSpeakingByAmplitude = false;
3534
+ let silenceFrames = 0;
3535
+ const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
3536
+ const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
3537
+ // Monitor amplitude changes
3538
+ this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
3539
+ const wasSpeaking = isSpeakingByAmplitude;
3540
+ if (amplitude > AMPLITUDE_THRESHOLD) {
3541
+ silenceFrames = 0;
3542
+ if (!wasSpeaking) {
3543
+ // Speech started - pause audio if playing and track timing for interruption calculation
3544
+ if (this.canInterrupt && this.wavPlayer.isPlaying) {
3545
+ this.audioPauseTime = Date.now();
3546
+ this.wavPlayer.pause();
3547
+ }
3548
+ isSpeakingByAmplitude = true;
3549
+ this.userIsSpeaking = true;
3550
+ this.options.onUserIsSpeakingChange(true);
3551
+ this._wsSend({
3552
+ type: 'vad_events',
3553
+ event: 'vad_start',
3554
+ });
3555
+ }
3556
+ }
3557
+ else {
3558
+ silenceFrames++;
3559
+ if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
3560
+ // Speech ended
3561
+ isSpeakingByAmplitude = false;
3562
+ this.userIsSpeaking = false;
3563
+ this.options.onUserIsSpeakingChange(false);
3564
+ this._wsSend({
3565
+ type: 'vad_events',
3566
+ event: 'vad_end',
3567
+ });
3568
+ }
3569
+ }
3570
+ });
3571
+ }
3512
3572
  _initializeVAD() {
3513
3573
  console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
3514
3574
  // If we're in push to talk mode, we don't need to use the VAD model
@@ -3517,9 +3577,17 @@ class LayercodeClient {
3517
3577
  }
3518
3578
  const timeout = setTimeout(() => {
3519
3579
  console.log('silero vad model timeout');
3520
- // TODO: send message to server to indicate that the vad model timed out
3521
- this.userIsSpeaking = true; // allow audio to be sent to the server
3522
- this.options.onUserIsSpeakingChange(true);
3580
+ console.warn('VAD model failed to load - falling back to amplitude-based detection');
3581
+ // Send a message to server indicating VAD failure
3582
+ this._wsSend({
3583
+ type: 'vad_events',
3584
+ event: 'vad_model_failed',
3585
+ });
3586
+ // In automatic mode without VAD, allow the bot to speak initially
3587
+ this.userIsSpeaking = false;
3588
+ this.options.onUserIsSpeakingChange(false);
3589
+ // Set up amplitude-based fallback detection
3590
+ this._setupAmplitudeBasedVAD();
3523
3591
  }, 2000);
3524
3592
  if (!this.canInterrupt) {
3525
3593
  dist.MicVAD.new({
@@ -3528,20 +3596,30 @@ class LayercodeClient {
3528
3596
  positiveSpeechThreshold: 0.3,
3529
3597
  negativeSpeechThreshold: 0.2,
3530
3598
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3531
- minSpeechFrames: 15,
3599
+ minSpeechFrames: 0,
3532
3600
  preSpeechPadFrames: 0,
3533
3601
  onSpeechStart: () => {
3534
- if (!this.wavPlayer.isPlaying) {
3535
- this.userIsSpeaking = true;
3536
- this.options.onUserIsSpeakingChange(true);
3537
- }
3602
+ this.userIsSpeaking = true;
3603
+ this.options.onUserIsSpeakingChange(true);
3604
+ console.log('onSpeechStart: sending vad_start');
3605
+ this._wsSend({
3606
+ type: 'vad_events',
3607
+ event: 'vad_start',
3608
+ });
3538
3609
  },
3539
- onVADMisfire: () => {
3610
+ onSpeechEnd: () => {
3611
+ console.log('onSpeechEnd: sending vad_end');
3612
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3613
+ this.audioBuffer = []; // Clear buffer on speech end
3540
3614
  this.userIsSpeaking = false;
3541
3615
  this.options.onUserIsSpeakingChange(false);
3542
- },
3543
- onSpeechEnd: () => {
3544
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3616
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3617
+ // Send vad_end immediately instead of waiting for next audio chunk
3618
+ this._wsSend({
3619
+ type: 'vad_events',
3620
+ event: 'vad_end',
3621
+ });
3622
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3545
3623
  },
3546
3624
  })
3547
3625
  .then((vad) => {
@@ -3563,41 +3641,59 @@ class LayercodeClient {
3563
3641
  positiveSpeechThreshold: 0.3,
3564
3642
  negativeSpeechThreshold: 0.2,
3565
3643
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3566
- minSpeechFrames: 15,
3644
+ minSpeechFrames: 5,
3567
3645
  preSpeechPadFrames: 0,
3568
3646
  onSpeechStart: () => {
3569
3647
  // Only pause agent audio if it's currently playing
3570
3648
  if (this.wavPlayer.isPlaying) {
3571
3649
  console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3650
+ this.audioPauseTime = Date.now(); // Track when we paused
3572
3651
  this.wavPlayer.pause();
3573
- this.vadPausedPlayer = true; // VAD is responsible for this pause
3574
3652
  }
3575
3653
  else {
3576
3654
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3577
3655
  }
3578
- this.userIsSpeaking = true;
3579
- this.options.onUserIsSpeakingChange(true);
3580
3656
  console.log('onSpeechStart: sending vad_start');
3581
3657
  this._wsSend({
3582
3658
  type: 'vad_events',
3583
3659
  event: 'vad_start',
3584
3660
  });
3661
+ this.userIsSpeaking = true;
3662
+ this.options.onUserIsSpeakingChange(true);
3663
+ this.endUserTurn = false; // Reset endUserTurn when speech starts
3664
+ console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3585
3665
  },
3586
3666
  onVADMisfire: () => {
3587
3667
  // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3588
3668
  this.userIsSpeaking = false;
3669
+ this.audioBuffer = []; // Clear buffer on misfire
3589
3670
  this.options.onUserIsSpeakingChange(false);
3590
- if (this.vadPausedPlayer) {
3591
- console.log('onSpeechEnd: VAD paused the player, resuming');
3592
- this.wavPlayer.play();
3593
- this.vadPausedPlayer = false; // Reset flag
3594
- }
3595
- else {
3596
- console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3597
- }
3671
+ // Add the missing delay before resuming to prevent race conditions
3672
+ setTimeout(() => {
3673
+ if (!this.wavPlayer.isPlaying) {
3674
+ console.log('onVADMisfire: Resuming after delay');
3675
+ this.audioPauseTime = null; // Clear pause time since we're resuming
3676
+ this.wavPlayer.play();
3677
+ }
3678
+ else {
3679
+ console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
3680
+ this.endUserTurn = true;
3681
+ }
3682
+ }, this.options.vadResumeDelay);
3598
3683
  },
3599
3684
  onSpeechEnd: () => {
3600
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3685
+ console.log('onSpeechEnd: sending vad_end');
3686
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3687
+ this.audioBuffer = []; // Clear buffer on speech end
3688
+ this.userIsSpeaking = false;
3689
+ this.options.onUserIsSpeakingChange(false);
3690
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3691
+ // Send vad_end immediately instead of waiting for next audio chunk
3692
+ this._wsSend({
3693
+ type: 'vad_events',
3694
+ event: 'vad_end',
3695
+ });
3696
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3601
3697
  },
3602
3698
  })
3603
3699
  .then((vad) => {
@@ -3632,13 +3728,36 @@ class LayercodeClient {
3632
3728
  });
3633
3729
  }
3634
3730
  async _clientInterruptAssistantReplay() {
3635
- await this.wavPlayer.interrupt();
3636
- // TODO: Use in voice pipeline to know how much of the audio has been played and how much to truncate transcript
3637
- // this._wsSend({
3638
- // type: 'trigger.response.audio.replay_finished',
3639
- // reason: 'interrupted',
3640
- // delta_id: 'TODO'
3641
- // });
3731
+ const offsetData = await this.wavPlayer.interrupt();
3732
+ if (offsetData && this.currentTurnId) {
3733
+ let offsetMs = offsetData.currentTime * 1000;
3734
+ // Calculate accurate offset by subtracting pause time if audio was paused for VAD
3735
+ if (this.audioPauseTime) {
3736
+ const pauseDurationMs = Date.now() - this.audioPauseTime;
3737
+ const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
3738
+ console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
3739
+ offsetMs = adjustedOffsetMs;
3740
+ this.audioPauseTime = null; // Clear the pause time
3741
+ }
3742
+ else {
3743
+ console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
3744
+ }
3745
+ // Send interruption event with accurate playback offset in milliseconds
3746
+ this._wsSend({
3747
+ type: 'trigger.response.audio.interrupted',
3748
+ playback_offset: offsetMs,
3749
+ interruption_context: {
3750
+ turn_id: this.currentTurnId,
3751
+ playback_offset_ms: offsetMs,
3752
+ },
3753
+ });
3754
+ }
3755
+ else {
3756
+ console.warn('Interruption requested but missing required data:', {
3757
+ hasOffsetData: !!offsetData,
3758
+ hasTurnId: !!this.currentTurnId,
3759
+ });
3760
+ }
3642
3761
  }
3643
3762
  async triggerUserTurnStarted() {
3644
3763
  if (!this.pushToTalkActive) {
@@ -3669,24 +3788,38 @@ class LayercodeClient {
3669
3788
  // Sent from the server to this client when a new user turn is detected
3670
3789
  console.log('received turn.start from server');
3671
3790
  console.log(message);
3672
- if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3791
+ if (message.role === 'assistant') {
3792
+ // Start tracking new assistant turn
3793
+ // Note: Don't reset currentTurnId here - let response.audio set it
3794
+ // This prevents race conditions where text arrives before audio
3795
+ console.log('Assistant turn started, will track new turn ID from audio/text');
3796
+ }
3797
+ else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3673
3798
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3674
3799
  console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3675
3800
  await this._clientInterruptAssistantReplay();
3676
3801
  }
3677
- // if (message.role === 'assistant') {
3678
- // // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
3679
- // console.log('Clearing audio buffer as assistant turn has started');
3680
- // await this._clientInterruptAssistantReplay();
3681
- // }
3682
3802
  break;
3683
3803
  case 'response.audio':
3684
3804
  const audioBuffer = base64ToArrayBuffer(message.content);
3685
3805
  this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
3806
+ // Set current turn ID from first audio message, or update if different turn
3807
+ if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
3808
+ console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
3809
+ this.currentTurnId = message.turn_id;
3810
+ // Clean up interrupted tracks, keeping only the current turn
3811
+ this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
3812
+ }
3686
3813
  break;
3687
- // case 'response.end':
3688
- // console.log('received response.end');
3689
- // break;
3814
+ case 'response.text': {
3815
+ // Set turn ID from first text message if not set
3816
+ if (!this.currentTurnId) {
3817
+ this.currentTurnId = message.turn_id;
3818
+ console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3819
+ }
3820
+ // Note: We no longer track text content in the client - the pipeline handles interruption estimation
3821
+ break;
3822
+ }
3690
3823
  case 'response.data':
3691
3824
  console.log('received response.data', message);
3692
3825
  this.options.onDataMessage(message);
@@ -3711,18 +3844,29 @@ class LayercodeClient {
3711
3844
  const base64 = arrayBufferToBase64(data.mono);
3712
3845
  const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
3713
3846
  if (sendAudio) {
3847
+ // If we have buffered audio, send it first
3848
+ if (this.audioBuffer.length > 0) {
3849
+ console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
3850
+ for (const bufferedAudio of this.audioBuffer) {
3851
+ this._wsSend({
3852
+ type: 'client.audio',
3853
+ content: bufferedAudio,
3854
+ });
3855
+ }
3856
+ this.audioBuffer = []; // Clear the buffer after sending
3857
+ }
3858
+ // Send the current audio
3714
3859
  this._wsSend({
3715
3860
  type: 'client.audio',
3716
3861
  content: base64,
3717
3862
  });
3718
- if (this.endUserTurn) {
3719
- this.endUserTurn = false;
3720
- this.userIsSpeaking = false; // Reset userIsSpeaking to false so we don't send any more audio to the server
3721
- this.options.onUserIsSpeakingChange(false);
3722
- this._wsSend({
3723
- type: 'vad_events',
3724
- event: 'vad_end',
3725
- });
3863
+ }
3864
+ else {
3865
+ // Buffer audio when not sending (to catch audio just before VAD triggers)
3866
+ this.audioBuffer.push(base64);
3867
+ // Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
3868
+ if (this.audioBuffer.length > 10) {
3869
+ this.audioBuffer.shift(); // Remove oldest chunk
3726
3870
  }
3727
3871
  }
3728
3872
  }
@@ -3779,6 +3923,8 @@ class LayercodeClient {
3779
3923
  async connect() {
3780
3924
  try {
3781
3925
  this._setStatus('connecting');
3926
+ // Reset turn tracking for clean start
3927
+ this._resetTurnTracking();
3782
3928
  // Get session key from server
3783
3929
  let authorizeSessionRequestBody = {
3784
3930
  pipeline_id: this.options.pipelineId,
@@ -3856,11 +4002,27 @@ class LayercodeClient {
3856
4002
  throw error;
3857
4003
  }
3858
4004
  }
4005
+ _resetTurnTracking() {
4006
+ this.currentTurnId = null;
4007
+ console.log('Reset turn tracking state');
4008
+ }
3859
4009
  async disconnect() {
3860
- var _a;
4010
+ // Clean up VAD if it exists
4011
+ if (this.vad) {
4012
+ this.vad.pause();
4013
+ this.vad.destroy();
4014
+ this.vad = null;
4015
+ }
3861
4016
  this.wavRecorder.quit();
3862
4017
  this.wavPlayer.disconnect();
3863
- (_a = this.ws) === null || _a === void 0 ? void 0 : _a.close();
4018
+ // Reset turn tracking
4019
+ this._resetTurnTracking();
4020
+ // Close websocket and ensure status is updated
4021
+ if (this.ws) {
4022
+ this.ws.close();
4023
+ this._setStatus('disconnected');
4024
+ this.options.onDisconnect();
4025
+ }
3864
4026
  }
3865
4027
  /**
3866
4028
  * Gets the microphone MediaStream used by this client