@layercode/js-sdk 1.0.21 → 1.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -510,6 +510,24 @@ class WavStreamPlayer {
510
510
  this.isPlaying = false;
511
511
  }
512
512
 
513
+ /**
514
+ * Clears interrupted track IDs to prevent memory leaks
515
+ * @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
516
+ */
517
+ clearInterruptedTracks(keepTrackIds = []) {
518
+ if (keepTrackIds.length === 0) {
519
+ this.interruptedTrackIds = {};
520
+ } else {
521
+ const newInterruptedTracks = {};
522
+ for (const trackId of keepTrackIds) {
523
+ if (this.interruptedTrackIds[trackId]) {
524
+ newInterruptedTracks[trackId] = true;
525
+ }
526
+ }
527
+ this.interruptedTrackIds = newInterruptedTracks;
528
+ }
529
+ }
530
+
513
531
  /**
514
532
  * Connects the audio context and enables output to speakers
515
533
  * @returns {Promise<true>}
@@ -743,7 +761,7 @@ class WavStreamPlayer {
743
761
  this.analyser.disconnect();
744
762
  }
745
763
 
746
- if (this.context) {
764
+ if (this.context && this.context.state !== 'closed') {
747
765
  this.context.close().catch((err) => console.error("Error closing audio context:", err));
748
766
  }
749
767
 
@@ -3498,13 +3516,15 @@ class LayercodeClient {
3498
3516
  this.agentAudioAmplitude = 0;
3499
3517
  this.sessionId = options.sessionId || null;
3500
3518
  this.pushToTalkActive = false;
3501
- this.vadPausedPlayer = false;
3502
3519
  this.pushToTalkEnabled = false;
3503
3520
  this.canInterrupt = false;
3504
3521
  this.userIsSpeaking = false;
3505
3522
  this.endUserTurn = false;
3506
3523
  this.recorderStarted = false;
3507
3524
  this.readySent = false;
3525
+ this.currentTurnText = '';
3526
+ this.currentTurnId = null;
3527
+ this.audioBuffer = [];
3508
3528
  // Bind event handlers
3509
3529
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3510
3530
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
@@ -3531,17 +3551,48 @@ class LayercodeClient {
3531
3551
  minSpeechFrames: 15,
3532
3552
  preSpeechPadFrames: 0,
3533
3553
  onSpeechStart: () => {
3534
- if (!this.wavPlayer.isPlaying) {
3535
- this.userIsSpeaking = true;
3536
- this.options.onUserIsSpeakingChange(true);
3537
- }
3554
+ this.userIsSpeaking = true;
3555
+ this.options.onUserIsSpeakingChange(true);
3556
+ console.log('onSpeechStart: sending vad_start');
3557
+ this._wsSend({
3558
+ type: 'vad_events',
3559
+ event: 'vad_start',
3560
+ });
3538
3561
  },
3539
3562
  onVADMisfire: () => {
3563
+ console.log('onVADMisfire: Short utterance detected, resuming bot');
3564
+ this.audioBuffer = []; // Clear buffer on misfire
3540
3565
  this.userIsSpeaking = false;
3541
3566
  this.options.onUserIsSpeakingChange(false);
3567
+ // Send vad_end to indicate the short utterance is over
3568
+ this._wsSend({
3569
+ type: 'vad_events',
3570
+ event: 'vad_end',
3571
+ });
3572
+ // End the user's turn
3573
+ this._wsSend({
3574
+ type: 'trigger.turn.end',
3575
+ role: 'user',
3576
+ });
3577
+ // Resume bot audio if it was playing
3578
+ if (!this.wavPlayer.isPlaying) {
3579
+ console.log('onVADMisfire: Resuming bot audio');
3580
+ this.wavPlayer.play();
3581
+ }
3542
3582
  },
3543
3583
  onSpeechEnd: () => {
3544
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3584
+ console.log('onSpeechEnd: sending vad_end');
3585
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3586
+ this.audioBuffer = []; // Clear buffer on speech end
3587
+ this.userIsSpeaking = false;
3588
+ this.options.onUserIsSpeakingChange(false);
3589
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3590
+ // Send vad_end immediately instead of waiting for next audio chunk
3591
+ this._wsSend({
3592
+ type: 'vad_events',
3593
+ event: 'vad_end',
3594
+ });
3595
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3545
3596
  },
3546
3597
  })
3547
3598
  .then((vad) => {
@@ -3563,41 +3614,59 @@ class LayercodeClient {
3563
3614
  positiveSpeechThreshold: 0.3,
3564
3615
  negativeSpeechThreshold: 0.2,
3565
3616
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3566
- minSpeechFrames: 15,
3617
+ minSpeechFrames: 5,
3567
3618
  preSpeechPadFrames: 0,
3568
3619
  onSpeechStart: () => {
3569
3620
  // Only pause agent audio if it's currently playing
3570
3621
  if (this.wavPlayer.isPlaying) {
3571
3622
  console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3572
3623
  this.wavPlayer.pause();
3573
- this.vadPausedPlayer = true; // VAD is responsible for this pause
3574
3624
  }
3575
3625
  else {
3576
3626
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3577
3627
  }
3578
- this.userIsSpeaking = true;
3579
- this.options.onUserIsSpeakingChange(true);
3580
3628
  console.log('onSpeechStart: sending vad_start');
3581
3629
  this._wsSend({
3582
3630
  type: 'vad_events',
3583
3631
  event: 'vad_start',
3584
3632
  });
3633
+ this.userIsSpeaking = true;
3634
+ this.options.onUserIsSpeakingChange(true);
3635
+ this.endUserTurn = false; // Reset endUserTurn when speech starts
3636
+ console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3585
3637
  },
3586
3638
  onVADMisfire: () => {
3587
3639
  // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3588
3640
  this.userIsSpeaking = false;
3641
+ this.audioBuffer = []; // Clear buffer on misfire
3589
3642
  this.options.onUserIsSpeakingChange(false);
3590
- if (this.vadPausedPlayer) {
3591
- console.log('onSpeechEnd: VAD paused the player, resuming');
3592
- this.wavPlayer.play();
3593
- this.vadPausedPlayer = false; // Reset flag
3594
- }
3595
- else {
3596
- console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3597
- }
3643
+ // Add the missing delay before resuming to prevent race conditions
3644
+ setTimeout(() => {
3645
+ if (!this.wavPlayer.isPlaying) {
3646
+ console.log('onVADMisfire: Resuming after delay');
3647
+ this.wavPlayer.play();
3648
+ this.userIsSpeaking = true;
3649
+ this.options.onUserIsSpeakingChange(true);
3650
+ }
3651
+ else {
3652
+ console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
3653
+ this.endUserTurn = true;
3654
+ }
3655
+ }, this.options.vadResumeDelay);
3598
3656
  },
3599
3657
  onSpeechEnd: () => {
3600
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3658
+ console.log('onSpeechEnd: sending vad_end');
3659
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3660
+ this.audioBuffer = []; // Clear buffer on speech end
3661
+ this.userIsSpeaking = false;
3662
+ this.options.onUserIsSpeakingChange(false);
3663
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3664
+ // Send vad_end immediately instead of waiting for next audio chunk
3665
+ this._wsSend({
3666
+ type: 'vad_events',
3667
+ event: 'vad_end',
3668
+ });
3669
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3601
3670
  },
3602
3671
  })
3603
3672
  .then((vad) => {
@@ -3631,14 +3700,33 @@ class LayercodeClient {
3631
3700
  reason: 'completed',
3632
3701
  });
3633
3702
  }
3703
+ _estimateWordsHeard(text, playbackOffsetSeconds) {
3704
+ const words = text.split(/\s+/).filter((word) => word.length > 0);
3705
+ const totalWords = words.length;
3706
+ // Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
3707
+ const estimatedWordsPerSecond = 2.5;
3708
+ const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
3709
+ const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
3710
+ return { wordsHeard: estimatedWordsHeard, textHeard };
3711
+ }
3634
3712
  async _clientInterruptAssistantReplay() {
3635
- await this.wavPlayer.interrupt();
3636
- // TODO: Use in voice pipeline to know how much of the audio has been played and how much to truncate transcript
3637
- // this._wsSend({
3638
- // type: 'trigger.response.audio.replay_finished',
3639
- // reason: 'interrupted',
3640
- // delta_id: 'TODO'
3641
- // });
3713
+ const offsetData = await this.wavPlayer.interrupt();
3714
+ if (offsetData && this.currentTurnText && this.currentTurnId) {
3715
+ const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
3716
+ const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
3717
+ console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
3718
+ // Send interruption event with context
3719
+ this._wsSend({
3720
+ type: 'trigger.response.audio.interrupted',
3721
+ playback_offset: offsetData.currentTime,
3722
+ interruption_context: {
3723
+ turn_id: this.currentTurnId,
3724
+ estimated_words_heard: wordsHeard,
3725
+ total_words: totalWords,
3726
+ text_heard: textHeard,
3727
+ },
3728
+ });
3729
+ }
3642
3730
  }
3643
3731
  async triggerUserTurnStarted() {
3644
3732
  if (!this.pushToTalkActive) {
@@ -3669,20 +3757,44 @@ class LayercodeClient {
3669
3757
  // Sent from the server to this client when a new user turn is detected
3670
3758
  console.log('received turn.start from server');
3671
3759
  console.log(message);
3672
- if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3760
+ if (message.role === 'assistant') {
3761
+ // Start tracking new assistant turn
3762
+ // Note: Don't reset currentTurnId here - let response.audio set it
3763
+ // This prevents race conditions where text arrives before audio
3764
+ console.log('Assistant turn started, will track new turn ID from audio/text');
3765
+ }
3766
+ else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3673
3767
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3674
3768
  console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3675
3769
  await this._clientInterruptAssistantReplay();
3676
3770
  }
3677
- // if (message.role === 'assistant') {
3678
- // // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
3679
- // console.log('Clearing audio buffer as assistant turn has started');
3680
- // await this._clientInterruptAssistantReplay();
3681
- // }
3682
3771
  break;
3683
3772
  case 'response.audio':
3684
3773
  const audioBuffer = base64ToArrayBuffer(message.content);
3685
3774
  this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
3775
+ // Set current turn ID from first audio message, or update if different turn
3776
+ if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
3777
+ console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
3778
+ const oldTurnId = this.currentTurnId;
3779
+ this.currentTurnId = message.turn_id;
3780
+ this.currentTurnText = ''; // Reset text for new turn
3781
+ // Clean up interrupted tracks, keeping only the current turn
3782
+ this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
3783
+ }
3784
+ break;
3785
+ case 'response.text':
3786
+ // Set turn ID from first text message if not set, or accumulate if matches current turn
3787
+ if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
3788
+ if (!this.currentTurnId) {
3789
+ console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3790
+ this.currentTurnId = message.turn_id;
3791
+ this.currentTurnText = '';
3792
+ }
3793
+ this.currentTurnText += message.content;
3794
+ }
3795
+ else {
3796
+ console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
3797
+ }
3686
3798
  break;
3687
3799
  // case 'response.end':
3688
3800
  // console.log('received response.end');
@@ -3711,18 +3823,29 @@ class LayercodeClient {
3711
3823
  const base64 = arrayBufferToBase64(data.mono);
3712
3824
  const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
3713
3825
  if (sendAudio) {
3826
+ // If we have buffered audio, send it first
3827
+ if (this.audioBuffer.length > 0) {
3828
+ console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
3829
+ for (const bufferedAudio of this.audioBuffer) {
3830
+ this._wsSend({
3831
+ type: 'client.audio',
3832
+ content: bufferedAudio,
3833
+ });
3834
+ }
3835
+ this.audioBuffer = []; // Clear the buffer after sending
3836
+ }
3837
+ // Send the current audio
3714
3838
  this._wsSend({
3715
3839
  type: 'client.audio',
3716
3840
  content: base64,
3717
3841
  });
3718
- if (this.endUserTurn) {
3719
- this.endUserTurn = false;
3720
- this.userIsSpeaking = false; // Reset userIsSpeaking to false so we don't send any more audio to the server
3721
- this.options.onUserIsSpeakingChange(false);
3722
- this._wsSend({
3723
- type: 'vad_events',
3724
- event: 'vad_end',
3725
- });
3842
+ }
3843
+ else {
3844
+ // Buffer audio when not sending (to catch audio just before VAD triggers)
3845
+ this.audioBuffer.push(base64);
3846
+ // Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
3847
+ if (this.audioBuffer.length > 10) {
3848
+ this.audioBuffer.shift(); // Remove oldest chunk
3726
3849
  }
3727
3850
  }
3728
3851
  }
@@ -3857,10 +3980,20 @@ class LayercodeClient {
3857
3980
  }
3858
3981
  }
3859
3982
  async disconnect() {
3860
- var _a;
3983
+ // Clean up VAD if it exists
3984
+ if (this.vad) {
3985
+ this.vad.pause();
3986
+ this.vad.destroy();
3987
+ this.vad = null;
3988
+ }
3861
3989
  this.wavRecorder.quit();
3862
3990
  this.wavPlayer.disconnect();
3863
- (_a = this.ws) === null || _a === void 0 ? void 0 : _a.close();
3991
+ // Close websocket and ensure status is updated
3992
+ if (this.ws) {
3993
+ this.ws.close();
3994
+ this._setStatus('disconnected');
3995
+ this.options.onDisconnect();
3996
+ }
3864
3997
  }
3865
3998
  /**
3866
3999
  * Gets the microphone MediaStream used by this client