@layercode/js-sdk 1.0.19 → 1.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -510,6 +510,24 @@ class WavStreamPlayer {
510
510
  this.isPlaying = false;
511
511
  }
512
512
 
513
+ /**
514
+ * Clears interrupted track IDs to prevent memory leaks
515
+ * @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
516
+ */
517
+ clearInterruptedTracks(keepTrackIds = []) {
518
+ if (keepTrackIds.length === 0) {
519
+ this.interruptedTrackIds = {};
520
+ } else {
521
+ const newInterruptedTracks = {};
522
+ for (const trackId of keepTrackIds) {
523
+ if (this.interruptedTrackIds[trackId]) {
524
+ newInterruptedTracks[trackId] = true;
525
+ }
526
+ }
527
+ this.interruptedTrackIds = newInterruptedTracks;
528
+ }
529
+ }
530
+
513
531
  /**
514
532
  * Connects the audio context and enables output to speakers
515
533
  * @returns {Promise<true>}
@@ -743,7 +761,7 @@ class WavStreamPlayer {
743
761
  this.analyser.disconnect();
744
762
  }
745
763
 
746
- if (this.context) {
764
+ if (this.context && this.context.state !== 'closed') {
747
765
  this.context.close().catch((err) => console.error("Error closing audio context:", err));
748
766
  }
749
767
 
@@ -3482,6 +3500,7 @@ class LayercodeClient {
3482
3500
  onUserAmplitudeChange: options.onUserAmplitudeChange || (() => { }),
3483
3501
  onAgentAmplitudeChange: options.onAgentAmplitudeChange || (() => { }),
3484
3502
  onStatusChange: options.onStatusChange || (() => { }),
3503
+ onUserIsSpeakingChange: options.onUserIsSpeakingChange || (() => { }),
3485
3504
  };
3486
3505
  this.AMPLITUDE_MONITORING_SAMPLE_RATE = 10;
3487
3506
  this._websocketUrl = 'wss://api.layercode.com/v1/pipelines/websocket';
@@ -3497,13 +3516,15 @@ class LayercodeClient {
3497
3516
  this.agentAudioAmplitude = 0;
3498
3517
  this.sessionId = options.sessionId || null;
3499
3518
  this.pushToTalkActive = false;
3500
- this.vadPausedPlayer = false;
3501
3519
  this.pushToTalkEnabled = false;
3502
3520
  this.canInterrupt = false;
3503
3521
  this.userIsSpeaking = false;
3504
3522
  this.endUserTurn = false;
3505
3523
  this.recorderStarted = false;
3506
3524
  this.readySent = false;
3525
+ this.currentTurnText = '';
3526
+ this.currentTurnId = null;
3527
+ this.audioBuffer = [];
3507
3528
  // Bind event handlers
3508
3529
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3509
3530
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
@@ -3518,6 +3539,7 @@ class LayercodeClient {
3518
3539
  console.log('silero vad model timeout');
3519
3540
  // TODO: send message to server to indicate that the vad model timed out
3520
3541
  this.userIsSpeaking = true; // allow audio to be sent to the server
3542
+ this.options.onUserIsSpeakingChange(true);
3521
3543
  }, 2000);
3522
3544
  if (!this.canInterrupt) {
3523
3545
  dist.MicVAD.new({
@@ -3529,15 +3551,48 @@ class LayercodeClient {
3529
3551
  minSpeechFrames: 15,
3530
3552
  preSpeechPadFrames: 0,
3531
3553
  onSpeechStart: () => {
3532
- if (!this.wavPlayer.isPlaying) {
3533
- this.userIsSpeaking = true;
3534
- }
3554
+ this.userIsSpeaking = true;
3555
+ this.options.onUserIsSpeakingChange(true);
3556
+ console.log('onSpeechStart: sending vad_start');
3557
+ this._wsSend({
3558
+ type: 'vad_events',
3559
+ event: 'vad_start',
3560
+ });
3535
3561
  },
3536
3562
  onVADMisfire: () => {
3563
+ console.log('onVADMisfire: Short utterance detected, resuming bot');
3564
+ this.audioBuffer = []; // Clear buffer on misfire
3537
3565
  this.userIsSpeaking = false;
3566
+ this.options.onUserIsSpeakingChange(false);
3567
+ // Send vad_end to indicate the short utterance is over
3568
+ this._wsSend({
3569
+ type: 'vad_events',
3570
+ event: 'vad_end',
3571
+ });
3572
+ // End the user's turn
3573
+ this._wsSend({
3574
+ type: 'trigger.turn.end',
3575
+ role: 'user',
3576
+ });
3577
+ // Resume bot audio if it was playing
3578
+ if (!this.wavPlayer.isPlaying) {
3579
+ console.log('onVADMisfire: Resuming bot audio');
3580
+ this.wavPlayer.play();
3581
+ }
3538
3582
  },
3539
3583
  onSpeechEnd: () => {
3540
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3584
+ console.log('onSpeechEnd: sending vad_end');
3585
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3586
+ this.audioBuffer = []; // Clear buffer on speech end
3587
+ this.userIsSpeaking = false;
3588
+ this.options.onUserIsSpeakingChange(false);
3589
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3590
+ // Send vad_end immediately instead of waiting for next audio chunk
3591
+ this._wsSend({
3592
+ type: 'vad_events',
3593
+ event: 'vad_end',
3594
+ });
3595
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3541
3596
  },
3542
3597
  })
3543
3598
  .then((vad) => {
@@ -3559,43 +3614,59 @@ class LayercodeClient {
3559
3614
  positiveSpeechThreshold: 0.3,
3560
3615
  negativeSpeechThreshold: 0.2,
3561
3616
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3562
- minSpeechFrames: 15,
3617
+ minSpeechFrames: 5,
3563
3618
  preSpeechPadFrames: 0,
3564
3619
  onSpeechStart: () => {
3565
3620
  // Only pause agent audio if it's currently playing
3566
3621
  if (this.wavPlayer.isPlaying) {
3567
3622
  console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3568
3623
  this.wavPlayer.pause();
3569
- this.vadPausedPlayer = true; // VAD is responsible for this pause
3570
3624
  }
3571
3625
  else {
3572
3626
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3573
3627
  }
3574
- this.userIsSpeaking = true;
3575
3628
  console.log('onSpeechStart: sending vad_start');
3576
3629
  this._wsSend({
3577
3630
  type: 'vad_events',
3578
3631
  event: 'vad_start',
3579
3632
  });
3633
+ this.userIsSpeaking = true;
3634
+ this.options.onUserIsSpeakingChange(true);
3635
+ this.endUserTurn = false; // Reset endUserTurn when speech starts
3636
+ console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3580
3637
  },
3581
3638
  onVADMisfire: () => {
3582
3639
  // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3583
3640
  this.userIsSpeaking = false;
3584
- if (this.vadPausedPlayer) {
3585
- console.log('onSpeechEnd: VAD paused the player, resuming');
3586
- this.wavPlayer.play();
3587
- this.vadPausedPlayer = false; // Reset flag
3588
- }
3589
- else {
3590
- console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3591
- }
3641
+ this.audioBuffer = []; // Clear buffer on misfire
3642
+ this.options.onUserIsSpeakingChange(false);
3643
+ // Add the missing delay before resuming to prevent race conditions
3644
+ setTimeout(() => {
3645
+ if (!this.wavPlayer.isPlaying) {
3646
+ console.log('onVADMisfire: Resuming after delay');
3647
+ this.wavPlayer.play();
3648
+ this.userIsSpeaking = true;
3649
+ this.options.onUserIsSpeakingChange(true);
3650
+ }
3651
+ else {
3652
+ console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
3653
+ this.endUserTurn = true;
3654
+ }
3655
+ }, this.options.vadResumeDelay);
3592
3656
  },
3593
3657
  onSpeechEnd: () => {
3658
+ console.log('onSpeechEnd: sending vad_end');
3659
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3660
+ this.audioBuffer = []; // Clear buffer on speech end
3594
3661
  this.userIsSpeaking = false;
3662
+ this.options.onUserIsSpeakingChange(false);
3663
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3664
+ // Send vad_end immediately instead of waiting for next audio chunk
3595
3665
  this._wsSend({
3596
3666
  type: 'vad_events',
3597
3667
  event: 'vad_end',
3598
3668
  });
3669
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3599
3670
  },
3600
3671
  })
3601
3672
  .then((vad) => {
@@ -3629,14 +3700,33 @@ class LayercodeClient {
3629
3700
  reason: 'completed',
3630
3701
  });
3631
3702
  }
3703
+ _estimateWordsHeard(text, playbackOffsetSeconds) {
3704
+ const words = text.split(/\s+/).filter((word) => word.length > 0);
3705
+ const totalWords = words.length;
3706
+ // Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
3707
+ const estimatedWordsPerSecond = 2.5;
3708
+ const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
3709
+ const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
3710
+ return { wordsHeard: estimatedWordsHeard, textHeard };
3711
+ }
3632
3712
  async _clientInterruptAssistantReplay() {
3633
- await this.wavPlayer.interrupt();
3634
- // TODO: Use in voice pipeline to know how much of the audio has been played and how much to truncate transcript
3635
- // this._wsSend({
3636
- // type: 'trigger.response.audio.replay_finished',
3637
- // reason: 'interrupted',
3638
- // delta_id: 'TODO'
3639
- // });
3713
+ const offsetData = await this.wavPlayer.interrupt();
3714
+ if (offsetData && this.currentTurnText && this.currentTurnId) {
3715
+ const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
3716
+ const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
3717
+ console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
3718
+ // Send interruption event with context
3719
+ this._wsSend({
3720
+ type: 'trigger.response.audio.interrupted',
3721
+ playback_offset: offsetData.currentTime,
3722
+ interruption_context: {
3723
+ turn_id: this.currentTurnId,
3724
+ estimated_words_heard: wordsHeard,
3725
+ total_words: totalWords,
3726
+ text_heard: textHeard,
3727
+ },
3728
+ });
3729
+ }
3640
3730
  }
3641
3731
  async triggerUserTurnStarted() {
3642
3732
  if (!this.pushToTalkActive) {
@@ -3667,20 +3757,44 @@ class LayercodeClient {
3667
3757
  // Sent from the server to this client when a new user turn is detected
3668
3758
  console.log('received turn.start from server');
3669
3759
  console.log(message);
3670
- if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3760
+ if (message.role === 'assistant') {
3761
+ // Start tracking new assistant turn
3762
+ // Note: Don't reset currentTurnId here - let response.audio set it
3763
+ // This prevents race conditions where text arrives before audio
3764
+ console.log('Assistant turn started, will track new turn ID from audio/text');
3765
+ }
3766
+ else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3671
3767
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3672
3768
  console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3673
3769
  await this._clientInterruptAssistantReplay();
3674
3770
  }
3675
- // if (message.role === 'assistant') {
3676
- // // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
3677
- // console.log('Clearing audio buffer as assistant turn has started');
3678
- // await this._clientInterruptAssistantReplay();
3679
- // }
3680
3771
  break;
3681
3772
  case 'response.audio':
3682
3773
  const audioBuffer = base64ToArrayBuffer(message.content);
3683
3774
  this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
3775
+ // Set current turn ID from first audio message, or update if different turn
3776
+ if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
3777
+ console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
3778
+ const oldTurnId = this.currentTurnId;
3779
+ this.currentTurnId = message.turn_id;
3780
+ this.currentTurnText = ''; // Reset text for new turn
3781
+ // Clean up interrupted tracks, keeping only the current turn
3782
+ this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
3783
+ }
3784
+ break;
3785
+ case 'response.text':
3786
+ // Set turn ID from first text message if not set, or accumulate if matches current turn
3787
+ if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
3788
+ if (!this.currentTurnId) {
3789
+ console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3790
+ this.currentTurnId = message.turn_id;
3791
+ this.currentTurnText = '';
3792
+ }
3793
+ this.currentTurnText += message.content;
3794
+ }
3795
+ else {
3796
+ console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
3797
+ }
3684
3798
  break;
3685
3799
  // case 'response.end':
3686
3800
  // console.log('received response.end');
@@ -3709,17 +3823,29 @@ class LayercodeClient {
3709
3823
  const base64 = arrayBufferToBase64(data.mono);
3710
3824
  const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
3711
3825
  if (sendAudio) {
3826
+ // If we have buffered audio, send it first
3827
+ if (this.audioBuffer.length > 0) {
3828
+ console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
3829
+ for (const bufferedAudio of this.audioBuffer) {
3830
+ this._wsSend({
3831
+ type: 'client.audio',
3832
+ content: bufferedAudio,
3833
+ });
3834
+ }
3835
+ this.audioBuffer = []; // Clear the buffer after sending
3836
+ }
3837
+ // Send the current audio
3712
3838
  this._wsSend({
3713
3839
  type: 'client.audio',
3714
3840
  content: base64,
3715
3841
  });
3716
- if (this.endUserTurn) {
3717
- this.endUserTurn = false;
3718
- this.userIsSpeaking = false; // Reset userIsSpeaking to false so we don't send any more audio to the server
3719
- this._wsSend({
3720
- type: 'vad_events',
3721
- event: 'vad_end',
3722
- });
3842
+ }
3843
+ else {
3844
+ // Buffer audio when not sending (to catch audio just before VAD triggers)
3845
+ this.audioBuffer.push(base64);
3846
+ // Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
3847
+ if (this.audioBuffer.length > 10) {
3848
+ this.audioBuffer.shift(); // Remove oldest chunk
3723
3849
  }
3724
3850
  }
3725
3851
  }
@@ -3854,10 +3980,20 @@ class LayercodeClient {
3854
3980
  }
3855
3981
  }
3856
3982
  async disconnect() {
3857
- var _a;
3983
+ // Clean up VAD if it exists
3984
+ if (this.vad) {
3985
+ this.vad.pause();
3986
+ this.vad.destroy();
3987
+ this.vad = null;
3988
+ }
3858
3989
  this.wavRecorder.quit();
3859
3990
  this.wavPlayer.disconnect();
3860
- (_a = this.ws) === null || _a === void 0 ? void 0 : _a.close();
3991
+ // Close websocket and ensure status is updated
3992
+ if (this.ws) {
3993
+ this.ws.close();
3994
+ this._setStatus('disconnected');
3995
+ this.options.onDisconnect();
3996
+ }
3861
3997
  }
3862
3998
  /**
3863
3999
  * Gets the microphone MediaStream used by this client
@@ -3866,6 +4002,25 @@ class LayercodeClient {
3866
4002
  getStream() {
3867
4003
  return this.wavRecorder.getStream();
3868
4004
  }
4005
+ /**
4006
+ * Switches the input device for the microphone and restarts recording
4007
+ * @param {string} deviceId - The deviceId of the new microphone
4008
+ */
4009
+ async setInputDevice(deviceId) {
4010
+ if (this.wavRecorder) {
4011
+ try {
4012
+ await this.wavRecorder.end();
4013
+ }
4014
+ catch (e) { }
4015
+ try {
4016
+ await this.wavRecorder.quit();
4017
+ }
4018
+ catch (e) { }
4019
+ }
4020
+ await this.wavRecorder.begin(deviceId);
4021
+ await this.wavRecorder.record(this._handleDataAvailable, 1638);
4022
+ this._setupAmplitudeMonitoring(this.wavRecorder, this.options.onUserAmplitudeChange, (amp) => (this.userAudioAmplitude = amp));
4023
+ }
3869
4024
  }
3870
4025
 
3871
4026
  export { LayercodeClient as default };