@layercode/js-sdk 1.0.19 → 1.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -516,6 +516,24 @@ registerProcessor('stream_processor', StreamProcessor);
516
516
  this.isPlaying = false;
517
517
  }
518
518
 
519
+ /**
520
+ * Clears interrupted track IDs to prevent memory leaks
521
+ * @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
522
+ */
523
+ clearInterruptedTracks(keepTrackIds = []) {
524
+ if (keepTrackIds.length === 0) {
525
+ this.interruptedTrackIds = {};
526
+ } else {
527
+ const newInterruptedTracks = {};
528
+ for (const trackId of keepTrackIds) {
529
+ if (this.interruptedTrackIds[trackId]) {
530
+ newInterruptedTracks[trackId] = true;
531
+ }
532
+ }
533
+ this.interruptedTrackIds = newInterruptedTracks;
534
+ }
535
+ }
536
+
519
537
  /**
520
538
  * Connects the audio context and enables output to speakers
521
539
  * @returns {Promise<true>}
@@ -749,7 +767,7 @@ registerProcessor('stream_processor', StreamProcessor);
749
767
  this.analyser.disconnect();
750
768
  }
751
769
 
752
- if (this.context) {
770
+ if (this.context && this.context.state !== 'closed') {
753
771
  this.context.close().catch((err) => console.error("Error closing audio context:", err));
754
772
  }
755
773
 
@@ -3488,6 +3506,7 @@ registerProcessor('audio_processor', AudioProcessor);
3488
3506
  onUserAmplitudeChange: options.onUserAmplitudeChange || (() => { }),
3489
3507
  onAgentAmplitudeChange: options.onAgentAmplitudeChange || (() => { }),
3490
3508
  onStatusChange: options.onStatusChange || (() => { }),
3509
+ onUserIsSpeakingChange: options.onUserIsSpeakingChange || (() => { }),
3491
3510
  };
3492
3511
  this.AMPLITUDE_MONITORING_SAMPLE_RATE = 10;
3493
3512
  this._websocketUrl = 'wss://api.layercode.com/v1/pipelines/websocket';
@@ -3503,13 +3522,15 @@ registerProcessor('audio_processor', AudioProcessor);
3503
3522
  this.agentAudioAmplitude = 0;
3504
3523
  this.sessionId = options.sessionId || null;
3505
3524
  this.pushToTalkActive = false;
3506
- this.vadPausedPlayer = false;
3507
3525
  this.pushToTalkEnabled = false;
3508
3526
  this.canInterrupt = false;
3509
3527
  this.userIsSpeaking = false;
3510
3528
  this.endUserTurn = false;
3511
3529
  this.recorderStarted = false;
3512
3530
  this.readySent = false;
3531
+ this.currentTurnText = '';
3532
+ this.currentTurnId = null;
3533
+ this.audioBuffer = [];
3513
3534
  // Bind event handlers
3514
3535
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3515
3536
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
@@ -3524,6 +3545,7 @@ registerProcessor('audio_processor', AudioProcessor);
3524
3545
  console.log('silero vad model timeout');
3525
3546
  // TODO: send message to server to indicate that the vad model timed out
3526
3547
  this.userIsSpeaking = true; // allow audio to be sent to the server
3548
+ this.options.onUserIsSpeakingChange(true);
3527
3549
  }, 2000);
3528
3550
  if (!this.canInterrupt) {
3529
3551
  dist.MicVAD.new({
@@ -3535,15 +3557,48 @@ registerProcessor('audio_processor', AudioProcessor);
3535
3557
  minSpeechFrames: 15,
3536
3558
  preSpeechPadFrames: 0,
3537
3559
  onSpeechStart: () => {
3538
- if (!this.wavPlayer.isPlaying) {
3539
- this.userIsSpeaking = true;
3540
- }
3560
+ this.userIsSpeaking = true;
3561
+ this.options.onUserIsSpeakingChange(true);
3562
+ console.log('onSpeechStart: sending vad_start');
3563
+ this._wsSend({
3564
+ type: 'vad_events',
3565
+ event: 'vad_start',
3566
+ });
3541
3567
  },
3542
3568
  onVADMisfire: () => {
3569
+ console.log('onVADMisfire: Short utterance detected, resuming bot');
3570
+ this.audioBuffer = []; // Clear buffer on misfire
3543
3571
  this.userIsSpeaking = false;
3572
+ this.options.onUserIsSpeakingChange(false);
3573
+ // Send vad_end to indicate the short utterance is over
3574
+ this._wsSend({
3575
+ type: 'vad_events',
3576
+ event: 'vad_end',
3577
+ });
3578
+ // End the user's turn
3579
+ this._wsSend({
3580
+ type: 'trigger.turn.end',
3581
+ role: 'user',
3582
+ });
3583
+ // Resume bot audio if it was playing
3584
+ if (!this.wavPlayer.isPlaying) {
3585
+ console.log('onVADMisfire: Resuming bot audio');
3586
+ this.wavPlayer.play();
3587
+ }
3544
3588
  },
3545
3589
  onSpeechEnd: () => {
3546
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3590
+ console.log('onSpeechEnd: sending vad_end');
3591
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3592
+ this.audioBuffer = []; // Clear buffer on speech end
3593
+ this.userIsSpeaking = false;
3594
+ this.options.onUserIsSpeakingChange(false);
3595
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3596
+ // Send vad_end immediately instead of waiting for next audio chunk
3597
+ this._wsSend({
3598
+ type: 'vad_events',
3599
+ event: 'vad_end',
3600
+ });
3601
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3547
3602
  },
3548
3603
  })
3549
3604
  .then((vad) => {
@@ -3565,43 +3620,59 @@ registerProcessor('audio_processor', AudioProcessor);
3565
3620
  positiveSpeechThreshold: 0.3,
3566
3621
  negativeSpeechThreshold: 0.2,
3567
3622
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3568
- minSpeechFrames: 15,
3623
+ minSpeechFrames: 5,
3569
3624
  preSpeechPadFrames: 0,
3570
3625
  onSpeechStart: () => {
3571
3626
  // Only pause agent audio if it's currently playing
3572
3627
  if (this.wavPlayer.isPlaying) {
3573
3628
  console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3574
3629
  this.wavPlayer.pause();
3575
- this.vadPausedPlayer = true; // VAD is responsible for this pause
3576
3630
  }
3577
3631
  else {
3578
3632
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3579
3633
  }
3580
- this.userIsSpeaking = true;
3581
3634
  console.log('onSpeechStart: sending vad_start');
3582
3635
  this._wsSend({
3583
3636
  type: 'vad_events',
3584
3637
  event: 'vad_start',
3585
3638
  });
3639
+ this.userIsSpeaking = true;
3640
+ this.options.onUserIsSpeakingChange(true);
3641
+ this.endUserTurn = false; // Reset endUserTurn when speech starts
3642
+ console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3586
3643
  },
3587
3644
  onVADMisfire: () => {
3588
3645
  // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3589
3646
  this.userIsSpeaking = false;
3590
- if (this.vadPausedPlayer) {
3591
- console.log('onSpeechEnd: VAD paused the player, resuming');
3592
- this.wavPlayer.play();
3593
- this.vadPausedPlayer = false; // Reset flag
3594
- }
3595
- else {
3596
- console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3597
- }
3647
+ this.audioBuffer = []; // Clear buffer on misfire
3648
+ this.options.onUserIsSpeakingChange(false);
3649
+ // Add the missing delay before resuming to prevent race conditions
3650
+ setTimeout(() => {
3651
+ if (!this.wavPlayer.isPlaying) {
3652
+ console.log('onVADMisfire: Resuming after delay');
3653
+ this.wavPlayer.play();
3654
+ this.userIsSpeaking = true;
3655
+ this.options.onUserIsSpeakingChange(true);
3656
+ }
3657
+ else {
3658
+ console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
3659
+ this.endUserTurn = true;
3660
+ }
3661
+ }, this.options.vadResumeDelay);
3598
3662
  },
3599
3663
  onSpeechEnd: () => {
3664
+ console.log('onSpeechEnd: sending vad_end');
3665
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3666
+ this.audioBuffer = []; // Clear buffer on speech end
3600
3667
  this.userIsSpeaking = false;
3668
+ this.options.onUserIsSpeakingChange(false);
3669
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3670
+ // Send vad_end immediately instead of waiting for next audio chunk
3601
3671
  this._wsSend({
3602
3672
  type: 'vad_events',
3603
3673
  event: 'vad_end',
3604
3674
  });
3675
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3605
3676
  },
3606
3677
  })
3607
3678
  .then((vad) => {
@@ -3635,14 +3706,33 @@ registerProcessor('audio_processor', AudioProcessor);
3635
3706
  reason: 'completed',
3636
3707
  });
3637
3708
  }
3709
+ _estimateWordsHeard(text, playbackOffsetSeconds) {
3710
+ const words = text.split(/\s+/).filter((word) => word.length > 0);
3711
+ const totalWords = words.length;
3712
+ // Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
3713
+ const estimatedWordsPerSecond = 2.5;
3714
+ const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
3715
+ const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
3716
+ return { wordsHeard: estimatedWordsHeard, textHeard };
3717
+ }
3638
3718
  async _clientInterruptAssistantReplay() {
3639
- await this.wavPlayer.interrupt();
3640
- // TODO: Use in voice pipeline to know how much of the audio has been played and how much to truncate transcript
3641
- // this._wsSend({
3642
- // type: 'trigger.response.audio.replay_finished',
3643
- // reason: 'interrupted',
3644
- // delta_id: 'TODO'
3645
- // });
3719
+ const offsetData = await this.wavPlayer.interrupt();
3720
+ if (offsetData && this.currentTurnText && this.currentTurnId) {
3721
+ const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
3722
+ const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
3723
+ console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
3724
+ // Send interruption event with context
3725
+ this._wsSend({
3726
+ type: 'trigger.response.audio.interrupted',
3727
+ playback_offset: offsetData.currentTime,
3728
+ interruption_context: {
3729
+ turn_id: this.currentTurnId,
3730
+ estimated_words_heard: wordsHeard,
3731
+ total_words: totalWords,
3732
+ text_heard: textHeard,
3733
+ },
3734
+ });
3735
+ }
3646
3736
  }
3647
3737
  async triggerUserTurnStarted() {
3648
3738
  if (!this.pushToTalkActive) {
@@ -3673,20 +3763,44 @@ registerProcessor('audio_processor', AudioProcessor);
3673
3763
  // Sent from the server to this client when a new user turn is detected
3674
3764
  console.log('received turn.start from server');
3675
3765
  console.log(message);
3676
- if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3766
+ if (message.role === 'assistant') {
3767
+ // Start tracking new assistant turn
3768
+ // Note: Don't reset currentTurnId here - let response.audio set it
3769
+ // This prevents race conditions where text arrives before audio
3770
+ console.log('Assistant turn started, will track new turn ID from audio/text');
3771
+ }
3772
+ else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3677
3773
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3678
3774
  console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3679
3775
  await this._clientInterruptAssistantReplay();
3680
3776
  }
3681
- // if (message.role === 'assistant') {
3682
- // // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
3683
- // console.log('Clearing audio buffer as assistant turn has started');
3684
- // await this._clientInterruptAssistantReplay();
3685
- // }
3686
3777
  break;
3687
3778
  case 'response.audio':
3688
3779
  const audioBuffer = base64ToArrayBuffer(message.content);
3689
3780
  this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
3781
+ // Set current turn ID from first audio message, or update if different turn
3782
+ if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
3783
+ console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
3784
+ const oldTurnId = this.currentTurnId;
3785
+ this.currentTurnId = message.turn_id;
3786
+ this.currentTurnText = ''; // Reset text for new turn
3787
+ // Clean up interrupted tracks, keeping only the current turn
3788
+ this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
3789
+ }
3790
+ break;
3791
+ case 'response.text':
3792
+ // Set turn ID from first text message if not set, or accumulate if matches current turn
3793
+ if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
3794
+ if (!this.currentTurnId) {
3795
+ console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3796
+ this.currentTurnId = message.turn_id;
3797
+ this.currentTurnText = '';
3798
+ }
3799
+ this.currentTurnText += message.content;
3800
+ }
3801
+ else {
3802
+ console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
3803
+ }
3690
3804
  break;
3691
3805
  // case 'response.end':
3692
3806
  // console.log('received response.end');
@@ -3715,17 +3829,29 @@ registerProcessor('audio_processor', AudioProcessor);
3715
3829
  const base64 = arrayBufferToBase64(data.mono);
3716
3830
  const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
3717
3831
  if (sendAudio) {
3832
+ // If we have buffered audio, send it first
3833
+ if (this.audioBuffer.length > 0) {
3834
+ console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
3835
+ for (const bufferedAudio of this.audioBuffer) {
3836
+ this._wsSend({
3837
+ type: 'client.audio',
3838
+ content: bufferedAudio,
3839
+ });
3840
+ }
3841
+ this.audioBuffer = []; // Clear the buffer after sending
3842
+ }
3843
+ // Send the current audio
3718
3844
  this._wsSend({
3719
3845
  type: 'client.audio',
3720
3846
  content: base64,
3721
3847
  });
3722
- if (this.endUserTurn) {
3723
- this.endUserTurn = false;
3724
- this.userIsSpeaking = false; // Reset userIsSpeaking to false so we don't send any more audio to the server
3725
- this._wsSend({
3726
- type: 'vad_events',
3727
- event: 'vad_end',
3728
- });
3848
+ }
3849
+ else {
3850
+ // Buffer audio when not sending (to catch audio just before VAD triggers)
3851
+ this.audioBuffer.push(base64);
3852
+ // Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
3853
+ if (this.audioBuffer.length > 10) {
3854
+ this.audioBuffer.shift(); // Remove oldest chunk
3729
3855
  }
3730
3856
  }
3731
3857
  }
@@ -3860,10 +3986,20 @@ registerProcessor('audio_processor', AudioProcessor);
3860
3986
  }
3861
3987
  }
3862
3988
  async disconnect() {
3863
- var _a;
3989
+ // Clean up VAD if it exists
3990
+ if (this.vad) {
3991
+ this.vad.pause();
3992
+ this.vad.destroy();
3993
+ this.vad = null;
3994
+ }
3864
3995
  this.wavRecorder.quit();
3865
3996
  this.wavPlayer.disconnect();
3866
- (_a = this.ws) === null || _a === void 0 ? void 0 : _a.close();
3997
+ // Close websocket and ensure status is updated
3998
+ if (this.ws) {
3999
+ this.ws.close();
4000
+ this._setStatus('disconnected');
4001
+ this.options.onDisconnect();
4002
+ }
3867
4003
  }
3868
4004
  /**
3869
4005
  * Gets the microphone MediaStream used by this client
@@ -3872,6 +4008,25 @@ registerProcessor('audio_processor', AudioProcessor);
3872
4008
  getStream() {
3873
4009
  return this.wavRecorder.getStream();
3874
4010
  }
4011
+ /**
4012
+ * Switches the input device for the microphone and restarts recording
4013
+ * @param {string} deviceId - The deviceId of the new microphone
4014
+ */
4015
+ async setInputDevice(deviceId) {
4016
+ if (this.wavRecorder) {
4017
+ try {
4018
+ await this.wavRecorder.end();
4019
+ }
4020
+ catch (e) { }
4021
+ try {
4022
+ await this.wavRecorder.quit();
4023
+ }
4024
+ catch (e) { }
4025
+ }
4026
+ await this.wavRecorder.begin(deviceId);
4027
+ await this.wavRecorder.record(this._handleDataAvailable, 1638);
4028
+ this._setupAmplitudeMonitoring(this.wavRecorder, this.options.onUserAmplitudeChange, (amp) => (this.userAudioAmplitude = amp));
4029
+ }
3875
4030
  }
3876
4031
 
3877
4032
  return LayercodeClient;