@layercode/js-sdk 1.0.21 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -516,6 +516,24 @@ registerProcessor('stream_processor', StreamProcessor);
516
516
  this.isPlaying = false;
517
517
  }
518
518
 
519
+ /**
520
+ * Clears interrupted track IDs to prevent memory leaks
521
+ * @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
522
+ */
523
+ clearInterruptedTracks(keepTrackIds = []) {
524
+ if (keepTrackIds.length === 0) {
525
+ this.interruptedTrackIds = {};
526
+ } else {
527
+ const newInterruptedTracks = {};
528
+ for (const trackId of keepTrackIds) {
529
+ if (this.interruptedTrackIds[trackId]) {
530
+ newInterruptedTracks[trackId] = true;
531
+ }
532
+ }
533
+ this.interruptedTrackIds = newInterruptedTracks;
534
+ }
535
+ }
536
+
519
537
  /**
520
538
  * Connects the audio context and enables output to speakers
521
539
  * @returns {Promise<true>}
@@ -749,7 +767,7 @@ registerProcessor('stream_processor', StreamProcessor);
749
767
  this.analyser.disconnect();
750
768
  }
751
769
 
752
- if (this.context) {
770
+ if (this.context && this.context.state !== 'closed') {
753
771
  this.context.close().catch((err) => console.error("Error closing audio context:", err));
754
772
  }
755
773
 
@@ -3504,17 +3522,59 @@ registerProcessor('audio_processor', AudioProcessor);
3504
3522
  this.agentAudioAmplitude = 0;
3505
3523
  this.sessionId = options.sessionId || null;
3506
3524
  this.pushToTalkActive = false;
3507
- this.vadPausedPlayer = false;
3508
3525
  this.pushToTalkEnabled = false;
3509
3526
  this.canInterrupt = false;
3510
3527
  this.userIsSpeaking = false;
3511
3528
  this.endUserTurn = false;
3512
3529
  this.recorderStarted = false;
3513
3530
  this.readySent = false;
3531
+ this.currentTurnId = null;
3532
+ this.audioBuffer = [];
3533
+ this.audioPauseTime = null;
3514
3534
  // Bind event handlers
3515
3535
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3516
3536
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
3517
3537
  }
3538
+ _setupAmplitudeBasedVAD() {
3539
+ let isSpeakingByAmplitude = false;
3540
+ let silenceFrames = 0;
3541
+ const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
3542
+ const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
3543
+ // Monitor amplitude changes
3544
+ this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
3545
+ const wasSpeaking = isSpeakingByAmplitude;
3546
+ if (amplitude > AMPLITUDE_THRESHOLD) {
3547
+ silenceFrames = 0;
3548
+ if (!wasSpeaking) {
3549
+ // Speech started - pause audio if playing and track timing for interruption calculation
3550
+ if (this.canInterrupt && this.wavPlayer.isPlaying) {
3551
+ this.audioPauseTime = Date.now();
3552
+ this.wavPlayer.pause();
3553
+ }
3554
+ isSpeakingByAmplitude = true;
3555
+ this.userIsSpeaking = true;
3556
+ this.options.onUserIsSpeakingChange(true);
3557
+ this._wsSend({
3558
+ type: 'vad_events',
3559
+ event: 'vad_start',
3560
+ });
3561
+ }
3562
+ }
3563
+ else {
3564
+ silenceFrames++;
3565
+ if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
3566
+ // Speech ended
3567
+ isSpeakingByAmplitude = false;
3568
+ this.userIsSpeaking = false;
3569
+ this.options.onUserIsSpeakingChange(false);
3570
+ this._wsSend({
3571
+ type: 'vad_events',
3572
+ event: 'vad_end',
3573
+ });
3574
+ }
3575
+ }
3576
+ });
3577
+ }
3518
3578
  _initializeVAD() {
3519
3579
  console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
3520
3580
  // If we're in push to talk mode, we don't need to use the VAD model
@@ -3523,9 +3583,17 @@ registerProcessor('audio_processor', AudioProcessor);
3523
3583
  }
3524
3584
  const timeout = setTimeout(() => {
3525
3585
  console.log('silero vad model timeout');
3526
- // TODO: send message to server to indicate that the vad model timed out
3527
- this.userIsSpeaking = true; // allow audio to be sent to the server
3528
- this.options.onUserIsSpeakingChange(true);
3586
+ console.warn('VAD model failed to load - falling back to amplitude-based detection');
3587
+ // Send a message to server indicating VAD failure
3588
+ this._wsSend({
3589
+ type: 'vad_events',
3590
+ event: 'vad_model_failed',
3591
+ });
3592
+ // In automatic mode without VAD, allow the bot to speak initially
3593
+ this.userIsSpeaking = false;
3594
+ this.options.onUserIsSpeakingChange(false);
3595
+ // Set up amplitude-based fallback detection
3596
+ this._setupAmplitudeBasedVAD();
3529
3597
  }, 2000);
3530
3598
  if (!this.canInterrupt) {
3531
3599
  dist.MicVAD.new({
@@ -3534,20 +3602,30 @@ registerProcessor('audio_processor', AudioProcessor);
3534
3602
  positiveSpeechThreshold: 0.3,
3535
3603
  negativeSpeechThreshold: 0.2,
3536
3604
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3537
- minSpeechFrames: 15,
3605
+ minSpeechFrames: 0,
3538
3606
  preSpeechPadFrames: 0,
3539
3607
  onSpeechStart: () => {
3540
- if (!this.wavPlayer.isPlaying) {
3541
- this.userIsSpeaking = true;
3542
- this.options.onUserIsSpeakingChange(true);
3543
- }
3608
+ this.userIsSpeaking = true;
3609
+ this.options.onUserIsSpeakingChange(true);
3610
+ console.log('onSpeechStart: sending vad_start');
3611
+ this._wsSend({
3612
+ type: 'vad_events',
3613
+ event: 'vad_start',
3614
+ });
3544
3615
  },
3545
- onVADMisfire: () => {
3616
+ onSpeechEnd: () => {
3617
+ console.log('onSpeechEnd: sending vad_end');
3618
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3619
+ this.audioBuffer = []; // Clear buffer on speech end
3546
3620
  this.userIsSpeaking = false;
3547
3621
  this.options.onUserIsSpeakingChange(false);
3548
- },
3549
- onSpeechEnd: () => {
3550
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3622
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3623
+ // Send vad_end immediately instead of waiting for next audio chunk
3624
+ this._wsSend({
3625
+ type: 'vad_events',
3626
+ event: 'vad_end',
3627
+ });
3628
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3551
3629
  },
3552
3630
  })
3553
3631
  .then((vad) => {
@@ -3569,41 +3647,59 @@ registerProcessor('audio_processor', AudioProcessor);
3569
3647
  positiveSpeechThreshold: 0.3,
3570
3648
  negativeSpeechThreshold: 0.2,
3571
3649
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3572
- minSpeechFrames: 15,
3650
+ minSpeechFrames: 5,
3573
3651
  preSpeechPadFrames: 0,
3574
3652
  onSpeechStart: () => {
3575
3653
  // Only pause agent audio if it's currently playing
3576
3654
  if (this.wavPlayer.isPlaying) {
3577
3655
  console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3656
+ this.audioPauseTime = Date.now(); // Track when we paused
3578
3657
  this.wavPlayer.pause();
3579
- this.vadPausedPlayer = true; // VAD is responsible for this pause
3580
3658
  }
3581
3659
  else {
3582
3660
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3583
3661
  }
3584
- this.userIsSpeaking = true;
3585
- this.options.onUserIsSpeakingChange(true);
3586
3662
  console.log('onSpeechStart: sending vad_start');
3587
3663
  this._wsSend({
3588
3664
  type: 'vad_events',
3589
3665
  event: 'vad_start',
3590
3666
  });
3667
+ this.userIsSpeaking = true;
3668
+ this.options.onUserIsSpeakingChange(true);
3669
+ this.endUserTurn = false; // Reset endUserTurn when speech starts
3670
+ console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3591
3671
  },
3592
3672
  onVADMisfire: () => {
3593
3673
  // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3594
3674
  this.userIsSpeaking = false;
3675
+ this.audioBuffer = []; // Clear buffer on misfire
3595
3676
  this.options.onUserIsSpeakingChange(false);
3596
- if (this.vadPausedPlayer) {
3597
- console.log('onSpeechEnd: VAD paused the player, resuming');
3598
- this.wavPlayer.play();
3599
- this.vadPausedPlayer = false; // Reset flag
3600
- }
3601
- else {
3602
- console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3603
- }
3677
+ // Add the missing delay before resuming to prevent race conditions
3678
+ setTimeout(() => {
3679
+ if (!this.wavPlayer.isPlaying) {
3680
+ console.log('onVADMisfire: Resuming after delay');
3681
+ this.audioPauseTime = null; // Clear pause time since we're resuming
3682
+ this.wavPlayer.play();
3683
+ }
3684
+ else {
3685
+ console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
3686
+ this.endUserTurn = true;
3687
+ }
3688
+ }, this.options.vadResumeDelay);
3604
3689
  },
3605
3690
  onSpeechEnd: () => {
3606
- this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
3691
+ console.log('onSpeechEnd: sending vad_end');
3692
+ this.endUserTurn = true; // Set flag to indicate that the user turn has ended
3693
+ this.audioBuffer = []; // Clear buffer on speech end
3694
+ this.userIsSpeaking = false;
3695
+ this.options.onUserIsSpeakingChange(false);
3696
+ console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
3697
+ // Send vad_end immediately instead of waiting for next audio chunk
3698
+ this._wsSend({
3699
+ type: 'vad_events',
3700
+ event: 'vad_end',
3701
+ });
3702
+ this.endUserTurn = false; // Reset the flag after sending vad_end
3607
3703
  },
3608
3704
  })
3609
3705
  .then((vad) => {
@@ -3638,13 +3734,36 @@ registerProcessor('audio_processor', AudioProcessor);
3638
3734
  });
3639
3735
  }
3640
3736
  async _clientInterruptAssistantReplay() {
3641
- await this.wavPlayer.interrupt();
3642
- // TODO: Use in voice pipeline to know how much of the audio has been played and how much to truncate transcript
3643
- // this._wsSend({
3644
- // type: 'trigger.response.audio.replay_finished',
3645
- // reason: 'interrupted',
3646
- // delta_id: 'TODO'
3647
- // });
3737
+ const offsetData = await this.wavPlayer.interrupt();
3738
+ if (offsetData && this.currentTurnId) {
3739
+ let offsetMs = offsetData.currentTime * 1000;
3740
+ // Calculate accurate offset by subtracting pause time if audio was paused for VAD
3741
+ if (this.audioPauseTime) {
3742
+ const pauseDurationMs = Date.now() - this.audioPauseTime;
3743
+ const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
3744
+ console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
3745
+ offsetMs = adjustedOffsetMs;
3746
+ this.audioPauseTime = null; // Clear the pause time
3747
+ }
3748
+ else {
3749
+ console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
3750
+ }
3751
+ // Send interruption event with accurate playback offset in milliseconds
3752
+ this._wsSend({
3753
+ type: 'trigger.response.audio.interrupted',
3754
+ playback_offset: offsetMs,
3755
+ interruption_context: {
3756
+ turn_id: this.currentTurnId,
3757
+ playback_offset_ms: offsetMs,
3758
+ },
3759
+ });
3760
+ }
3761
+ else {
3762
+ console.warn('Interruption requested but missing required data:', {
3763
+ hasOffsetData: !!offsetData,
3764
+ hasTurnId: !!this.currentTurnId,
3765
+ });
3766
+ }
3648
3767
  }
3649
3768
  async triggerUserTurnStarted() {
3650
3769
  if (!this.pushToTalkActive) {
@@ -3675,24 +3794,38 @@ registerProcessor('audio_processor', AudioProcessor);
3675
3794
  // Sent from the server to this client when a new user turn is detected
3676
3795
  console.log('received turn.start from server');
3677
3796
  console.log(message);
3678
- if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3797
+ if (message.role === 'assistant') {
3798
+ // Start tracking new assistant turn
3799
+ // Note: Don't reset currentTurnId here - let response.audio set it
3800
+ // This prevents race conditions where text arrives before audio
3801
+ console.log('Assistant turn started, will track new turn ID from audio/text');
3802
+ }
3803
+ else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3679
3804
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3680
3805
  console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3681
3806
  await this._clientInterruptAssistantReplay();
3682
3807
  }
3683
- // if (message.role === 'assistant') {
3684
- // // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
3685
- // console.log('Clearing audio buffer as assistant turn has started');
3686
- // await this._clientInterruptAssistantReplay();
3687
- // }
3688
3808
  break;
3689
3809
  case 'response.audio':
3690
3810
  const audioBuffer = base64ToArrayBuffer(message.content);
3691
3811
  this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
3812
+ // Set current turn ID from first audio message, or update if different turn
3813
+ if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
3814
+ console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
3815
+ this.currentTurnId = message.turn_id;
3816
+ // Clean up interrupted tracks, keeping only the current turn
3817
+ this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
3818
+ }
3692
3819
  break;
3693
- // case 'response.end':
3694
- // console.log('received response.end');
3695
- // break;
3820
+ case 'response.text': {
3821
+ // Set turn ID from first text message if not set
3822
+ if (!this.currentTurnId) {
3823
+ this.currentTurnId = message.turn_id;
3824
+ console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3825
+ }
3826
+ // Note: We no longer track text content in the client - the pipeline handles interruption estimation
3827
+ break;
3828
+ }
3696
3829
  case 'response.data':
3697
3830
  console.log('received response.data', message);
3698
3831
  this.options.onDataMessage(message);
@@ -3717,18 +3850,29 @@ registerProcessor('audio_processor', AudioProcessor);
3717
3850
  const base64 = arrayBufferToBase64(data.mono);
3718
3851
  const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
3719
3852
  if (sendAudio) {
3853
+ // If we have buffered audio, send it first
3854
+ if (this.audioBuffer.length > 0) {
3855
+ console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
3856
+ for (const bufferedAudio of this.audioBuffer) {
3857
+ this._wsSend({
3858
+ type: 'client.audio',
3859
+ content: bufferedAudio,
3860
+ });
3861
+ }
3862
+ this.audioBuffer = []; // Clear the buffer after sending
3863
+ }
3864
+ // Send the current audio
3720
3865
  this._wsSend({
3721
3866
  type: 'client.audio',
3722
3867
  content: base64,
3723
3868
  });
3724
- if (this.endUserTurn) {
3725
- this.endUserTurn = false;
3726
- this.userIsSpeaking = false; // Reset userIsSpeaking to false so we don't send any more audio to the server
3727
- this.options.onUserIsSpeakingChange(false);
3728
- this._wsSend({
3729
- type: 'vad_events',
3730
- event: 'vad_end',
3731
- });
3869
+ }
3870
+ else {
3871
+ // Buffer audio when not sending (to catch audio just before VAD triggers)
3872
+ this.audioBuffer.push(base64);
3873
+ // Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
3874
+ if (this.audioBuffer.length > 10) {
3875
+ this.audioBuffer.shift(); // Remove oldest chunk
3732
3876
  }
3733
3877
  }
3734
3878
  }
@@ -3785,6 +3929,8 @@ registerProcessor('audio_processor', AudioProcessor);
3785
3929
  async connect() {
3786
3930
  try {
3787
3931
  this._setStatus('connecting');
3932
+ // Reset turn tracking for clean start
3933
+ this._resetTurnTracking();
3788
3934
  // Get session key from server
3789
3935
  let authorizeSessionRequestBody = {
3790
3936
  pipeline_id: this.options.pipelineId,
@@ -3862,11 +4008,27 @@ registerProcessor('audio_processor', AudioProcessor);
3862
4008
  throw error;
3863
4009
  }
3864
4010
  }
4011
+ _resetTurnTracking() {
4012
+ this.currentTurnId = null;
4013
+ console.log('Reset turn tracking state');
4014
+ }
3865
4015
  async disconnect() {
3866
- var _a;
4016
+ // Clean up VAD if it exists
4017
+ if (this.vad) {
4018
+ this.vad.pause();
4019
+ this.vad.destroy();
4020
+ this.vad = null;
4021
+ }
3867
4022
  this.wavRecorder.quit();
3868
4023
  this.wavPlayer.disconnect();
3869
- (_a = this.ws) === null || _a === void 0 ? void 0 : _a.close();
4024
+ // Reset turn tracking
4025
+ this._resetTurnTracking();
4026
+ // Close websocket and ensure status is updated
4027
+ if (this.ws) {
4028
+ this.ws.close();
4029
+ this._setStatus('disconnected');
4030
+ this.options.onDisconnect();
4031
+ }
3870
4032
  }
3871
4033
  /**
3872
4034
  * Gets the microphone MediaStream used by this client