@layercode/js-sdk 1.0.22 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3522,13 +3522,53 @@ class LayercodeClient {
3522
3522
  this.endUserTurn = false;
3523
3523
  this.recorderStarted = false;
3524
3524
  this.readySent = false;
3525
- this.currentTurnText = '';
3526
3525
  this.currentTurnId = null;
3527
3526
  this.audioBuffer = [];
3527
+ this.audioPauseTime = null;
3528
3528
  // Bind event handlers
3529
3529
  this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3530
3530
  this._handleDataAvailable = this._handleDataAvailable.bind(this);
3531
3531
  }
3532
+ _setupAmplitudeBasedVAD() {
3533
+ let isSpeakingByAmplitude = false;
3534
+ let silenceFrames = 0;
3535
+ const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
3536
+ const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
3537
+ // Monitor amplitude changes
3538
+ this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
3539
+ const wasSpeaking = isSpeakingByAmplitude;
3540
+ if (amplitude > AMPLITUDE_THRESHOLD) {
3541
+ silenceFrames = 0;
3542
+ if (!wasSpeaking) {
3543
+ // Speech started - pause audio if playing and track timing for interruption calculation
3544
+ if (this.canInterrupt && this.wavPlayer.isPlaying) {
3545
+ this.audioPauseTime = Date.now();
3546
+ this.wavPlayer.pause();
3547
+ }
3548
+ isSpeakingByAmplitude = true;
3549
+ this.userIsSpeaking = true;
3550
+ this.options.onUserIsSpeakingChange(true);
3551
+ this._wsSend({
3552
+ type: 'vad_events',
3553
+ event: 'vad_start',
3554
+ });
3555
+ }
3556
+ }
3557
+ else {
3558
+ silenceFrames++;
3559
+ if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
3560
+ // Speech ended
3561
+ isSpeakingByAmplitude = false;
3562
+ this.userIsSpeaking = false;
3563
+ this.options.onUserIsSpeakingChange(false);
3564
+ this._wsSend({
3565
+ type: 'vad_events',
3566
+ event: 'vad_end',
3567
+ });
3568
+ }
3569
+ }
3570
+ });
3571
+ }
3532
3572
  _initializeVAD() {
3533
3573
  console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
3534
3574
  // If we're in push to talk mode, we don't need to use the VAD model
@@ -3537,9 +3577,17 @@ class LayercodeClient {
3537
3577
  }
3538
3578
  const timeout = setTimeout(() => {
3539
3579
  console.log('silero vad model timeout');
3540
- // TODO: send message to server to indicate that the vad model timed out
3541
- this.userIsSpeaking = true; // allow audio to be sent to the server
3542
- this.options.onUserIsSpeakingChange(true);
3580
+ console.warn('VAD model failed to load - falling back to amplitude-based detection');
3581
+ // Send a message to server indicating VAD failure
3582
+ this._wsSend({
3583
+ type: 'vad_events',
3584
+ event: 'vad_model_failed',
3585
+ });
3586
+ // In automatic mode without VAD, allow the bot to speak initially
3587
+ this.userIsSpeaking = false;
3588
+ this.options.onUserIsSpeakingChange(false);
3589
+ // Set up amplitude-based fallback detection
3590
+ this._setupAmplitudeBasedVAD();
3543
3591
  }, 2000);
3544
3592
  if (!this.canInterrupt) {
3545
3593
  dist.MicVAD.new({
@@ -3548,7 +3596,7 @@ class LayercodeClient {
3548
3596
  positiveSpeechThreshold: 0.3,
3549
3597
  negativeSpeechThreshold: 0.2,
3550
3598
  redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3551
- minSpeechFrames: 15,
3599
+ minSpeechFrames: 0,
3552
3600
  preSpeechPadFrames: 0,
3553
3601
  onSpeechStart: () => {
3554
3602
  this.userIsSpeaking = true;
@@ -3559,27 +3607,6 @@ class LayercodeClient {
3559
3607
  event: 'vad_start',
3560
3608
  });
3561
3609
  },
3562
- onVADMisfire: () => {
3563
- console.log('onVADMisfire: Short utterance detected, resuming bot');
3564
- this.audioBuffer = []; // Clear buffer on misfire
3565
- this.userIsSpeaking = false;
3566
- this.options.onUserIsSpeakingChange(false);
3567
- // Send vad_end to indicate the short utterance is over
3568
- this._wsSend({
3569
- type: 'vad_events',
3570
- event: 'vad_end',
3571
- });
3572
- // End the user's turn
3573
- this._wsSend({
3574
- type: 'trigger.turn.end',
3575
- role: 'user',
3576
- });
3577
- // Resume bot audio if it was playing
3578
- if (!this.wavPlayer.isPlaying) {
3579
- console.log('onVADMisfire: Resuming bot audio');
3580
- this.wavPlayer.play();
3581
- }
3582
- },
3583
3610
  onSpeechEnd: () => {
3584
3611
  console.log('onSpeechEnd: sending vad_end');
3585
3612
  this.endUserTurn = true; // Set flag to indicate that the user turn has ended
@@ -3620,6 +3647,7 @@ class LayercodeClient {
3620
3647
  // Only pause agent audio if it's currently playing
3621
3648
  if (this.wavPlayer.isPlaying) {
3622
3649
  console.log('onSpeechStart: WavPlayer is playing, pausing it.');
3650
+ this.audioPauseTime = Date.now(); // Track when we paused
3623
3651
  this.wavPlayer.pause();
3624
3652
  }
3625
3653
  else {
@@ -3644,9 +3672,8 @@ class LayercodeClient {
3644
3672
  setTimeout(() => {
3645
3673
  if (!this.wavPlayer.isPlaying) {
3646
3674
  console.log('onVADMisfire: Resuming after delay');
3675
+ this.audioPauseTime = null; // Clear pause time since we're resuming
3647
3676
  this.wavPlayer.play();
3648
- this.userIsSpeaking = true;
3649
- this.options.onUserIsSpeakingChange(true);
3650
3677
  }
3651
3678
  else {
3652
3679
  console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
@@ -3700,33 +3727,37 @@ class LayercodeClient {
3700
3727
  reason: 'completed',
3701
3728
  });
3702
3729
  }
3703
- _estimateWordsHeard(text, playbackOffsetSeconds) {
3704
- const words = text.split(/\s+/).filter((word) => word.length > 0);
3705
- const totalWords = words.length;
3706
- // Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
3707
- const estimatedWordsPerSecond = 2.5;
3708
- const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
3709
- const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
3710
- return { wordsHeard: estimatedWordsHeard, textHeard };
3711
- }
3712
3730
  async _clientInterruptAssistantReplay() {
3713
3731
  const offsetData = await this.wavPlayer.interrupt();
3714
- if (offsetData && this.currentTurnText && this.currentTurnId) {
3715
- const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
3716
- const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
3717
- console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
3718
- // Send interruption event with context
3732
+ if (offsetData && this.currentTurnId) {
3733
+ let offsetMs = offsetData.currentTime * 1000;
3734
+ // Calculate accurate offset by subtracting pause time if audio was paused for VAD
3735
+ if (this.audioPauseTime) {
3736
+ const pauseDurationMs = Date.now() - this.audioPauseTime;
3737
+ const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
3738
+ console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
3739
+ offsetMs = adjustedOffsetMs;
3740
+ this.audioPauseTime = null; // Clear the pause time
3741
+ }
3742
+ else {
3743
+ console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
3744
+ }
3745
+ // Send interruption event with accurate playback offset in milliseconds
3719
3746
  this._wsSend({
3720
3747
  type: 'trigger.response.audio.interrupted',
3721
- playback_offset: offsetData.currentTime,
3748
+ playback_offset: offsetMs,
3722
3749
  interruption_context: {
3723
3750
  turn_id: this.currentTurnId,
3724
- estimated_words_heard: wordsHeard,
3725
- total_words: totalWords,
3726
- text_heard: textHeard,
3751
+ playback_offset_ms: offsetMs,
3727
3752
  },
3728
3753
  });
3729
3754
  }
3755
+ else {
3756
+ console.warn('Interruption requested but missing required data:', {
3757
+ hasOffsetData: !!offsetData,
3758
+ hasTurnId: !!this.currentTurnId,
3759
+ });
3760
+ }
3730
3761
  }
3731
3762
  async triggerUserTurnStarted() {
3732
3763
  if (!this.pushToTalkActive) {
@@ -3775,30 +3806,20 @@ class LayercodeClient {
3775
3806
  // Set current turn ID from first audio message, or update if different turn
3776
3807
  if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
3777
3808
  console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
3778
- const oldTurnId = this.currentTurnId;
3779
3809
  this.currentTurnId = message.turn_id;
3780
- this.currentTurnText = ''; // Reset text for new turn
3781
3810
  // Clean up interrupted tracks, keeping only the current turn
3782
3811
  this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
3783
3812
  }
3784
3813
  break;
3785
- case 'response.text':
3786
- // Set turn ID from first text message if not set, or accumulate if matches current turn
3787
- if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
3788
- if (!this.currentTurnId) {
3789
- console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3790
- this.currentTurnId = message.turn_id;
3791
- this.currentTurnText = '';
3792
- }
3793
- this.currentTurnText += message.content;
3794
- }
3795
- else {
3796
- console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
3814
+ case 'response.text': {
3815
+ // Set turn ID from first text message if not set
3816
+ if (!this.currentTurnId) {
3817
+ this.currentTurnId = message.turn_id;
3818
+ console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
3797
3819
  }
3820
+ // Note: We no longer track text content in the client - the pipeline handles interruption estimation
3798
3821
  break;
3799
- // case 'response.end':
3800
- // console.log('received response.end');
3801
- // break;
3822
+ }
3802
3823
  case 'response.data':
3803
3824
  console.log('received response.data', message);
3804
3825
  this.options.onDataMessage(message);
@@ -3902,6 +3923,8 @@ class LayercodeClient {
3902
3923
  async connect() {
3903
3924
  try {
3904
3925
  this._setStatus('connecting');
3926
+ // Reset turn tracking for clean start
3927
+ this._resetTurnTracking();
3905
3928
  // Get session key from server
3906
3929
  let authorizeSessionRequestBody = {
3907
3930
  pipeline_id: this.options.pipelineId,
@@ -3979,6 +4002,10 @@ class LayercodeClient {
3979
4002
  throw error;
3980
4003
  }
3981
4004
  }
4005
+ _resetTurnTracking() {
4006
+ this.currentTurnId = null;
4007
+ console.log('Reset turn tracking state');
4008
+ }
3982
4009
  async disconnect() {
3983
4010
  // Clean up VAD if it exists
3984
4011
  if (this.vad) {
@@ -3988,6 +4015,8 @@ class LayercodeClient {
3988
4015
  }
3989
4016
  this.wavRecorder.quit();
3990
4017
  this.wavPlayer.disconnect();
4018
+ // Reset turn tracking
4019
+ this._resetTurnTracking();
3991
4020
  // Close websocket and ensure status is updated
3992
4021
  if (this.ws) {
3993
4022
  this.ws.close();