npm - @layercode/js-sdk - Versions diffs - 1.0.21 → 1.0.23 - Mend

@layercode/js-sdk 1.0.21 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/layercode-js-sdk.esm.js +215 -53
package/dist/layercode-js-sdk.esm.js.map +1 -1
package/dist/layercode-js-sdk.min.js +215 -53
package/dist/layercode-js-sdk.min.js.map +1 -1
package/dist/types/index.d.ts +22 -2
package/dist/types/interfaces.d.ts +18 -5
package/package.json +1 -1

package/dist/layercode-js-sdk.min.js CHANGED Viewed

@@ -516,6 +516,24 @@ registerProcessor('stream_processor', StreamProcessor);
       this.isPlaying = false;
     }
+    /**
+     * Clears interrupted track IDs to prevent memory leaks
+     * @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
+     */
+    clearInterruptedTracks(keepTrackIds = []) {
+      if (keepTrackIds.length === 0) {
+        this.interruptedTrackIds = {};
+      } else {
+        const newInterruptedTracks = {};
+        for (const trackId of keepTrackIds) {
+          if (this.interruptedTrackIds[trackId]) {
+            newInterruptedTracks[trackId] = true;
+          }
+        }
+        this.interruptedTrackIds = newInterruptedTracks;
+      }
+    }
     /**
      * Connects the audio context and enables output to speakers
      * @returns {Promise<true>}
@@ -749,7 +767,7 @@ registerProcessor('stream_processor', StreamProcessor);
         this.analyser.disconnect();
       }
-      if (this.context) {
+      if (this.context && this.context.state !== 'closed') {
         this.context.close().catch((err) => console.error("Error closing audio context:", err));
       }
@@ -3504,17 +3522,59 @@ registerProcessor('audio_processor', AudioProcessor);
           this.agentAudioAmplitude = 0;
           this.sessionId = options.sessionId || null;
           this.pushToTalkActive = false;
-          this.vadPausedPlayer = false;
           this.pushToTalkEnabled = false;
           this.canInterrupt = false;
           this.userIsSpeaking = false;
           this.endUserTurn = false;
           this.recorderStarted = false;
           this.readySent = false;
+          this.currentTurnId = null;
+          this.audioBuffer = [];
+          this.audioPauseTime = null;
           // Bind event handlers
           this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
           this._handleDataAvailable = this._handleDataAvailable.bind(this);
       }
+      _setupAmplitudeBasedVAD() {
+          let isSpeakingByAmplitude = false;
+          let silenceFrames = 0;
+          const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
+          const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
+          // Monitor amplitude changes
+          this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
+              const wasSpeaking = isSpeakingByAmplitude;
+              if (amplitude > AMPLITUDE_THRESHOLD) {
+                  silenceFrames = 0;
+                  if (!wasSpeaking) {
+                      // Speech started - pause audio if playing and track timing for interruption calculation
+                      if (this.canInterrupt && this.wavPlayer.isPlaying) {
+                          this.audioPauseTime = Date.now();
+                          this.wavPlayer.pause();
+                      }
+                      isSpeakingByAmplitude = true;
+                      this.userIsSpeaking = true;
+                      this.options.onUserIsSpeakingChange(true);
+                      this._wsSend({
+                          type: 'vad_events',
+                          event: 'vad_start',
+                      });
+                  }
+              }
+              else {
+                  silenceFrames++;
+                  if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
+                      // Speech ended
+                      isSpeakingByAmplitude = false;
+                      this.userIsSpeaking = false;
+                      this.options.onUserIsSpeakingChange(false);
+                      this._wsSend({
+                          type: 'vad_events',
+                          event: 'vad_end',
+                      });
+                  }
+              }
+          });
+      }
       _initializeVAD() {
           console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
           // If we're in push to talk mode, we don't need to use the VAD model
@@ -3523,9 +3583,17 @@ registerProcessor('audio_processor', AudioProcessor);
           }
           const timeout = setTimeout(() => {
               console.log('silero vad model timeout');
-              // TODO: send message to server to indicate that the vad model timed out
-              this.userIsSpeaking = true; // allow audio to be sent to the server
-              this.options.onUserIsSpeakingChange(true);
+              console.warn('VAD model failed to load - falling back to amplitude-based detection');
+              // Send a message to server indicating VAD failure
+              this._wsSend({
+                  type: 'vad_events',
+                  event: 'vad_model_failed',
+              });
+              // In automatic mode without VAD, allow the bot to speak initially
+              this.userIsSpeaking = false;
+              this.options.onUserIsSpeakingChange(false);
+              // Set up amplitude-based fallback detection
+              this._setupAmplitudeBasedVAD();
           }, 2000);
           if (!this.canInterrupt) {
               dist.MicVAD.new({
@@ -3534,20 +3602,30 @@ registerProcessor('audio_processor', AudioProcessor);
                   positiveSpeechThreshold: 0.3,
                   negativeSpeechThreshold: 0.2,
                   redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
-                  minSpeechFrames: 15,
+                  minSpeechFrames: 0,
                   preSpeechPadFrames: 0,
                   onSpeechStart: () => {
-                      if (!this.wavPlayer.isPlaying) {
-                          this.userIsSpeaking = true;
-                          this.options.onUserIsSpeakingChange(true);
-                      }
+                      this.userIsSpeaking = true;
+                      this.options.onUserIsSpeakingChange(true);
+                      console.log('onSpeechStart: sending vad_start');
+                      this._wsSend({
+                          type: 'vad_events',
+                          event: 'vad_start',
+                      });
                   },
-                  onVADMisfire: () => {
+                  onSpeechEnd: () => {
+                      console.log('onSpeechEnd: sending vad_end');
+                      this.endUserTurn = true; // Set flag to indicate that the user turn has ended
+                      this.audioBuffer = []; // Clear buffer on speech end
                       this.userIsSpeaking = false;
                       this.options.onUserIsSpeakingChange(false);
-                  },
-                  onSpeechEnd: () => {
-                      this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
+                      console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
+                      // Send vad_end immediately instead of waiting for next audio chunk
+                      this._wsSend({
+                          type: 'vad_events',
+                          event: 'vad_end',
+                      });
+                      this.endUserTurn = false; // Reset the flag after sending vad_end
                   },
               })
                   .then((vad) => {
@@ -3569,41 +3647,59 @@ registerProcessor('audio_processor', AudioProcessor);
                   positiveSpeechThreshold: 0.3,
                   negativeSpeechThreshold: 0.2,
                   redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
-                  minSpeechFrames: 15,
+                  minSpeechFrames: 5,
                   preSpeechPadFrames: 0,
                   onSpeechStart: () => {
                       // Only pause agent audio if it's currently playing
                       if (this.wavPlayer.isPlaying) {
                           console.log('onSpeechStart: WavPlayer is playing, pausing it.');
+                          this.audioPauseTime = Date.now(); // Track when we paused
                           this.wavPlayer.pause();
-                          this.vadPausedPlayer = true; // VAD is responsible for this pause
                       }
                       else {
                           console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
                       }
-                      this.userIsSpeaking = true;
-                      this.options.onUserIsSpeakingChange(true);
                       console.log('onSpeechStart: sending vad_start');
                       this._wsSend({
                           type: 'vad_events',
                           event: 'vad_start',
                       });
+                      this.userIsSpeaking = true;
+                      this.options.onUserIsSpeakingChange(true);
+                      this.endUserTurn = false; // Reset endUserTurn when speech starts
+                      console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
                   },
                   onVADMisfire: () => {
                       // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
                       this.userIsSpeaking = false;
+                      this.audioBuffer = []; // Clear buffer on misfire
                       this.options.onUserIsSpeakingChange(false);
-                      if (this.vadPausedPlayer) {
-                          console.log('onSpeechEnd: VAD paused the player, resuming');
-                          this.wavPlayer.play();
-                          this.vadPausedPlayer = false; // Reset flag
-                      }
-                      else {
-                          console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
-                      }
+                      // Add the missing delay before resuming to prevent race conditions
+                      setTimeout(() => {
+                          if (!this.wavPlayer.isPlaying) {
+                              console.log('onVADMisfire: Resuming after delay');
+                              this.audioPauseTime = null; // Clear pause time since we're resuming
+                              this.wavPlayer.play();
+                          }
+                          else {
+                              console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
+                              this.endUserTurn = true;
+                          }
+                      }, this.options.vadResumeDelay);
                   },
                   onSpeechEnd: () => {
-                      this.endUserTurn = true; // Set flag to indicate that the user turn has ended, so we can send a vad_end event to the server
+                      console.log('onSpeechEnd: sending vad_end');
+                      this.endUserTurn = true; // Set flag to indicate that the user turn has ended
+                      this.audioBuffer = []; // Clear buffer on speech end
+                      this.userIsSpeaking = false;
+                      this.options.onUserIsSpeakingChange(false);
+                      console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
+                      // Send vad_end immediately instead of waiting for next audio chunk
+                      this._wsSend({
+                          type: 'vad_events',
+                          event: 'vad_end',
+                      });
+                      this.endUserTurn = false; // Reset the flag after sending vad_end
                   },
               })
                   .then((vad) => {
@@ -3638,13 +3734,36 @@ registerProcessor('audio_processor', AudioProcessor);
           });
       }
       async _clientInterruptAssistantReplay() {
-          await this.wavPlayer.interrupt();
-          // TODO: Use in voice pipeline to know how much of the audio has been played and how much to truncate transcript
-          // this._wsSend({
-          //   type: 'trigger.response.audio.replay_finished',
-          //   reason: 'interrupted',
-          //   delta_id: 'TODO'
-          // });
+          const offsetData = await this.wavPlayer.interrupt();
+          if (offsetData && this.currentTurnId) {
+              let offsetMs = offsetData.currentTime * 1000;
+              // Calculate accurate offset by subtracting pause time if audio was paused for VAD
+              if (this.audioPauseTime) {
+                  const pauseDurationMs = Date.now() - this.audioPauseTime;
+                  const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
+                  console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
+                  offsetMs = adjustedOffsetMs;
+                  this.audioPauseTime = null; // Clear the pause time
+              }
+              else {
+                  console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
+              }
+              // Send interruption event with accurate playback offset in milliseconds
+              this._wsSend({
+                  type: 'trigger.response.audio.interrupted',
+                  playback_offset: offsetMs,
+                  interruption_context: {
+                      turn_id: this.currentTurnId,
+                      playback_offset_ms: offsetMs,
+                  },
+              });
+          }
+          else {
+              console.warn('Interruption requested but missing required data:', {
+                  hasOffsetData: !!offsetData,
+                  hasTurnId: !!this.currentTurnId,
+              });
+          }
       }
       async triggerUserTurnStarted() {
           if (!this.pushToTalkActive) {
@@ -3675,24 +3794,38 @@ registerProcessor('audio_processor', AudioProcessor);
                       // Sent from the server to this client when a new user turn is detected
                       console.log('received turn.start from server');
                       console.log(message);
-                      if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
+                      if (message.role === 'assistant') {
+                          // Start tracking new assistant turn
+                          // Note: Don't reset currentTurnId here - let response.audio set it
+                          // This prevents race conditions where text arrives before audio
+                          console.log('Assistant turn started, will track new turn ID from audio/text');
+                      }
+                      else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
                           // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
                           console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
                           await this._clientInterruptAssistantReplay();
                       }
-                      // if (message.role === 'assistant') {
-                      //   // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
-                      //   console.log('Clearing audio buffer as assistant turn has started');
-                      //   await this._clientInterruptAssistantReplay();
-                      // }
                       break;
                   case 'response.audio':
                       const audioBuffer = base64ToArrayBuffer(message.content);
                       this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
+                      // Set current turn ID from first audio message, or update if different turn
+                      if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
+                          console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
+                          this.currentTurnId = message.turn_id;
+                          // Clean up interrupted tracks, keeping only the current turn
+                          this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
+                      }
                       break;
-                  // case 'response.end':
-                  //   console.log('received response.end');
-                  //   break;
+                  case 'response.text': {
+                      // Set turn ID from first text message if not set
+                      if (!this.currentTurnId) {
+                          this.currentTurnId = message.turn_id;
+                          console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
+                      }
+                      // Note: We no longer track text content in the client - the pipeline handles interruption estimation
+                      break;
+                  }
                   case 'response.data':
                       console.log('received response.data', message);
                       this.options.onDataMessage(message);
@@ -3717,18 +3850,29 @@ registerProcessor('audio_processor', AudioProcessor);
               const base64 = arrayBufferToBase64(data.mono);
               const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
               if (sendAudio) {
+                  // If we have buffered audio, send it first
+                  if (this.audioBuffer.length > 0) {
+                      console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
+                      for (const bufferedAudio of this.audioBuffer) {
+                          this._wsSend({
+                              type: 'client.audio',
+                              content: bufferedAudio,
+                          });
+                      }
+                      this.audioBuffer = []; // Clear the buffer after sending
+                  }
+                  // Send the current audio
                   this._wsSend({
                       type: 'client.audio',
                       content: base64,
                   });
-                  if (this.endUserTurn) {
-                      this.endUserTurn = false;
-                      this.userIsSpeaking = false; // Reset userIsSpeaking to false so we don't send any more audio to the server
-                      this.options.onUserIsSpeakingChange(false);
-                      this._wsSend({
-                          type: 'vad_events',
-                          event: 'vad_end',
-                      });
+              }
+              else {
+                  // Buffer audio when not sending (to catch audio just before VAD triggers)
+                  this.audioBuffer.push(base64);
+                  // Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
+                  if (this.audioBuffer.length > 10) {
+                      this.audioBuffer.shift(); // Remove oldest chunk
                   }
               }
           }
@@ -3785,6 +3929,8 @@ registerProcessor('audio_processor', AudioProcessor);
       async connect() {
           try {
               this._setStatus('connecting');
+              // Reset turn tracking for clean start
+              this._resetTurnTracking();
               // Get session key from server
               let authorizeSessionRequestBody = {
                   pipeline_id: this.options.pipelineId,
@@ -3862,11 +4008,27 @@ registerProcessor('audio_processor', AudioProcessor);
               throw error;
           }
       }
+      _resetTurnTracking() {
+          this.currentTurnId = null;
+          console.log('Reset turn tracking state');
+      }
       async disconnect() {
-          var _a;
+          // Clean up VAD if it exists
+          if (this.vad) {
+              this.vad.pause();
+              this.vad.destroy();
+              this.vad = null;
+          }
           this.wavRecorder.quit();
           this.wavPlayer.disconnect();
-          (_a = this.ws) === null || _a === void 0 ? void 0 : _a.close();
+          // Reset turn tracking
+          this._resetTurnTracking();
+          // Close websocket and ensure status is updated
+          if (this.ws) {
+              this.ws.close();
+              this._setStatus('disconnected');
+              this.options.onDisconnect();
+          }
       }
       /**
        * Gets the microphone MediaStream used by this client