npm - @layercode/js-sdk - Versions diffs - 1.0.24 → 1.0.26 - Mend

@layercode/js-sdk 1.0.24 → 1.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/layercode-js-sdk.esm.js +46 -151
package/dist/layercode-js-sdk.esm.js.map +1 -1
package/dist/layercode-js-sdk.min.js +46 -151
package/dist/layercode-js-sdk.min.js.map +1 -1
package/dist/types/index.d.ts +0 -7
package/package.json +2 -2

package/dist/layercode-js-sdk.esm.js CHANGED Viewed

@@ -3519,12 +3519,11 @@ class LayercodeClient {
         this.pushToTalkEnabled = false;
         this.canInterrupt = false;
         this.userIsSpeaking = false;
-        this.endUserTurn = false;
         this.recorderStarted = false;
         this.readySent = false;
         this.currentTurnId = null;
         this.audioBuffer = [];
-        this.audioPauseTime = null;
+        // this.audioPauseTime = null;
         // Bind event handlers
         this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
         this._handleDataAvailable = this._handleDataAvailable.bind(this);
@@ -3533,18 +3532,13 @@ class LayercodeClient {
         let isSpeakingByAmplitude = false;
         let silenceFrames = 0;
         const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
-        const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
+        const SILENCE_FRAMES_THRESHOLD = 6.4; // 6.4 * 20ms chunks = 128ms silence. Same as Silero ((frame samples: 512 / sampleRate: 16000) * 1000 * redemptionFrames: 4) = 128 ms silence
         // Monitor amplitude changes
         this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
             const wasSpeaking = isSpeakingByAmplitude;
             if (amplitude > AMPLITUDE_THRESHOLD) {
                 silenceFrames = 0;
                 if (!wasSpeaking) {
-                    // Speech started - pause audio if playing and track timing for interruption calculation
-                    if (this.canInterrupt && this.wavPlayer.isPlaying) {
-                        this.audioPauseTime = Date.now();
-                        this.wavPlayer.pause();
-                    }
                     isSpeakingByAmplitude = true;
                     this.userIsSpeaking = true;
                     this.options.onUserIsSpeakingChange(true);
@@ -3557,7 +3551,6 @@ class LayercodeClient {
             else {
                 silenceFrames++;
                 if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
-                    // Speech ended
                     isSpeakingByAmplitude = false;
                     this.userIsSpeaking = false;
                     this.options.onUserIsSpeakingChange(false);
@@ -3575,7 +3568,7 @@ class LayercodeClient {
         if (this.pushToTalkEnabled) {
             return;
         }
-        const timeout = setTimeout(() => {
+        const vadLoadTimeout = setTimeout(() => {
             console.log('silero vad model timeout');
             console.warn('VAD model failed to load - falling back to amplitude-based detection');
             // Send a message to server indicating VAD failure
@@ -3583,134 +3576,54 @@ class LayercodeClient {
                 type: 'vad_events',
                 event: 'vad_model_failed',
             });
-            // In automatic mode without VAD, allow the bot to speak initially
-            this.userIsSpeaking = false;
-            this.options.onUserIsSpeakingChange(false);
             // Set up amplitude-based fallback detection
             this._setupAmplitudeBasedVAD();
         }, 2000);
-        if (!this.canInterrupt) {
-            dist.MicVAD.new({
-                stream: this.wavRecorder.getStream() || undefined,
-                model: 'v5',
-                positiveSpeechThreshold: 0.3,
-                negativeSpeechThreshold: 0.2,
-                redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
-                minSpeechFrames: 0,
-                preSpeechPadFrames: 0,
-                onSpeechStart: () => {
-                    this.userIsSpeaking = true;
-                    this.options.onUserIsSpeakingChange(true);
-                    console.log('onSpeechStart: sending vad_start');
-                    this._wsSend({
-                        type: 'vad_events',
-                        event: 'vad_start',
-                    });
-                },
-                onSpeechEnd: () => {
-                    console.log('onSpeechEnd: sending vad_end');
-                    this.endUserTurn = true; // Set flag to indicate that the user turn has ended
-                    this.audioBuffer = []; // Clear buffer on speech end
-                    this.userIsSpeaking = false;
-                    this.options.onUserIsSpeakingChange(false);
-                    console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
-                    // Send vad_end immediately instead of waiting for next audio chunk
-                    this._wsSend({
-                        type: 'vad_events',
-                        event: 'vad_end',
-                    });
-                    this.endUserTurn = false; // Reset the flag after sending vad_end
-                },
-            })
-                .then((vad) => {
-                clearTimeout(timeout);
-                this.vad = vad;
-                this.vad.start();
-                console.log('VAD started');
-            })
-                .catch((error) => {
-                console.error('Error initializing VAD:', error);
-            });
-        }
-        else {
-            dist.MicVAD.new({
-                stream: this.wavRecorder.getStream() || undefined,
-                model: 'v5',
-                // baseAssetPath: '/', // Use if bundling model locally
-                // onnxWASMBasePath: '/', // Use if bundling model locally
-                positiveSpeechThreshold: 0.5,
-                negativeSpeechThreshold: 0.3,
-                redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
-                minSpeechFrames: 25,
-                preSpeechPadFrames: 0,
-                onSpeechStart: () => {
-                    // Only pause agent audio if it's currently playing
-                    if (this.wavPlayer.isPlaying) {
-                        console.log('onSpeechStart: WavPlayer is playing, pausing it.');
-                        this.audioPauseTime = Date.now(); // Track when we paused
-                        this.wavPlayer.pause();
-                    }
-                    else {
-                        console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
-                    }
-                    console.log('onSpeechStart: sending vad_start');
-                    this._wsSend({
-                        type: 'vad_events',
-                        event: 'vad_start',
-                    });
-                    this.userIsSpeaking = true;
-                    this.options.onUserIsSpeakingChange(true);
-                    this.endUserTurn = false; // Reset endUserTurn when speech starts
-                    console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
-                },
-                onVADMisfire: () => {
-                    // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
-                    this.userIsSpeaking = false;
-                    this.audioBuffer = []; // Clear buffer on misfire
-                    this.options.onUserIsSpeakingChange(false);
-                    // Add the missing delay before resuming to prevent race conditions
-                    setTimeout(() => {
-                        if (!this.wavPlayer.isPlaying) {
-                            console.log('onVADMisfire: Resuming after delay');
-                            this.audioPauseTime = null; // Clear pause time since we're resuming
-                            this.wavPlayer.play();
-                        }
-                        else {
-                            console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
-                            this.endUserTurn = true;
-                        }
-                    }, this.options.vadResumeDelay);
-                },
-                onSpeechEnd: () => {
-                    console.log('onSpeechEnd: sending vad_end');
-                    this.endUserTurn = true; // Set flag to indicate that the user turn has ended
-                    this.audioBuffer = []; // Clear buffer on speech end
-                    this.userIsSpeaking = false;
-                    this.options.onUserIsSpeakingChange(false);
-                    console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
-                    // Send vad_end immediately instead of waiting for next audio chunk
-                    this._wsSend({
-                        type: 'vad_events',
-                        event: 'vad_end',
-                    });
-                    this.endUserTurn = false; // Reset the flag after sending vad_end
-                },
-            })
-                .then((vad) => {
-                clearTimeout(timeout);
-                this.vad = vad;
-                this.vad.start();
-                console.log('VAD started');
-            })
-                .catch((error) => {
-                console.error('Error initializing VAD:', error);
-            });
-        }
+        dist.MicVAD.new({
+            stream: this.wavRecorder.getStream() || undefined,
+            model: 'v5',
+            positiveSpeechThreshold: 0.15,
+            negativeSpeechThreshold: 0.05,
+            redemptionFrames: 4,
+            minSpeechFrames: 2,
+            preSpeechPadFrames: 0,
+            frameSamples: 512, // Required for v5 as per https://docs.vad.ricky0123.com/user-guide/algorithm/#configuration
+            onSpeechStart: () => {
+                console.log('onSpeechStart: sending vad_start');
+                this.userIsSpeaking = true;
+                this.options.onUserIsSpeakingChange(true);
+                this._wsSend({
+                    type: 'vad_events',
+                    event: 'vad_start',
+                });
+            },
+            onSpeechEnd: () => {
+                console.log('onSpeechEnd: sending vad_end');
+                this.userIsSpeaking = false;
+                this.options.onUserIsSpeakingChange(false);
+                this.audioBuffer = []; // Clear buffer on speech end
+                this._wsSend({
+                    type: 'vad_events',
+                    event: 'vad_end',
+                });
+            },
+            // onVADMisfire: () => {
+            //   // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd.
+            // },
+        })
+            .then((vad) => {
+            clearTimeout(vadLoadTimeout);
+            this.vad = vad;
+            this.vad.start();
+            console.log('VAD started');
+        })
+            .catch((error) => {
+            console.error('Error initializing VAD:', error);
+        });
     }
     /**
      * Updates the connection status and triggers the callback
      * @param {string} status - New status value
-     * @private
      */
     _setStatus(status) {
         this.status = status;
@@ -3718,7 +3631,6 @@ class LayercodeClient {
     }
     /**
      * Handles when agent audio finishes playing
-     * @private
      */
     _clientResponseAudioReplayFinished() {
         console.log('clientResponseAudioReplayFinished');
@@ -3731,17 +3643,6 @@ class LayercodeClient {
         const offsetData = await this.wavPlayer.interrupt();
         if (offsetData && this.currentTurnId) {
             let offsetMs = offsetData.currentTime * 1000;
-            // Calculate accurate offset by subtracting pause time if audio was paused for VAD
-            if (this.audioPauseTime) {
-                const pauseDurationMs = Date.now() - this.audioPauseTime;
-                const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
-                console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
-                offsetMs = adjustedOffsetMs;
-                this.audioPauseTime = null; // Clear the pause time
-            }
-            else {
-                console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
-            }
             // Send interruption event with accurate playback offset in milliseconds
             this._wsSend({
                 type: 'trigger.response.audio.interrupted',
@@ -3775,7 +3676,6 @@ class LayercodeClient {
     /**
      * Handles incoming WebSocket messages
      * @param {MessageEvent} event - The WebSocket message event
-     * @private
      */
     async _handleWebSocketMessage(event) {
         try {
@@ -3790,12 +3690,10 @@ class LayercodeClient {
                     console.log(message);
                     if (message.role === 'assistant') {
                         // Start tracking new assistant turn
-                        // Note: Don't reset currentTurnId here - let response.audio set it
-                        // This prevents race conditions where text arrives before audio
                         console.log('Assistant turn started, will track new turn ID from audio/text');
                     }
-                    else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
-                        // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
+                    else if (message.role === 'user' && !this.pushToTalkEnabled) {
+                        // Interrupt any playing assistant audio if this is a turn triggered by the server (and not push to talk, which will have already called interrupt)
                         console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
                         await this._clientInterruptAssistantReplay();
                     }
@@ -3817,7 +3715,6 @@ class LayercodeClient {
                         this.currentTurnId = message.turn_id;
                         console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
                     }
-                    // Note: We no longer track text content in the client - the pipeline handles interruption estimation
                     break;
                 }
                 case 'response.data':
@@ -3837,7 +3734,6 @@ class LayercodeClient {
     /**
      * Handles available client browser microphone audio data and sends it over the WebSocket
      * @param {ArrayBuffer} data - The audio data buffer
-     * @private
      */
     _handleDataAvailable(data) {
         try {
@@ -3897,7 +3793,6 @@ class LayercodeClient {
      * @param {WavRecorder | WavStreamPlayer} source - The audio source (recorder or player).
      * @param {(amplitude: number) => void} callback - The callback function to invoke on amplitude change.
      * @param {(amplitude: number) => void} updateInternalState - Function to update the internal amplitude state.
-     * @private
      */
     _setupAmplitudeMonitoring(source, callback, updateInternalState) {
         // Set up amplitude monitoring only if a callback is provided