npm - @layercode/js-sdk - Versions diffs - 1.0.22 → 1.0.23 - Mend

@layercode/js-sdk 1.0.22 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/layercode-js-sdk.esm.js +92 -63
package/dist/layercode-js-sdk.esm.js.map +1 -1
package/dist/layercode-js-sdk.min.js +92 -63
package/dist/layercode-js-sdk.min.js.map +1 -1
package/dist/types/index.d.ts +3 -2
package/dist/types/interfaces.d.ts +2 -4
package/package.json +1 -1

package/dist/layercode-js-sdk.esm.js CHANGED Viewed

@@ -3522,13 +3522,53 @@ class LayercodeClient {
         this.endUserTurn = false;
         this.recorderStarted = false;
         this.readySent = false;
-        this.currentTurnText = '';
         this.currentTurnId = null;
         this.audioBuffer = [];
+        this.audioPauseTime = null;
         // Bind event handlers
         this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
         this._handleDataAvailable = this._handleDataAvailable.bind(this);
     }
+    _setupAmplitudeBasedVAD() {
+        let isSpeakingByAmplitude = false;
+        let silenceFrames = 0;
+        const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
+        const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
+        // Monitor amplitude changes
+        this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
+            const wasSpeaking = isSpeakingByAmplitude;
+            if (amplitude > AMPLITUDE_THRESHOLD) {
+                silenceFrames = 0;
+                if (!wasSpeaking) {
+                    // Speech started - pause audio if playing and track timing for interruption calculation
+                    if (this.canInterrupt && this.wavPlayer.isPlaying) {
+                        this.audioPauseTime = Date.now();
+                        this.wavPlayer.pause();
+                    }
+                    isSpeakingByAmplitude = true;
+                    this.userIsSpeaking = true;
+                    this.options.onUserIsSpeakingChange(true);
+                    this._wsSend({
+                        type: 'vad_events',
+                        event: 'vad_start',
+                    });
+                }
+            }
+            else {
+                silenceFrames++;
+                if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
+                    // Speech ended
+                    isSpeakingByAmplitude = false;
+                    this.userIsSpeaking = false;
+                    this.options.onUserIsSpeakingChange(false);
+                    this._wsSend({
+                        type: 'vad_events',
+                        event: 'vad_end',
+                    });
+                }
+            }
+        });
+    }
     _initializeVAD() {
         console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
         // If we're in push to talk mode, we don't need to use the VAD model
@@ -3537,9 +3577,17 @@ class LayercodeClient {
         }
         const timeout = setTimeout(() => {
             console.log('silero vad model timeout');
-            // TODO: send message to server to indicate that the vad model timed out
-            this.userIsSpeaking = true; // allow audio to be sent to the server
-            this.options.onUserIsSpeakingChange(true);
+            console.warn('VAD model failed to load - falling back to amplitude-based detection');
+            // Send a message to server indicating VAD failure
+            this._wsSend({
+                type: 'vad_events',
+                event: 'vad_model_failed',
+            });
+            // In automatic mode without VAD, allow the bot to speak initially
+            this.userIsSpeaking = false;
+            this.options.onUserIsSpeakingChange(false);
+            // Set up amplitude-based fallback detection
+            this._setupAmplitudeBasedVAD();
         }, 2000);
         if (!this.canInterrupt) {
             dist.MicVAD.new({
@@ -3548,7 +3596,7 @@ class LayercodeClient {
                 positiveSpeechThreshold: 0.3,
                 negativeSpeechThreshold: 0.2,
                 redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
-                minSpeechFrames: 15,
+                minSpeechFrames: 0,
                 preSpeechPadFrames: 0,
                 onSpeechStart: () => {
                     this.userIsSpeaking = true;
@@ -3559,27 +3607,6 @@ class LayercodeClient {
                         event: 'vad_start',
                     });
                 },
-                onVADMisfire: () => {
-                    console.log('onVADMisfire: Short utterance detected, resuming bot');
-                    this.audioBuffer = []; // Clear buffer on misfire
-                    this.userIsSpeaking = false;
-                    this.options.onUserIsSpeakingChange(false);
-                    // Send vad_end to indicate the short utterance is over
-                    this._wsSend({
-                        type: 'vad_events',
-                        event: 'vad_end',
-                    });
-                    // End the user's turn
-                    this._wsSend({
-                        type: 'trigger.turn.end',
-                        role: 'user',
-                    });
-                    // Resume bot audio if it was playing
-                    if (!this.wavPlayer.isPlaying) {
-                        console.log('onVADMisfire: Resuming bot audio');
-                        this.wavPlayer.play();
-                    }
-                },
                 onSpeechEnd: () => {
                     console.log('onSpeechEnd: sending vad_end');
                     this.endUserTurn = true; // Set flag to indicate that the user turn has ended
@@ -3620,6 +3647,7 @@ class LayercodeClient {
                     // Only pause agent audio if it's currently playing
                     if (this.wavPlayer.isPlaying) {
                         console.log('onSpeechStart: WavPlayer is playing, pausing it.');
+                        this.audioPauseTime = Date.now(); // Track when we paused
                         this.wavPlayer.pause();
                     }
                     else {
@@ -3644,9 +3672,8 @@ class LayercodeClient {
                     setTimeout(() => {
                         if (!this.wavPlayer.isPlaying) {
                             console.log('onVADMisfire: Resuming after delay');
+                            this.audioPauseTime = null; // Clear pause time since we're resuming
                             this.wavPlayer.play();
-                            this.userIsSpeaking = true;
-                            this.options.onUserIsSpeakingChange(true);
                         }
                         else {
                             console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
@@ -3700,33 +3727,37 @@ class LayercodeClient {
             reason: 'completed',
         });
     }
-    _estimateWordsHeard(text, playbackOffsetSeconds) {
-        const words = text.split(/\s+/).filter((word) => word.length > 0);
-        const totalWords = words.length;
-        // Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
-        const estimatedWordsPerSecond = 2.5;
-        const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
-        const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
-        return { wordsHeard: estimatedWordsHeard, textHeard };
-    }
     async _clientInterruptAssistantReplay() {
         const offsetData = await this.wavPlayer.interrupt();
-        if (offsetData && this.currentTurnText && this.currentTurnId) {
-            const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
-            const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
-            console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
-            // Send interruption event with context
+        if (offsetData && this.currentTurnId) {
+            let offsetMs = offsetData.currentTime * 1000;
+            // Calculate accurate offset by subtracting pause time if audio was paused for VAD
+            if (this.audioPauseTime) {
+                const pauseDurationMs = Date.now() - this.audioPauseTime;
+                const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
+                console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
+                offsetMs = adjustedOffsetMs;
+                this.audioPauseTime = null; // Clear the pause time
+            }
+            else {
+                console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
+            }
+            // Send interruption event with accurate playback offset in milliseconds
             this._wsSend({
                 type: 'trigger.response.audio.interrupted',
-                playback_offset: offsetData.currentTime,
+                playback_offset: offsetMs,
                 interruption_context: {
                     turn_id: this.currentTurnId,
-                    estimated_words_heard: wordsHeard,
-                    total_words: totalWords,
-                    text_heard: textHeard,
+                    playback_offset_ms: offsetMs,
                 },
             });
         }
+        else {
+            console.warn('Interruption requested but missing required data:', {
+                hasOffsetData: !!offsetData,
+                hasTurnId: !!this.currentTurnId,
+            });
+        }
     }
     async triggerUserTurnStarted() {
         if (!this.pushToTalkActive) {
@@ -3775,30 +3806,20 @@ class LayercodeClient {
                     // Set current turn ID from first audio message, or update if different turn
                     if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
                         console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
-                        const oldTurnId = this.currentTurnId;
                         this.currentTurnId = message.turn_id;
-                        this.currentTurnText = ''; // Reset text for new turn
                         // Clean up interrupted tracks, keeping only the current turn
                         this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
                     }
                     break;
-                case 'response.text':
-                    // Set turn ID from first text message if not set, or accumulate if matches current turn
-                    if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
-                        if (!this.currentTurnId) {
-                            console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
-                            this.currentTurnId = message.turn_id;
-                            this.currentTurnText = '';
-                        }
-                        this.currentTurnText += message.content;
-                    }
-                    else {
-                        console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
+                case 'response.text': {
+                    // Set turn ID from first text message if not set
+                    if (!this.currentTurnId) {
+                        this.currentTurnId = message.turn_id;
+                        console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
                     }
+                    // Note: We no longer track text content in the client - the pipeline handles interruption estimation
                     break;
-                // case 'response.end':
-                //   console.log('received response.end');
-                //   break;
+                }
                 case 'response.data':
                     console.log('received response.data', message);
                     this.options.onDataMessage(message);
@@ -3902,6 +3923,8 @@ class LayercodeClient {
     async connect() {
         try {
             this._setStatus('connecting');
+            // Reset turn tracking for clean start
+            this._resetTurnTracking();
             // Get session key from server
             let authorizeSessionRequestBody = {
                 pipeline_id: this.options.pipelineId,
@@ -3979,6 +4002,10 @@ class LayercodeClient {
             throw error;
         }
     }
+    _resetTurnTracking() {
+        this.currentTurnId = null;
+        console.log('Reset turn tracking state');
+    }
     async disconnect() {
         // Clean up VAD if it exists
         if (this.vad) {
@@ -3988,6 +4015,8 @@ class LayercodeClient {
         }
         this.wavRecorder.quit();
         this.wavPlayer.disconnect();
+        // Reset turn tracking
+        this._resetTurnTracking();
         // Close websocket and ensure status is updated
         if (this.ws) {
             this.ws.close();