npm - osborn - Versions diffs - 0.9.52 → 0.9.53 - Mend

osborn 0.9.52 → 0.9.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +62 -6
package/package.json +9 -9

package/dist/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 // Load environment variables FIRST before any other imports
 import 'dotenv/config';
 import { voice, initializeLogger } from '@livekit/agents';
-import { Room, RoomEvent, } from '@livekit/rtc-node';
+import { Room, RoomEvent, RemoteParticipant, } from '@livekit/rtc-node';
 import { AccessToken } from 'livekit-server-sdk';
 // Initialize logger before anything else
 initializeLogger({ pretty: true, level: 'info' });
@@ -1205,6 +1205,26 @@ async function main() {
     // Session-level always-allow list: paths the user has approved for this session without prompting
     let sessionAlwaysAllowPaths = new Set();
     let userState = 'listening'; // Track user speech state for queue safety
+    // Leading-edge debounce for the TTS interrupt below — restores the same
+    // anti-flap protection the removed ActiveSpeakersChanged handler had pre-0.9.39
+    // (May 21 / c345c98). Wall-clock timestamp + ms compare; no setTimeout, no
+    // promise, no new API. Suppresses repeat interrupts within the window so a
+    // single user-input transition fires at most one interrupt() call per second.
+    // Without it, TTS echo bleeding through the mic causes user_state to oscillate
+    // speaking ↔ listening across rapid Deepgram frames, each transition firing a
+    // fresh interrupt — and even after 1.4.x's stricter error classification, the
+    // first one survives but the cascade kills the session.
+    let lastInterruptAt = 0;
+    // Self-echo guard for the TTS interrupt below. Updated by the
+    // ActiveSpeakersChanged listener registered near the other room.on(...) handlers.
+    // user_state_changed carries NO speaker identity (verified against the SDK type
+    // — UserStateChangedEvent has only oldState/newState/createdAt), so a separate
+    // remote-speaker timestamp is the only way to distinguish "real user spoke" from
+    // "agent's own TTS echoed through the mic". Independent producer: rtc-node
+    // emits activeSpeakersChanged from server WebRTC audio-level reports
+    // (room.js:213), with NO reference to AgentSession or STT — so there's no
+    // dependency loop with user_state_changed's STT-driven producer.
+    let lastRemoteSpeakerAt = 0;
     let currentVoiceMode = voiceMode; // Track active voice mode for data handlers
     let currentProvider = realtimeConfig.provider; // Track active realtime provider
     // Authenticated Supabase userId from participant metadata. Used to scope
@@ -2589,6 +2609,20 @@ async function main() {
         // rather than hold it indefinitely. Cancelled in ParticipantConnected.
         armAloneTimer();
     });
+    // Self-echo guard producer. Server WebRTC audio-level reports drive this
+    // (rtc-node room.js:213, ~50-100ms latency from mic onset — faster than
+    // Deepgram STT classification, so by the time user_state_changed fires
+    // lastRemoteSpeakerAt is already current). Filter speakers to RemoteParticipant
+    // — LocalParticipant is the agent itself and including it would defeat the
+    // whole point (the echo we're guarding against IS the agent's local audio).
+    // This is the speaker-identity filter the removed ActiveSpeakersChanged
+    // handler had (May 21 / c345c98) — minus the interrupt() call, since the
+    // user_state_changed handler now owns interrupt firing.
+    room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
+        if (speakers.some((s) => s instanceof RemoteParticipant)) {
+            lastRemoteSpeakerAt = Date.now();
+        }
+    });
     // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
     // handler that interrupted TTS on any sustained audio activity (~50ms after
     // mic onset). That fired too eagerly — coughs, paper rustles, the agent's
@@ -2954,12 +2988,34 @@ async function main() {
                 userState = ev.newState;
                 console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
                 if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
-                    try {
-                        console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
-                        currentSession?.interrupt();
+                    const now = Date.now();
+                    // Self-echo guard FIRST. Reject this trigger entirely if no remote
+                    // participant has been heard speaking in the last 500ms — at that
+                    // point user_state=speaking is almost certainly TTS bleeding through
+                    // the mic (Deepgram correctly identifies it as "speech", we add the
+                    // identity filter the high-level event lacks). 500ms is wider than
+                    // the ~50-300ms gap between ActiveSpeakersChanged and user_state_changed
+                    // firing, so a real user is comfortably inside the window.
+                    if (now - lastRemoteSpeakerAt > 500) {
+                        console.log('🔇 Skipping interrupt — no recent remote-speaker activity (self-echo guard)');
+                        return;
                     }
-                    catch (err) {
-                        console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
+                    // Leading-edge 1s debounce — verbatim shape of the removed
+                    // ActiveSpeakersChanged handler's anti-flap (see lastInterruptAt
+                    // declaration). Belt + suspenders with the self-echo guard above.
+                    const debounced = now - lastInterruptAt < 1000;
+                    lastInterruptAt = now;
+                    if (debounced) {
+                        console.log('🔇 user-state interrupt debounced (< 1s since last)');
+                    }
+                    else {
+                        try {
+                            console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS');
+                            currentSession?.interrupt();
+                        }
+                        catch (err) {
+                            console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
+                        }
                     }
                 }
                 // When user stops speaking, retry voice queue — items may be waiting

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.9.52",
+  "version": "0.9.53",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {
@@ -33,14 +33,14 @@
     "@anthropic-ai/claude-agent-sdk": "^0.2.91",
     "@anthropic-ai/sdk": "^0.80.0",
     "@google/genai": "^1.0.0",
-    "@livekit/agents": "^1.2.1",
-    "@livekit/agents-plugin-deepgram": "^1.2.1",
-    "@livekit/agents-plugin-elevenlabs": "^1.2.1",
-    "@livekit/agents-plugin-google": "^1.2.1",
-    "@livekit/agents-plugin-livekit": "^1.2.1",
-    "@livekit/agents-plugin-openai": "^1.2.1",
-    "@livekit/agents-plugin-silero": "^1.2.1",
-    "@livekit/rtc-node": "^0.13.24",
+    "@livekit/agents": "1.2.1",
+    "@livekit/agents-plugin-deepgram": "1.2.1",
+    "@livekit/agents-plugin-elevenlabs": "1.2.1",
+    "@livekit/agents-plugin-google": "1.2.1",
+    "@livekit/agents-plugin-livekit": "1.2.1",
+    "@livekit/agents-plugin-openai": "1.2.1",
+    "@livekit/agents-plugin-silero": "1.2.1",
+    "@livekit/rtc-node": "0.13.24",
     "@modelcontextprotocol/sdk": "^1.29.0",
     "@openai/codex-sdk": "^0.77.0",
     "@smithery/api": "^0.48.0",