osborn 0.9.52 → 0.9.54

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +56 -2
  2. package/package.json +9 -9
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent, } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent, RemoteParticipant, } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -1205,6 +1205,16 @@ async function main() {
1205
1205
  // Session-level always-allow list: paths the user has approved for this session without prompting
1206
1206
  let sessionAlwaysAllowPaths = new Set();
1207
1207
  let userState = 'listening'; // Track user speech state for queue safety
1208
+ // Self-echo guard for the TTS interrupt below. Updated by the
1209
+ // ActiveSpeakersChanged listener registered near the other room.on(...) handlers.
1210
+ // user_state_changed carries NO speaker identity (verified against the SDK type
1211
+ // — UserStateChangedEvent has only oldState/newState/createdAt), so a separate
1212
+ // remote-speaker timestamp is the only way to distinguish "real user spoke" from
1213
+ // "agent's own TTS echoed through the mic". Independent producer: rtc-node
1214
+ // emits activeSpeakersChanged from server WebRTC audio-level reports
1215
+ // (room.js:213), with NO reference to AgentSession or STT — so there's no
1216
+ // dependency loop with user_state_changed's STT-driven producer.
1217
+ let lastRemoteSpeakerAt = 0;
1208
1218
  let currentVoiceMode = voiceMode; // Track active voice mode for data handlers
1209
1219
  let currentProvider = realtimeConfig.provider; // Track active realtime provider
1210
1220
  // Authenticated Supabase userId from participant metadata. Used to scope
@@ -2052,6 +2062,19 @@ async function main() {
2052
2062
  minDelay: 500, // Wait 500ms after STT commits before generating reply
2053
2063
  maxDelay: 2000, // Force end-of-turn after 2s to prevent hangs
2054
2064
  },
2065
+ // Echo-driven false-interrupt protection at the SDK level (1.2.x has these knobs,
2066
+ // we just never set them — defaults are minDuration:500ms / minWords:0 which let
2067
+ // through every short echo blip). Both knobs gate the SDK's internal
2068
+ // interruptByAudioActivity() (agent_activity.js — runs on Deepgram interim
2069
+ // transcripts AND speechDuration updates), which is the path that was firing
2070
+ // even after our user_state_changed handler skipped the trigger.
2071
+ interruption: {
2072
+ minDuration: 750, // 500 → 750: require 750ms of sustained audio activity
2073
+ minWords: 2, // 0 → 2: require ≥2 transcript words (filters single-word echo blips)
2074
+ // SDK defaults kept: enabled=true, resumeFalseInterruption=true,
2075
+ // falseInterruptionTimeout=2000ms (that 2s timer is what resumed your audio
2076
+ // exactly where it stopped — confirmed working as designed).
2077
+ },
2055
2078
  },
2056
2079
  });
2057
2080
  return { session, agent };
@@ -2589,6 +2612,20 @@ async function main() {
2589
2612
  // rather than hold it indefinitely. Cancelled in ParticipantConnected.
2590
2613
  armAloneTimer();
2591
2614
  });
2615
+ // Self-echo guard producer. Server WebRTC audio-level reports drive this
2616
+ // (rtc-node room.js:213, ~50-100ms latency from mic onset — faster than
2617
+ // Deepgram STT classification, so by the time user_state_changed fires
2618
+ // lastRemoteSpeakerAt is already current). Filter speakers to RemoteParticipant
2619
+ // — LocalParticipant is the agent itself and including it would defeat the
2620
+ // whole point (the echo we're guarding against IS the agent's local audio).
2621
+ // This is the speaker-identity filter the removed ActiveSpeakersChanged
2622
+ // handler had (May 21 / c345c98) — minus the interrupt() call, since the
2623
+ // user_state_changed handler now owns interrupt firing.
2624
+ room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2625
+ if (speakers.some((s) => s instanceof RemoteParticipant)) {
2626
+ lastRemoteSpeakerAt = Date.now();
2627
+ }
2628
+ });
2592
2629
  // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
2593
2630
  // handler that interrupted TTS on any sustained audio activity (~50ms after
2594
2631
  // mic onset). That fired too eagerly — coughs, paper rustles, the agent's
@@ -2954,8 +2991,25 @@ async function main() {
2954
2991
  userState = ev.newState;
2955
2992
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
2956
2993
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
2994
+ const now = Date.now();
2995
+ // Self-echo guard. Reject this trigger entirely if no remote
2996
+ // participant has been heard speaking in the last 500ms — at that
2997
+ // point user_state=speaking is almost certainly TTS bleeding through
2998
+ // the mic (Deepgram correctly identifies it as "speech", we add the
2999
+ // identity filter the high-level event lacks). 500ms is wider than
3000
+ // the ~50-300ms gap between ActiveSpeakersChanged and user_state_changed
3001
+ // firing, so a real user is comfortably inside the window.
3002
+ //
3003
+ // The 1s leading-edge debounce that used to live here was removed in
3004
+ // 0.9.54 — the SDK-side `turnHandling.interruption.minDuration:750` +
3005
+ // `minWords:2` now do the heavy lifting on echo filtering, and stacking
3006
+ // an extra cooldown on top risked masking the SDK's own resume timing.
3007
+ if (now - lastRemoteSpeakerAt > 500) {
3008
+ console.log('🔇 Skipping interrupt — no recent remote-speaker activity (self-echo guard)');
3009
+ return;
3010
+ }
2957
3011
  try {
2958
- console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
3012
+ console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS');
2959
3013
  currentSession?.interrupt();
2960
3014
  }
2961
3015
  catch (err) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.52",
3
+ "version": "0.9.54",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
@@ -33,14 +33,14 @@
33
33
  "@anthropic-ai/claude-agent-sdk": "^0.2.91",
34
34
  "@anthropic-ai/sdk": "^0.80.0",
35
35
  "@google/genai": "^1.0.0",
36
- "@livekit/agents": "^1.2.1",
37
- "@livekit/agents-plugin-deepgram": "^1.2.1",
38
- "@livekit/agents-plugin-elevenlabs": "^1.2.1",
39
- "@livekit/agents-plugin-google": "^1.2.1",
40
- "@livekit/agents-plugin-livekit": "^1.2.1",
41
- "@livekit/agents-plugin-openai": "^1.2.1",
42
- "@livekit/agents-plugin-silero": "^1.2.1",
43
- "@livekit/rtc-node": "^0.13.24",
36
+ "@livekit/agents": "1.2.1",
37
+ "@livekit/agents-plugin-deepgram": "1.2.1",
38
+ "@livekit/agents-plugin-elevenlabs": "1.2.1",
39
+ "@livekit/agents-plugin-google": "1.2.1",
40
+ "@livekit/agents-plugin-livekit": "1.2.1",
41
+ "@livekit/agents-plugin-openai": "1.2.1",
42
+ "@livekit/agents-plugin-silero": "1.2.1",
43
+ "@livekit/rtc-node": "0.13.24",
44
44
  "@modelcontextprotocol/sdk": "^1.29.0",
45
45
  "@openai/codex-sdk": "^0.77.0",
46
46
  "@smithery/api": "^0.48.0",