osborn 0.9.52 → 0.9.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +62 -6
  2. package/package.json +9 -9
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent, } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent, RemoteParticipant, } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -1205,6 +1205,26 @@ async function main() {
1205
1205
  // Session-level always-allow list: paths the user has approved for this session without prompting
1206
1206
  let sessionAlwaysAllowPaths = new Set();
1207
1207
  let userState = 'listening'; // Track user speech state for queue safety
1208
+ // Leading-edge debounce for the TTS interrupt below — restores the same
1209
+ // anti-flap protection the removed ActiveSpeakersChanged handler had pre-0.9.39
1210
+ // (May 21 / c345c98). Wall-clock timestamp + ms compare; no setTimeout, no
1211
+ // promise, no new API. Suppresses repeat interrupts within the window so a
1212
+ // single user-input transition fires at most one interrupt() call per second.
1213
+ // Without it, TTS echo bleeding through the mic causes user_state to oscillate
1214
+ // speaking ↔ listening across rapid Deepgram frames, each transition firing a
1215
+ // fresh interrupt — and even after 1.4.x's stricter error classification, the
1216
+ // first one survives but the cascade kills the session.
1217
+ let lastInterruptAt = 0;
1218
+ // Self-echo guard for the TTS interrupt below. Updated by the
1219
+ // ActiveSpeakersChanged listener registered near the other room.on(...) handlers.
1220
+ // user_state_changed carries NO speaker identity (verified against the SDK type
1221
+ // — UserStateChangedEvent has only oldState/newState/createdAt), so a separate
1222
+ // remote-speaker timestamp is the only way to distinguish "real user spoke" from
1223
+ // "agent's own TTS echoed through the mic". Independent producer: rtc-node
1224
+ // emits activeSpeakersChanged from server WebRTC audio-level reports
1225
+ // (room.js:213), with NO reference to AgentSession or STT — so there's no
1226
+ // dependency loop with user_state_changed's STT-driven producer.
1227
+ let lastRemoteSpeakerAt = 0;
1208
1228
  let currentVoiceMode = voiceMode; // Track active voice mode for data handlers
1209
1229
  let currentProvider = realtimeConfig.provider; // Track active realtime provider
1210
1230
  // Authenticated Supabase userId from participant metadata. Used to scope
@@ -2589,6 +2609,20 @@ async function main() {
2589
2609
  // rather than hold it indefinitely. Cancelled in ParticipantConnected.
2590
2610
  armAloneTimer();
2591
2611
  });
2612
+ // Self-echo guard producer. Server WebRTC audio-level reports drive this
2613
+ // (rtc-node room.js:213, ~50-100ms latency from mic onset — faster than
2614
+ // Deepgram STT classification, so by the time user_state_changed fires
2615
+ // lastRemoteSpeakerAt is already current). Filter speakers to RemoteParticipant
2616
+ // — LocalParticipant is the agent itself and including it would defeat the
2617
+ // whole point (the echo we're guarding against IS the agent's local audio).
2618
+ // This is the speaker-identity filter the removed ActiveSpeakersChanged
2619
+ // handler had (May 21 / c345c98) — minus the interrupt() call, since the
2620
+ // user_state_changed handler now owns interrupt firing.
2621
+ room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2622
+ if (speakers.some((s) => s instanceof RemoteParticipant)) {
2623
+ lastRemoteSpeakerAt = Date.now();
2624
+ }
2625
+ });
2592
2626
  // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
2593
2627
  // handler that interrupted TTS on any sustained audio activity (~50ms after
2594
2628
  // mic onset). That fired too eagerly — coughs, paper rustles, the agent's
@@ -2954,12 +2988,34 @@ async function main() {
2954
2988
  userState = ev.newState;
2955
2989
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
2956
2990
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
2957
- try {
2958
- console.log('🎤 user_state_changed=speaking + agent speaking interrupting TTS');
2959
- currentSession?.interrupt();
2991
+ const now = Date.now();
2992
+ // Self-echo guard FIRST. Reject this trigger entirely if no remote
2993
+ // participant has been heard speaking in the last 500ms — at that
2994
+ // point user_state=speaking is almost certainly TTS bleeding through
2995
+ // the mic (Deepgram correctly identifies it as "speech", we add the
2996
+ // identity filter the high-level event lacks). 500ms is wider than
2997
+ // the ~50-300ms gap between ActiveSpeakersChanged and user_state_changed
2998
+ // firing, so a real user is comfortably inside the window.
2999
+ if (now - lastRemoteSpeakerAt > 500) {
3000
+ console.log('🔇 Skipping interrupt — no recent remote-speaker activity (self-echo guard)');
3001
+ return;
2960
3002
  }
2961
- catch (err) {
2962
- console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
3003
+ // Leading-edge 1s debounce — verbatim shape of the removed
3004
+ // ActiveSpeakersChanged handler's anti-flap (see lastInterruptAt
3005
+ // declaration). Belt + suspenders with the self-echo guard above.
3006
+ const debounced = now - lastInterruptAt < 1000;
3007
+ lastInterruptAt = now;
3008
+ if (debounced) {
3009
+ console.log('🔇 user-state interrupt debounced (< 1s since last)');
3010
+ }
3011
+ else {
3012
+ try {
3013
+ console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS');
3014
+ currentSession?.interrupt();
3015
+ }
3016
+ catch (err) {
3017
+ console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
3018
+ }
2963
3019
  }
2964
3020
  }
2965
3021
  // When user stops speaking, retry voice queue — items may be waiting
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.52",
3
+ "version": "0.9.53",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
@@ -33,14 +33,14 @@
33
33
  "@anthropic-ai/claude-agent-sdk": "^0.2.91",
34
34
  "@anthropic-ai/sdk": "^0.80.0",
35
35
  "@google/genai": "^1.0.0",
36
- "@livekit/agents": "^1.2.1",
37
- "@livekit/agents-plugin-deepgram": "^1.2.1",
38
- "@livekit/agents-plugin-elevenlabs": "^1.2.1",
39
- "@livekit/agents-plugin-google": "^1.2.1",
40
- "@livekit/agents-plugin-livekit": "^1.2.1",
41
- "@livekit/agents-plugin-openai": "^1.2.1",
42
- "@livekit/agents-plugin-silero": "^1.2.1",
43
- "@livekit/rtc-node": "^0.13.24",
36
+ "@livekit/agents": "1.2.1",
37
+ "@livekit/agents-plugin-deepgram": "1.2.1",
38
+ "@livekit/agents-plugin-elevenlabs": "1.2.1",
39
+ "@livekit/agents-plugin-google": "1.2.1",
40
+ "@livekit/agents-plugin-livekit": "1.2.1",
41
+ "@livekit/agents-plugin-openai": "1.2.1",
42
+ "@livekit/agents-plugin-silero": "1.2.1",
43
+ "@livekit/rtc-node": "0.13.24",
44
44
  "@modelcontextprotocol/sdk": "^1.29.0",
45
45
  "@openai/codex-sdk": "^0.77.0",
46
46
  "@smithery/api": "^0.48.0",