osborn 0.9.55 → 0.9.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +25 -74
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent, RemoteParticipant, } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent, } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -1275,16 +1275,6 @@ async function main() {
1275
1275
  // Session-level always-allow list: paths the user has approved for this session without prompting
1276
1276
  let sessionAlwaysAllowPaths = new Set();
1277
1277
  let userState = 'listening'; // Track user speech state for queue safety
1278
- // Self-echo guard for the TTS interrupt below. Updated by the
1279
- // ActiveSpeakersChanged listener registered near the other room.on(...) handlers.
1280
- // user_state_changed carries NO speaker identity (verified against the SDK type
1281
- // — UserStateChangedEvent has only oldState/newState/createdAt), so a separate
1282
- // remote-speaker timestamp is the only way to distinguish "real user spoke" from
1283
- // "agent's own TTS echoed through the mic". Independent producer: rtc-node
1284
- // emits activeSpeakersChanged from server WebRTC audio-level reports
1285
- // (room.js:213), with NO reference to AgentSession or STT — so there's no
1286
- // dependency loop with user_state_changed's STT-driven producer.
1287
- let lastRemoteSpeakerAt = 0;
1288
1278
  let currentVoiceMode = voiceMode; // Track active voice mode for data handlers
1289
1279
  let currentProvider = realtimeConfig.provider; // Track active realtime provider
1290
1280
  // Authenticated Supabase userId from participant metadata. Used to scope
@@ -2132,29 +2122,22 @@ async function main() {
2132
2122
  minDelay: 500, // Wait 500ms after STT commits before generating reply
2133
2123
  maxDelay: 2000, // Force end-of-turn after 2s to prevent hangs
2134
2124
  },
2135
- // Echo-driven false-interrupt protection at the SDK level (1.2.x has these knobs,
2136
- // we just never set them defaults are minDuration:500ms / minWords:0 which let
2137
- // through every short echo blip). Both knobs gate the SDK's internal
2138
- // interruptByAudioActivity() (agent_activity.js runs on Deepgram interim
2139
- // transcripts AND speechDuration updates), which is the path that was firing
2140
- // even after our user_state_changed handler skipped the trigger.
2125
+ // 0.9.57: bump falseInterruptionTimeout from default 2000ms 3000ms.
2126
+ // This is the silence-after-interrupt window the SDK waits before
2127
+ // emitting agentFalseInterruption + resuming. Extending it gives the
2128
+ // user a fuller breath between low-level audio activity moments to
2129
+ // accumulate a clean silence, which helps when echo or ambient noise
2130
+ // keeps resetting the 2s window. Other tunables in this same block
2131
+ // (NOT changed yet — try the timeout first, escalate if needed):
2132
+ // - minDuration (default 500ms) — minimum sustained speech to count
2133
+ // - minWords (default 0) — minimum word count in interim transcript
2134
+ // - enabled (default true) — kept ON (auto-interrupt path active)
2135
+ // - resumeFalseInterruption (default true) — auto-resume kept ON
2136
+ // - discardAudioIfUninterruptible (default true)
2141
2137
  interruption: {
2142
- // SDK auto-interrupt fully DISABLED (0.9.55). Even with minDuration:750
2143
- // and minWords:2 in 0.9.54, the SDK's onInterimTranscript path bypasses
2144
- // duration gating (it fires on first interim text) and minWords gates
2145
- // against accumulated transcript wordcount — so once a real user utters
2146
- // ≥2 words, every subsequent echo passes. Worse: double-fires within
2147
- // 200ms corrupt SegmentSynchronizerImpl state (pushAudio called after
2148
- // close → markPlaybackFinished before input done → playback hangs).
2149
- // With enabled:false the SDK won't fire interruptByAudioActivity at all;
2150
- // our user_state_changed handler at index.ts:3162 with the self-echo
2151
- // guard (lastRemoteSpeakerAt + ActiveSpeakersChanged) becomes the SOLE
2152
- // interrupt path. We control timing, deduplication, and identity.
2153
- enabled: false,
2154
- // The values below have no effect with enabled:false but kept for
2155
- // documentation in case enabled is flipped back on for testing.
2156
- minDuration: 750,
2157
- minWords: 2,
2138
+ falseInterruptionTimeout: 3000, // 2000 3000 (extra second of silence before resume)
2139
+ minDuration: 1000, // 500 1000 (need 1s sustained speech to count)
2140
+ minWords: 3, // 0 3 (interim transcript needs ≥3 words)
2158
2141
  },
2159
2142
  },
2160
2143
  });
@@ -2693,20 +2676,6 @@ async function main() {
2693
2676
  // rather than hold it indefinitely. Cancelled in ParticipantConnected.
2694
2677
  armAloneTimer();
2695
2678
  });
2696
- // Self-echo guard producer. Server WebRTC audio-level reports drive this
2697
- // (rtc-node room.js:213, ~50-100ms latency from mic onset — faster than
2698
- // Deepgram STT classification, so by the time user_state_changed fires
2699
- // lastRemoteSpeakerAt is already current). Filter speakers to RemoteParticipant
2700
- // — LocalParticipant is the agent itself and including it would defeat the
2701
- // whole point (the echo we're guarding against IS the agent's local audio).
2702
- // This is the speaker-identity filter the removed ActiveSpeakersChanged
2703
- // handler had (May 21 / c345c98) — minus the interrupt() call, since the
2704
- // user_state_changed handler now owns interrupt firing.
2705
- room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2706
- if (speakers.some((s) => s instanceof RemoteParticipant)) {
2707
- lastRemoteSpeakerAt = Date.now();
2708
- }
2709
- });
2710
2679
  // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
2711
2680
  // handler that interrupted TTS on any sustained audio activity (~50ms after
2712
2681
  // mic onset). That fired too eagerly — coughs, paper rustles, the agent's
@@ -3072,34 +3041,16 @@ async function main() {
3072
3041
  userState = ev.newState;
3073
3042
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
3074
3043
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
3075
- const now = Date.now();
3076
- // Self-echo guard. Reject this trigger entirely if no remote
3077
- // participant has been heard speaking in the last 500ms at that
3078
- // point user_state=speaking is almost certainly TTS bleeding through
3079
- // the mic (Deepgram correctly identifies it as "speech", we add the
3080
- // identity filter the high-level event lacks). 500ms is wider than
3081
- // the ~50-300ms gap between ActiveSpeakersChanged and user_state_changed
3082
- // firing, so a real user is comfortably inside the window.
3083
- //
3084
- // The 1s leading-edge debounce that used to live here was removed in
3085
- // 0.9.54 — the SDK-side `turnHandling.interruption.minDuration:750` +
3086
- // `minWords:2` now do the heavy lifting on echo filtering, and stacking
3087
- // an extra cooldown on top risked masking the SDK's own resume timing.
3088
- if (now - lastRemoteSpeakerAt > 500) {
3089
- console.log('🔇 Skipping interrupt — no recent remote-speaker activity (self-echo guard)');
3090
- return;
3091
- }
3044
+ // Reverted to the simple post-May-22 (c345c98 / 0.9.39) shape in 0.9.56.
3045
+ // The self-echo guard via lastRemoteSpeakerAt was defeated by the same
3046
+ // physics it was trying to filter TTS bleeds into the user's mic →
3047
+ // LiveKit registers their participant as a remote speaker → the guard
3048
+ // passes we interrupt anyway. Verified in osbornojure logs 2026-06-16
3049
+ // (2 of 3 interrupts that session were from this handler firing on echo).
3050
+ // Echo prevention moved to browser AEC on the publisher side.
3092
3051
  try {
3093
- // force:true bypasses the SpeechHandle's allowInterruptions check
3094
- // (speech_handle.js:93-99). Required because turnHandling.interruption.enabled=false
3095
- // sets allowInterruptions=false on every SpeechHandle (agent_activity.js:329-331),
3096
- // which is what blocks the SDK's auto-interrupt path — but without
3097
- // force:true, this manual call from our handler would also throw
3098
- // "This generation handle does not allow interruptions". Combined,
3099
- // they let US interrupt (with self-echo guard already verified above)
3100
- // while keeping the SDK's auto-trigger off.
3101
- console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS (force)');
3102
- currentSession?.interrupt({ force: true });
3052
+ console.log('🎤 user_state_changed=speaking + agent speaking interrupting TTS');
3053
+ currentSession?.interrupt();
3103
3054
  }
3104
3055
  catch (err) {
3105
3056
  console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.55",
3
+ "version": "0.9.58",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {