osborn 0.9.53 → 0.9.54

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +24 -26
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -1205,16 +1205,6 @@ async function main() {
1205
1205
  // Session-level always-allow list: paths the user has approved for this session without prompting
1206
1206
  let sessionAlwaysAllowPaths = new Set();
1207
1207
  let userState = 'listening'; // Track user speech state for queue safety
1208
- // Leading-edge debounce for the TTS interrupt below — restores the same
1209
- // anti-flap protection the removed ActiveSpeakersChanged handler had pre-0.9.39
1210
- // (May 21 / c345c98). Wall-clock timestamp + ms compare; no setTimeout, no
1211
- // promise, no new API. Suppresses repeat interrupts within the window so a
1212
- // single user-input transition fires at most one interrupt() call per second.
1213
- // Without it, TTS echo bleeding through the mic causes user_state to oscillate
1214
- // speaking ↔ listening across rapid Deepgram frames, each transition firing a
1215
- // fresh interrupt — and even after 1.4.x's stricter error classification, the
1216
- // first one survives but the cascade kills the session.
1217
- let lastInterruptAt = 0;
1218
1208
  // Self-echo guard for the TTS interrupt below. Updated by the
1219
1209
  // ActiveSpeakersChanged listener registered near the other room.on(...) handlers.
1220
1210
  // user_state_changed carries NO speaker identity (verified against the SDK type
@@ -2072,6 +2062,19 @@ async function main() {
2072
2062
  minDelay: 500, // Wait 500ms after STT commits before generating reply
2073
2063
  maxDelay: 2000, // Force end-of-turn after 2s to prevent hangs
2074
2064
  },
2065
+ // Echo-driven false-interrupt protection at the SDK level (1.2.x has these knobs,
2066
+ // we just never set them — defaults are minDuration:500ms / minWords:0 which let
2067
+ // through every short echo blip). Both knobs gate the SDK's internal
2068
+ // interruptByAudioActivity() (agent_activity.js — runs on Deepgram interim
2069
+ // transcripts AND speechDuration updates), which is the path that was firing
2070
+ // even after our user_state_changed handler skipped the trigger.
2071
+ interruption: {
2072
+ minDuration: 750, // 500 → 750: require 750ms of sustained audio activity
2073
+ minWords: 2, // 0 → 2: require ≥2 transcript words (filters single-word echo blips)
2074
+ // SDK defaults kept: enabled=true, resumeFalseInterruption=true,
2075
+ // falseInterruptionTimeout=2000ms (that 2s timer is what resumed your audio
2076
+ // exactly where it stopped — confirmed working as designed).
2077
+ },
2075
2078
  },
2076
2079
  });
2077
2080
  return { session, agent };
@@ -2989,33 +2992,28 @@ async function main() {
2989
2992
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
2990
2993
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
2991
2994
  const now = Date.now();
2992
- // Self-echo guard FIRST. Reject this trigger entirely if no remote
2995
+ // Self-echo guard. Reject this trigger entirely if no remote
2993
2996
  // participant has been heard speaking in the last 500ms — at that
2994
2997
  // point user_state=speaking is almost certainly TTS bleeding through
2995
2998
  // the mic (Deepgram correctly identifies it as "speech", we add the
2996
2999
  // identity filter the high-level event lacks). 500ms is wider than
2997
3000
  // the ~50-300ms gap between ActiveSpeakersChanged and user_state_changed
2998
3001
  // firing, so a real user is comfortably inside the window.
3002
+ //
3003
+ // The 1s leading-edge debounce that used to live here was removed in
3004
+ // 0.9.54 — the SDK-side `turnHandling.interruption.minDuration:750` +
3005
+ // `minWords:2` now do the heavy lifting on echo filtering, and stacking
3006
+ // an extra cooldown on top risked masking the SDK's own resume timing.
2999
3007
  if (now - lastRemoteSpeakerAt > 500) {
3000
3008
  console.log('🔇 Skipping interrupt — no recent remote-speaker activity (self-echo guard)');
3001
3009
  return;
3002
3010
  }
3003
- // Leading-edge 1s debounce — verbatim shape of the removed
3004
- // ActiveSpeakersChanged handler's anti-flap (see lastInterruptAt
3005
- // declaration). Belt + suspenders with the self-echo guard above.
3006
- const debounced = now - lastInterruptAt < 1000;
3007
- lastInterruptAt = now;
3008
- if (debounced) {
3009
- console.log('🔇 user-state interrupt debounced (< 1s since last)');
3011
+ try {
3012
+ console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS');
3013
+ currentSession?.interrupt();
3010
3014
  }
3011
- else {
3012
- try {
3013
- console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS');
3014
- currentSession?.interrupt();
3015
- }
3016
- catch (err) {
3017
- console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
3018
- }
3015
+ catch (err) {
3016
+ console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
3019
3017
  }
3020
3018
  }
3021
3019
  // When user stops speaking, retry voice queue — items may be waiting
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.53",
3
+ "version": "0.9.54",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {