osborn 0.9.61 → 0.9.63

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +94 -35
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -2113,51 +2113,57 @@ async function main() {
2113
2113
  tts,
2114
2114
  turnDetection: 'stt',
2115
2115
  });
2116
+ // 0.9.62: REVERT to the AgentSession config that was deployed during the
2117
+ // user's known-good month (0.9.52, Jun 09). Pre-48h evidence shows the
2118
+ // explicit interruption block introduced in 0.9.60 + the timer bumps in
2119
+ // 0.9.61 made things WORSE, not better — osbornojure logs showed 5+
2120
+ // consecutive TTS stalls on a single TTS-say, each one re-triggering
2121
+ // because the underlying pause-and-resume deadlock (workflow finding:
2122
+ // waitUntilTimeout signal-blind, audioOutput.pause without _currentSpeech.interrupt,
2123
+ // captureFrame parked on playbackEnabledFuture) is INHERENT to the
2124
+ // 1.4.x pause path and our tuned thresholds (minDuration: 1000, minWords: 3)
2125
+ // simply make each rare-but-deadlocking trigger more catastrophic.
2126
+ //
2127
+ // Stripped back to SDK defaults for every interrupt-related knob. SDK
2128
+ // 1.4.6 defaults (aecWarmupDuration: 3000, minDuration: 500, minWords: 0,
2129
+ // falseInterruptionTimeout: 2000, resumeFalseInterruption: true,
2130
+ // discardAudioIfUninterruptible: true, ttsReadIdleTimeout: 10000,
2131
+ // maxUnrecoverableErrors: 3) are what was silently running via caret-resolved
2132
+ // 1.4.5 throughout the user's working month. Restoring them.
2116
2133
  const session = new voice.AgentSession({
2117
2134
  turnDetection: 'stt',
2118
2135
  preemptiveGeneration: false, // Only fire LLM on final committed transcript, not partial preemptives
2119
- // First-line echo defense: drop mic frames from BOTH the recognition stream
2120
- // and the realtime audio stream for this many ms after the agent first
2121
- // enters 'speaking' state. STT receives no audio during the warmup → no
2122
- // interim/final transcripts can fire echo cannot trigger an interrupt.
2123
- // 1.4.x default is 3000; bumping to 5000 widens the safe zone at session start.
2124
- // One-shot per session (NOT re-armed each turn), so this protects only the
2125
- // first agent response. After that the in-block interruption settings handle it.
2126
- aecWarmupDuration: 5000,
2127
- // TTS stall mitigations (0.9.61). The 1.4.x SDK added a 10s default
2128
- // readIdleTimeout in generation.js:519 (PR livekit/agents-js#1461) — when
2129
- // the TTS stream goes silent for >10s, it force-closes via reader.cancel()
2130
- // which trips the OpenAI SDK's AbortSignal → APIUserAbortError →
2131
- // tts_error recoverable:false. Root cause is upstream: the OpenAI plugin
2132
- // BUFFERS the entire tts-1 PCM response (arrayBuffer()) before emitting a
2133
- // single frame. Long sentences intermittently exceed 10s end-to-end with
2134
- // tts-1. Raising both watchdogs to 30s gives slow OpenAI responses room
2135
- // to complete; raising maxUnrecoverableErrors from default 3 to 15 prevents
2136
- // a transient burst of stalls from killing the AgentSession outright (the
2137
- // counter resets on every successful speaking transition).
2138
- ttsReadIdleTimeout: 30_000,
2139
- forwardAudioIdleTimeout: 30_000,
2140
- connOptions: {
2141
- maxUnrecoverableErrors: 15,
2142
- },
2136
+ // Commented out kept for reference. These were added across 0.9.60/0.9.61
2137
+ // to try to harden interrupt + TTS handling, but evidence (osbornojure
2138
+ // 2026-06-16/17 logs + the interrupt-stall workflow) showed they made
2139
+ // things worse: tighter gates concentrated the rare-but-deadlocking pause
2140
+ // path triggers into longer events that the SDK's signal-blind read loop
2141
+ // (utils.js:624 waitUntilTimeout) couldn't recover from. Defaults from
2142
+ // SDK 1.4.6 (matching what silently ran via caret-resolved 1.4.5 throughout
2143
+ // the user's last-working month) are restored by leaving these unset.
2144
+ //
2145
+ // aecWarmupDuration: 5000, // default 3000
2146
+ // ttsReadIdleTimeout: 30_000, // default 10000
2147
+ // forwardAudioIdleTimeout: 30_000, // default 10000
2148
+ // connOptions: {
2149
+ // maxUnrecoverableErrors: 15, // default 3
2150
+ // },
2143
2151
  turnHandling: {
2144
2152
  endpointing: {
2145
2153
  mode: 'fixed',
2146
2154
  minDelay: 500, // Wait 500ms after STT commits before generating reply
2147
2155
  maxDelay: 2000, // Force end-of-turn after 2s to prevent hangs
2148
2156
  },
2149
- // 1.4.x SDK fully wires these minDuration now applies to the STT path
2150
- // (not just VAD), falseInterruptionTimeout actually fires the
2151
- // agentFalseInterruption event with auto-resume, discardAudioIfUninterruptible
2152
- // is checked at runtime. All inert in 1.2.1; live in 1.4.x.
2157
+ // Tightened gates: only commit to the pause path when the STT layer is
2158
+ // confident this is real speech, not echo. Once paused, give the user
2159
+ // a full 3s window to keep talking before deciding it was false and
2160
+ // resuming. Other two knobs left at SDK defaults.
2153
2161
  interruption: {
2154
- // enabled defaults truekept default (don't set to false; cascades into
2155
- // allowInterruptions:false which breaks manual interrupt() calls).
2156
- minDuration: 1000, // 1.4.x: now gates STT-path; require 1s sustained speech
2157
- minWords: 3, // require ≥3 words in interim transcript
2158
- falseInterruptionTimeout: 2000, // emit agentFalseInterruption after 2s silence
2159
- resumeFalseInterruption: true, // auto-resume TTS on false interrupt detection
2160
- discardAudioIfUninterruptible: true, // drop buffered echo audio
2162
+ minDuration: 2000, // default 500 require 2s sustained speech
2163
+ minWords: 3, // default 0 — require ≥3 transcript words
2164
+ falseInterruptionTimeout: 3000, // default 2000 wait 3s before auto-resume
2165
+ // resumeFalseInterruption: true, // default true (unchanged)
2166
+ // discardAudioIfUninterruptible: true,// default true (unchanged)
2161
2167
  },
2162
2168
  },
2163
2169
  });
@@ -3079,6 +3085,59 @@ async function main() {
3079
3085
  setTimeout(() => processVoiceQueue(), 500);
3080
3086
  }
3081
3087
  });
3088
+ // ============================================================
3089
+ // Interrupt-debug instrumentation (0.9.63) — log every SDK event
3090
+ // that touches the pause/resume + transcript path so we can correlate
3091
+ // a "TTS stream stalled" or visible cutoff to the exact transcript
3092
+ // text + timing that triggered it.
3093
+ //
3094
+ // The events below are emitted by AgentSession in @livekit/agents 1.4.6.
3095
+ // Each line prints with a wall-clock timestamp so it can be cross-referenced
3096
+ // against the WARN/ERROR lines from the SDK itself.
3097
+ // ============================================================
3098
+ // user_input_transcribed — the actual transcript Deepgram emitted.
3099
+ // Fires for BOTH interim and final transcripts. This is the smoking-gun
3100
+ // log for false interrupts: if echo bleeds through and Deepgram transcribes
3101
+ // a 1-2 word fragment, you'll see it here a fraction of a second before
3102
+ // user_state_changed=speaking or the SDK fires interruptByAudioActivity.
3103
+ sess.on('user_input_transcribed', (ev) => {
3104
+ const t = ev.transcript ?? '';
3105
+ const isFinal = !!ev.isFinal;
3106
+ const words = t.trim().split(/\s+/).filter(Boolean).length;
3107
+ const tag = isFinal ? '📝 FINAL' : '✏️ interim';
3108
+ console.log(`${tag} transcript (${words}w, ${t.length}c) [${new Date().toISOString()}]: "${t.slice(0, 120)}${t.length > 120 ? '…' : ''}"`);
3109
+ });
3110
+ // overlapping_speech — SDK detected user audio while agent was speaking.
3111
+ // This is the moment the pause path fires (before any interrupt() call).
3112
+ sess.on('overlapping_speech', (ev) => {
3113
+ console.log(`🔁 OVERLAPPING SPEECH detected [${new Date().toISOString()}]:`, JSON.stringify({
3114
+ type: ev.type,
3115
+ isInterruption: ev.isInterruption,
3116
+ interruptedAt: ev.interruptedAt,
3117
+ // Whatever else SDK provides — dump it all for now
3118
+ fields: Object.keys(ev),
3119
+ }));
3120
+ });
3121
+ // agent_false_interruption — the SDK's "actually that was a false alarm,
3122
+ // resuming TTS" event. Fires falseInterruptionTimeout after a pause.
3123
+ // resumed:true means the TTS audio was resumed cleanly; resumed:false
3124
+ // means resume was attempted but blocked (canPause check, etc.) — the
3125
+ // canonical signal for our deadlock scenario.
3126
+ sess.on('agent_false_interruption', (ev) => {
3127
+ console.log(`✅ AGENT FALSE INTERRUPTION [${new Date().toISOString()}]:`, JSON.stringify({
3128
+ resumed: ev.resumed,
3129
+ createdAt: ev.createdAt,
3130
+ }));
3131
+ });
3132
+ // speech_created — every time TTS audio is queued. Lets us correlate
3133
+ // a speech-handle id back to the transcript that triggered it.
3134
+ sess.on('speech_created', (ev) => {
3135
+ console.log(`🗣️ SPEECH CREATED [${new Date().toISOString()}]:`, JSON.stringify({
3136
+ speechId: ev.speechHandle?.id,
3137
+ source: ev.source,
3138
+ userInitiated: ev.userInitiated,
3139
+ }));
3140
+ });
3082
3141
  // FALLBACK: playout_completed
3083
3142
  sess.on('playout_completed', (ev) => {
3084
3143
  const message = ev.message || ev.text || ev.content;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.61",
3
+ "version": "0.9.63",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {