osborn 0.9.58 → 0.9.61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +40 -22
  2. package/package.json +9 -9
package/dist/index.js CHANGED
@@ -2116,28 +2116,48 @@ async function main() {
2116
2116
  const session = new voice.AgentSession({
2117
2117
  turnDetection: 'stt',
2118
2118
  preemptiveGeneration: false, // Only fire LLM on final committed transcript, not partial preemptives
2119
+ // First-line echo defense: drop mic frames from BOTH the recognition stream
2120
+ // and the realtime audio stream for this many ms after the agent first
2121
+ // enters 'speaking' state. STT receives no audio during the warmup → no
2122
+ // interim/final transcripts can fire → echo cannot trigger an interrupt.
2123
+ // 1.4.x default is 3000; bumping to 5000 widens the safe zone at session start.
2124
+ // One-shot per session (NOT re-armed each turn), so this protects only the
2125
+ // first agent response. After that the in-block interruption settings handle it.
2126
+ aecWarmupDuration: 5000,
2127
+ // TTS stall mitigations (0.9.61). The 1.4.x SDK added a 10s default
2128
+ // readIdleTimeout in generation.js:519 (PR livekit/agents-js#1461) — when
2129
+ // the TTS stream goes silent for >10s, it force-closes via reader.cancel()
2130
+ // which trips the OpenAI SDK's AbortSignal → APIUserAbortError →
2131
+ // tts_error recoverable:false. Root cause is upstream: the OpenAI plugin
2132
+ // BUFFERS the entire tts-1 PCM response (arrayBuffer()) before emitting a
2133
+ // single frame. Long sentences intermittently exceed 10s end-to-end with
2134
+ // tts-1. Raising both watchdogs to 30s gives slow OpenAI responses room
2135
+ // to complete; raising maxUnrecoverableErrors from default 3 to 15 prevents
2136
+ // a transient burst of stalls from killing the AgentSession outright (the
2137
+ // counter resets on every successful speaking transition).
2138
+ ttsReadIdleTimeout: 30_000,
2139
+ forwardAudioIdleTimeout: 30_000,
2140
+ connOptions: {
2141
+ maxUnrecoverableErrors: 15,
2142
+ },
2119
2143
  turnHandling: {
2120
2144
  endpointing: {
2121
2145
  mode: 'fixed',
2122
2146
  minDelay: 500, // Wait 500ms after STT commits before generating reply
2123
2147
  maxDelay: 2000, // Force end-of-turn after 2s to prevent hangs
2124
2148
  },
2125
- // 0.9.57: bump falseInterruptionTimeout from default 2000ms 3000ms.
2126
- // This is the silence-after-interrupt window the SDK waits before
2127
- // emitting agentFalseInterruption + resuming. Extending it gives the
2128
- // user a fuller breath between low-level audio activity moments to
2129
- // accumulate a clean silence, which helps when echo or ambient noise
2130
- // keeps resetting the 2s window. Other tunables in this same block
2131
- // (NOT changed yet — try the timeout first, escalate if needed):
2132
- // - minDuration (default 500ms) — minimum sustained speech to count
2133
- // - minWords (default 0) — minimum word count in interim transcript
2134
- // - enabled (default true) — kept ON (auto-interrupt path active)
2135
- // - resumeFalseInterruption (default true) — auto-resume kept ON
2136
- // - discardAudioIfUninterruptible (default true)
2149
+ // 1.4.x SDK fully wires these minDuration now applies to the STT path
2150
+ // (not just VAD), falseInterruptionTimeout actually fires the
2151
+ // agentFalseInterruption event with auto-resume, discardAudioIfUninterruptible
2152
+ // is checked at runtime. All inert in 1.2.1; live in 1.4.x.
2137
2153
  interruption: {
2138
- falseInterruptionTimeout: 3000, // 2000 3000 (extra second of silence before resume)
2139
- minDuration: 1000, // 500 → 1000 (need 1s sustained speech to count)
2140
- minWords: 3, // 0 3 (interim transcript needs ≥3 words)
2154
+ // enabled defaults true kept default (don't set to false; cascades into
2155
+ // allowInterruptions:false which breaks manual interrupt() calls).
2156
+ minDuration: 1000, // 1.4.x: now gates STT-path; require 1s sustained speech
2157
+ minWords: 3, // require ≥3 words in interim transcript
2158
+ falseInterruptionTimeout: 2000, // emit agentFalseInterruption after 2s silence
2159
+ resumeFalseInterruption: true, // auto-resume TTS on false interrupt detection
2160
+ discardAudioIfUninterruptible: true, // drop buffered echo audio
2141
2161
  },
2142
2162
  },
2143
2163
  });
@@ -3041,13 +3061,11 @@ async function main() {
3041
3061
  userState = ev.newState;
3042
3062
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
3043
3063
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
3044
- // Reverted to the simple post-May-22 (c345c98 / 0.9.39) shape in 0.9.56.
3045
- // The self-echo guard via lastRemoteSpeakerAt was defeated by the same
3046
- // physics it was trying to filter — TTS bleeds into the user's mic →
3047
- // LiveKit registers their participant as a remote speaker → the guard
3048
- // passes we interrupt anyway. Verified in osbornojure logs 2026-06-16
3049
- // (2 of 3 interrupts that session were from this handler firing on echo).
3050
- // Echo prevention moved to browser AEC on the publisher side.
3064
+ // Simple manual interrupt for echo-side defense fallback. With 1.4.x
3065
+ // the SDK's interrupt-by-audio-activity path is properly gated by
3066
+ // turnHandling.interruption.{minDuration, minWords, falseInterruptionTimeout},
3067
+ // and resumeFalseInterruption auto-recovers if echo was misclassified.
3068
+ // This handler stays as a secondary trigger only.
3051
3069
  try {
3052
3070
  console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
3053
3071
  currentSession?.interrupt();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.58",
3
+ "version": "0.9.61",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
@@ -33,14 +33,14 @@
33
33
  "@anthropic-ai/claude-agent-sdk": "^0.2.91",
34
34
  "@anthropic-ai/sdk": "^0.80.0",
35
35
  "@google/genai": "^1.0.0",
36
- "@livekit/agents": "1.2.1",
37
- "@livekit/agents-plugin-deepgram": "1.2.1",
38
- "@livekit/agents-plugin-elevenlabs": "1.2.1",
39
- "@livekit/agents-plugin-google": "1.2.1",
40
- "@livekit/agents-plugin-livekit": "1.2.1",
41
- "@livekit/agents-plugin-openai": "1.2.1",
42
- "@livekit/agents-plugin-silero": "1.2.1",
43
- "@livekit/rtc-node": "0.13.24",
36
+ "@livekit/agents": "1.4.6",
37
+ "@livekit/agents-plugin-deepgram": "1.4.6",
38
+ "@livekit/agents-plugin-elevenlabs": "1.4.6",
39
+ "@livekit/agents-plugin-google": "1.4.6",
40
+ "@livekit/agents-plugin-livekit": "1.4.6",
41
+ "@livekit/agents-plugin-openai": "1.4.6",
42
+ "@livekit/agents-plugin-silero": "1.4.6",
43
+ "@livekit/rtc-node": "0.13.29",
44
44
  "@modelcontextprotocol/sdk": "^1.29.0",
45
45
  "@openai/codex-sdk": "^0.77.0",
46
46
  "@smithery/api": "^0.48.0",