osborn 0.9.55 → 0.9.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +27 -74
- package/package.json +9 -9
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Load environment variables FIRST before any other imports
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { voice, initializeLogger } from '@livekit/agents';
|
|
4
|
-
import { Room, RoomEvent,
|
|
4
|
+
import { Room, RoomEvent, } from '@livekit/rtc-node';
|
|
5
5
|
import { AccessToken } from 'livekit-server-sdk';
|
|
6
6
|
// Initialize logger before anything else
|
|
7
7
|
initializeLogger({ pretty: true, level: 'info' });
|
|
@@ -1275,16 +1275,6 @@ async function main() {
|
|
|
1275
1275
|
// Session-level always-allow list: paths the user has approved for this session without prompting
|
|
1276
1276
|
let sessionAlwaysAllowPaths = new Set();
|
|
1277
1277
|
let userState = 'listening'; // Track user speech state for queue safety
|
|
1278
|
-
// Self-echo guard for the TTS interrupt below. Updated by the
|
|
1279
|
-
// ActiveSpeakersChanged listener registered near the other room.on(...) handlers.
|
|
1280
|
-
// user_state_changed carries NO speaker identity (verified against the SDK type
|
|
1281
|
-
// — UserStateChangedEvent has only oldState/newState/createdAt), so a separate
|
|
1282
|
-
// remote-speaker timestamp is the only way to distinguish "real user spoke" from
|
|
1283
|
-
// "agent's own TTS echoed through the mic". Independent producer: rtc-node
|
|
1284
|
-
// emits activeSpeakersChanged from server WebRTC audio-level reports
|
|
1285
|
-
// (room.js:213), with NO reference to AgentSession or STT — so there's no
|
|
1286
|
-
// dependency loop with user_state_changed's STT-driven producer.
|
|
1287
|
-
let lastRemoteSpeakerAt = 0;
|
|
1288
1278
|
let currentVoiceMode = voiceMode; // Track active voice mode for data handlers
|
|
1289
1279
|
let currentProvider = realtimeConfig.provider; // Track active realtime provider
|
|
1290
1280
|
// Authenticated Supabase userId from participant metadata. Used to scope
|
|
@@ -2126,35 +2116,32 @@ async function main() {
|
|
|
2126
2116
|
const session = new voice.AgentSession({
|
|
2127
2117
|
turnDetection: 'stt',
|
|
2128
2118
|
preemptiveGeneration: false, // Only fire LLM on final committed transcript, not partial preemptives
|
|
2119
|
+
// First-line echo defense: drop mic frames from BOTH the recognition stream
|
|
2120
|
+
// and the realtime audio stream for this many ms after the agent first
|
|
2121
|
+
// enters 'speaking' state. STT receives no audio during the warmup → no
|
|
2122
|
+
// interim/final transcripts can fire → echo cannot trigger an interrupt.
|
|
2123
|
+
// 1.4.x default is 3000; bumping to 5000 widens the safe zone at session start.
|
|
2124
|
+
// One-shot per session (NOT re-armed each turn), so this protects only the
|
|
2125
|
+
// first agent response. After that the in-block interruption settings handle it.
|
|
2126
|
+
aecWarmupDuration: 5000,
|
|
2129
2127
|
turnHandling: {
|
|
2130
2128
|
endpointing: {
|
|
2131
2129
|
mode: 'fixed',
|
|
2132
2130
|
minDelay: 500, // Wait 500ms after STT commits before generating reply
|
|
2133
2131
|
maxDelay: 2000, // Force end-of-turn after 2s to prevent hangs
|
|
2134
2132
|
},
|
|
2135
|
-
//
|
|
2136
|
-
//
|
|
2137
|
-
//
|
|
2138
|
-
//
|
|
2139
|
-
// transcripts AND speechDuration updates), which is the path that was firing
|
|
2140
|
-
// even after our user_state_changed handler skipped the trigger.
|
|
2133
|
+
// 1.4.x SDK fully wires these — minDuration now applies to the STT path
|
|
2134
|
+
// (not just VAD), falseInterruptionTimeout actually fires the
|
|
2135
|
+
// agentFalseInterruption event with auto-resume, discardAudioIfUninterruptible
|
|
2136
|
+
// is checked at runtime. All inert in 1.2.1; live in 1.4.x.
|
|
2141
2137
|
interruption: {
|
|
2142
|
-
//
|
|
2143
|
-
//
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
//
|
|
2148
|
-
|
|
2149
|
-
// With enabled:false the SDK won't fire interruptByAudioActivity at all;
|
|
2150
|
-
// our user_state_changed handler at index.ts:3162 with the self-echo
|
|
2151
|
-
// guard (lastRemoteSpeakerAt + ActiveSpeakersChanged) becomes the SOLE
|
|
2152
|
-
// interrupt path. We control timing, deduplication, and identity.
|
|
2153
|
-
enabled: false,
|
|
2154
|
-
// The values below have no effect with enabled:false but kept for
|
|
2155
|
-
// documentation in case enabled is flipped back on for testing.
|
|
2156
|
-
minDuration: 750,
|
|
2157
|
-
minWords: 2,
|
|
2138
|
+
// enabled defaults true — kept default (don't set to false; cascades into
|
|
2139
|
+
// allowInterruptions:false which breaks manual interrupt() calls).
|
|
2140
|
+
minDuration: 1000, // 1.4.x: now gates STT-path; require 1s sustained speech
|
|
2141
|
+
minWords: 3, // require ≥3 words in interim transcript
|
|
2142
|
+
falseInterruptionTimeout: 2000, // emit agentFalseInterruption after 2s silence
|
|
2143
|
+
resumeFalseInterruption: true, // auto-resume TTS on false interrupt detection
|
|
2144
|
+
discardAudioIfUninterruptible: true, // drop buffered echo audio
|
|
2158
2145
|
},
|
|
2159
2146
|
},
|
|
2160
2147
|
});
|
|
@@ -2693,20 +2680,6 @@ async function main() {
|
|
|
2693
2680
|
// rather than hold it indefinitely. Cancelled in ParticipantConnected.
|
|
2694
2681
|
armAloneTimer();
|
|
2695
2682
|
});
|
|
2696
|
-
// Self-echo guard producer. Server WebRTC audio-level reports drive this
|
|
2697
|
-
// (rtc-node room.js:213, ~50-100ms latency from mic onset — faster than
|
|
2698
|
-
// Deepgram STT classification, so by the time user_state_changed fires
|
|
2699
|
-
// lastRemoteSpeakerAt is already current). Filter speakers to RemoteParticipant
|
|
2700
|
-
// — LocalParticipant is the agent itself and including it would defeat the
|
|
2701
|
-
// whole point (the echo we're guarding against IS the agent's local audio).
|
|
2702
|
-
// This is the speaker-identity filter the removed ActiveSpeakersChanged
|
|
2703
|
-
// handler had (May 21 / c345c98) — minus the interrupt() call, since the
|
|
2704
|
-
// user_state_changed handler now owns interrupt firing.
|
|
2705
|
-
room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
|
|
2706
|
-
if (speakers.some((s) => s instanceof RemoteParticipant)) {
|
|
2707
|
-
lastRemoteSpeakerAt = Date.now();
|
|
2708
|
-
}
|
|
2709
|
-
});
|
|
2710
2683
|
// NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
|
|
2711
2684
|
// handler that interrupted TTS on any sustained audio activity (~50ms after
|
|
2712
2685
|
// mic onset). That fired too eagerly — coughs, paper rustles, the agent's
|
|
@@ -3072,34 +3045,14 @@ async function main() {
|
|
|
3072
3045
|
userState = ev.newState;
|
|
3073
3046
|
console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
|
|
3074
3047
|
if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
|
|
3075
|
-
|
|
3076
|
-
//
|
|
3077
|
-
//
|
|
3078
|
-
//
|
|
3079
|
-
//
|
|
3080
|
-
// identity filter the high-level event lacks). 500ms is wider than
|
|
3081
|
-
// the ~50-300ms gap between ActiveSpeakersChanged and user_state_changed
|
|
3082
|
-
// firing, so a real user is comfortably inside the window.
|
|
3083
|
-
//
|
|
3084
|
-
// The 1s leading-edge debounce that used to live here was removed in
|
|
3085
|
-
// 0.9.54 — the SDK-side `turnHandling.interruption.minDuration:750` +
|
|
3086
|
-
// `minWords:2` now do the heavy lifting on echo filtering, and stacking
|
|
3087
|
-
// an extra cooldown on top risked masking the SDK's own resume timing.
|
|
3088
|
-
if (now - lastRemoteSpeakerAt > 500) {
|
|
3089
|
-
console.log('🔇 Skipping interrupt — no recent remote-speaker activity (self-echo guard)');
|
|
3090
|
-
return;
|
|
3091
|
-
}
|
|
3048
|
+
// Simple manual interrupt for echo-side defense fallback. With 1.4.x
|
|
3049
|
+
// the SDK's interrupt-by-audio-activity path is properly gated by
|
|
3050
|
+
// turnHandling.interruption.{minDuration, minWords, falseInterruptionTimeout},
|
|
3051
|
+
// and resumeFalseInterruption auto-recovers if echo was misclassified.
|
|
3052
|
+
// This handler stays as a secondary trigger only.
|
|
3092
3053
|
try {
|
|
3093
|
-
|
|
3094
|
-
|
|
3095
|
-
// sets allowInterruptions=false on every SpeechHandle (agent_activity.js:329-331),
|
|
3096
|
-
// which is what blocks the SDK's auto-interrupt path — but without
|
|
3097
|
-
// force:true, this manual call from our handler would also throw
|
|
3098
|
-
// "This generation handle does not allow interruptions". Combined,
|
|
3099
|
-
// they let US interrupt (with self-echo guard already verified above)
|
|
3100
|
-
// while keeping the SDK's auto-trigger off.
|
|
3101
|
-
console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS (force)');
|
|
3102
|
-
currentSession?.interrupt({ force: true });
|
|
3054
|
+
console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
|
|
3055
|
+
currentSession?.interrupt();
|
|
3103
3056
|
}
|
|
3104
3057
|
catch (err) {
|
|
3105
3058
|
console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "osborn",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.60",
|
|
4
4
|
"description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -33,14 +33,14 @@
|
|
|
33
33
|
"@anthropic-ai/claude-agent-sdk": "^0.2.91",
|
|
34
34
|
"@anthropic-ai/sdk": "^0.80.0",
|
|
35
35
|
"@google/genai": "^1.0.0",
|
|
36
|
-
"@livekit/agents": "1.
|
|
37
|
-
"@livekit/agents-plugin-deepgram": "1.
|
|
38
|
-
"@livekit/agents-plugin-elevenlabs": "1.
|
|
39
|
-
"@livekit/agents-plugin-google": "1.
|
|
40
|
-
"@livekit/agents-plugin-livekit": "1.
|
|
41
|
-
"@livekit/agents-plugin-openai": "1.
|
|
42
|
-
"@livekit/agents-plugin-silero": "1.
|
|
43
|
-
"@livekit/rtc-node": "0.13.
|
|
36
|
+
"@livekit/agents": "1.4.6",
|
|
37
|
+
"@livekit/agents-plugin-deepgram": "1.4.6",
|
|
38
|
+
"@livekit/agents-plugin-elevenlabs": "1.4.6",
|
|
39
|
+
"@livekit/agents-plugin-google": "1.4.6",
|
|
40
|
+
"@livekit/agents-plugin-livekit": "1.4.6",
|
|
41
|
+
"@livekit/agents-plugin-openai": "1.4.6",
|
|
42
|
+
"@livekit/agents-plugin-silero": "1.4.6",
|
|
43
|
+
"@livekit/rtc-node": "0.13.29",
|
|
44
44
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
45
45
|
"@openai/codex-sdk": "^0.77.0",
|
|
46
46
|
"@smithery/api": "^0.48.0",
|