osborn 0.9.38 → 0.9.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +19 -53
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Load environment variables FIRST before any other imports
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { voice, initializeLogger } from '@livekit/agents';
|
|
4
|
-
import { Room, RoomEvent
|
|
4
|
+
import { Room, RoomEvent } from '@livekit/rtc-node';
|
|
5
5
|
import { AccessToken } from 'livekit-server-sdk';
|
|
6
6
|
// Initialize logger before anything else
|
|
7
7
|
initializeLogger({ pretty: true, level: 'info' });
|
|
@@ -2530,51 +2530,16 @@ async function main() {
|
|
|
2530
2530
|
console.log('✅ Connected to room:', roomName);
|
|
2531
2531
|
localParticipant = room.localParticipant;
|
|
2532
2532
|
});
|
|
2533
|
-
//
|
|
2534
|
-
//
|
|
2535
|
-
//
|
|
2536
|
-
//
|
|
2537
|
-
//
|
|
2538
|
-
//
|
|
2539
|
-
//
|
|
2540
|
-
//
|
|
2541
|
-
//
|
|
2542
|
-
//
|
|
2543
|
-
//
|
|
2544
|
-
// Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
|
|
2545
|
-
// room, and when its TTS plays it appears in the active-speakers list too. An earlier
|
|
2546
|
-
// attempt that compared `s.identity !== room.localParticipant?.identity` failed because
|
|
2547
|
-
// localParticipant.identity could be undefined at event-fire time, letting the agent's
|
|
2548
|
-
// own speech trigger a self-interrupt. The type check is bulletproof.
|
|
2549
|
-
//
|
|
2550
|
-
// Realtime mode skipped — the SDK handles interruption internally there, and manual
|
|
2551
|
-
// interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
|
|
2552
|
-
let lastActiveSpeakerInterruptAt = 0;
|
|
2553
|
-
room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
|
|
2554
|
-
if (!Array.isArray(speakers) || speakers.length === 0)
|
|
2555
|
-
return;
|
|
2556
|
-
const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
|
|
2557
|
-
if (remoteSpeakers.length === 0)
|
|
2558
|
-
return;
|
|
2559
|
-
if (currentVoiceMode === 'realtime')
|
|
2560
|
-
return;
|
|
2561
|
-
if (agentState !== 'speaking')
|
|
2562
|
-
return;
|
|
2563
|
-
const now = Date.now();
|
|
2564
|
-
const debounced = now - lastActiveSpeakerInterruptAt < 1000;
|
|
2565
|
-
lastActiveSpeakerInterruptAt = now;
|
|
2566
|
-
try {
|
|
2567
|
-
if (!debounced) {
|
|
2568
|
-
const ids = remoteSpeakers.map((s) => s.identity).join(',');
|
|
2569
|
-
console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
|
|
2570
|
-
}
|
|
2571
|
-
currentSession?.interrupt();
|
|
2572
|
-
}
|
|
2573
|
-
catch (err) {
|
|
2574
|
-
if (!debounced)
|
|
2575
|
-
console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
|
|
2576
|
-
}
|
|
2577
|
-
});
|
|
2533
|
+
// NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
|
|
2534
|
+
// handler that interrupted TTS on any sustained audio activity (~50ms after
|
|
2535
|
+
// mic onset). That fired too eagerly — coughs, paper rustles, the agent's
|
|
2536
|
+
// own TTS bleeding through the mic, and other non-speech sounds tripped it
|
|
2537
|
+
// ~10-15% of the time, leaving the agent silent with no recovery path
|
|
2538
|
+
// (because no STT transcript would follow). Dropped in favor of the
|
|
2539
|
+
// user_state_changed → 'speaking' handler below, which is fed by Deepgram
|
|
2540
|
+
// Flux STT's speech-vs-noise classification: slower (~100-300ms) but
|
|
2541
|
+
// confidence-aware. The latency tradeoff is worth eliminating the false
|
|
2542
|
+
// interrupts at the root.
|
|
2578
2543
|
room.on(RoomEvent.Disconnected, () => {
|
|
2579
2544
|
console.log('👋 Disconnected from room');
|
|
2580
2545
|
// Clean up active research and voice queue
|
|
@@ -2868,19 +2833,20 @@ async function main() {
|
|
|
2868
2833
|
}
|
|
2869
2834
|
});
|
|
2870
2835
|
// User state tracking — prevents queue from colliding with server-side VAD.
|
|
2871
|
-
// Also
|
|
2872
|
-
//
|
|
2873
|
-
//
|
|
2874
|
-
//
|
|
2875
|
-
//
|
|
2876
|
-
//
|
|
2836
|
+
// Also the PRIMARY interrupt trigger now that the over-eager ActiveSpeakersChanged
|
|
2837
|
+
// path is gone. Fires when Deepgram Flux STT classifies frames as speech (not noise)
|
|
2838
|
+
// and propagates via agent_activity.onStartOfSpeech → _updateUserState('speaking').
|
|
2839
|
+
// Latency ~100-300ms after mic onset, which is the cost of confidence-aware
|
|
2840
|
+
// detection — vs the prior ActiveSpeakers handler that fired at ~50ms on any audio
|
|
2841
|
+
// activity and tripped ~10-15% false interrupts on coughs, paper rustle, agent's
|
|
2842
|
+
// own TTS bleeding through the mic, etc.
|
|
2877
2843
|
sess.on('user_state_changed', (ev) => {
|
|
2878
2844
|
const prev = userState;
|
|
2879
2845
|
userState = ev.newState;
|
|
2880
2846
|
console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
|
|
2881
2847
|
if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
|
|
2882
2848
|
try {
|
|
2883
|
-
console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS
|
|
2849
|
+
console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
|
|
2884
2850
|
currentSession?.interrupt();
|
|
2885
2851
|
}
|
|
2886
2852
|
catch (err) {
|