osborn 0.9.34 → 0.9.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +65 -21
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Load environment variables FIRST before any other imports
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { voice, initializeLogger } from '@livekit/agents';
|
|
4
|
-
import { Room, RoomEvent } from '@livekit/rtc-node';
|
|
4
|
+
import { Room, RoomEvent, RemoteParticipant } from '@livekit/rtc-node';
|
|
5
5
|
import { AccessToken } from 'livekit-server-sdk';
|
|
6
6
|
// Initialize logger before anything else
|
|
7
7
|
initializeLogger({ pretty: true, level: 'info' });
|
|
@@ -2492,6 +2492,51 @@ async function main() {
|
|
|
2492
2492
|
console.log('✅ Connected to room:', roomName);
|
|
2493
2493
|
localParticipant = room.localParticipant;
|
|
2494
2494
|
});
|
|
2495
|
+
// EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
|
|
2496
|
+
// server-side audio-level VAD on the participant's WebRTC track — fires ~50-100ms
|
|
2497
|
+
// after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
|
|
2498
|
+
//
|
|
2499
|
+
// Flow: user starts talking → ActiveSpeakersChanged includes a RemoteParticipant →
|
|
2500
|
+
// if agent is currently speaking → interrupt the SpeechHandle to flush TTS playback.
|
|
2501
|
+
// The existing handleSpeechDone callback (around line 1320) captures the spoken-text
|
|
2502
|
+
// + JSONL context into lastInterruption; PipelineDirectLLM consumes it on the next
|
|
2503
|
+
// chat() call to enrich the user's message with [INTERRUPTED] context — so the
|
|
2504
|
+
// post-interrupt note flow is preserved even though we're cutting TTS earlier.
|
|
2505
|
+
//
|
|
2506
|
+
// Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
|
|
2507
|
+
// room, and when its TTS plays it appears in the active-speakers list too. An earlier
|
|
2508
|
+
// attempt that compared `s.identity !== room.localParticipant?.identity` failed because
|
|
2509
|
+
// localParticipant.identity could be undefined at event-fire time, letting the agent's
|
|
2510
|
+
// own speech trigger a self-interrupt. The type check is bulletproof.
|
|
2511
|
+
//
|
|
2512
|
+
// Realtime mode skipped — the SDK handles interruption internally there, and manual
|
|
2513
|
+
// interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
|
|
2514
|
+
let lastActiveSpeakerInterruptAt = 0;
|
|
2515
|
+
room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
|
|
2516
|
+
if (!Array.isArray(speakers) || speakers.length === 0)
|
|
2517
|
+
return;
|
|
2518
|
+
const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
|
|
2519
|
+
if (remoteSpeakers.length === 0)
|
|
2520
|
+
return;
|
|
2521
|
+
if (currentVoiceMode === 'realtime')
|
|
2522
|
+
return;
|
|
2523
|
+
if (agentState !== 'speaking')
|
|
2524
|
+
return;
|
|
2525
|
+
const now = Date.now();
|
|
2526
|
+
const debounced = now - lastActiveSpeakerInterruptAt < 1000;
|
|
2527
|
+
lastActiveSpeakerInterruptAt = now;
|
|
2528
|
+
try {
|
|
2529
|
+
if (!debounced) {
|
|
2530
|
+
const ids = remoteSpeakers.map((s) => s.identity).join(',');
|
|
2531
|
+
console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
|
|
2532
|
+
}
|
|
2533
|
+
currentSession?.interrupt();
|
|
2534
|
+
}
|
|
2535
|
+
catch (err) {
|
|
2536
|
+
if (!debounced)
|
|
2537
|
+
console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
|
|
2538
|
+
}
|
|
2539
|
+
});
|
|
2495
2540
|
room.on(RoomEvent.Disconnected, () => {
|
|
2496
2541
|
console.log('👋 Disconnected from room');
|
|
2497
2542
|
// Clean up active research and voice queue
|
|
@@ -2784,10 +2829,26 @@ async function main() {
|
|
|
2784
2829
|
setTimeout(() => processVoiceQueue(), 500); // 500ms to let model settle
|
|
2785
2830
|
}
|
|
2786
2831
|
});
|
|
2787
|
-
// User state tracking — prevents queue from colliding with server-side VAD
|
|
2832
|
+
// User state tracking — prevents queue from colliding with server-side VAD.
|
|
2833
|
+
// Also a secondary interrupt trigger: when Deepgram STT classifies speech onset
|
|
2834
|
+
// it propagates here via agent_activity.onStartOfSpeech → _updateUserState('speaking').
|
|
2835
|
+
// Fires later than ActiveSpeakersChanged (Deepgram has ~100-300ms classification
|
|
2836
|
+
// latency vs LiveKit's ~50-100ms audio-level) but acts as a redundant fallback in
|
|
2837
|
+
// case the room-level event drops. interrupt() is idempotent on an already-
|
|
2838
|
+
// interrupted SpeechHandle so calling both paths is harmless.
|
|
2788
2839
|
sess.on('user_state_changed', (ev) => {
|
|
2840
|
+
const prev = userState;
|
|
2789
2841
|
userState = ev.newState;
|
|
2790
|
-
console.log(`👤 User state: ${ev.newState}`);
|
|
2842
|
+
console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
|
|
2843
|
+
if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
|
|
2844
|
+
try {
|
|
2845
|
+
console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS (fallback)');
|
|
2846
|
+
currentSession?.interrupt();
|
|
2847
|
+
}
|
|
2848
|
+
catch (err) {
|
|
2849
|
+
console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
|
|
2850
|
+
}
|
|
2851
|
+
}
|
|
2791
2852
|
// When user stops speaking, retry voice queue — items may be waiting
|
|
2792
2853
|
if (ev.newState === 'listening' && voiceQueue.length > 0) {
|
|
2793
2854
|
setTimeout(() => processVoiceQueue(), 500);
|
|
@@ -3081,27 +3142,10 @@ async function main() {
|
|
|
3081
3142
|
clearInterval(readyInterval);
|
|
3082
3143
|
console.log('✅ agent_ready retries complete');
|
|
3083
3144
|
}, 20000);
|
|
3084
|
-
// Stop agent_ready retries on user speech
|
|
3085
|
-
// Previously the interrupt only fired when STT committed a full transcript (chat()
|
|
3086
|
-
// call), which let the agent talk over the user for the full utterance. Firing it
|
|
3087
|
-
// here cuts TTS the moment VAD detects speech.
|
|
3088
|
-
// Realtime providers (OpenAI/Gemini) handle interruption server-side via their own
|
|
3089
|
-
// VAD — calling interrupt() manually for Gemini specifically crashes its state
|
|
3090
|
-
// machine (code 1008, hangs in 'speaking'), so skip those.
|
|
3145
|
+
// Stop agent_ready retries on user speech
|
|
3091
3146
|
session.on('input_speech_started', () => {
|
|
3092
3147
|
readySent = true;
|
|
3093
3148
|
clearInterval(readyInterval);
|
|
3094
|
-
if (agentState !== 'speaking')
|
|
3095
|
-
return;
|
|
3096
|
-
if (sessionVoiceMode === 'realtime')
|
|
3097
|
-
return;
|
|
3098
|
-
try {
|
|
3099
|
-
console.log('🎤 VAD onset → interrupting agent TTS');
|
|
3100
|
-
currentSession?.interrupt();
|
|
3101
|
-
}
|
|
3102
|
-
catch (err) {
|
|
3103
|
-
console.warn('⚠️ VAD-onset interrupt failed:', err instanceof Error ? err.message : err);
|
|
3104
|
-
}
|
|
3105
3149
|
});
|
|
3106
3150
|
// Greet user via TTS (delayed if resume prompt will be shown)
|
|
3107
3151
|
// For realtime mode: use generateReply() since there's no standalone TTS
|