osborn 0.9.35 → 0.9.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +34 -42
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Load environment variables FIRST before any other imports
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { voice, initializeLogger } from '@livekit/agents';
|
|
4
|
-
import { Room, RoomEvent } from '@livekit/rtc-node';
|
|
4
|
+
import { Room, RoomEvent, RemoteParticipant } from '@livekit/rtc-node';
|
|
5
5
|
import { AccessToken } from 'livekit-server-sdk';
|
|
6
6
|
// Initialize logger before anything else
|
|
7
7
|
initializeLogger({ pretty: true, level: 'info' });
|
|
@@ -2495,46 +2495,46 @@ async function main() {
|
|
|
2495
2495
|
// EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
|
|
2496
2496
|
// server-side audio-level VAD on the participant's WebRTC track — fires ~50-100ms
|
|
2497
2497
|
// after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
|
|
2498
|
-
// Same signal the LiveKit room uses to identify active speakers, so it's tuned for
|
|
2499
|
-
// real speech and ignores low-level noise.
|
|
2500
2498
|
//
|
|
2501
|
-
// Flow: user starts talking → ActiveSpeakersChanged includes
|
|
2502
|
-
// if agent is currently speaking → interrupt the SpeechHandle to flush TTS.
|
|
2503
|
-
// The existing handleSpeechDone callback captures the spoken-text
|
|
2504
|
-
//
|
|
2505
|
-
// enrich the user's message with [INTERRUPTED] context — so the
|
|
2506
|
-
// note flow is preserved even though we're
|
|
2499
|
+
// Flow: user starts talking → ActiveSpeakersChanged includes a RemoteParticipant →
|
|
2500
|
+
// if agent is currently speaking → interrupt the SpeechHandle to flush TTS playback.
|
|
2501
|
+
// The existing handleSpeechDone callback (around line 1320) captures the spoken-text
|
|
2502
|
+
// + JSONL context into lastInterruption; PipelineDirectLLM consumes it on the next
|
|
2503
|
+
// chat() call to enrich the user's message with [INTERRUPTED] context — so the
|
|
2504
|
+
// post-interrupt note flow is preserved even though we're cutting TTS earlier.
|
|
2505
|
+
//
|
|
2506
|
+
// Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
|
|
2507
|
+
// room, and when its TTS plays it appears in the active-speakers list too. An earlier
|
|
2508
|
+
// attempt that compared `s.identity !== room.localParticipant?.identity` failed because
|
|
2509
|
+
// localParticipant.identity could be undefined at event-fire time, letting the agent's
|
|
2510
|
+
// own speech trigger a self-interrupt. The type check is bulletproof.
|
|
2507
2511
|
//
|
|
2508
2512
|
// Realtime mode skipped — the SDK handles interruption internally there, and manual
|
|
2509
2513
|
// interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
|
|
2510
2514
|
let lastActiveSpeakerInterruptAt = 0;
|
|
2511
2515
|
room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
|
|
2516
|
+
if (!Array.isArray(speakers) || speakers.length === 0)
|
|
2517
|
+
return;
|
|
2518
|
+
const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
|
|
2519
|
+
if (remoteSpeakers.length === 0)
|
|
2520
|
+
return;
|
|
2512
2521
|
if (currentVoiceMode === 'realtime')
|
|
2513
2522
|
return;
|
|
2514
2523
|
if (agentState !== 'speaking')
|
|
2515
2524
|
return;
|
|
2516
|
-
const localIdentity = room.localParticipant?.identity;
|
|
2517
|
-
const remoteSpeaking = Array.isArray(speakers) && speakers.some((s) => s && s.identity && s.identity !== localIdentity);
|
|
2518
|
-
if (!remoteSpeaking)
|
|
2519
|
-
return;
|
|
2520
|
-
// Debounce: avoid log spam when audio level oscillates above/below threshold.
|
|
2521
|
-
// interrupt() itself is idempotent on an already-interrupted SpeechHandle, but
|
|
2522
|
-
// we suppress duplicate logs within 1s.
|
|
2523
2525
|
const now = Date.now();
|
|
2524
|
-
|
|
2525
|
-
try {
|
|
2526
|
-
currentSession?.interrupt();
|
|
2527
|
-
}
|
|
2528
|
-
catch { }
|
|
2529
|
-
return;
|
|
2530
|
-
}
|
|
2526
|
+
const debounced = now - lastActiveSpeakerInterruptAt < 1000;
|
|
2531
2527
|
lastActiveSpeakerInterruptAt = now;
|
|
2532
2528
|
try {
|
|
2533
|
-
|
|
2529
|
+
if (!debounced) {
|
|
2530
|
+
const ids = remoteSpeakers.map((s) => s.identity).join(',');
|
|
2531
|
+
console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
|
|
2532
|
+
}
|
|
2534
2533
|
currentSession?.interrupt();
|
|
2535
2534
|
}
|
|
2536
2535
|
catch (err) {
|
|
2537
|
-
|
|
2536
|
+
if (!debounced)
|
|
2537
|
+
console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
|
|
2538
2538
|
}
|
|
2539
2539
|
});
|
|
2540
2540
|
room.on(RoomEvent.Disconnected, () => {
|
|
@@ -2830,28 +2830,23 @@ async function main() {
|
|
|
2830
2830
|
}
|
|
2831
2831
|
});
|
|
2832
2832
|
// User state tracking — prevents queue from colliding with server-side VAD.
|
|
2833
|
-
//
|
|
2834
|
-
//
|
|
2835
|
-
//
|
|
2836
|
-
//
|
|
2837
|
-
//
|
|
2838
|
-
//
|
|
2839
|
-
// a local VAD. interrupt() drains the currentSpeech + speech queue, killing TTS
|
|
2840
|
-
// playback in-flight. handleSpeechDone still captures the spoken-text + JSONL
|
|
2841
|
-
// context, consumed by PipelineDirectLLM on the next chat() call.
|
|
2842
|
-
// Realtime mode skipped — the SDK handles interruption internally there, and manual
|
|
2843
|
-
// interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
|
|
2833
|
+
// Also a secondary interrupt trigger: when Deepgram STT classifies speech onset
|
|
2834
|
+
// it propagates here via agent_activity.onStartOfSpeech → _updateUserState('speaking').
|
|
2835
|
+
// Fires later than ActiveSpeakersChanged (Deepgram has ~100-300ms classification
|
|
2836
|
+
// latency vs LiveKit's ~50-100ms audio-level) but acts as a redundant fallback in
|
|
2837
|
+
// case the room-level event drops. interrupt() is idempotent on an already-
|
|
2838
|
+
// interrupted SpeechHandle so calling both paths is harmless.
|
|
2844
2839
|
sess.on('user_state_changed', (ev) => {
|
|
2845
2840
|
const prev = userState;
|
|
2846
2841
|
userState = ev.newState;
|
|
2847
2842
|
console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
|
|
2848
2843
|
if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
|
|
2849
2844
|
try {
|
|
2850
|
-
console.log('🎤
|
|
2845
|
+
console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS (fallback)');
|
|
2851
2846
|
currentSession?.interrupt();
|
|
2852
2847
|
}
|
|
2853
2848
|
catch (err) {
|
|
2854
|
-
console.warn('⚠️ user-
|
|
2849
|
+
console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
|
|
2855
2850
|
}
|
|
2856
2851
|
}
|
|
2857
2852
|
// When user stops speaking, retry voice queue — items may be waiting
|
|
@@ -3147,10 +3142,7 @@ async function main() {
|
|
|
3147
3142
|
clearInterval(readyInterval);
|
|
3148
3143
|
console.log('✅ agent_ready retries complete');
|
|
3149
3144
|
}, 20000);
|
|
3150
|
-
// Stop agent_ready retries on user speech
|
|
3151
|
-
// NB: input_speech_started is realtime-only — the SDK never emits it in STT pipeline
|
|
3152
|
-
// mode. The earliest onset signal in pipeline mode is user_state_changed → 'speaking',
|
|
3153
|
-
// wired further down. Don't add interrupt logic here.
|
|
3145
|
+
// Stop agent_ready retries on user speech
|
|
3154
3146
|
session.on('input_speech_started', () => {
|
|
3155
3147
|
readySent = true;
|
|
3156
3148
|
clearInterval(readyInterval);
|