osborn 0.9.33 → 0.9.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +72 -3
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -2492,6 +2492,51 @@ async function main() {
2492
2492
  console.log('✅ Connected to room:', roomName);
2493
2493
  localParticipant = room.localParticipant;
2494
2494
  });
2495
+ // EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
2496
+ // server-side audio-level VAD on the participant's WebRTC track — fires ~50-100ms
2497
+ // after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
2498
+ // Same signal the LiveKit room uses to identify active speakers, so it's tuned for
2499
+ // real speech and ignores low-level noise.
2500
+ //
2501
+ // Flow: user starts talking → ActiveSpeakersChanged includes the remote participant →
2502
+ // if agent is currently speaking → interrupt the SpeechHandle to flush TTS.
2503
+ // The existing handleSpeechDone callback captures the spoken-text + JSONL context
2504
+ // (lastInterruption) and PipelineDirectLLM consumes it on the next chat() call to
2505
+ // enrich the user's message with [INTERRUPTED] context — so the post-interrupt
2506
+ // note flow is preserved even though we're interrupting earlier.
2507
+ //
2508
+ // Realtime mode skipped — the SDK handles interruption internally there, and manual
2509
+ // interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
2510
+ let lastActiveSpeakerInterruptAt = 0;
2511
+ room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2512
+ if (currentVoiceMode === 'realtime')
2513
+ return;
2514
+ if (agentState !== 'speaking')
2515
+ return;
2516
+ const localIdentity = room.localParticipant?.identity;
2517
+ const remoteSpeaking = Array.isArray(speakers) && speakers.some((s) => s && s.identity && s.identity !== localIdentity);
2518
+ if (!remoteSpeaking)
2519
+ return;
2520
+ // Debounce: avoid log spam when audio level oscillates above/below threshold.
2521
+ // interrupt() itself is idempotent on an already-interrupted SpeechHandle, but
2522
+ // we suppress duplicate logs within 1s.
2523
+ const now = Date.now();
2524
+ if (now - lastActiveSpeakerInterruptAt < 1000) {
2525
+ try {
2526
+ currentSession?.interrupt();
2527
+ }
2528
+ catch { }
2529
+ return;
2530
+ }
2531
+ lastActiveSpeakerInterruptAt = now;
2532
+ try {
2533
+ console.log('🎤 ActiveSpeakersChanged: remote speaker + agent speaking → interrupting TTS');
2534
+ currentSession?.interrupt();
2535
+ }
2536
+ catch (err) {
2537
+ console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
2538
+ }
2539
+ });
2495
2540
  room.on(RoomEvent.Disconnected, () => {
2496
2541
  console.log('👋 Disconnected from room');
2497
2542
  // Clean up active research and voice queue
@@ -2784,10 +2829,31 @@ async function main() {
2784
2829
  setTimeout(() => processVoiceQueue(), 500); // 500ms to let model settle
2785
2830
  }
2786
2831
  });
2787
- // User state tracking — prevents queue from colliding with server-side VAD
2832
+ // User state tracking — prevents queue from colliding with server-side VAD.
2833
+ // ALSO: interrupt the agent's TTS the moment Deepgram STT says the user is speaking.
2834
+ // Why here: in STT pipeline mode without a local VAD, the SDK's own auto-interrupt
2835
+ // (interruptByAudioActivity, agent_activity.js:651) is dead because it only fires
2836
+ // from onVADInferenceDone. The STT path (Deepgram START_OF_SPEECH) reaches us via
2837
+ // agent_activity.onStartOfSpeech → _updateUserState('speaking') → this event. That
2838
+ // is the earliest "user is speaking, not noise" signal we get without bringing back
2839
+ // a local VAD. interrupt() drains the currentSpeech + speech queue, killing TTS
2840
+ // playback in-flight. handleSpeechDone still captures the spoken-text + JSONL
2841
+ // context, consumed by PipelineDirectLLM on the next chat() call.
2842
+ // Realtime mode skipped — the SDK handles interruption internally there, and manual
2843
+ // interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
2788
2844
  sess.on('user_state_changed', (ev) => {
2845
+ const prev = userState;
2789
2846
  userState = ev.newState;
2790
- console.log(`👤 User state: ${ev.newState}`);
2847
+ console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
2848
+ if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
2849
+ try {
2850
+ console.log('🎤 User started speaking while agent was speaking → interrupting TTS');
2851
+ currentSession?.interrupt();
2852
+ }
2853
+ catch (err) {
2854
+ console.warn('⚠️ user-onset interrupt failed:', err instanceof Error ? err.message : err);
2855
+ }
2856
+ }
2791
2857
  // When user stops speaking, retry voice queue — items may be waiting
2792
2858
  if (ev.newState === 'listening' && voiceQueue.length > 0) {
2793
2859
  setTimeout(() => processVoiceQueue(), 500);
@@ -3081,7 +3147,10 @@ async function main() {
3081
3147
  clearInterval(readyInterval);
3082
3148
  console.log('✅ agent_ready retries complete');
3083
3149
  }, 20000);
3084
- // Stop agent_ready retries on user speech
3150
+ // Stop agent_ready retries on user speech.
3151
+ // NB: input_speech_started is realtime-only — the SDK never emits it in STT pipeline
3152
+ // mode. The earliest onset signal in pipeline mode is user_state_changed → 'speaking',
3153
+ // wired further down. Don't add interrupt logic here.
3085
3154
  session.on('input_speech_started', () => {
3086
3155
  readySent = true;
3087
3156
  clearInterval(readyInterval);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.33",
3
+ "version": "0.9.35",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {