osborn 0.9.51 → 0.9.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +198 -6
  2. package/package.json +9 -9
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent, } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent, RemoteParticipant, } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -165,6 +165,30 @@ const livekitState = {
165
165
  lastAttemptAt: null,
166
166
  attemptCount: 0,
167
167
  };
168
+ // ── Room-presence lifecycle (2026-06-09) ──────────────────────────────────────
169
+ // The agent used to eager-connect to LiveKit on boot and hold the room for the
170
+ // machine's entire life. With 1 participant (the agent itself), LiveKit never
171
+ // considers the room empty, so it never closes — a single forgotten session
172
+ // burned 25h of connection-minutes (room osborn-jzs94j) before we caught it.
173
+ //
174
+ // Fix: the agent now LEAVES the LiveKit room when no user is present, and only
175
+ // rejoins when a user actually connects. Two triggers, both feeding room.disconnect():
176
+ // 1. Agent-side "alone" timer — armed in ParticipantDisconnected once a real
177
+ // session has ended; if no user rejoins within ALONE_GRACE_MS, the agent
178
+ // leaves on its own. This is tab-close-proof (does not depend on the
179
+ // frontend's JS still running — the exact gap that let the 25h room linger).
180
+ // 2. POST /leave-room — the frontend's explicit "leave" button leaves instantly.
181
+ // Rejoin happens via POST /connect-room (frontend connect flow) which re-runs the
182
+ // connect-with-retry loop.
183
+ //
184
+ // `intentionalLeave` distinguishes a voluntary leave from an involuntary LiveKit
185
+ // eviction. The ghost-agent fix in RoomEvent.Disconnected auto-rejoins on drop;
186
+ // that must NOT fire after a voluntary leave (it would recreate the burn we just
187
+ // stopped). The hooks below are populated by main() (which owns `room` and the
188
+ // connect-with-retry loop) so the module-level HTTP server can drive them.
189
+ let intentionalLeave = false;
190
+ let connectRoomHook = null;
191
+ let leaveRoomHook = null;
168
192
  function startApiServer(workingDir, port) {
169
193
  const server = createServer(async (req, res) => {
170
194
  // CORS headers for cloud frontend
@@ -282,6 +306,32 @@ function startApiServer(workingDir, port) {
282
306
  res.end(JSON.stringify({ roomCode: currentRoomCode }));
283
307
  return;
284
308
  }
309
+ // POST /connect-room — the frontend connect flow calls this so the agent
310
+ // joins LiveKit for an incoming user. Idempotent: if already connected it's
311
+ // a no-op. Must complete (agent in room) BEFORE the user joins, because the
312
+ // voice session is created from the ParticipantConnected event, which only
313
+ // fires for participants who join AFTER the agent. The frontend polls
314
+ // /health for livekit.status==='connected' before minting its token.
315
+ if (req.method === 'POST' && url.pathname === '/connect-room') {
316
+ res.writeHead(200, { 'Content-Type': 'application/json' });
317
+ res.end(JSON.stringify({ ok: true, status: livekitState.status }));
318
+ if (connectRoomHook) {
319
+ connectRoomHook().catch((e) => console.error('❌ /connect-room hook failed:', e));
320
+ }
321
+ return;
322
+ }
323
+ // POST /leave-room — the frontend's explicit "leave"/disconnect leaves the
324
+ // LiveKit room immediately so connection-minute burn stops the instant the
325
+ // user is done (no waiting for the agent-side alone timer). Sets
326
+ // intentionalLeave so the Disconnected handler does NOT auto-rejoin.
327
+ if (req.method === 'POST' && url.pathname === '/leave-room') {
328
+ res.writeHead(200, { 'Content-Type': 'application/json' });
329
+ res.end(JSON.stringify({ ok: true }));
330
+ if (leaveRoomHook) {
331
+ leaveRoomHook('frontend_leave').catch((e) => console.error('❌ /leave-room hook failed:', e));
332
+ }
333
+ return;
334
+ }
285
335
  // POST /restart — graceful process restart (process manager will restart)
286
336
  if (req.method === 'POST' && url.pathname === '/restart') {
287
337
  res.writeHead(200, { 'Content-Type': 'application/json' });
@@ -1069,6 +1119,35 @@ async function main() {
1069
1119
  let currentSession = null;
1070
1120
  let currentAgent = null; // For updateChatCtx() context injection
1071
1121
  let currentLLM = null;
1122
+ // Agent-side "alone in room" leave timer (see Room-presence lifecycle note up
1123
+ // top). Armed in ParticipantDisconnected once a user has left; if no one
1124
+ // rejoins within the grace window the agent leaves LiveKit on its own.
1125
+ // Cancelled in ParticipantConnected. 3 min: long enough to ride out a brief
1126
+ // reconnect (page refresh, network blip), short enough that a forgotten
1127
+ // session costs ~3 min of connection-minutes instead of hours.
1128
+ let aloneTimer = null;
1129
+ const ALONE_GRACE_MS = 3 * 60 * 1000;
1130
+ // Arm (or re-arm) the alone timer: if no remote participant is present, leave
1131
+ // the LiveKit room after the grace window. Called on Connected (covers a
1132
+ // machine woken but then abandoned before the user joined) and on
1133
+ // ParticipantDisconnected (covers a finished session). Cancelled the moment a
1134
+ // user joins. Net invariant: the agent never holds an empty room beyond
1135
+ // ALONE_GRACE_MS, in any scenario — the root cause of the 25h burn.
1136
+ const armAloneTimer = () => {
1137
+ if (aloneTimer)
1138
+ clearTimeout(aloneTimer);
1139
+ aloneTimer = null;
1140
+ if (room.remoteParticipants.size > 0)
1141
+ return;
1142
+ aloneTimer = setTimeout(() => {
1143
+ aloneTimer = null;
1144
+ if (room.remoteParticipants.size === 0 && livekitState.status === 'connected') {
1145
+ console.log(`🕊️ Alone in room ${ALONE_GRACE_MS / 1000}s — leaving LiveKit to stop connection-minute burn`);
1146
+ intentionalLeave = true;
1147
+ room.disconnect().catch((e) => console.error('alone-leave room.disconnect failed:', e));
1148
+ }
1149
+ }, ALONE_GRACE_MS);
1150
+ };
1072
1151
  /**
1073
1152
  * Hard-kill the in-flight Claude SDK query AND the persistent subprocess.
1074
1153
  *
@@ -1126,6 +1205,26 @@ async function main() {
1126
1205
  // Session-level always-allow list: paths the user has approved for this session without prompting
1127
1206
  let sessionAlwaysAllowPaths = new Set();
1128
1207
  let userState = 'listening'; // Track user speech state for queue safety
1208
+ // Leading-edge debounce for the TTS interrupt below — restores the same
1209
+ // anti-flap protection the removed ActiveSpeakersChanged handler had pre-0.9.39
1210
+ // (May 21 / c345c98). Wall-clock timestamp + ms compare; no setTimeout, no
1211
+ // promise, no new API. Suppresses repeat interrupts within the window so a
1212
+ // single user-input transition fires at most one interrupt() call per second.
1213
+ // Without it, TTS echo bleeding through the mic causes user_state to oscillate
1214
+ // speaking ↔ listening across rapid Deepgram frames, each transition firing a
1215
+ // fresh interrupt — and even after 1.4.x's stricter error classification, the
1216
+ // first one survives but the cascade kills the session.
1217
+ let lastInterruptAt = 0;
1218
+ // Self-echo guard for the TTS interrupt below. Updated by the
1219
+ // ActiveSpeakersChanged listener registered near the other room.on(...) handlers.
1220
+ // user_state_changed carries NO speaker identity (verified against the SDK type
1221
+ // — UserStateChangedEvent has only oldState/newState/createdAt), so a separate
1222
+ // remote-speaker timestamp is the only way to distinguish "real user spoke" from
1223
+ // "agent's own TTS echoed through the mic". Independent producer: rtc-node
1224
+ // emits activeSpeakersChanged from server WebRTC audio-level reports
1225
+ // (room.js:213), with NO reference to AgentSession or STT — so there's no
1226
+ // dependency loop with user_state_changed's STT-driven producer.
1227
+ let lastRemoteSpeakerAt = 0;
1129
1228
  let currentVoiceMode = voiceMode; // Track active voice mode for data handlers
1130
1229
  let currentProvider = realtimeConfig.provider; // Track active realtime provider
1131
1230
  // Authenticated Supabase userId from participant metadata. Used to scope
@@ -2505,6 +2604,24 @@ async function main() {
2505
2604
  room.on(RoomEvent.Connected, () => {
2506
2605
  console.log('✅ Connected to room:', roomName);
2507
2606
  localParticipant = room.localParticipant;
2607
+ // Arm the alone timer: if we connected but no user joins within the grace
2608
+ // window (e.g. machine woken then abandoned mid-handshake), leave the room
2609
+ // rather than hold it indefinitely. Cancelled in ParticipantConnected.
2610
+ armAloneTimer();
2611
+ });
2612
+ // Self-echo guard producer. Server WebRTC audio-level reports drive this
2613
+ // (rtc-node room.js:213, ~50-100ms latency from mic onset — faster than
2614
+ // Deepgram STT classification, so by the time user_state_changed fires
2615
+ // lastRemoteSpeakerAt is already current). Filter speakers to RemoteParticipant
2616
+ // — LocalParticipant is the agent itself and including it would defeat the
2617
+ // whole point (the echo we're guarding against IS the agent's local audio).
2618
+ // This is the speaker-identity filter the removed ActiveSpeakersChanged
2619
+ // handler had (May 21 / c345c98) — minus the interrupt() call, since the
2620
+ // user_state_changed handler now owns interrupt firing.
2621
+ room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2622
+ if (speakers.some((s) => s instanceof RemoteParticipant)) {
2623
+ lastRemoteSpeakerAt = Date.now();
2624
+ }
2508
2625
  });
2509
2626
  // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
2510
2627
  // handler that interrupted TTS on any sustained audio activity (~50ms after
@@ -2559,6 +2676,20 @@ async function main() {
2559
2676
  //
2560
2677
  // Note: we mark status='retrying' immediately so /health reflects the real
2561
2678
  // state — closing the lie window between Disconnected and the next attempt.
2679
+ // ── Voluntary-leave guard (2026-06-09) ──
2680
+ // If we left the room ON PURPOSE (user clicked leave → /leave-room, or the
2681
+ // agent-side alone timer fired), do NOT auto-rejoin — rejoining would
2682
+ // recreate the connection-minute burn we just stopped. Mark the connection
2683
+ // 'idle' (machine stays warm, /health still 200) and wait for the next
2684
+ // /connect-room. Reset the flag so a later involuntary drop still rejoins.
2685
+ if (intentionalLeave) {
2686
+ intentionalLeave = false;
2687
+ livekitState.status = 'idle';
2688
+ livekitState.error = null;
2689
+ livekitState.errorCode = null;
2690
+ console.log('🕊️ Left LiveKit room intentionally — idle, awaiting /connect-room (no auto-rejoin)');
2691
+ return;
2692
+ }
2562
2693
  livekitState.status = 'retrying';
2563
2694
  livekitState.error = 'LiveKit room disconnected; attempting to rejoin';
2564
2695
  livekitState.errorCode = 'disconnected';
@@ -2569,6 +2700,11 @@ async function main() {
2569
2700
  });
2570
2701
  room.on(RoomEvent.ParticipantConnected, async (participant) => {
2571
2702
  console.log(`\n👤 User joined: ${participant.identity}`);
2703
+ // A user is present — cancel any pending agent-side "alone" leave.
2704
+ if (aloneTimer) {
2705
+ clearTimeout(aloneTimer);
2706
+ aloneTimer = null;
2707
+ }
2572
2708
  // Wait for previous session's byte stream handler to fully deregister.
2573
2709
  // Quick reconnects (< ~6s) crash with "byte stream handler already set" without this.
2574
2710
  if (pendingSessionClose) {
@@ -2852,12 +2988,34 @@ async function main() {
2852
2988
  userState = ev.newState;
2853
2989
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
2854
2990
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
2855
- try {
2856
- console.log('🎤 user_state_changed=speaking + agent speaking interrupting TTS');
2857
- currentSession?.interrupt();
2991
+ const now = Date.now();
2992
+ // Self-echo guard FIRST. Reject this trigger entirely if no remote
2993
+ // participant has been heard speaking in the last 500ms — at that
2994
+ // point user_state=speaking is almost certainly TTS bleeding through
2995
+ // the mic (Deepgram correctly identifies it as "speech", we add the
2996
+ // identity filter the high-level event lacks). 500ms is wider than
2997
+ // the ~50-300ms gap between ActiveSpeakersChanged and user_state_changed
2998
+ // firing, so a real user is comfortably inside the window.
2999
+ if (now - lastRemoteSpeakerAt > 500) {
3000
+ console.log('🔇 Skipping interrupt — no recent remote-speaker activity (self-echo guard)');
3001
+ return;
2858
3002
  }
2859
- catch (err) {
2860
- console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
3003
+ // Leading-edge 1s debounce — verbatim shape of the removed
3004
+ // ActiveSpeakersChanged handler's anti-flap (see lastInterruptAt
3005
+ // declaration). Belt + suspenders with the self-echo guard above.
3006
+ const debounced = now - lastInterruptAt < 1000;
3007
+ lastInterruptAt = now;
3008
+ if (debounced) {
3009
+ console.log('🔇 user-state interrupt debounced (< 1s since last)');
3010
+ }
3011
+ else {
3012
+ try {
3013
+ console.log('🎤 user_state_changed=speaking + agent speaking + remote-speaker confirmed → interrupting TTS');
3014
+ currentSession?.interrupt();
3015
+ }
3016
+ catch (err) {
3017
+ console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
3018
+ }
2861
3019
  }
2862
3020
  }
2863
3021
  // When user stops speaking, retry voice queue — items may be waiting
@@ -3291,6 +3449,10 @@ async function main() {
3291
3449
  activeMeetingBotId = null;
3292
3450
  }
3293
3451
  }
3452
+ // Arm the agent-side "alone" leave timer (tab-close-proof — runs on the
3453
+ // agent, not the frontend, so it fires even if the user closed the tab
3454
+ // without clicking leave).
3455
+ armAloneTimer();
3294
3456
  console.log('⏳ Waiting for new user...\n');
3295
3457
  });
3296
3458
  room.on(RoomEvent.DataReceived, async (payload, participant, kind, topic) => {
@@ -4092,6 +4254,36 @@ async function main() {
4092
4254
  }
4093
4255
  }
4094
4256
  };
4257
+ // Wire the module-level HTTP control hooks now that `room` and the
4258
+ // connect-with-retry loop exist (see Room-presence lifecycle note up top).
4259
+ // The /connect-room and /leave-room endpoints in startApiServer call these.
4260
+ connectRoomHook = async () => {
4261
+ intentionalLeave = false;
4262
+ if (aloneTimer) {
4263
+ clearTimeout(aloneTimer);
4264
+ aloneTimer = null;
4265
+ }
4266
+ if (livekitState.status === 'connected')
4267
+ return; // already in the room — no-op
4268
+ console.log('🔌 /connect-room — joining LiveKit for incoming user');
4269
+ await connectWithRetry();
4270
+ };
4271
+ leaveRoomHook = async (reason) => {
4272
+ if (aloneTimer) {
4273
+ clearTimeout(aloneTimer);
4274
+ aloneTimer = null;
4275
+ }
4276
+ if (livekitState.status !== 'connected')
4277
+ return; // already out — no-op
4278
+ intentionalLeave = true;
4279
+ console.log(`🚪 Leaving LiveKit room (${reason}) — stops connection-minute burn`);
4280
+ try {
4281
+ await room.disconnect();
4282
+ }
4283
+ catch (e) {
4284
+ console.error('leave-room room.disconnect failed:', e);
4285
+ }
4286
+ };
4095
4287
  // Fire and forget; the retry loop keeps the process alive on its own (so
4096
4288
  // we don't need the explicit `new Promise(() => {})` keepalive anymore).
4097
4289
  // Errors that escape the retry loop should never happen, but if they do,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.51",
3
+ "version": "0.9.53",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
@@ -33,14 +33,14 @@
33
33
  "@anthropic-ai/claude-agent-sdk": "^0.2.91",
34
34
  "@anthropic-ai/sdk": "^0.80.0",
35
35
  "@google/genai": "^1.0.0",
36
- "@livekit/agents": "^1.2.1",
37
- "@livekit/agents-plugin-deepgram": "^1.2.1",
38
- "@livekit/agents-plugin-elevenlabs": "^1.2.1",
39
- "@livekit/agents-plugin-google": "^1.2.1",
40
- "@livekit/agents-plugin-livekit": "^1.2.1",
41
- "@livekit/agents-plugin-openai": "^1.2.1",
42
- "@livekit/agents-plugin-silero": "^1.2.1",
43
- "@livekit/rtc-node": "^0.13.24",
36
+ "@livekit/agents": "1.2.1",
37
+ "@livekit/agents-plugin-deepgram": "1.2.1",
38
+ "@livekit/agents-plugin-elevenlabs": "1.2.1",
39
+ "@livekit/agents-plugin-google": "1.2.1",
40
+ "@livekit/agents-plugin-livekit": "1.2.1",
41
+ "@livekit/agents-plugin-openai": "1.2.1",
42
+ "@livekit/agents-plugin-silero": "1.2.1",
43
+ "@livekit/rtc-node": "0.13.24",
44
44
  "@modelcontextprotocol/sdk": "^1.29.0",
45
45
  "@openai/codex-sdk": "^0.77.0",
46
46
  "@smithery/api": "^0.48.0",