npm - osborn - Versions diffs - 0.9.38 → 0.9.40 - Mend

osborn 0.9.38 → 0.9.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 // Load environment variables FIRST before any other imports
 import 'dotenv/config';
 import { voice, initializeLogger } from '@livekit/agents';
-import { Room, RoomEvent, RemoteParticipant } from '@livekit/rtc-node';
+import { Room, RoomEvent } from '@livekit/rtc-node';
 import { AccessToken } from 'livekit-server-sdk';
 // Initialize logger before anything else
 initializeLogger({ pretty: true, level: 'info' });
@@ -1224,28 +1224,46 @@ async function main() {
     let lastCompletedResearch = null;
     // No manual queuing — the Claude SDK handles sequential queries internally
     // ============================================================
-    // Recall.ai — Meeting Transcript Routing
+    // Recall.ai — Meeting Transcript Listener
     // ============================================================
+    // NOTE: LLM-forwarding via Recall webhook STT was DISABLED in the Phase 2
+    // LiveKit-based meeting-bot migration. Reason: Recall sends transcripts as
+    // sentence-level fragments (e.g. "transcript.data" events fire ~once per
+    // sentence). The old code below called currentLLM.chat() PER FRAGMENT, which
+    // meant the agent fired ~10 chat() calls during a single user utterance —
+    // each one prompting a separate response. The agent ended up speaking over
+    // itself answering partial fragments.
+    //
+    // Phase 2 routes meeting audio through LiveKit instead (see
+    // frontend/src/app/meeting-bot/page.tsx). The agent's existing Deepgram Flux
+    // STT processes that audio via end-of-turn detection — ONE chat() call per
+    // actual completed utterance, no fragment storms.
+    //
+    // We keep the listener registered so we have a hook for future work (e.g.
+    // forwarding the live transcript to the frontend chat panel as a read-only
+    // "what was said in the meeting" display, separate from the LLM input path).
     const recall = getRecallClient();
     if (recall) {
-        console.log('🎥 Recall.ai client initialized (RECALL_API_KEY present)');
+        console.log('🎥 Recall.ai client initialized (webhook STT receiver — LLM forwarding disabled, see meeting-bot Phase 2)');
         recall.on('transcript', ({ botId, speaker, text }) => {
             console.log(`📝 Meeting transcript [${speaker}]: ${text}`);
-            // Route meeting transcripts to Claude as user text with speaker attribution
-            if (currentLLM && currentSession) {
-                const meetingText = `[Meeting — ${speaker}]: ${text}`;
-                // Use the same pipeline as user_text data channel messages
-                try {
-                    if (currentVoiceMode === 'pipeline' || currentVoiceMode === 'direct') {
-                        const chatCtx = new llm.ChatContext();
-                        chatCtx.addMessage({ role: 'user', content: meetingText });
-                        currentLLM.chat({ chatCtx });
-                    }
-                }
-                catch (err) {
-                    console.error('❌ Failed to route meeting transcript:', err);
-                }
-            }
+            // INTENTIONALLY DISABLED — see comment above. Audio path is now LiveKit
+            // → meeting-bot page publishes meeting audio → agent STT processes it.
+            // The line below is preserved as a reference for future re-enablement
+            // (e.g. as a display-only feature, NOT as LLM input).
+            //
+            // if (currentLLM && currentSession) {
+            //   const meetingText = `[Meeting — ${speaker}]: ${text}`
+            //   try {
+            //     if (currentVoiceMode === 'pipeline' || currentVoiceMode === 'direct') {
+            //       const chatCtx = new llm.ChatContext()
+            //       chatCtx.addMessage({ role: 'user', content: meetingText })
+            //       ;(currentLLM as any).chat({ chatCtx })
+            //     }
+            //   } catch (err) {
+            //     console.error('❌ Failed to route meeting transcript:', err)
+            //   }
+            // }
         });
     }
     // ============================================================
@@ -1656,9 +1674,34 @@ async function main() {
             skipTTSQueue: true,
             onCompactionEvent: (event) => {
                 try {
-                    // Forward every field — frontend renders stage + detail + skill list during compaction.
-                    // Spread covers compaction_started/progress/complete (different fields per type).
+                    // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
                     sendToFrontend({ ...event });
+                    // ALSO emit as a claude_output chat bubble — reuses the existing message path
+                    // that's already working end-to-end. PreCompact → in-progress bubble.
+                    // PostCompact → completion bubble with the skills summary. The dedicated
+                    // banner has been unreliable in production (data path works on backend, banner
+                    // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
+                    // are visible without dev tools.
+                    if (event.type === 'compaction_started') {
+                        const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
+                        sendToFrontend({
+                            type: 'claude_output',
+                            text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
+                            agentRole: 'direct',
+                        });
+                    }
+                    else if (event.type === 'compaction_complete') {
+                        const ev = event;
+                        const n = ev.skillsWritten ?? 0;
+                        const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
+                            ? ` — ${ev.skillNames.join(', ')}`
+                            : '';
+                        sendToFrontend({
+                            type: 'claude_output',
+                            text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
+                            agentRole: 'direct',
+                        });
+                    }
                 }
                 catch { /* non-fatal */ }
             },
@@ -1862,14 +1905,17 @@ async function main() {
             const sayId = Date.now(); // simple ID to correlate start/end logs
             console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
             // Forward spoken text + audio to meeting output page when bot is in a meeting.
-            // Text appears immediately; audio uses the same configured TTS (directConfig.tts)
-            // so voice/provider stays consistent — no separate hardcoded provider.
+            // Uses DIRECT_MODE_TTS (same OpenAI fable voice as the live session) — was
+            // previously using directConfig.tts which falls back to DEFAULT_CONFIG.direct.tts
+            // (Deepgram aura-2-asteria-en) when no user config exists, producing a different
+            // voice in the meeting than what the user hears in voice-native. Both paths now
+            // share the single source of truth.
             // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
             // Recall captures the browser page's audio output and injects it into the meeting.
             if (activeMeetingBotId) {
                 sendToMeetingOutput({ type: 'speak', text: data.text });
                 if (meetingOutputWs) {
-                    synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
+                    synthesizeForMeeting(data.text, DIRECT_MODE_TTS).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
                 }
             }
             try {
@@ -2011,9 +2057,34 @@ async function main() {
             resumeSessionId,
             onCompactionEvent: (event) => {
                 try {
-                    // Forward every field — frontend renders stage + detail + skill list during compaction.
-                    // Spread covers compaction_started/progress/complete (different fields per type).
+                    // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
                     sendToFrontend({ ...event });
+                    // ALSO emit as a claude_output chat bubble — reuses the existing message path
+                    // that's already working end-to-end. PreCompact → in-progress bubble.
+                    // PostCompact → completion bubble with the skills summary. The dedicated
+                    // banner has been unreliable in production (data path works on backend, banner
+                    // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
+                    // are visible without dev tools.
+                    if (event.type === 'compaction_started') {
+                        const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
+                        sendToFrontend({
+                            type: 'claude_output',
+                            text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
+                            agentRole: 'direct',
+                        });
+                    }
+                    else if (event.type === 'compaction_complete') {
+                        const ev = event;
+                        const n = ev.skillsWritten ?? 0;
+                        const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
+                            ? ` — ${ev.skillNames.join(', ')}`
+                            : '';
+                        sendToFrontend({
+                            type: 'claude_output',
+                            text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
+                            agentRole: 'direct',
+                        });
+                    }
                 }
                 catch { /* non-fatal */ }
             },
@@ -2530,51 +2601,16 @@ async function main() {
         console.log('✅ Connected to room:', roomName);
         localParticipant = room.localParticipant;
     });
-    // EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
-    // server-side audio-level VAD on the participant's WebRTC track — fires ~50-100ms
-    // after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
-    //
-    // Flow: user starts talking → ActiveSpeakersChanged includes a RemoteParticipant →
-    // if agent is currently speaking → interrupt the SpeechHandle to flush TTS playback.
-    // The existing handleSpeechDone callback (around line 1320) captures the spoken-text
-    // + JSONL context into lastInterruption; PipelineDirectLLM consumes it on the next
-    // chat() call to enrich the user's message with [INTERRUPTED] context — so the
-    // post-interrupt note flow is preserved even though we're cutting TTS earlier.
-    //
-    // Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
-    // room, and when its TTS plays it appears in the active-speakers list too. An earlier
-    // attempt that compared `s.identity !== room.localParticipant?.identity` failed because
-    // localParticipant.identity could be undefined at event-fire time, letting the agent's
-    // own speech trigger a self-interrupt. The type check is bulletproof.
-    //
-    // Realtime mode skipped — the SDK handles interruption internally there, and manual
-    // interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
-    let lastActiveSpeakerInterruptAt = 0;
-    room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
-        if (!Array.isArray(speakers) || speakers.length === 0)
-            return;
-        const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
-        if (remoteSpeakers.length === 0)
-            return;
-        if (currentVoiceMode === 'realtime')
-            return;
-        if (agentState !== 'speaking')
-            return;
-        const now = Date.now();
-        const debounced = now - lastActiveSpeakerInterruptAt < 1000;
-        lastActiveSpeakerInterruptAt = now;
-        try {
-            if (!debounced) {
-                const ids = remoteSpeakers.map((s) => s.identity).join(',');
-                console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
-            }
-            currentSession?.interrupt();
-        }
-        catch (err) {
-            if (!debounced)
-                console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
-        }
-    });
+    // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
+    // handler that interrupted TTS on any sustained audio activity (~50ms after
+    // mic onset). That fired too eagerly — coughs, paper rustles, the agent's
+    // own TTS bleeding through the mic, and other non-speech sounds tripped it
+    // ~10-15% of the time, leaving the agent silent with no recovery path
+    // (because no STT transcript would follow). Dropped in favor of the
+    // user_state_changed → 'speaking' handler below, which is fed by Deepgram
+    // Flux STT's speech-vs-noise classification: slower (~100-300ms) but
+    // confidence-aware. The latency tradeoff is worth eliminating the false
+    // interrupts at the root.
     room.on(RoomEvent.Disconnected, () => {
         console.log('👋 Disconnected from room');
         // Clean up active research and voice queue
@@ -2868,19 +2904,20 @@ async function main() {
                 }
             });
             // User state tracking — prevents queue from colliding with server-side VAD.
-            // Also a secondary interrupt trigger: when Deepgram STT classifies speech onset
-            // it propagates here via agent_activity.onStartOfSpeech → _updateUserState('speaking').
-            // Fires later than ActiveSpeakersChanged (Deepgram has ~100-300ms classification
-            // latency vs LiveKit's ~50-100ms audio-level) but acts as a redundant fallback in
-            // case the room-level event drops. interrupt() is idempotent on an already-
-            // interrupted SpeechHandle so calling both paths is harmless.
+            // Also the PRIMARY interrupt trigger now that the over-eager ActiveSpeakersChanged
+            // path is gone. Fires when Deepgram Flux STT classifies frames as speech (not noise)
+            // and propagates via agent_activity.onStartOfSpeech → _updateUserState('speaking').
+            // Latency ~100-300ms after mic onset, which is the cost of confidence-aware
+            // detection — vs the prior ActiveSpeakers handler that fired at ~50ms on any audio
+            // activity and tripped ~10-15% false interrupts on coughs, paper rustle, agent's
+            // own TTS bleeding through the mic, etc.
             sess.on('user_state_changed', (ev) => {
                 const prev = userState;
                 userState = ev.newState;
                 console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
                 if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
                     try {
-                        console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS (fallback)');
+                        console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
                         currentSession?.interrupt();
                     }
                     catch (err) {
@@ -3889,8 +3926,57 @@ async function main() {
                                 (process.env.FLY_APP_NAME
                                     ? `https://${process.env.FLY_APP_NAME}.fly.dev`
                                     : `http://localhost:${apiPort}`);
+                            // Try to mint a LiveKit bot token + construct the frontend-hosted
+                            // meeting-bot page URL. The bot page joins the same LiveKit room
+                            // as this agent so meeting audio flows through LiveKit directly
+                            // (no agent-side WebSocket+WAV pipe). Falls back to the legacy
+                            // /meeting-output webpage if no frontend URL is resolvable, so
+                            // the old code path keeps working during the migration window.
+                            //
+                            // Frontend URL resolution (in priority order):
+                            //   1. data.frontendBase — the public URL the user's browser is on,
+                            //      passed through the join_meeting data channel message. Works
+                            //      automatically for localhost dev + production without any
+                            //      env var.
+                            //   2. OSBORN_FRONTEND_URL — existing convention from sprites.ts
+                            //      (frontend/src/lib/sprites.ts:241) that injects the public
+                            //      frontend URL into sandbox env vars. Defense in depth.
+                            //
+                            // Auth: the endpoint uses LiveKit room-presence as the auth check
+                            // — no shared secret needed. The agent must already be in the
+                            // requested room (which it is by this point) for the mint to
+                            // succeed.
+                            let outputPageUrl;
+                            const frontendUrl = data.frontendBase
+                                || process.env.OSBORN_FRONTEND_URL;
+                            if (frontendUrl) {
+                                try {
+                                    const botLkId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
+                                    const tokenRes = await fetch(`${frontendUrl}/api/meeting-bot-token`, {
+                                        method: 'POST',
+                                        headers: { 'Content-Type': 'application/json' },
+                                        body: JSON.stringify({ botId: botLkId, roomName }),
+                                    });
+                                    if (tokenRes.ok) {
+                                        const { token, url } = await tokenRes.json();
+                                        const params = new URLSearchParams({ token, url, room: roomName, botId: botLkId });
+                                        outputPageUrl = `${frontendUrl}/meeting-bot?${params.toString()}`;
+                                        console.log(`🎫 Meeting-bot token minted for room=${roomName} bot=${botLkId}`);
+                                    }
+                                    else {
+                                        const errText = await tokenRes.text().catch(() => '');
+                                        console.warn(`⚠️ meeting-bot-token mint failed (HTTP ${tokenRes.status}: ${errText.substring(0, 120)}) — falling back to legacy /meeting-output path`);
+                                    }
+                                }
+                                catch (mintErr) {
+                                    console.warn(`⚠️ meeting-bot-token mint threw — falling back: ${mintErr.message}`);
+                                }
+                            }
+                            else {
+                                console.log('ℹ️ No frontend URL (data.frontendBase + OSBORN_FRONTEND_URL both empty) — using legacy /meeting-output path');
+                            }
                             await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
-                            const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase);
+                            const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase, { outputPageUrl });
                             const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
                             recallJoin.registerBot(botId, sessionId);
                             activeMeetingBotId = botId;

package/dist/recall-client.d.ts CHANGED Viewed

@@ -36,7 +36,23 @@ export interface TranscriptPayload {
 export declare class RecallClient extends EventEmitter {
     #private;
     constructor(apiKey: string);
-    joinMeeting(meetingUrl: string, webhookBaseUrl: string, botName?: string): Promise<string>;
+    /**
+     * Join a meeting via Recall.ai.
+     *
+     * @param meetingUrl       Zoom / Google Meet / Teams URL the bot should dial in to
+     * @param webhookBaseUrl   Base URL for the agent's HTTP endpoints (transcript webhook)
+     * @param opts.outputPageUrl  Full URL for the bot's camera/audio page. If provided,
+     *                            replaces the default `${webhookBaseUrl}/meeting-output`.
+     *                            Used to point at the frontend-hosted /meeting-bot page
+     *                            with token + room embedded as query params, so the page
+     *                            connects to LiveKit and audio flows through the same
+     *                            room as the osborn agent (no separate WebSocket+WAV pipe).
+     * @param opts.botName     Display name of the bot in the meeting
+     */
+    joinMeeting(meetingUrl: string, webhookBaseUrl: string, opts?: {
+        outputPageUrl?: string;
+        botName?: string;
+    }): Promise<string>;
     leaveMeeting(botId: string): Promise<void>;
     getBotStatus(botId: string): Promise<string>;
     handleWebhook(payload: TranscriptPayload): void;

package/dist/recall-client.js CHANGED Viewed

@@ -8,7 +8,22 @@ export class RecallClient extends EventEmitter {
         super();
         this.#apiKey = apiKey;
     }
-    async joinMeeting(meetingUrl, webhookBaseUrl, botName = 'Osborn') {
+    /**
+     * Join a meeting via Recall.ai.
+     *
+     * @param meetingUrl       Zoom / Google Meet / Teams URL the bot should dial in to
+     * @param webhookBaseUrl   Base URL for the agent's HTTP endpoints (transcript webhook)
+     * @param opts.outputPageUrl  Full URL for the bot's camera/audio page. If provided,
+     *                            replaces the default `${webhookBaseUrl}/meeting-output`.
+     *                            Used to point at the frontend-hosted /meeting-bot page
+     *                            with token + room embedded as query params, so the page
+     *                            connects to LiveKit and audio flows through the same
+     *                            room as the osborn agent (no separate WebSocket+WAV pipe).
+     * @param opts.botName     Display name of the bot in the meeting
+     */
+    async joinMeeting(meetingUrl, webhookBaseUrl, opts) {
+        const botName = opts?.botName ?? 'Osborn';
+        const outputPageUrl = opts?.outputPageUrl ?? `${webhookBaseUrl}/meeting-output`;
         // Authoritative structure per https://docs.recall.ai/reference/bot_create
         // and https://docs.recall.ai/docs/real-time-transcription:
         //
@@ -49,10 +64,13 @@ export class RecallClient extends EventEmitter {
                 output_media: {
                     camera: {
                         // `kind` (not `type`) — confirmed from prior debugging.
-                        // Output webpage plays TTS audio so meeting participants can hear the agent.
+                        // The page Recall renders is responsible for joining the same LiveKit
+                        // room as the osborn agent: meeting audio captured via getUserMedia is
+                        // published into the room; osborn's TTS audio (already in the room) is
+                        // played by the page and captured by Recall as the bot's mic output.
                         kind: 'webpage',
                         config: {
-                            url: `${webhookBaseUrl}/meeting-output`,
+                            url: outputPageUrl,
                         },
                     },
                 },
@@ -63,7 +81,7 @@ export class RecallClient extends EventEmitter {
             throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
         }
         const bot = (await res.json());
-        console.log(`🤖 Recall.ai bot joined meeting: ${bot.id}`);
+        console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (output page: ${outputPageUrl})`);
         return bot.id;
     }
     async leaveMeeting(botId) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.9.38",
+  "version": "0.9.40",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {