npm - osborn - Versions diffs - 0.9.42 → 0.9.43 - Mend

osborn 0.9.42 → 0.9.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/.claude/skills/meetings/SKILL.md +73 -0
package/dist/index.js +111 -527
package/dist/meeting-transcript-poller.d.ts +60 -0
package/dist/meeting-transcript-poller.js +112 -0
package/dist/recall-client.d.ts +49 -2
package/dist/recall-client.js +41 -64
package/package.json +2 -2

package/.claude/skills/meetings/SKILL.md ADDED Viewed

@@ -0,0 +1,73 @@
+# Skill: Meetings
+Silent note-taking and TODO-tracking when osborn is sitting in a live meeting.
+## When to use
+When a user message arrives with the prefix `[MEETING — <botId>]:` (every ~30 seconds while a Recall.ai meeting bot is active). Also use this skill when the orchestrating system injects `[SYSTEM] You are now in a meeting ...`.
+**Do NOT use this skill** for normal user messages in the voice-native chat — those still get spoken responses as usual.
+## How to behave
+For every `[MEETING — *]:` message:
+1. **Do NOT speak.** No TTS output. No `tts_say`. No conversational reply. This is a silent observer path. If you must acknowledge that you processed the message, do it via a Write/Edit tool call (writing to the workspace), not via spoken or chat output.
+2. **Update `meeting-todos.md`** in the session workspace (`{workspace}/meeting-todos.md`). Append new action items, decisions, and open questions as they emerge in the transcript. Do not rewrite existing entries unless something contradicts.
+3. **Optionally trigger background research silently.** If a topic in the meeting would benefit from a quick web/code lookup, dispatch a researcher sub-agent via the Task tool. Save its output to `{workspace}/library/meeting-research-<topic-slug>.md`. Do NOT speak the result.
+4. **Do not consume voice-native attention.** The user can still talk to you via the voice-native browser. When they do (a normal user message with no `[MEETING — *]` prefix), respond normally — speak. Treat the meeting transcript as background context they can ask about ("what did Sarah say about pricing?" → answer normally).
+## The `meeting-todos.md` file
+Keep it scannable. Structure:
+```markdown
+# Meeting Notes
+**Bot:** <botId> · **Started:** <ISO timestamp>
+## TODOs
+- [ ] <person>: <action item> — <context>
+- [ ] <person>: <action item>
+## Decisions
+- <date/time> — <what was decided> (raised by <person>)
+## Open Questions
+- <question> — raised by <person>, still unresolved
+- <question> — answered by <person>: <answer>
+## Highlights
+- <key moment or quote worth surfacing>
+```
+Update the same file across multiple poll cycles — don't create `meeting-todos-1.md`, `meeting-todos-2.md`. One file, evolving.
+## Workspace path
+The session workspace is `~/.claude/projects/<slug>/osb/<session-uuid>/`. Read the env variable or the spec.md header if you need to confirm the exact path. Write absolute paths in tool calls (e.g. `/Users/<user>/.claude/projects/.../osb/<uuid>/meeting-todos.md`).
+## On meeting end
+When the user leaves the meeting (the system stops sending `[MEETING — *]:` messages and may inject `[SYSTEM] meeting ended`), do a final pass on `meeting-todos.md` to:
+- Mark items the user has clearly committed to
+- Move resolved open questions to a `## Resolved` section
+- Add a `## Summary` section at the top with 3-5 lines distilling the meeting
+Still silent. The user will ask out loud if they want a recap.
+## When the user asks about the meeting
+When a non-meeting-tagged message references the meeting ("what's on the todo list?", "what did we decide about X?", "who's handling Y?"), respond normally — speak. Read `meeting-todos.md` first to ground the response. Don't make up speaker names or decisions; only state what's recorded.
+## Anti-patterns
+- ❌ Speaking in response to a `[MEETING — *]:` message
+- ❌ Creating a new file per poll cycle instead of updating one
+- ❌ Trying to drive the meeting (don't add "we should..." items unless someone in the meeting said them)
+- ❌ Asking the user clarifying questions during the meeting — they're not paying attention to chat
+- ❌ Re-transcribing what's in the message into the TODO file verbatim. Distill.

package/dist/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 // Load environment variables FIRST before any other imports
 import 'dotenv/config';
 import { voice, initializeLogger } from '@livekit/agents';
-import { Room, RoomEvent, AudioSource, AudioFrame, LocalAudioTrack, TrackPublishOptions, TrackSource, } from '@livekit/rtc-node';
+import { Room, RoomEvent, } from '@livekit/rtc-node';
 import { AccessToken } from 'livekit-server-sdk';
 // Initialize logger before anything else
 initializeLogger({ pretty: true, level: 'info' });
@@ -10,7 +10,6 @@ initializeLogger({ pretty: true, level: 'info' });
 import { setMaxListeners } from 'node:events';
 setMaxListeners(50);
 import { createServer } from 'http';
-import { WebSocket, WebSocketServer } from 'ws';
 import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
 import { dirname, join } from 'node:path';
 import { fileURLToPath } from 'node:url';
@@ -34,6 +33,7 @@ import { askHaiku, askFastBrain, updateSpecFromJSONL, processResearchCompletion,
 import { DIRECT_MODE_PROMPT, getRealtimeInstructions, getScriptInjection, getProactiveInjection, getNotificationInjection } from './prompts.js';
 import { MCP_CATALOG } from './config.js';
 import { getRecallClient } from './recall-client.js';
+import { MeetingTranscriptPoller } from './meeting-transcript-poller.js';
 import { llm } from '@livekit/agents';
 import { z } from 'zod';
 // ============================================================
@@ -147,79 +147,6 @@ process.on('uncaughtException', (error) => {
 // ============================================================
 // Module-level room code so the HTTP server can expose it via GET /room-code
 let currentRoomCode = null;
-// Meeting output WebSocket — module-level so both startApiServer and main() can access it
-let meetingOutputWs = null;
-// Module-level AgentSession reference so /meeting-audio-in WS handler can switch
-// the RoomIO-linked participant when meeting audio starts/stops (B2 design).
-let activeAgentSession = null;
-// Identity of the local user participant the session was originally listening to
-// — captured at the moment we switch to the meeting publisher, restored on cleanup.
-let preMeetingUserIdentity = null;
-function sendToMeetingOutput(msg) {
-    if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
-        try {
-            meetingOutputWs.send(JSON.stringify(msg));
-        }
-        catch { }
-    }
-}
-// Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
-// Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
-async function synthesizeForMeeting(text, ttsConfig) {
-    if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
-        return;
-    const ttsInstance = createTTS(ttsConfig);
-    try {
-        const chunks = [];
-        let sampleRate = 24000;
-        let numChannels = 1;
-        const stream = ttsInstance.synthesize(text);
-        for await (const event of stream) {
-            if (event === Symbol.for('END_OF_STREAM'))
-                break;
-            const e = event;
-            if (e?.frame?.data) {
-                chunks.push(e.frame.data);
-                sampleRate = e.frame.sampleRate ?? sampleRate;
-                numChannels = e.frame.numChannels ?? numChannels;
-            }
-        }
-        if (chunks.length === 0)
-            return;
-        const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
-        const pcm = new Int16Array(totalSamples);
-        let offset = 0;
-        for (const c of chunks) {
-            pcm.set(c, offset);
-            offset += c.length;
-        }
-        // WAV header (44 bytes) + PCM data
-        const dataBytes = pcm.length * 2;
-        const wav = Buffer.alloc(44 + dataBytes);
-        wav.write('RIFF', 0);
-        wav.writeUInt32LE(36 + dataBytes, 4);
-        wav.write('WAVE', 8);
-        wav.write('fmt ', 12);
-        wav.writeUInt32LE(16, 16);
-        wav.writeUInt16LE(1, 20);
-        wav.writeUInt16LE(numChannels, 22);
-        wav.writeUInt32LE(sampleRate, 24);
-        wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
-        wav.writeUInt16LE(numChannels * 2, 32);
-        wav.writeUInt16LE(16, 34);
-        wav.write('data', 36);
-        wav.writeUInt32LE(dataBytes, 40);
-        for (let i = 0; i < pcm.length; i++)
-            wav.writeInt16LE(pcm[i], 44 + i * 2);
-        if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
-            meetingOutputWs.send(wav);
-            console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
-        }
-    }
-    finally {
-        await ttsInstance.close().catch(() => { });
-    }
-}
 function startApiServer(workingDir, port) {
     const server = createServer(async (req, res) => {
         // CORS headers for cloud frontend
@@ -317,40 +244,6 @@ function startApiServer(workingDir, port) {
             });
             return;
         }
-        // GET /meeting-output — Output Media webpage for Recall.ai bot audio.
-        //
-        // The file lives next to this compiled JS (copied by the build script from
-        // src/ to dist/). Resolve via __dirname rather than process.cwd() — in
-        // production cwd is the user's workspace, NOT the osborn package directory.
-        if (req.method === 'GET' && url.pathname === '/meeting-output') {
-            // Try the package-relative path first (post-build location), then fall
-            // back to source path for `tsx src/index.ts` dev runs.
-            const candidates = [
-                join(__dirname, 'meeting-output.html'), // dist/ (production)
-                join(__dirname, '..', 'src', 'meeting-output.html'), // dev: dist/ → src/
-                join(__dirname, '..', 'meeting-output.html'), // tsx run from src/
-            ];
-            let html = null;
-            let foundPath = null;
-            for (const p of candidates) {
-                try {
-                    html = readFileSync(p, 'utf-8');
-                    foundPath = p;
-                    break;
-                }
-                catch { }
-            }
-            if (html) {
-                res.writeHead(200, { 'Content-Type': 'text/html' });
-                res.end(html);
-            }
-            else {
-                console.warn(`[meeting-output] not found in any of: ${candidates.join(', ')}`);
-                res.writeHead(404, { 'Content-Type': 'text/plain' });
-                res.end('meeting-output.html not found');
-            }
-            return;
-        }
         if (req.method === 'GET' && url.pathname === '/room-code') {
             res.writeHead(200, { 'Content-Type': 'application/json' });
             res.end(JSON.stringify({ roomCode: currentRoomCode }));
@@ -965,286 +858,12 @@ function startApiServer(workingDir, port) {
     };
     cleanStaleUploadDirs();
     setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
-    // ============================================================
-    // Meeting Output WebSocket — /meeting-audio (LEGACY)
-    // ============================================================
-    // Recall's headless browser used to open meeting-output.html which connects
-    // here. With the new /meeting-bot Next.js page (Phase 2 + LiveKit), Recall
-    // points at frontend/meeting-bot instead — this handler exists only for
-    // backwards-compat with old machine images still serving the legacy path.
-    const meetingOutputWss = new WebSocketServer({ noServer: true });
-    meetingOutputWss.on('connection', (ws) => {
-        console.log('📺 Meeting output browser connected (legacy /meeting-audio)');
-        meetingOutputWs = ws;
-        ws.on('close', () => {
-            console.log('📺 Meeting output browser disconnected (legacy)');
-            if (meetingOutputWs === ws)
-                meetingOutputWs = null;
-        });
-    });
-    // ============================================================
-    // Recall.ai meeting-audio-in WebSocket — /meeting-audio-in
-    // ============================================================
-    // Recall.ai's per-participant real-time audio protocol. Bot is configured
-    // (in recall-client.ts joinMeeting) with audio_separate_raw + a realtime
-    // endpoint pointing at this URL. Recall sends JSON events containing
-    // base64-encoded PCM (S16LE, 16kHz, mono) for every meeting participant
-    // (bot's own audio NOT included by default — no feedback loop possible).
-    //
-    // Flow: Recall → /meeting-audio-in → open a SECOND LiveKit connection from
-    //       this agent process as a publisher participant → publish PCM as an
-    //       audio track in the same LiveKit room → the existing AgentSession's
-    //       STT subscribes to it as a remote track → routes to currentLLM.chat()
-    //       via the same pipeline as voice-native user mic.
-    //
-    // The advantage of this design vs a parallel STT pipeline: meeting audio
-    // becomes "just another participant" in the LiveKit room — same end-of-turn
-    // detection, same interrupt handling, same conversation context, no parallel
-    // chat() paths to maintain.
-    //
-    // Wait until activeAgentSession._roomIO exists AND the publisher participant
-    // is visible to the agent's room. Both can race against join_meeting:
-    //   - Agent session may still be starting up when Recall connects.
-    //   - LiveKit takes a moment to propagate the publisher's join to the agent
-    //     side after publishTrack() returns on our side.
-    // Bounded poll (200ms cadence) avoids both timing gaps.
-    async function waitForRoomIOAndParticipant(publisherIdentity, timeoutMs) {
-        const deadline = Date.now() + timeoutMs;
-        let roomIO = null;
-        let participantVisible = false;
-        while (Date.now() < deadline) {
-            roomIO = activeAgentSession?._roomIO;
-            if (roomIO && typeof roomIO.setParticipant === 'function') {
-                const agentRoom = roomIO.rtcRoom;
-                const remotes = agentRoom?.remoteParticipants;
-                if (remotes && typeof remotes.values === 'function') {
-                    for (const p of remotes.values()) {
-                        if (p?.identity === publisherIdentity) {
-                            participantVisible = true;
-                            break;
-                        }
-                    }
-                }
-                if (participantVisible)
-                    return { roomIO, participantVisible };
-            }
-            await new Promise(r => setTimeout(r, 200));
-        }
-        // Timed out — return whatever we have. Caller decides whether to proceed.
-        return { roomIO, participantVisible };
-    }
-    const meetingAudioInWss = new WebSocketServer({ noServer: true });
-    meetingAudioInWss.on('connection', async (recallWs) => {
-        console.log('🎙️ Recall audio-in WebSocket connected — setting up LiveKit publisher');
-        const livekitUrl = process.env.LIVEKIT_URL;
-        const apiKey = process.env.LIVEKIT_API_KEY;
-        const apiSecret = process.env.LIVEKIT_API_SECRET;
-        if (!livekitUrl || !apiKey || !apiSecret) {
-            console.warn('⚠️ LIVEKIT_URL / LIVEKIT_API_KEY / LIVEKIT_API_SECRET not set — meeting audio publisher disabled');
-            recallWs.close();
-            return;
-        }
-        if (!currentRoomCode) {
-            console.warn('⚠️ No active LiveKit room (currentRoomCode null) — meeting audio publisher cannot attach');
-            recallWs.close();
-            return;
-        }
-        const roomName = `osborn-${currentRoomCode}`;
-        // Mint a publisher token via livekit-server-sdk (already imported for
-        // /api/token style flows). Long TTL — meetings can run for hours.
-        const identity = `meeting-audio-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
-        const at = new AccessToken(apiKey, apiSecret, {
-            identity,
-            ttl: 14400, // 4 hours
-            metadata: JSON.stringify({ role: 'meeting-audio-publisher' }),
-        });
-        at.addGrant({ roomJoin: true, room: roomName, canPublish: true, canSubscribe: false });
-        const token = await at.toJwt();
-        let room = null;
-        let source = null;
-        let track = null;
-        const cleanup = async () => {
-            // Restore AgentSession STT input to the original user participant before
-            // tearing down the publisher track. If we don't switch back, the session
-            // will be stuck waiting on a participant that's about to disappear.
-            try {
-                const roomIO = activeAgentSession?._roomIO;
-                if (roomIO && typeof roomIO.setParticipant === 'function') {
-                    if (preMeetingUserIdentity) {
-                        roomIO.setParticipant(preMeetingUserIdentity);
-                        console.log(`🔁 Restored AgentSession STT input to user: ${preMeetingUserIdentity}`);
-                    }
-                    else {
-                        roomIO.unsetParticipant();
-                        console.log('🔁 Cleared AgentSession STT input (no original user to restore)');
-                    }
-                }
-            }
-            catch (err) {
-                console.warn('⚠️ Failed to restore RoomIO participant on cleanup:', err.message);
-            }
-            preMeetingUserIdentity = null;
-            try {
-                if (track)
-                    await track.close(true);
-            }
-            catch { }
-            try {
-                if (source)
-                    await source.close();
-            }
-            catch { }
-            try {
-                if (room)
-                    await room.disconnect();
-            }
-            catch { }
-            room = null;
-            source = null;
-            track = null;
-        };
-        try {
-            room = new Room();
-            await room.connect(livekitUrl, token);
-            if (!room.localParticipant)
-                throw new Error('LiveKit connected but localParticipant missing');
-            // Recall sends S16LE PCM at 16kHz mono. AudioSource matches the format.
-            source = new AudioSource(16000, 1);
-            track = LocalAudioTrack.createAudioTrack('meeting-audio', source);
-            await room.localParticipant.publishTrack(track, new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }));
-            console.log(`🎙️ Meeting audio publisher connected to ${roomName} as ${identity}`);
-            // B2 — switch the existing AgentSession's RoomIO input from the local user
-            // to this meeting-audio publisher. While the meeting is active, the user
-            // talks via the meeting (Recall captures it and sends PCM here), and the
-            // agent treats this publisher as the "speaking" participant for STT/EOT.
-            // Original user identity is stashed so cleanup() can restore it.
-            //
-            // 15s timeout accommodates: session-start race (agent still booting when
-            // user clicks "join meeting"), LiveKit participant-join propagation
-            // (~hundreds of ms), and Fly cold-path latency on first request.
-            try {
-                const { roomIO, participantVisible } = await waitForRoomIOAndParticipant(identity, 15000);
-                if (!roomIO) {
-                    console.warn('⚠️ Timed out waiting for AgentSession._roomIO (15s) — meeting audio published but STT not switched. Meeting audio will be ignored until a session starts.');
-                }
-                else if (!participantVisible) {
-                    // RoomIO exists but our publisher hasn't propagated to the agent's
-                    // room view yet. setParticipant stores the identity and links on
-                    // participant-connected event, so this is still safe to call —
-                    // RoomIO will pick up the link when the event arrives.
-                    preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
-                    roomIO.setParticipant(identity);
-                    console.log(`🔁 Switched AgentSession STT input (publisher not yet visible — will link on connect): ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
-                }
-                else {
-                    preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
-                    roomIO.setParticipant(identity);
-                    console.log(`🔁 Switched AgentSession STT input: ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
-                }
-            }
-            catch (err) {
-                console.warn('⚠️ Failed to switch RoomIO participant:', err.message);
-            }
-        }
-        catch (err) {
-            console.error('❌ Failed to set up LiveKit publisher for meeting audio:', err instanceof Error ? err.message : err);
-            try {
-                recallWs.close();
-            }
-            catch { }
-            await cleanup();
-            return;
-        }
-        // Recall → us: JSON events with base64-encoded PCM. Decode, wrap as
-        // AudioFrame, and capture into the source. AgentSession in the main room
-        // will subscribe to this published track and STT it via the normal pipeline.
-        // Payload shape from
-        // docs.recall.ai/docs/how-to-get-separate-audio-per-participant-realtime:
-        //   { event: 'audio_separate_raw.data', data: { data: { buffer: '<base64>', ... }, participant: {...} } }
-        //
-        // Diagnostic counters so we can tell from prod logs whether (a) Recall is
-        // streaming any frames at all, (b) they're decoding correctly, and (c)
-        // captureFrame is succeeding. Logged every 100 frames (~5s at 50fps).
-        let totalMessages = 0;
-        let audioFrames = 0;
-        let bytesIn = 0;
-        let lastSpeakerSeen;
-        const startTs = Date.now();
-        recallWs.on('message', async (raw) => {
-            totalMessages++;
-            if (!source)
-                return;
-            try {
-                const msg = JSON.parse(raw.toString());
-                if (msg.event !== 'audio_separate_raw.data') {
-                    // First-time event-type diagnostic — log unknown event types once so
-                    // we know if Recall's payload shape changed
-                    if (totalMessages <= 3) {
-                        console.log(`[meeting-audio-in] non-audio event: ${msg.event}`);
-                    }
-                    return;
-                }
-                const b64 = msg.data?.data?.buffer;
-                if (!b64) {
-                    if (audioFrames === 0) {
-                        console.warn(`[meeting-audio-in] first audio event had no buffer field. payload keys=${Object.keys(msg.data?.data ?? {}).join(',')}`);
-                    }
-                    return;
-                }
-                const pcmBuf = Buffer.from(b64, 'base64');
-                bytesIn += pcmBuf.byteLength;
-                const speakerName = msg.data?.data?.participant?.name || msg.data?.participant?.name;
-                if (speakerName && speakerName !== lastSpeakerSeen) {
-                    console.log(`[meeting-audio-in] now hearing: ${speakerName}`);
-                    lastSpeakerSeen = speakerName;
-                }
-                // AudioFrame expects Int16Array. The PCM buffer is S16LE — view it
-                // directly without copy. Length / 2 = samples (each sample 2 bytes).
-                const samplesPerChannel = pcmBuf.byteLength / 2;
-                const int16 = new Int16Array(pcmBuf.buffer, pcmBuf.byteOffset, samplesPerChannel);
-                const frame = new AudioFrame(int16, 16000, 1, samplesPerChannel);
-                await source.captureFrame(frame);
-                audioFrames++;
-                if (audioFrames === 1) {
-                    console.log(`[meeting-audio-in] FIRST audio frame captured (${pcmBuf.byteLength} bytes, ${samplesPerChannel} samples)`);
-                }
-                if (audioFrames % 100 === 0) {
-                    const elapsed = ((Date.now() - startTs) / 1000).toFixed(1);
-                    console.log(`[meeting-audio-in] heartbeat: ${audioFrames} frames, ${(bytesIn / 1024).toFixed(1)} KB in ${elapsed}s (last speaker: ${lastSpeakerSeen ?? 'unknown'})`);
-                }
-            }
-            catch (err) {
-                // Don't log every frame parse failure — could be noisy if Recall sends
-                // non-audio_separate_raw events on the same channel.
-                if (err.message?.includes('JSON'))
-                    return;
-                console.warn('⚠️ meeting audio capture error:', err instanceof Error ? err.message : err);
-            }
-        });
-        recallWs.on('close', async () => {
-            const elapsed = ((Date.now() - startTs) / 1000).toFixed(1);
-            console.log(`🎙️ Recall audio-in WebSocket closed — tearing down LiveKit publisher. Total: ${audioFrames} audio frames / ${totalMessages} messages / ${(bytesIn / 1024).toFixed(1)} KB over ${elapsed}s`);
-            await cleanup();
-        });
-        recallWs.on('error', (err) => {
-            console.warn('⚠️ Recall WS error:', err instanceof Error ? err.message : err);
-        });
-    });
-    server.on('upgrade', (req, socket, head) => {
-        const url = new URL(req.url || '/', `http://localhost:${port}`);
-        if (url.pathname === '/meeting-audio') {
-            meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
-                meetingOutputWss.emit('connection', ws, req);
-            });
-        }
-        else if (url.pathname === '/meeting-audio-in') {
-            meetingAudioInWss.handleUpgrade(req, socket, head, (ws) => {
-                meetingAudioInWss.emit('connection', ws, req);
-            });
-        }
-        else {
-            socket.destroy();
-        }
+    // No WebSocket upgrade routes — meeting audio in/out moved off LiveKit to
+    // a polling architecture (see MeetingTranscriptPoller). The /meeting-audio
+    // and /meeting-audio-in routes were the old WebSocket-audio pipeline; both
+    // are gone. Reject all upgrade attempts.
+    server.on('upgrade', (_req, socket) => {
+        socket.destroy();
     });
     server.on('error', (err) => {
         if (err.code === 'EADDRINUSE') {
@@ -1467,7 +1086,7 @@ async function main() {
     // session-only path (no user prefix).
     let currentUserId = '';
     let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
-    // meetingOutputWs is module-level (see top of file) — shared between startApiServer and main()
+    let activeMeetingPoller = null; // Transcript poller bound to that bot
     // Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
     // Updated by resume_session, session_selected, continue_session, switch_session handlers
     let currentResumeSessionId;
@@ -1918,6 +1537,40 @@ async function main() {
             }
         }
     }
+    // Compaction event → frontend bridge. Forwards the raw event (consumed by the
+    // dedicated banner UI state machine) AND emits a `claude_output` chat bubble
+    // (so the activity is visible inline in chat even when the banner is hidden,
+    // collapsed, or unreliable on iPad/iPhone). Extracted as a helper because
+    // both direct-mode and pipeline-mode need to register it — the pipeline path
+    // previously skipped this entirely, so compaction events fired into the void
+    // in pipeline mode.
+    const buildOnCompactionEvent = () => (event) => {
+        try {
+            // Raw event → banner state machine (compaction_started/progress/complete handlers in VoiceRoom.tsx).
+            sendToFrontend({ ...event });
+            // Inline chat bubble — reuses the existing claude_output path that's already working.
+            if (event.type === 'compaction_started') {
+                const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
+                sendToFrontend({
+                    type: 'claude_output',
+                    text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
+                    agentRole: 'direct',
+                });
+            }
+            else if (event.type === 'compaction_complete') {
+                const n = event.skillsWritten ?? 0;
+                const names = Array.isArray(event.skillNames) && event.skillNames.length > 0
+                    ? ` — ${event.skillNames.join(', ')}`
+                    : '';
+                sendToFrontend({
+                    type: 'claude_output',
+                    text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
+                    agentRole: 'direct',
+                });
+            }
+        }
+        catch { /* non-fatal */ }
+    };
     // Create DIRECT session (STT + Claude Agent SDK + TTS)
     async function createDirectSession(resumeSessionId, llmOverride) {
         console.log('🎯 Creating direct session...');
@@ -1933,39 +1586,7 @@ async function main() {
             resumeSessionId,
             voiceMode: 'direct',
             skipTTSQueue: true,
-            onCompactionEvent: (event) => {
-                try {
-                    // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
-                    sendToFrontend({ ...event });
-                    // ALSO emit as a claude_output chat bubble — reuses the existing message path
-                    // that's already working end-to-end. PreCompact → in-progress bubble.
-                    // PostCompact → completion bubble with the skills summary. The dedicated
-                    // banner has been unreliable in production (data path works on backend, banner
-                    // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
-                    // are visible without dev tools.
-                    if (event.type === 'compaction_started') {
-                        const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
-                        sendToFrontend({
-                            type: 'claude_output',
-                            text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
-                            agentRole: 'direct',
-                        });
-                    }
-                    else if (event.type === 'compaction_complete') {
-                        const ev = event;
-                        const n = ev.skillsWritten ?? 0;
-                        const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
-                            ? ` — ${ev.skillNames.join(', ')}`
-                            : '';
-                        sendToFrontend({
-                            type: 'claude_output',
-                            text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
-                            agentRole: 'direct',
-                        });
-                    }
-                }
-                catch { /* non-fatal */ }
-            },
+            onCompactionEvent: buildOnCompactionEvent(),
         });
         currentLLM = directLLM;
         // Reset the session always-allow list for each new direct session
@@ -2165,20 +1786,6 @@ async function main() {
             }
             const sayId = Date.now(); // simple ID to correlate start/end logs
             console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
-            // Forward spoken text + audio to meeting output page when bot is in a meeting.
-            // Uses DIRECT_MODE_TTS (same OpenAI fable voice as the live session) — was
-            // previously using directConfig.tts which falls back to DEFAULT_CONFIG.direct.tts
-            // (Deepgram aura-2-asteria-en) when no user config exists, producing a different
-            // voice in the meeting than what the user hears in voice-native. Both paths now
-            // share the single source of truth.
-            // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
-            // Recall captures the browser page's audio output and injects it into the meeting.
-            if (activeMeetingBotId) {
-                sendToMeetingOutput({ type: 'speak', text: data.text });
-                if (meetingOutputWs) {
-                    synthesizeForMeeting(data.text, DIRECT_MODE_TTS).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
-                }
-            }
             try {
                 const handle = currentSession.say(data.text);
                 if (handle && typeof handle.addDoneCallback === 'function') {
@@ -2316,39 +1923,7 @@ async function main() {
             sessionBaseDir,
             mcpServers,
             resumeSessionId,
-            onCompactionEvent: (event) => {
-                try {
-                    // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
-                    sendToFrontend({ ...event });
-                    // ALSO emit as a claude_output chat bubble — reuses the existing message path
-                    // that's already working end-to-end. PreCompact → in-progress bubble.
-                    // PostCompact → completion bubble with the skills summary. The dedicated
-                    // banner has been unreliable in production (data path works on backend, banner
-                    // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
-                    // are visible without dev tools.
-                    if (event.type === 'compaction_started') {
-                        const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
-                        sendToFrontend({
-                            type: 'claude_output',
-                            text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
-                            agentRole: 'direct',
-                        });
-                    }
-                    else if (event.type === 'compaction_complete') {
-                        const ev = event;
-                        const n = ev.skillsWritten ?? 0;
-                        const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
-                            ? ` — ${ev.skillNames.join(', ')}`
-                            : '';
-                        sendToFrontend({
-                            type: 'claude_output',
-                            text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
-                            agentRole: 'direct',
-                        });
-                    }
-                }
-                catch { /* non-fatal */ }
-            },
+            onCompactionEvent: buildOnCompactionEvent(),
         });
         currentLLM = realtimeClaudeHandler;
         // For resumed sessions, eagerly create workspace (we know the real ID)
@@ -2891,7 +2466,6 @@ async function main() {
         }
         lastCompletedResearch = null;
         currentSession = null;
-        activeAgentSession = null;
         currentAgent = null;
         // Same disconnect-leak fix as the other two cleanup sites — kill the Claude SDK
         // subprocess BEFORE dropping the reference. See killCurrentLLM() for full context.
@@ -2937,7 +2511,6 @@ async function main() {
             }
             catch { }
             currentSession = null;
-            activeAgentSession = null;
             currentAgent = null;
             // Same disconnect-leak fix — kill the previous user's Claude subprocess
             // before binding currentLLM to the new user's session below.
@@ -3057,6 +2630,13 @@ async function main() {
                 resumeSessionId,
                 voiceMode: 'direct',
                 skipTTSQueue: true,
+                // PipelineDirectOptions extends ClaudeLLMOptions; passing this through
+                // forwards it into the inner `new ClaudeLLM(opts)`. Without this,
+                // pipeline mode silently drops every PreCompact/PostCompact event
+                // — banner never appears, chat bubble never appears — because
+                // createDirectSession's `createClaudeLLM(...)` call is skipped when
+                // an llmOverride is supplied (which is exactly what pipeline mode does).
+                onCompactionEvent: buildOnCompactionEvent(),
                 getChatHistory: () => getChatHistory(20).map(t => ({ role: t.role, content: t.text })),
                 getResearchContext: () => {
                     if (activeResearch?.researchLog.length) {
@@ -3092,7 +2672,6 @@ async function main() {
             agent = result.agent;
         }
         currentSession = session;
-        activeAgentSession = session;
         currentAgent = agent; // Store for updateChatCtx() context injection
         // ============================================================
         // Session event wiring — extracted into function for auto-recovery
@@ -3252,7 +2831,6 @@ async function main() {
                     }
                     catch { }
                     currentSession = null;
-                    activeAgentSession = null;
                     currentAgent = null;
                     // Clear stale state from crashed session
                     voiceQueue.length = 0;
@@ -3314,7 +2892,6 @@ async function main() {
                         const newSession = result.session;
                         const newAgent = result.agent;
                         currentSession = newSession;
-                        activeAgentSession = newSession;
                         currentAgent = newAgent;
                         // Re-wire event listeners on the new session
                         wireSessionEvents(newSession, newAgent);
@@ -3371,7 +2948,6 @@ async function main() {
                     }
                     catch { }
                     currentSession = null;
-                    activeAgentSession = null;
                     currentAgent = null;
                     // Clear voice queue — stale injections from the crashed session
                     voiceQueue.length = 0;
@@ -3395,7 +2971,6 @@ async function main() {
                         const newSession = result.session;
                         const newAgent = result.agent;
                         currentSession = newSession;
-                        activeAgentSession = newSession;
                         currentAgent = newAgent;
                         // Re-wire event listeners on the new session
                         wireSessionEvents(newSession, newAgent);
@@ -3590,7 +3165,6 @@ async function main() {
         if (currentSession) {
             const sessionToClose = currentSession;
             currentSession = null;
-            activeAgentSession = null;
             // Track async close so new connections can wait for byte stream handler to be released
             pendingSessionClose = (async () => {
                 try {
@@ -3612,6 +3186,10 @@ async function main() {
         clearFastBrainSession();
         clearPipelineFastBrainSession();
         // Auto-leave any active meeting bot when user disconnects from the room
+        if (activeMeetingPoller) {
+            activeMeetingPoller.stop();
+            activeMeetingPoller = null;
+        }
         if (activeMeetingBotId) {
             const recallDisconnect = getRecallClient();
             if (recallDisconnect) {
@@ -4195,61 +3773,61 @@ async function main() {
                                 (process.env.FLY_APP_NAME
                                     ? `https://${process.env.FLY_APP_NAME}.fly.dev`
                                     : `http://localhost:${apiPort}`);
-                            // Try to mint a LiveKit bot token + construct the frontend-hosted
-                            // meeting-bot page URL. The bot page joins the same LiveKit room
-                            // as this agent so meeting audio flows through LiveKit directly
-                            // (no agent-side WebSocket+WAV pipe). Falls back to the legacy
-                            // /meeting-output webpage if no frontend URL is resolvable, so
-                            // the old code path keeps working during the migration window.
-                            //
-                            // Frontend URL resolution (in priority order):
-                            //   1. data.frontendBase — the public URL the user's browser is on,
-                            //      passed through the join_meeting data channel message. Works
-                            //      automatically for localhost dev + production without any
-                            //      env var.
-                            //   2. OSBORN_FRONTEND_URL — existing convention from sprites.ts
-                            //      (frontend/src/lib/sprites.ts:241) that injects the public
-                            //      frontend URL into sandbox env vars. Defense in depth.
-                            //
-                            // Auth: the endpoint uses LiveKit room-presence as the auth check
-                            // — no shared secret needed. The agent must already be in the
-                            // requested room (which it is by this point) for the mint to
-                            // succeed.
-                            let outputPageUrl;
-                            const frontendUrl = data.frontendBase
-                                || process.env.OSBORN_FRONTEND_URL;
-                            if (frontendUrl) {
+                            // Polling architecture (post-2026-05-22): the bot joins by name
+                            // only — no output_media webpage, no LiveKit republish, no audio
+                            // pipeline at all. Recall captures the meeting audio internally
+                            // and we pull the transcript via its REST API every ~30s.
+                            await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
+                            const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase);
+                            const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
+                            recallJoin.registerBot(botId, sessionId);
+                            activeMeetingBotId = botId;
+                            await sendToFrontend({ type: 'meeting_joined', botId, message: 'Osborn has joined the meeting' });
+                            // System injection so the LLM knows it's in a meeting and which
+                            // skill to apply. The meetings skill (agent/.claude/skills/meetings/SKILL.md)
+                            // teaches the agent: don't speak in response to [MEETING — *]:
+                            // messages, keep meeting-todos.md updated in the workspace, etc.
+                            if (currentLLM) {
                                 try {
-                                    const botLkId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
-                                    const tokenRes = await fetch(`${frontendUrl}/api/meeting-bot-token`, {
-                                        method: 'POST',
-                                        headers: { 'Content-Type': 'application/json' },
-                                        body: JSON.stringify({ botId: botLkId, roomName }),
+                                    const sysCtx = new llm.ChatContext();
+                                    sysCtx.addMessage({
+                                        role: 'user',
+                                        content: `[SYSTEM] You are now in a meeting (Recall bot ID: ${botId}, URL: ${meetingUrl}). Transcript chunks will arrive every ~30 seconds tagged \`[MEETING — ${botId}]:\`. Follow the meetings skill: do NOT speak in response (no TTS output), instead maintain meeting-todos.md in the session workspace, optionally trigger background research silently. The voice-native user can still interact normally — only the meeting-tagged messages are the silent-observer path. Acknowledge by writing the initial meeting-todos.md skeleton.`,
                                     });
-                                    if (tokenRes.ok) {
-                                        const { token, url } = await tokenRes.json();
-                                        const params = new URLSearchParams({ token, url, room: roomName, botId: botLkId });
-                                        outputPageUrl = `${frontendUrl}/meeting-bot?${params.toString()}`;
-                                        console.log(`🎫 Meeting-bot token minted for room=${roomName} bot=${botLkId}`);
-                                    }
-                                    else {
-                                        const errText = await tokenRes.text().catch(() => '');
-                                        console.warn(`⚠️ meeting-bot-token mint failed (HTTP ${tokenRes.status}: ${errText.substring(0, 120)}) — falling back to legacy /meeting-output path`);
-                                    }
+                                    currentLLM.chat({ chatCtx: sysCtx });
+                                    console.log('📓 Meeting system injection sent to LLM');
                                 }
-                                catch (mintErr) {
-                                    console.warn(`⚠️ meeting-bot-token mint threw — falling back: ${mintErr.message}`);
+                                catch (sysErr) {
+                                    console.warn('⚠️ Meeting system injection failed:', sysErr.message);
                                 }
                             }
-                            else {
-                                console.log('ℹ️ No frontend URL (data.frontendBase + OSBORN_FRONTEND_URL both empty) — using legacy /meeting-output path');
+                            // Start polling the transcript every 30s. Each batch of new turns
+                            // is pushed to currentLLM.chat() tagged [MEETING — botId]: so the
+                            // skill kicks in. Poller dedups via first-word timestamp cursor.
+                            if (activeMeetingPoller) {
+                                activeMeetingPoller.stop();
+                                activeMeetingPoller = null;
                             }
-                            await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
-                            const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase, { outputPageUrl });
-                            const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
-                            recallJoin.registerBot(botId, sessionId);
-                            activeMeetingBotId = botId;
-                            await sendToFrontend({ type: 'meeting_joined', botId, message: 'Osborn has joined the meeting' });
+                            activeMeetingPoller = new MeetingTranscriptPoller({
+                                botId,
+                                recall: recallJoin,
+                                onTurns: async ({ formatted }) => {
+                                    if (!currentLLM) {
+                                        console.warn('📓 Meeting transcript arrived but currentLLM is null — dropping');
+                                        return;
+                                    }
+                                    const tagged = `[MEETING — ${botId}]:\n${formatted}`;
+                                    try {
+                                        const turnCtx = new llm.ChatContext();
+                                        turnCtx.addMessage({ role: 'user', content: tagged });
+                                        currentLLM.chat({ chatCtx: turnCtx });
+                                    }
+                                    catch (err) {
+                                        console.warn(`⚠️ Failed to forward meeting transcript to LLM: ${err.message}`);
+                                    }
+                                },
+                            });
+                            activeMeetingPoller.start();
                         }
                         catch (err) {
                             console.error('❌ Recall.ai join error:', err);
@@ -4263,6 +3841,12 @@ async function main() {
                 const recallLeave = getRecallClient();
                 if (recallLeave && botId) {
                     try {
+                        // Stop the transcript poller FIRST so no more transcript chunks get
+                        // forwarded to the LLM during the leave.
+                        if (activeMeetingPoller) {
+                            activeMeetingPoller.stop();
+                            activeMeetingPoller = null;
+                        }
                         await recallLeave.leaveMeeting(botId);
                         activeMeetingBotId = null;
                         await sendToFrontend({ type: 'meeting_left', botId });

package/dist/meeting-transcript-poller.d.ts ADDED Viewed

@@ -0,0 +1,60 @@
+/**
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
+ * messages.
+ *
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
+ * never speaks in the meeting — it's a silent note-taker.
+ *
+ * Lifecycle:
+ *   const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
+ *   poller.start()
+ *   ...
+ *   poller.stop()  // on leave_meeting / disconnect / session switch
+ *
+ * Dedup strategy:
+ *   Each turn carries a `start_timestamp.relative` on its first word (seconds
+ *   since recording start). We track the highest cursor we've forwarded and
+ *   only send turns with a strictly greater first-word timestamp. This means
+ *   re-fetches don't double-deliver, and partial transcripts that get refined
+ *   later don't re-trigger LLM processing of already-handled turns.
+ *
+ * Error handling:
+ *   Transient fetch errors are logged + skipped (poll continues on next tick).
+ *   No backoff — Recall's transcript endpoint is stable enough that a 30s
+ *   cadence makes "slow start" non-issues self-recover within one cycle.
+ */
+import type { RecallClient, TranscriptTurn } from './recall-client.js';
+export interface MeetingTranscriptPollerOptions {
+    botId: string;
+    recall: RecallClient;
+    /** Called when new transcript turns arrive (de-duped). Get a fresh batch each tick. */
+    onTurns: (chunk: {
+        botId: string;
+        turns: TranscriptTurn[];
+        formatted: string;
+    }) => void | Promise<void>;
+    /** Default 30s — matches the user's stated cadence. */
+    intervalMs?: number;
+    /** Optional debug logger. */
+    onError?: (err: Error) => void;
+}
+export declare class MeetingTranscriptPoller {
+    #private;
+    constructor(opts: MeetingTranscriptPollerOptions);
+    start(): void;
+    stop(): void;
+}
+/**
+ * Format an array of turns into a single string for LLM consumption.
+ *
+ * Each turn becomes:
+ *   <Speaker>: <text>
+ *
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
+ * string if nothing meaningful is in the batch.
+ */
+export declare function formatTurns(turns: TranscriptTurn[]): string;

package/dist/meeting-transcript-poller.js ADDED Viewed

@@ -0,0 +1,112 @@
+/**
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
+ * messages.
+ *
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
+ * never speaks in the meeting — it's a silent note-taker.
+ *
+ * Lifecycle:
+ *   const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
+ *   poller.start()
+ *   ...
+ *   poller.stop()  // on leave_meeting / disconnect / session switch
+ *
+ * Dedup strategy:
+ *   Each turn carries a `start_timestamp.relative` on its first word (seconds
+ *   since recording start). We track the highest cursor we've forwarded and
+ *   only send turns with a strictly greater first-word timestamp. This means
+ *   re-fetches don't double-deliver, and partial transcripts that get refined
+ *   later don't re-trigger LLM processing of already-handled turns.
+ *
+ * Error handling:
+ *   Transient fetch errors are logged + skipped (poll continues on next tick).
+ *   No backoff — Recall's transcript endpoint is stable enough that a 30s
+ *   cadence makes "slow start" non-issues self-recover within one cycle.
+ */
+export class MeetingTranscriptPoller {
+    #opts;
+    #timer = null;
+    #cursor = -Infinity; // highest first-word.start_timestamp.relative we've forwarded
+    #inFlight = false; // prevent overlapping polls if one cycle runs long
+    #stopped = false;
+    constructor(opts) {
+        this.#opts = opts;
+    }
+    start() {
+        if (this.#timer)
+            return;
+        const interval = this.#opts.intervalMs ?? 30_000;
+        console.log(`📓 MeetingTranscriptPoller starting for bot=${this.#opts.botId.substring(0, 8)} (every ${Math.round(interval / 1000)}s)`);
+        // Fire once immediately so the LLM sees the meeting started, then on interval.
+        void this.#tick();
+        this.#timer = setInterval(() => void this.#tick(), interval);
+    }
+    stop() {
+        if (this.#stopped)
+            return;
+        this.#stopped = true;
+        if (this.#timer) {
+            clearInterval(this.#timer);
+            this.#timer = null;
+        }
+        console.log(`📓 MeetingTranscriptPoller stopped for bot=${this.#opts.botId.substring(0, 8)}`);
+    }
+    async #tick() {
+        if (this.#inFlight || this.#stopped)
+            return;
+        this.#inFlight = true;
+        try {
+            const all = await this.#opts.recall.getTranscript(this.#opts.botId);
+            const fresh = all.filter(t => {
+                const firstWordTs = t.words?.[0]?.start_timestamp?.relative;
+                return typeof firstWordTs === 'number' && firstWordTs > this.#cursor;
+            });
+            if (fresh.length === 0)
+                return;
+            // Advance cursor to highest seen first-word ts (across all returned turns,
+            // not just the fresh ones — guards against Recall returning a paged subset).
+            for (const t of all) {
+                const ts = t.words?.[0]?.start_timestamp?.relative;
+                if (typeof ts === 'number' && ts > this.#cursor)
+                    this.#cursor = ts;
+            }
+            const formatted = formatTurns(fresh);
+            if (!formatted)
+                return; // pure-whitespace fresh batch — skip
+            console.log(`📓 MeetingTranscriptPoller: ${fresh.length} new turn(s), cursor=${this.#cursor.toFixed(1)}s, chars=${formatted.length}`);
+            await this.#opts.onTurns({ botId: this.#opts.botId, turns: fresh, formatted });
+        }
+        catch (err) {
+            const e = err instanceof Error ? err : new Error(String(err));
+            this.#opts.onError?.(e);
+            console.warn(`⚠️ MeetingTranscriptPoller tick failed: ${e.message}`);
+        }
+        finally {
+            this.#inFlight = false;
+        }
+    }
+}
+/**
+ * Format an array of turns into a single string for LLM consumption.
+ *
+ * Each turn becomes:
+ *   <Speaker>: <text>
+ *
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
+ * string if nothing meaningful is in the batch.
+ */
+export function formatTurns(turns) {
+    const lines = [];
+    for (const t of turns) {
+        const speaker = t.speaker || t.participant?.name || 'Unknown';
+        const text = (t.words ?? []).map(w => w.text).join(' ').replace(/\s+/g, ' ').trim();
+        if (!text)
+            continue;
+        lines.push(`${speaker}: ${text}`);
+    }
+    return lines.join('\n');
+}

package/dist/recall-client.d.ts CHANGED Viewed

@@ -4,6 +4,36 @@ export interface RecallBot {
     meeting_url: string;
     status: string;
 }
+/**
+ * One transcript turn = one speaker's continuous utterance.
+ * Shape returned by GET /api/v1/bot/{bot_id}/transcript.
+ *
+ * Per Recall docs each turn contains:
+ *   - speaker: participant name (or 'Unknown')
+ *   - words: array of { text, start_timestamp.relative, end_timestamp.relative }
+ *   - The `start_timestamp.relative` (seconds since recording start) on the
+ *     FIRST word is the turn's start; we use this as the dedup cursor.
+ */
+export interface TranscriptTurn {
+    speaker?: string;
+    participant?: {
+        id?: number;
+        name?: string;
+        is_host?: boolean;
+    };
+    words: Array<{
+        text: string;
+        start_timestamp?: {
+            relative?: number;
+            absolute?: string;
+        };
+        end_timestamp?: {
+            relative?: number;
+            absolute?: string;
+        };
+    }>;
+    language?: string;
+}
 export interface TranscriptPayload {
     event: string;
     data: {
@@ -49,10 +79,27 @@ export declare class RecallClient extends EventEmitter {
      *                            room as the osborn agent (no separate WebSocket+WAV pipe).
      * @param opts.botName     Display name of the bot in the meeting
      */
-    joinMeeting(meetingUrl: string, webhookBaseUrl: string, opts?: {
-        outputPageUrl?: string;
+    joinMeeting(meetingUrl: string, _webhookBaseUrl: string, opts?: {
         botName?: string;
     }): Promise<string>;
+    /**
+     * Fetch the bot's current transcript. Returns an array of "transcript turns"
+     * (each turn = one speaker's utterance) sorted by start time. Use the bot's
+     * `recordings[0].id` from getBotStatus / bot record to locate the recording,
+     * then list its transcripts.
+     *
+     * Per Recall docs:
+     *   GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
+     *   GET /api/v1/transcript/{transcript_id} → transcript with download_url
+     *   Download the transcript JSON from download_url to get the actual content.
+     *
+     * For the polling use case (called every ~30s), we use the simpler combined
+     * endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
+     * convenience and returns the full transcript so far in one call. The caller
+     * is responsible for de-duping (keeping a since-cursor) so the LLM only sees
+     * new turns.
+     */
+    getTranscript(botId: string): Promise<TranscriptTurn[]>;
     leaveMeeting(botId: string): Promise<void>;
     getBotStatus(botId: string): Promise<string>;
     handleWebhook(payload: TranscriptPayload): void;

package/dist/recall-client.js CHANGED Viewed

@@ -21,37 +21,20 @@ export class RecallClient extends EventEmitter {
      *                            room as the osborn agent (no separate WebSocket+WAV pipe).
      * @param opts.botName     Display name of the bot in the meeting
      */
-    async joinMeeting(meetingUrl, webhookBaseUrl, opts) {
+    async joinMeeting(meetingUrl, _webhookBaseUrl, opts) {
         const botName = opts?.botName ?? 'Osborn';
-        const outputPageUrl = opts?.outputPageUrl ?? `${webhookBaseUrl}/meeting-output`;
-        // Authoritative structure per https://docs.recall.ai/reference/bot_create
-        // and https://docs.recall.ai/docs/real-time-transcription:
+        // ARCHITECTURE (post-2026-05-22 polling redesign):
+        //   The bot joins by name only — visible in the meeting participant list as
+        //   "Osborn" but with no audio output and no avatar. We do NOT configure any
+        //   `output_media`, `audio_separate_raw`, or `realtime_endpoints` — instead
+        //   the agent polls Recall's REST transcript API every ~30s
+        //   (see MeetingTranscriptPoller) and feeds new turns into the LLM as
+        //   `[MEETING — <botId>]:` tagged messages. The meetings skill teaches the
+        //   LLM not to respond out loud to those messages, only to take notes.
         //
-        //   recording_config.transcript.provider  — transcription provider config
-        //   recording_config.realtime_endpoints   — webhook/websocket delivery
-        //
-        // IMPORTANT:
-        //   - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
-        //   - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
-        //   - `transcription_options` does NOT exist — use `transcript.provider`
-        //   - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
-        //
-        // ARCHITECTURE (post-2026-05-22 redesign):
-        //   Input (meeting → osborn): Recall's documented WebSocket audio protocol.
-        //     `audio_separate_raw` config + websocket realtime endpoint streams
-        //     per-participant PCM (S16LE 16kHz mono, base64 in JSON) to the agent's
-        //     /meeting-audio-in WS handler. Bot's own audio is excluded by default
-        //     → zero possibility of feedback loop, no echo cancellation needed.
-        //   Output (osborn → meeting): webpage output_media (LiveKit-on-page). Bot
-        //     page subscribes to osborn's LiveKit audio track and plays it via
-        //     track.attach(); Recall captures the page's audio output and injects
-        //     into the meeting.
-        //   Webhook transcripts (transcript.data): retained as a SECONDARY signal —
-        //     the agent index.ts handler for this event currently logs but does NOT
-        //     forward to the LLM (intentionally disabled). The Deepgram WS path
-        //     above is the LLM input.
-        const httpBase = webhookBaseUrl.replace(/\/$/, '');
-        const wsBase = httpBase.replace(/^https?:\/\//, m => m === 'https://' ? 'wss://' : 'ws://');
+        //   We DO keep `recording_config.transcript.provider.recallai_streaming` so
+        //   Recall actually transcribes the meeting — the REST endpoint we poll
+        //   requires this to be configured, otherwise transcripts are empty.
         const res = await fetch(`${RECALL_BASE_URL}/bot`, {
             method: 'POST',
             headers: {
@@ -64,46 +47,12 @@ export class RecallClient extends EventEmitter {
                 recording_config: {
                     transcript: {
                         provider: {
-                            // recallai_streaming is built-in — no external API key needed,
-                            // low-latency, works across all meeting platforms.
-                            // Kept for the secondary webhook signal (display / future use);
-                            // LLM input now comes from the Deepgram WS pipe below.
                             recallai_streaming: {
                                 mode: 'prioritize_low_latency',
                                 language_code: 'en',
                             },
                         },
                     },
-                    // Per-participant raw PCM audio stream. Bot's own audio is excluded
-                    // (we don't set include_bot_in_recording.audio:true).
-                    audio_separate_raw: {},
-                    realtime_endpoints: [
-                        {
-                            // Transcript webhook (secondary signal; LLM forwarding disabled).
-                            type: 'webhook',
-                            url: `${httpBase}/webhook/recall`,
-                            events: ['transcript.data'],
-                        },
-                        {
-                            // Per-participant PCM audio → agent's Deepgram STT pipe.
-                            type: 'websocket',
-                            url: `${wsBase}/meeting-audio-in`,
-                            events: ['audio_separate_raw.data'],
-                        },
-                    ],
-                },
-                output_media: {
-                    camera: {
-                        // `kind` (not `type`) — confirmed from prior debugging.
-                        // The page Recall renders connects to LiveKit and plays osborn's
-                        // TTS audio via track.attach(); Recall captures the page audio.
-                        // The page does NOT call getUserMedia anymore — input now comes
-                        // from the audio_separate_raw WebSocket above.
-                        kind: 'webpage',
-                        config: {
-                            url: outputPageUrl,
-                        },
-                    },
                 },
             }),
         });
@@ -112,9 +61,37 @@ export class RecallClient extends EventEmitter {
             throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
         }
         const bot = (await res.json());
-        console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (output page: ${outputPageUrl})`);
+        console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (polling-only, no audio pipeline)`);
         return bot.id;
     }
+    /**
+     * Fetch the bot's current transcript. Returns an array of "transcript turns"
+     * (each turn = one speaker's utterance) sorted by start time. Use the bot's
+     * `recordings[0].id` from getBotStatus / bot record to locate the recording,
+     * then list its transcripts.
+     *
+     * Per Recall docs:
+     *   GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
+     *   GET /api/v1/transcript/{transcript_id} → transcript with download_url
+     *   Download the transcript JSON from download_url to get the actual content.
+     *
+     * For the polling use case (called every ~30s), we use the simpler combined
+     * endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
+     * convenience and returns the full transcript so far in one call. The caller
+     * is responsible for de-duping (keeping a since-cursor) so the LLM only sees
+     * new turns.
+     */
+    async getTranscript(botId) {
+        const res = await fetch(`${RECALL_BASE_URL}/bot/${botId}/transcript`, {
+            headers: { 'Authorization': `Token ${this.#apiKey}` },
+        });
+        if (!res.ok) {
+            const err = await res.text().catch(() => '');
+            throw new Error(`Recall.ai transcript fetch failed: ${res.status} ${err.substring(0, 200)}`);
+        }
+        const turns = await res.json();
+        return Array.isArray(turns) ? turns : [];
+    }
     async leaveMeeting(botId) {
         await fetch(`${RECALL_BASE_URL}/bot/${botId}/leave_call`, {
             method: 'POST',

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.9.42",
+  "version": "0.9.43",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {
@@ -11,7 +11,7 @@
     "dev:logged": "tsx scripts/dev-logged.ts",
     "review": "tsx scripts/review.ts",
     "start": "tsx src/index.ts",
-    "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts && cp src/meeting-output.html dist/",
+    "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts",
     "room": "tsx src/index.ts --room",
     "prepublishOnly": "npm run build"
   },