osborn 0.9.41 → 0.9.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ # Skill: Meetings
2
+
3
+ Silent note-taking and TODO-tracking when osborn is sitting in a live meeting.
4
+
5
+ ## When to use
6
+
7
+ When a user message arrives with the prefix `[MEETING — <botId>]:` (every ~30 seconds while a Recall.ai meeting bot is active). Also use this skill when the orchestrating system injects `[SYSTEM] You are now in a meeting ...`.
8
+
9
+ **Do NOT use this skill** for normal user messages in the voice-native chat — those still get spoken responses as usual.
10
+
11
+ ## How to behave
12
+
13
+ For every `[MEETING — *]:` message:
14
+
15
+ 1. **Do NOT speak.** No TTS output. No `tts_say`. No conversational reply. This is a silent observer path. If you must acknowledge that you processed the message, do it via a Write/Edit tool call (writing to the workspace), not via spoken or chat output.
16
+ 2. **Update `meeting-todos.md`** in the session workspace (`{workspace}/meeting-todos.md`). Append new action items, decisions, and open questions as they emerge in the transcript. Do not rewrite existing entries unless something contradicts.
17
+ 3. **Optionally trigger background research silently.** If a topic in the meeting would benefit from a quick web/code lookup, dispatch a researcher sub-agent via the Task tool. Save its output to `{workspace}/library/meeting-research-<topic-slug>.md`. Do NOT speak the result.
18
+ 4. **Do not consume voice-native attention.** The user can still talk to you via the voice-native browser. When they do (a normal user message with no `[MEETING — *]` prefix), respond normally — speak. Treat the meeting transcript as background context they can ask about ("what did Sarah say about pricing?" → answer normally).
19
+
20
+ ## The `meeting-todos.md` file
21
+
22
+ Keep it scannable. Structure:
23
+
24
+ ```markdown
25
+ # Meeting Notes
26
+
27
+ **Bot:** <botId> · **Started:** <ISO timestamp>
28
+
29
+ ## TODOs
30
+
31
+ - [ ] <person>: <action item> — <context>
32
+ - [ ] <person>: <action item>
33
+
34
+ ## Decisions
35
+
36
+ - <date/time> — <what was decided> (raised by <person>)
37
+
38
+ ## Open Questions
39
+
40
+ - <question> — raised by <person>, still unresolved
41
+ - <question> — answered by <person>: <answer>
42
+
43
+ ## Highlights
44
+
45
+ - <key moment or quote worth surfacing>
46
+ ```
47
+
48
+ Update the same file across multiple poll cycles — don't create `meeting-todos-1.md`, `meeting-todos-2.md`. One file, evolving.
49
+
50
+ ## Workspace path
51
+
52
+ The session workspace is `~/.claude/projects/<slug>/osb/<session-uuid>/`. Read the env variable or the spec.md header if you need to confirm the exact path. Write absolute paths in tool calls (e.g. `/Users/<user>/.claude/projects/.../osb/<uuid>/meeting-todos.md`).
53
+
54
+ ## On meeting end
55
+
56
+ When the user leaves the meeting (the system stops sending `[MEETING — *]:` messages and may inject `[SYSTEM] meeting ended`), do a final pass on `meeting-todos.md` to:
57
+ - Mark items the user has clearly committed to
58
+ - Move resolved open questions to a `## Resolved` section
59
+ - Add a `## Summary` section at the top with 3-5 lines distilling the meeting
60
+
61
+ Still silent. The user will ask out loud if they want a recap.
62
+
63
+ ## When the user asks about the meeting
64
+
65
+ When a non-meeting-tagged message references the meeting ("what's on the todo list?", "what did we decide about X?", "who's handling Y?"), respond normally — speak. Read `meeting-todos.md` first to ground the response. Don't make up speaker names or decisions; only state what's recorded.
66
+
67
+ ## Anti-patterns
68
+
69
+ - ❌ Speaking in response to a `[MEETING — *]:` message
70
+ - ❌ Creating a new file per poll cycle instead of updating one
71
+ - ❌ Trying to drive the meeting (don't add "we should..." items unless someone in the meeting said them)
72
+ - ❌ Asking the user clarifying questions during the meeting — they're not paying attention to chat
73
+ - ❌ Re-transcribing what's in the message into the TODO file verbatim. Distill.
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent, AudioSource, AudioFrame, LocalAudioTrack, TrackPublishOptions, TrackSource, } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent, } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -10,7 +10,6 @@ initializeLogger({ pretty: true, level: 'info' });
10
10
  import { setMaxListeners } from 'node:events';
11
11
  setMaxListeners(50);
12
12
  import { createServer } from 'http';
13
- import { WebSocket, WebSocketServer } from 'ws';
14
13
  import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
15
14
  import { dirname, join } from 'node:path';
16
15
  import { fileURLToPath } from 'node:url';
@@ -34,6 +33,7 @@ import { askHaiku, askFastBrain, updateSpecFromJSONL, processResearchCompletion,
34
33
  import { DIRECT_MODE_PROMPT, getRealtimeInstructions, getScriptInjection, getProactiveInjection, getNotificationInjection } from './prompts.js';
35
34
  import { MCP_CATALOG } from './config.js';
36
35
  import { getRecallClient } from './recall-client.js';
36
+ import { MeetingTranscriptPoller } from './meeting-transcript-poller.js';
37
37
  import { llm } from '@livekit/agents';
38
38
  import { z } from 'zod';
39
39
  // ============================================================
@@ -147,79 +147,6 @@ process.on('uncaughtException', (error) => {
147
147
  // ============================================================
148
148
  // Module-level room code so the HTTP server can expose it via GET /room-code
149
149
  let currentRoomCode = null;
150
- // Meeting output WebSocket — module-level so both startApiServer and main() can access it
151
- let meetingOutputWs = null;
152
- // Module-level AgentSession reference so /meeting-audio-in WS handler can switch
153
- // the RoomIO-linked participant when meeting audio starts/stops (B2 design).
154
- let activeAgentSession = null;
155
- // Identity of the local user participant the session was originally listening to
156
- // — captured at the moment we switch to the meeting publisher, restored on cleanup.
157
- let preMeetingUserIdentity = null;
158
- function sendToMeetingOutput(msg) {
159
- if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
160
- try {
161
- meetingOutputWs.send(JSON.stringify(msg));
162
- }
163
- catch { }
164
- }
165
- }
166
- // Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
167
- // Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
168
- async function synthesizeForMeeting(text, ttsConfig) {
169
- if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
170
- return;
171
- const ttsInstance = createTTS(ttsConfig);
172
- try {
173
- const chunks = [];
174
- let sampleRate = 24000;
175
- let numChannels = 1;
176
- const stream = ttsInstance.synthesize(text);
177
- for await (const event of stream) {
178
- if (event === Symbol.for('END_OF_STREAM'))
179
- break;
180
- const e = event;
181
- if (e?.frame?.data) {
182
- chunks.push(e.frame.data);
183
- sampleRate = e.frame.sampleRate ?? sampleRate;
184
- numChannels = e.frame.numChannels ?? numChannels;
185
- }
186
- }
187
- if (chunks.length === 0)
188
- return;
189
- const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
190
- const pcm = new Int16Array(totalSamples);
191
- let offset = 0;
192
- for (const c of chunks) {
193
- pcm.set(c, offset);
194
- offset += c.length;
195
- }
196
- // WAV header (44 bytes) + PCM data
197
- const dataBytes = pcm.length * 2;
198
- const wav = Buffer.alloc(44 + dataBytes);
199
- wav.write('RIFF', 0);
200
- wav.writeUInt32LE(36 + dataBytes, 4);
201
- wav.write('WAVE', 8);
202
- wav.write('fmt ', 12);
203
- wav.writeUInt32LE(16, 16);
204
- wav.writeUInt16LE(1, 20);
205
- wav.writeUInt16LE(numChannels, 22);
206
- wav.writeUInt32LE(sampleRate, 24);
207
- wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
208
- wav.writeUInt16LE(numChannels * 2, 32);
209
- wav.writeUInt16LE(16, 34);
210
- wav.write('data', 36);
211
- wav.writeUInt32LE(dataBytes, 40);
212
- for (let i = 0; i < pcm.length; i++)
213
- wav.writeInt16LE(pcm[i], 44 + i * 2);
214
- if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
215
- meetingOutputWs.send(wav);
216
- console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
217
- }
218
- }
219
- finally {
220
- await ttsInstance.close().catch(() => { });
221
- }
222
- }
223
150
  function startApiServer(workingDir, port) {
224
151
  const server = createServer(async (req, res) => {
225
152
  // CORS headers for cloud frontend
@@ -317,40 +244,6 @@ function startApiServer(workingDir, port) {
317
244
  });
318
245
  return;
319
246
  }
320
- // GET /meeting-output — Output Media webpage for Recall.ai bot audio.
321
- //
322
- // The file lives next to this compiled JS (copied by the build script from
323
- // src/ to dist/). Resolve via __dirname rather than process.cwd() — in
324
- // production cwd is the user's workspace, NOT the osborn package directory.
325
- if (req.method === 'GET' && url.pathname === '/meeting-output') {
326
- // Try the package-relative path first (post-build location), then fall
327
- // back to source path for `tsx src/index.ts` dev runs.
328
- const candidates = [
329
- join(__dirname, 'meeting-output.html'), // dist/ (production)
330
- join(__dirname, '..', 'src', 'meeting-output.html'), // dev: dist/ → src/
331
- join(__dirname, '..', 'meeting-output.html'), // tsx run from src/
332
- ];
333
- let html = null;
334
- let foundPath = null;
335
- for (const p of candidates) {
336
- try {
337
- html = readFileSync(p, 'utf-8');
338
- foundPath = p;
339
- break;
340
- }
341
- catch { }
342
- }
343
- if (html) {
344
- res.writeHead(200, { 'Content-Type': 'text/html' });
345
- res.end(html);
346
- }
347
- else {
348
- console.warn(`[meeting-output] not found in any of: ${candidates.join(', ')}`);
349
- res.writeHead(404, { 'Content-Type': 'text/plain' });
350
- res.end('meeting-output.html not found');
351
- }
352
- return;
353
- }
354
247
  if (req.method === 'GET' && url.pathname === '/room-code') {
355
248
  res.writeHead(200, { 'Content-Type': 'application/json' });
356
249
  res.end(JSON.stringify({ roomCode: currentRoomCode }));
@@ -965,251 +858,12 @@ function startApiServer(workingDir, port) {
965
858
  };
966
859
  cleanStaleUploadDirs();
967
860
  setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
968
- // ============================================================
969
- // Meeting Output WebSocket /meeting-audio (LEGACY)
970
- // ============================================================
971
- // Recall's headless browser used to open meeting-output.html which connects
972
- // here. With the new /meeting-bot Next.js page (Phase 2 + LiveKit), Recall
973
- // points at frontend/meeting-bot instead — this handler exists only for
974
- // backwards-compat with old machine images still serving the legacy path.
975
- const meetingOutputWss = new WebSocketServer({ noServer: true });
976
- meetingOutputWss.on('connection', (ws) => {
977
- console.log('📺 Meeting output browser connected (legacy /meeting-audio)');
978
- meetingOutputWs = ws;
979
- ws.on('close', () => {
980
- console.log('📺 Meeting output browser disconnected (legacy)');
981
- if (meetingOutputWs === ws)
982
- meetingOutputWs = null;
983
- });
984
- });
985
- // ============================================================
986
- // Recall.ai meeting-audio-in WebSocket — /meeting-audio-in
987
- // ============================================================
988
- // Recall.ai's per-participant real-time audio protocol. Bot is configured
989
- // (in recall-client.ts joinMeeting) with audio_separate_raw + a realtime
990
- // endpoint pointing at this URL. Recall sends JSON events containing
991
- // base64-encoded PCM (S16LE, 16kHz, mono) for every meeting participant
992
- // (bot's own audio NOT included by default — no feedback loop possible).
993
- //
994
- // Flow: Recall → /meeting-audio-in → open a SECOND LiveKit connection from
995
- // this agent process as a publisher participant → publish PCM as an
996
- // audio track in the same LiveKit room → the existing AgentSession's
997
- // STT subscribes to it as a remote track → routes to currentLLM.chat()
998
- // via the same pipeline as voice-native user mic.
999
- //
1000
- // The advantage of this design vs a parallel STT pipeline: meeting audio
1001
- // becomes "just another participant" in the LiveKit room — same end-of-turn
1002
- // detection, same interrupt handling, same conversation context, no parallel
1003
- // chat() paths to maintain.
1004
- //
1005
- // Wait until activeAgentSession._roomIO exists AND the publisher participant
1006
- // is visible to the agent's room. Both can race against join_meeting:
1007
- // - Agent session may still be starting up when Recall connects.
1008
- // - LiveKit takes a moment to propagate the publisher's join to the agent
1009
- // side after publishTrack() returns on our side.
1010
- // Bounded poll (200ms cadence) avoids both timing gaps.
1011
- async function waitForRoomIOAndParticipant(publisherIdentity, timeoutMs) {
1012
- const deadline = Date.now() + timeoutMs;
1013
- let roomIO = null;
1014
- let participantVisible = false;
1015
- while (Date.now() < deadline) {
1016
- roomIO = activeAgentSession?._roomIO;
1017
- if (roomIO && typeof roomIO.setParticipant === 'function') {
1018
- const agentRoom = roomIO.rtcRoom;
1019
- const remotes = agentRoom?.remoteParticipants;
1020
- if (remotes && typeof remotes.values === 'function') {
1021
- for (const p of remotes.values()) {
1022
- if (p?.identity === publisherIdentity) {
1023
- participantVisible = true;
1024
- break;
1025
- }
1026
- }
1027
- }
1028
- if (participantVisible)
1029
- return { roomIO, participantVisible };
1030
- }
1031
- await new Promise(r => setTimeout(r, 200));
1032
- }
1033
- // Timed out — return whatever we have. Caller decides whether to proceed.
1034
- return { roomIO, participantVisible };
1035
- }
1036
- const meetingAudioInWss = new WebSocketServer({ noServer: true });
1037
- meetingAudioInWss.on('connection', async (recallWs) => {
1038
- console.log('🎙️ Recall audio-in WebSocket connected — setting up LiveKit publisher');
1039
- const livekitUrl = process.env.LIVEKIT_URL;
1040
- const apiKey = process.env.LIVEKIT_API_KEY;
1041
- const apiSecret = process.env.LIVEKIT_API_SECRET;
1042
- if (!livekitUrl || !apiKey || !apiSecret) {
1043
- console.warn('⚠️ LIVEKIT_URL / LIVEKIT_API_KEY / LIVEKIT_API_SECRET not set — meeting audio publisher disabled');
1044
- recallWs.close();
1045
- return;
1046
- }
1047
- if (!currentRoomCode) {
1048
- console.warn('⚠️ No active LiveKit room (currentRoomCode null) — meeting audio publisher cannot attach');
1049
- recallWs.close();
1050
- return;
1051
- }
1052
- const roomName = `osborn-${currentRoomCode}`;
1053
- // Mint a publisher token via livekit-server-sdk (already imported for
1054
- // /api/token style flows). Long TTL — meetings can run for hours.
1055
- const identity = `meeting-audio-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
1056
- const at = new AccessToken(apiKey, apiSecret, {
1057
- identity,
1058
- ttl: 14400, // 4 hours
1059
- metadata: JSON.stringify({ role: 'meeting-audio-publisher' }),
1060
- });
1061
- at.addGrant({ roomJoin: true, room: roomName, canPublish: true, canSubscribe: false });
1062
- const token = await at.toJwt();
1063
- let room = null;
1064
- let source = null;
1065
- let track = null;
1066
- const cleanup = async () => {
1067
- // Restore AgentSession STT input to the original user participant before
1068
- // tearing down the publisher track. If we don't switch back, the session
1069
- // will be stuck waiting on a participant that's about to disappear.
1070
- try {
1071
- const roomIO = activeAgentSession?._roomIO;
1072
- if (roomIO && typeof roomIO.setParticipant === 'function') {
1073
- if (preMeetingUserIdentity) {
1074
- roomIO.setParticipant(preMeetingUserIdentity);
1075
- console.log(`🔁 Restored AgentSession STT input to user: ${preMeetingUserIdentity}`);
1076
- }
1077
- else {
1078
- roomIO.unsetParticipant();
1079
- console.log('🔁 Cleared AgentSession STT input (no original user to restore)');
1080
- }
1081
- }
1082
- }
1083
- catch (err) {
1084
- console.warn('⚠️ Failed to restore RoomIO participant on cleanup:', err.message);
1085
- }
1086
- preMeetingUserIdentity = null;
1087
- try {
1088
- if (track)
1089
- await track.close(true);
1090
- }
1091
- catch { }
1092
- try {
1093
- if (source)
1094
- await source.close();
1095
- }
1096
- catch { }
1097
- try {
1098
- if (room)
1099
- await room.disconnect();
1100
- }
1101
- catch { }
1102
- room = null;
1103
- source = null;
1104
- track = null;
1105
- };
1106
- try {
1107
- room = new Room();
1108
- await room.connect(livekitUrl, token);
1109
- if (!room.localParticipant)
1110
- throw new Error('LiveKit connected but localParticipant missing');
1111
- // Recall sends S16LE PCM at 16kHz mono. AudioSource matches the format.
1112
- source = new AudioSource(16000, 1);
1113
- track = LocalAudioTrack.createAudioTrack('meeting-audio', source);
1114
- await room.localParticipant.publishTrack(track, new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }));
1115
- console.log(`🎙️ Meeting audio publisher connected to ${roomName} as ${identity}`);
1116
- // B2 — switch the existing AgentSession's RoomIO input from the local user
1117
- // to this meeting-audio publisher. While the meeting is active, the user
1118
- // talks via the meeting (Recall captures it and sends PCM here), and the
1119
- // agent treats this publisher as the "speaking" participant for STT/EOT.
1120
- // Original user identity is stashed so cleanup() can restore it.
1121
- //
1122
- // 15s timeout accommodates: session-start race (agent still booting when
1123
- // user clicks "join meeting"), LiveKit participant-join propagation
1124
- // (~hundreds of ms), and Fly cold-path latency on first request.
1125
- try {
1126
- const { roomIO, participantVisible } = await waitForRoomIOAndParticipant(identity, 15000);
1127
- if (!roomIO) {
1128
- console.warn('⚠️ Timed out waiting for AgentSession._roomIO (15s) — meeting audio published but STT not switched. Meeting audio will be ignored until a session starts.');
1129
- }
1130
- else if (!participantVisible) {
1131
- // RoomIO exists but our publisher hasn't propagated to the agent's
1132
- // room view yet. setParticipant stores the identity and links on
1133
- // participant-connected event, so this is still safe to call —
1134
- // RoomIO will pick up the link when the event arrives.
1135
- preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
1136
- roomIO.setParticipant(identity);
1137
- console.log(`🔁 Switched AgentSession STT input (publisher not yet visible — will link on connect): ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
1138
- }
1139
- else {
1140
- preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
1141
- roomIO.setParticipant(identity);
1142
- console.log(`🔁 Switched AgentSession STT input: ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
1143
- }
1144
- }
1145
- catch (err) {
1146
- console.warn('⚠️ Failed to switch RoomIO participant:', err.message);
1147
- }
1148
- }
1149
- catch (err) {
1150
- console.error('❌ Failed to set up LiveKit publisher for meeting audio:', err instanceof Error ? err.message : err);
1151
- try {
1152
- recallWs.close();
1153
- }
1154
- catch { }
1155
- await cleanup();
1156
- return;
1157
- }
1158
- // Recall → us: JSON events with base64-encoded PCM. Decode, wrap as
1159
- // AudioFrame, and capture into the source. AgentSession in the main room
1160
- // will subscribe to this published track and STT it via the normal pipeline.
1161
- // Payload shape from
1162
- // docs.recall.ai/docs/how-to-get-separate-audio-per-participant-realtime:
1163
- // { event: 'audio_separate_raw.data', data: { data: { buffer: '<base64>', ... }, participant: {...} } }
1164
- recallWs.on('message', async (raw) => {
1165
- if (!source)
1166
- return;
1167
- try {
1168
- const msg = JSON.parse(raw.toString());
1169
- if (msg.event !== 'audio_separate_raw.data')
1170
- return;
1171
- const b64 = msg.data?.data?.buffer;
1172
- if (!b64)
1173
- return;
1174
- const pcmBuf = Buffer.from(b64, 'base64');
1175
- // AudioFrame expects Int16Array. The PCM buffer is S16LE — view it
1176
- // directly without copy. Length / 2 = samples (each sample 2 bytes).
1177
- const samplesPerChannel = pcmBuf.byteLength / 2;
1178
- const int16 = new Int16Array(pcmBuf.buffer, pcmBuf.byteOffset, samplesPerChannel);
1179
- const frame = new AudioFrame(int16, 16000, 1, samplesPerChannel);
1180
- await source.captureFrame(frame);
1181
- }
1182
- catch (err) {
1183
- // Don't log every frame parse failure — could be noisy if Recall sends
1184
- // non-audio_separate_raw events on the same channel.
1185
- if (err.message?.includes('JSON'))
1186
- return;
1187
- console.warn('⚠️ meeting audio capture error:', err instanceof Error ? err.message : err);
1188
- }
1189
- });
1190
- recallWs.on('close', async () => {
1191
- console.log('🎙️ Recall audio-in WebSocket closed — tearing down LiveKit publisher');
1192
- await cleanup();
1193
- });
1194
- recallWs.on('error', (err) => {
1195
- console.warn('⚠️ Recall WS error:', err instanceof Error ? err.message : err);
1196
- });
1197
- });
1198
- server.on('upgrade', (req, socket, head) => {
1199
- const url = new URL(req.url || '/', `http://localhost:${port}`);
1200
- if (url.pathname === '/meeting-audio') {
1201
- meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
1202
- meetingOutputWss.emit('connection', ws, req);
1203
- });
1204
- }
1205
- else if (url.pathname === '/meeting-audio-in') {
1206
- meetingAudioInWss.handleUpgrade(req, socket, head, (ws) => {
1207
- meetingAudioInWss.emit('connection', ws, req);
1208
- });
1209
- }
1210
- else {
1211
- socket.destroy();
1212
- }
861
+ // No WebSocket upgrade routes — meeting audio in/out moved off LiveKit to
862
+ // a polling architecture (see MeetingTranscriptPoller). The /meeting-audio
863
+ // and /meeting-audio-in routes were the old WebSocket-audio pipeline; both
864
+ // are gone. Reject all upgrade attempts.
865
+ server.on('upgrade', (_req, socket) => {
866
+ socket.destroy();
1213
867
  });
1214
868
  server.on('error', (err) => {
1215
869
  if (err.code === 'EADDRINUSE') {
@@ -1432,7 +1086,7 @@ async function main() {
1432
1086
  // session-only path (no user prefix).
1433
1087
  let currentUserId = '';
1434
1088
  let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
1435
- // meetingOutputWs is module-level (see top of file) shared between startApiServer and main()
1089
+ let activeMeetingPoller = null; // Transcript poller bound to that bot
1436
1090
  // Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
1437
1091
  // Updated by resume_session, session_selected, continue_session, switch_session handlers
1438
1092
  let currentResumeSessionId;
@@ -1883,6 +1537,40 @@ async function main() {
1883
1537
  }
1884
1538
  }
1885
1539
  }
1540
+ // Compaction event → frontend bridge. Forwards the raw event (consumed by the
1541
+ // dedicated banner UI state machine) AND emits a `claude_output` chat bubble
1542
+ // (so the activity is visible inline in chat even when the banner is hidden,
1543
+ // collapsed, or unreliable on iPad/iPhone). Extracted as a helper because
1544
+ // both direct-mode and pipeline-mode need to register it — the pipeline path
1545
+ // previously skipped this entirely, so compaction events fired into the void
1546
+ // in pipeline mode.
1547
+ const buildOnCompactionEvent = () => (event) => {
1548
+ try {
1549
+ // Raw event → banner state machine (compaction_started/progress/complete handlers in VoiceRoom.tsx).
1550
+ sendToFrontend({ ...event });
1551
+ // Inline chat bubble — reuses the existing claude_output path that's already working.
1552
+ if (event.type === 'compaction_started') {
1553
+ const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
1554
+ sendToFrontend({
1555
+ type: 'claude_output',
1556
+ text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
1557
+ agentRole: 'direct',
1558
+ });
1559
+ }
1560
+ else if (event.type === 'compaction_complete') {
1561
+ const n = event.skillsWritten ?? 0;
1562
+ const names = Array.isArray(event.skillNames) && event.skillNames.length > 0
1563
+ ? ` — ${event.skillNames.join(', ')}`
1564
+ : '';
1565
+ sendToFrontend({
1566
+ type: 'claude_output',
1567
+ text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
1568
+ agentRole: 'direct',
1569
+ });
1570
+ }
1571
+ }
1572
+ catch { /* non-fatal */ }
1573
+ };
1886
1574
  // Create DIRECT session (STT + Claude Agent SDK + TTS)
1887
1575
  async function createDirectSession(resumeSessionId, llmOverride) {
1888
1576
  console.log('🎯 Creating direct session...');
@@ -1898,39 +1586,7 @@ async function main() {
1898
1586
  resumeSessionId,
1899
1587
  voiceMode: 'direct',
1900
1588
  skipTTSQueue: true,
1901
- onCompactionEvent: (event) => {
1902
- try {
1903
- // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
1904
- sendToFrontend({ ...event });
1905
- // ALSO emit as a claude_output chat bubble — reuses the existing message path
1906
- // that's already working end-to-end. PreCompact → in-progress bubble.
1907
- // PostCompact → completion bubble with the skills summary. The dedicated
1908
- // banner has been unreliable in production (data path works on backend, banner
1909
- // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
1910
- // are visible without dev tools.
1911
- if (event.type === 'compaction_started') {
1912
- const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
1913
- sendToFrontend({
1914
- type: 'claude_output',
1915
- text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
1916
- agentRole: 'direct',
1917
- });
1918
- }
1919
- else if (event.type === 'compaction_complete') {
1920
- const ev = event;
1921
- const n = ev.skillsWritten ?? 0;
1922
- const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
1923
- ? ` — ${ev.skillNames.join(', ')}`
1924
- : '';
1925
- sendToFrontend({
1926
- type: 'claude_output',
1927
- text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
1928
- agentRole: 'direct',
1929
- });
1930
- }
1931
- }
1932
- catch { /* non-fatal */ }
1933
- },
1589
+ onCompactionEvent: buildOnCompactionEvent(),
1934
1590
  });
1935
1591
  currentLLM = directLLM;
1936
1592
  // Reset the session always-allow list for each new direct session
@@ -2130,20 +1786,6 @@ async function main() {
2130
1786
  }
2131
1787
  const sayId = Date.now(); // simple ID to correlate start/end logs
2132
1788
  console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
2133
- // Forward spoken text + audio to meeting output page when bot is in a meeting.
2134
- // Uses DIRECT_MODE_TTS (same OpenAI fable voice as the live session) — was
2135
- // previously using directConfig.tts which falls back to DEFAULT_CONFIG.direct.tts
2136
- // (Deepgram aura-2-asteria-en) when no user config exists, producing a different
2137
- // voice in the meeting than what the user hears in voice-native. Both paths now
2138
- // share the single source of truth.
2139
- // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
2140
- // Recall captures the browser page's audio output and injects it into the meeting.
2141
- if (activeMeetingBotId) {
2142
- sendToMeetingOutput({ type: 'speak', text: data.text });
2143
- if (meetingOutputWs) {
2144
- synthesizeForMeeting(data.text, DIRECT_MODE_TTS).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
2145
- }
2146
- }
2147
1789
  try {
2148
1790
  const handle = currentSession.say(data.text);
2149
1791
  if (handle && typeof handle.addDoneCallback === 'function') {
@@ -2281,39 +1923,7 @@ async function main() {
2281
1923
  sessionBaseDir,
2282
1924
  mcpServers,
2283
1925
  resumeSessionId,
2284
- onCompactionEvent: (event) => {
2285
- try {
2286
- // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
2287
- sendToFrontend({ ...event });
2288
- // ALSO emit as a claude_output chat bubble — reuses the existing message path
2289
- // that's already working end-to-end. PreCompact → in-progress bubble.
2290
- // PostCompact → completion bubble with the skills summary. The dedicated
2291
- // banner has been unreliable in production (data path works on backend, banner
2292
- // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
2293
- // are visible without dev tools.
2294
- if (event.type === 'compaction_started') {
2295
- const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
2296
- sendToFrontend({
2297
- type: 'claude_output',
2298
- text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
2299
- agentRole: 'direct',
2300
- });
2301
- }
2302
- else if (event.type === 'compaction_complete') {
2303
- const ev = event;
2304
- const n = ev.skillsWritten ?? 0;
2305
- const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
2306
- ? ` — ${ev.skillNames.join(', ')}`
2307
- : '';
2308
- sendToFrontend({
2309
- type: 'claude_output',
2310
- text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
2311
- agentRole: 'direct',
2312
- });
2313
- }
2314
- }
2315
- catch { /* non-fatal */ }
2316
- },
1926
+ onCompactionEvent: buildOnCompactionEvent(),
2317
1927
  });
2318
1928
  currentLLM = realtimeClaudeHandler;
2319
1929
  // For resumed sessions, eagerly create workspace (we know the real ID)
@@ -2856,7 +2466,6 @@ async function main() {
2856
2466
  }
2857
2467
  lastCompletedResearch = null;
2858
2468
  currentSession = null;
2859
- activeAgentSession = null;
2860
2469
  currentAgent = null;
2861
2470
  // Same disconnect-leak fix as the other two cleanup sites — kill the Claude SDK
2862
2471
  // subprocess BEFORE dropping the reference. See killCurrentLLM() for full context.
@@ -2902,7 +2511,6 @@ async function main() {
2902
2511
  }
2903
2512
  catch { }
2904
2513
  currentSession = null;
2905
- activeAgentSession = null;
2906
2514
  currentAgent = null;
2907
2515
  // Same disconnect-leak fix — kill the previous user's Claude subprocess
2908
2516
  // before binding currentLLM to the new user's session below.
@@ -3022,6 +2630,13 @@ async function main() {
3022
2630
  resumeSessionId,
3023
2631
  voiceMode: 'direct',
3024
2632
  skipTTSQueue: true,
2633
+ // PipelineDirectOptions extends ClaudeLLMOptions; passing this through
2634
+ // forwards it into the inner `new ClaudeLLM(opts)`. Without this,
2635
+ // pipeline mode silently drops every PreCompact/PostCompact event
2636
+ // — banner never appears, chat bubble never appears — because
2637
+ // createDirectSession's `createClaudeLLM(...)` call is skipped when
2638
+ // an llmOverride is supplied (which is exactly what pipeline mode does).
2639
+ onCompactionEvent: buildOnCompactionEvent(),
3025
2640
  getChatHistory: () => getChatHistory(20).map(t => ({ role: t.role, content: t.text })),
3026
2641
  getResearchContext: () => {
3027
2642
  if (activeResearch?.researchLog.length) {
@@ -3057,7 +2672,6 @@ async function main() {
3057
2672
  agent = result.agent;
3058
2673
  }
3059
2674
  currentSession = session;
3060
- activeAgentSession = session;
3061
2675
  currentAgent = agent; // Store for updateChatCtx() context injection
3062
2676
  // ============================================================
3063
2677
  // Session event wiring — extracted into function for auto-recovery
@@ -3217,7 +2831,6 @@ async function main() {
3217
2831
  }
3218
2832
  catch { }
3219
2833
  currentSession = null;
3220
- activeAgentSession = null;
3221
2834
  currentAgent = null;
3222
2835
  // Clear stale state from crashed session
3223
2836
  voiceQueue.length = 0;
@@ -3279,7 +2892,6 @@ async function main() {
3279
2892
  const newSession = result.session;
3280
2893
  const newAgent = result.agent;
3281
2894
  currentSession = newSession;
3282
- activeAgentSession = newSession;
3283
2895
  currentAgent = newAgent;
3284
2896
  // Re-wire event listeners on the new session
3285
2897
  wireSessionEvents(newSession, newAgent);
@@ -3336,7 +2948,6 @@ async function main() {
3336
2948
  }
3337
2949
  catch { }
3338
2950
  currentSession = null;
3339
- activeAgentSession = null;
3340
2951
  currentAgent = null;
3341
2952
  // Clear voice queue — stale injections from the crashed session
3342
2953
  voiceQueue.length = 0;
@@ -3360,7 +2971,6 @@ async function main() {
3360
2971
  const newSession = result.session;
3361
2972
  const newAgent = result.agent;
3362
2973
  currentSession = newSession;
3363
- activeAgentSession = newSession;
3364
2974
  currentAgent = newAgent;
3365
2975
  // Re-wire event listeners on the new session
3366
2976
  wireSessionEvents(newSession, newAgent);
@@ -3555,7 +3165,6 @@ async function main() {
3555
3165
  if (currentSession) {
3556
3166
  const sessionToClose = currentSession;
3557
3167
  currentSession = null;
3558
- activeAgentSession = null;
3559
3168
  // Track async close so new connections can wait for byte stream handler to be released
3560
3169
  pendingSessionClose = (async () => {
3561
3170
  try {
@@ -3577,6 +3186,10 @@ async function main() {
3577
3186
  clearFastBrainSession();
3578
3187
  clearPipelineFastBrainSession();
3579
3188
  // Auto-leave any active meeting bot when user disconnects from the room
3189
+ if (activeMeetingPoller) {
3190
+ activeMeetingPoller.stop();
3191
+ activeMeetingPoller = null;
3192
+ }
3580
3193
  if (activeMeetingBotId) {
3581
3194
  const recallDisconnect = getRecallClient();
3582
3195
  if (recallDisconnect) {
@@ -4160,61 +3773,61 @@ async function main() {
4160
3773
  (process.env.FLY_APP_NAME
4161
3774
  ? `https://${process.env.FLY_APP_NAME}.fly.dev`
4162
3775
  : `http://localhost:${apiPort}`);
4163
- // Try to mint a LiveKit bot token + construct the frontend-hosted
4164
- // meeting-bot page URL. The bot page joins the same LiveKit room
4165
- // as this agent so meeting audio flows through LiveKit directly
4166
- // (no agent-side WebSocket+WAV pipe). Falls back to the legacy
4167
- // /meeting-output webpage if no frontend URL is resolvable, so
4168
- // the old code path keeps working during the migration window.
4169
- //
4170
- // Frontend URL resolution (in priority order):
4171
- // 1. data.frontendBase — the public URL the user's browser is on,
4172
- // passed through the join_meeting data channel message. Works
4173
- // automatically for localhost dev + production without any
4174
- // env var.
4175
- // 2. OSBORN_FRONTEND_URL existing convention from sprites.ts
4176
- // (frontend/src/lib/sprites.ts:241) that injects the public
4177
- // frontend URL into sandbox env vars. Defense in depth.
4178
- //
4179
- // Auth: the endpoint uses LiveKit room-presence as the auth check
4180
- // — no shared secret needed. The agent must already be in the
4181
- // requested room (which it is by this point) for the mint to
4182
- // succeed.
4183
- let outputPageUrl;
4184
- const frontendUrl = data.frontendBase
4185
- || process.env.OSBORN_FRONTEND_URL;
4186
- if (frontendUrl) {
3776
+ // Polling architecture (post-2026-05-22): the bot joins by name
3777
+ // only no output_media webpage, no LiveKit republish, no audio
3778
+ // pipeline at all. Recall captures the meeting audio internally
3779
+ // and we pull the transcript via its REST API every ~30s.
3780
+ await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
3781
+ const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase);
3782
+ const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
3783
+ recallJoin.registerBot(botId, sessionId);
3784
+ activeMeetingBotId = botId;
3785
+ await sendToFrontend({ type: 'meeting_joined', botId, message: 'Osborn has joined the meeting' });
3786
+ // System injection so the LLM knows it's in a meeting and which
3787
+ // skill to apply. The meetings skill (agent/.claude/skills/meetings/SKILL.md)
3788
+ // teaches the agent: don't speak in response to [MEETING — *]:
3789
+ // messages, keep meeting-todos.md updated in the workspace, etc.
3790
+ if (currentLLM) {
4187
3791
  try {
4188
- const botLkId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
4189
- const tokenRes = await fetch(`${frontendUrl}/api/meeting-bot-token`, {
4190
- method: 'POST',
4191
- headers: { 'Content-Type': 'application/json' },
4192
- body: JSON.stringify({ botId: botLkId, roomName }),
3792
+ const sysCtx = new llm.ChatContext();
3793
+ sysCtx.addMessage({
3794
+ role: 'user',
3795
+ content: `[SYSTEM] You are now in a meeting (Recall bot ID: ${botId}, URL: ${meetingUrl}). Transcript chunks will arrive every ~30 seconds tagged \`[MEETING — ${botId}]:\`. Follow the meetings skill: do NOT speak in response (no TTS output), instead maintain meeting-todos.md in the session workspace, optionally trigger background research silently. The voice-native user can still interact normally — only the meeting-tagged messages are the silent-observer path. Acknowledge by writing the initial meeting-todos.md skeleton.`,
4193
3796
  });
4194
- if (tokenRes.ok) {
4195
- const { token, url } = await tokenRes.json();
4196
- const params = new URLSearchParams({ token, url, room: roomName, botId: botLkId });
4197
- outputPageUrl = `${frontendUrl}/meeting-bot?${params.toString()}`;
4198
- console.log(`🎫 Meeting-bot token minted for room=${roomName} bot=${botLkId}`);
4199
- }
4200
- else {
4201
- const errText = await tokenRes.text().catch(() => '');
4202
- console.warn(`⚠️ meeting-bot-token mint failed (HTTP ${tokenRes.status}: ${errText.substring(0, 120)}) — falling back to legacy /meeting-output path`);
4203
- }
3797
+ currentLLM.chat({ chatCtx: sysCtx });
3798
+ console.log('📓 Meeting system injection sent to LLM');
4204
3799
  }
4205
- catch (mintErr) {
4206
- console.warn(`⚠️ meeting-bot-token mint threw — falling back: ${mintErr.message}`);
3800
+ catch (sysErr) {
3801
+ console.warn('⚠️ Meeting system injection failed:', sysErr.message);
4207
3802
  }
4208
3803
  }
4209
- else {
4210
- console.log('ℹ️ No frontend URL (data.frontendBase + OSBORN_FRONTEND_URL both empty) using legacy /meeting-output path');
3804
+ // Start polling the transcript every 30s. Each batch of new turns
3805
+ // is pushed to currentLLM.chat() tagged [MEETINGbotId]: so the
3806
+ // skill kicks in. Poller dedups via first-word timestamp cursor.
3807
+ if (activeMeetingPoller) {
3808
+ activeMeetingPoller.stop();
3809
+ activeMeetingPoller = null;
4211
3810
  }
4212
- await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
4213
- const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase, { outputPageUrl });
4214
- const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
4215
- recallJoin.registerBot(botId, sessionId);
4216
- activeMeetingBotId = botId;
4217
- await sendToFrontend({ type: 'meeting_joined', botId, message: 'Osborn has joined the meeting' });
3811
+ activeMeetingPoller = new MeetingTranscriptPoller({
3812
+ botId,
3813
+ recall: recallJoin,
3814
+ onTurns: async ({ formatted }) => {
3815
+ if (!currentLLM) {
3816
+ console.warn('📓 Meeting transcript arrived but currentLLM is null dropping');
3817
+ return;
3818
+ }
3819
+ const tagged = `[MEETING — ${botId}]:\n${formatted}`;
3820
+ try {
3821
+ const turnCtx = new llm.ChatContext();
3822
+ turnCtx.addMessage({ role: 'user', content: tagged });
3823
+ currentLLM.chat({ chatCtx: turnCtx });
3824
+ }
3825
+ catch (err) {
3826
+ console.warn(`⚠️ Failed to forward meeting transcript to LLM: ${err.message}`);
3827
+ }
3828
+ },
3829
+ });
3830
+ activeMeetingPoller.start();
4218
3831
  }
4219
3832
  catch (err) {
4220
3833
  console.error('❌ Recall.ai join error:', err);
@@ -4228,6 +3841,12 @@ async function main() {
4228
3841
  const recallLeave = getRecallClient();
4229
3842
  if (recallLeave && botId) {
4230
3843
  try {
3844
+ // Stop the transcript poller FIRST so no more transcript chunks get
3845
+ // forwarded to the LLM during the leave.
3846
+ if (activeMeetingPoller) {
3847
+ activeMeetingPoller.stop();
3848
+ activeMeetingPoller = null;
3849
+ }
4231
3850
  await recallLeave.leaveMeeting(botId);
4232
3851
  activeMeetingBotId = null;
4233
3852
  await sendToFrontend({ type: 'meeting_left', botId });
@@ -0,0 +1,60 @@
1
+ /**
2
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
3
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
4
+ * messages.
5
+ *
6
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
7
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
8
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
9
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
10
+ * never speaks in the meeting — it's a silent note-taker.
11
+ *
12
+ * Lifecycle:
13
+ * const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
14
+ * poller.start()
15
+ * ...
16
+ * poller.stop() // on leave_meeting / disconnect / session switch
17
+ *
18
+ * Dedup strategy:
19
+ * Each turn carries a `start_timestamp.relative` on its first word (seconds
20
+ * since recording start). We track the highest cursor we've forwarded and
21
+ * only send turns with a strictly greater first-word timestamp. This means
22
+ * re-fetches don't double-deliver, and partial transcripts that get refined
23
+ * later don't re-trigger LLM processing of already-handled turns.
24
+ *
25
+ * Error handling:
26
+ * Transient fetch errors are logged + skipped (poll continues on next tick).
27
+ * No backoff — Recall's transcript endpoint is stable enough that a 30s
28
+ * cadence makes "slow start" non-issues self-recover within one cycle.
29
+ */
30
+ import type { RecallClient, TranscriptTurn } from './recall-client.js';
31
+ export interface MeetingTranscriptPollerOptions {
32
+ botId: string;
33
+ recall: RecallClient;
34
+ /** Called when new transcript turns arrive (de-duped). Get a fresh batch each tick. */
35
+ onTurns: (chunk: {
36
+ botId: string;
37
+ turns: TranscriptTurn[];
38
+ formatted: string;
39
+ }) => void | Promise<void>;
40
+ /** Default 30s — matches the user's stated cadence. */
41
+ intervalMs?: number;
42
+ /** Optional debug logger. */
43
+ onError?: (err: Error) => void;
44
+ }
45
+ export declare class MeetingTranscriptPoller {
46
+ #private;
47
+ constructor(opts: MeetingTranscriptPollerOptions);
48
+ start(): void;
49
+ stop(): void;
50
+ }
51
+ /**
52
+ * Format an array of turns into a single string for LLM consumption.
53
+ *
54
+ * Each turn becomes:
55
+ * <Speaker>: <text>
56
+ *
57
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
58
+ * string if nothing meaningful is in the batch.
59
+ */
60
+ export declare function formatTurns(turns: TranscriptTurn[]): string;
@@ -0,0 +1,112 @@
1
+ /**
2
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
3
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
4
+ * messages.
5
+ *
6
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
7
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
8
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
9
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
10
+ * never speaks in the meeting — it's a silent note-taker.
11
+ *
12
+ * Lifecycle:
13
+ * const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
14
+ * poller.start()
15
+ * ...
16
+ * poller.stop() // on leave_meeting / disconnect / session switch
17
+ *
18
+ * Dedup strategy:
19
+ * Each turn carries a `start_timestamp.relative` on its first word (seconds
20
+ * since recording start). We track the highest cursor we've forwarded and
21
+ * only send turns with a strictly greater first-word timestamp. This means
22
+ * re-fetches don't double-deliver, and partial transcripts that get refined
23
+ * later don't re-trigger LLM processing of already-handled turns.
24
+ *
25
+ * Error handling:
26
+ * Transient fetch errors are logged + skipped (poll continues on next tick).
27
+ * No backoff — Recall's transcript endpoint is stable enough that a 30s
28
+ * cadence makes "slow start" non-issues self-recover within one cycle.
29
+ */
30
+ export class MeetingTranscriptPoller {
31
+ #opts;
32
+ #timer = null;
33
+ #cursor = -Infinity; // highest first-word.start_timestamp.relative we've forwarded
34
+ #inFlight = false; // prevent overlapping polls if one cycle runs long
35
+ #stopped = false;
36
+ constructor(opts) {
37
+ this.#opts = opts;
38
+ }
39
+ start() {
40
+ if (this.#timer)
41
+ return;
42
+ const interval = this.#opts.intervalMs ?? 30_000;
43
+ console.log(`📓 MeetingTranscriptPoller starting for bot=${this.#opts.botId.substring(0, 8)} (every ${Math.round(interval / 1000)}s)`);
44
+ // Fire once immediately so the LLM sees the meeting started, then on interval.
45
+ void this.#tick();
46
+ this.#timer = setInterval(() => void this.#tick(), interval);
47
+ }
48
+ stop() {
49
+ if (this.#stopped)
50
+ return;
51
+ this.#stopped = true;
52
+ if (this.#timer) {
53
+ clearInterval(this.#timer);
54
+ this.#timer = null;
55
+ }
56
+ console.log(`📓 MeetingTranscriptPoller stopped for bot=${this.#opts.botId.substring(0, 8)}`);
57
+ }
58
+ async #tick() {
59
+ if (this.#inFlight || this.#stopped)
60
+ return;
61
+ this.#inFlight = true;
62
+ try {
63
+ const all = await this.#opts.recall.getTranscript(this.#opts.botId);
64
+ const fresh = all.filter(t => {
65
+ const firstWordTs = t.words?.[0]?.start_timestamp?.relative;
66
+ return typeof firstWordTs === 'number' && firstWordTs > this.#cursor;
67
+ });
68
+ if (fresh.length === 0)
69
+ return;
70
+ // Advance cursor to highest seen first-word ts (across all returned turns,
71
+ // not just the fresh ones — guards against Recall returning a paged subset).
72
+ for (const t of all) {
73
+ const ts = t.words?.[0]?.start_timestamp?.relative;
74
+ if (typeof ts === 'number' && ts > this.#cursor)
75
+ this.#cursor = ts;
76
+ }
77
+ const formatted = formatTurns(fresh);
78
+ if (!formatted)
79
+ return; // pure-whitespace fresh batch — skip
80
+ console.log(`📓 MeetingTranscriptPoller: ${fresh.length} new turn(s), cursor=${this.#cursor.toFixed(1)}s, chars=${formatted.length}`);
81
+ await this.#opts.onTurns({ botId: this.#opts.botId, turns: fresh, formatted });
82
+ }
83
+ catch (err) {
84
+ const e = err instanceof Error ? err : new Error(String(err));
85
+ this.#opts.onError?.(e);
86
+ console.warn(`⚠️ MeetingTranscriptPoller tick failed: ${e.message}`);
87
+ }
88
+ finally {
89
+ this.#inFlight = false;
90
+ }
91
+ }
92
+ }
93
+ /**
94
+ * Format an array of turns into a single string for LLM consumption.
95
+ *
96
+ * Each turn becomes:
97
+ * <Speaker>: <text>
98
+ *
99
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
100
+ * string if nothing meaningful is in the batch.
101
+ */
102
+ export function formatTurns(turns) {
103
+ const lines = [];
104
+ for (const t of turns) {
105
+ const speaker = t.speaker || t.participant?.name || 'Unknown';
106
+ const text = (t.words ?? []).map(w => w.text).join(' ').replace(/\s+/g, ' ').trim();
107
+ if (!text)
108
+ continue;
109
+ lines.push(`${speaker}: ${text}`);
110
+ }
111
+ return lines.join('\n');
112
+ }
@@ -4,6 +4,36 @@ export interface RecallBot {
4
4
  meeting_url: string;
5
5
  status: string;
6
6
  }
7
+ /**
8
+ * One transcript turn = one speaker's continuous utterance.
9
+ * Shape returned by GET /api/v1/bot/{bot_id}/transcript.
10
+ *
11
+ * Per Recall docs each turn contains:
12
+ * - speaker: participant name (or 'Unknown')
13
+ * - words: array of { text, start_timestamp.relative, end_timestamp.relative }
14
+ * - The `start_timestamp.relative` (seconds since recording start) on the
15
+ * FIRST word is the turn's start; we use this as the dedup cursor.
16
+ */
17
+ export interface TranscriptTurn {
18
+ speaker?: string;
19
+ participant?: {
20
+ id?: number;
21
+ name?: string;
22
+ is_host?: boolean;
23
+ };
24
+ words: Array<{
25
+ text: string;
26
+ start_timestamp?: {
27
+ relative?: number;
28
+ absolute?: string;
29
+ };
30
+ end_timestamp?: {
31
+ relative?: number;
32
+ absolute?: string;
33
+ };
34
+ }>;
35
+ language?: string;
36
+ }
7
37
  export interface TranscriptPayload {
8
38
  event: string;
9
39
  data: {
@@ -49,10 +79,27 @@ export declare class RecallClient extends EventEmitter {
49
79
  * room as the osborn agent (no separate WebSocket+WAV pipe).
50
80
  * @param opts.botName Display name of the bot in the meeting
51
81
  */
52
- joinMeeting(meetingUrl: string, webhookBaseUrl: string, opts?: {
53
- outputPageUrl?: string;
82
+ joinMeeting(meetingUrl: string, _webhookBaseUrl: string, opts?: {
54
83
  botName?: string;
55
84
  }): Promise<string>;
85
+ /**
86
+ * Fetch the bot's current transcript. Returns an array of "transcript turns"
87
+ * (each turn = one speaker's utterance) sorted by start time. Use the bot's
88
+ * `recordings[0].id` from getBotStatus / bot record to locate the recording,
89
+ * then list its transcripts.
90
+ *
91
+ * Per Recall docs:
92
+ * GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
93
+ * GET /api/v1/transcript/{transcript_id} → transcript with download_url
94
+ * Download the transcript JSON from download_url to get the actual content.
95
+ *
96
+ * For the polling use case (called every ~30s), we use the simpler combined
97
+ * endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
98
+ * convenience and returns the full transcript so far in one call. The caller
99
+ * is responsible for de-duping (keeping a since-cursor) so the LLM only sees
100
+ * new turns.
101
+ */
102
+ getTranscript(botId: string): Promise<TranscriptTurn[]>;
56
103
  leaveMeeting(botId: string): Promise<void>;
57
104
  getBotStatus(botId: string): Promise<string>;
58
105
  handleWebhook(payload: TranscriptPayload): void;
@@ -21,37 +21,20 @@ export class RecallClient extends EventEmitter {
21
21
  * room as the osborn agent (no separate WebSocket+WAV pipe).
22
22
  * @param opts.botName Display name of the bot in the meeting
23
23
  */
24
- async joinMeeting(meetingUrl, webhookBaseUrl, opts) {
24
+ async joinMeeting(meetingUrl, _webhookBaseUrl, opts) {
25
25
  const botName = opts?.botName ?? 'Osborn';
26
- const outputPageUrl = opts?.outputPageUrl ?? `${webhookBaseUrl}/meeting-output`;
27
- // Authoritative structure per https://docs.recall.ai/reference/bot_create
28
- // and https://docs.recall.ai/docs/real-time-transcription:
26
+ // ARCHITECTURE (post-2026-05-22 polling redesign):
27
+ // The bot joins by name only — visible in the meeting participant list as
28
+ // "Osborn" but with no audio output and no avatar. We do NOT configure any
29
+ // `output_media`, `audio_separate_raw`, or `realtime_endpoints` — instead
30
+ // the agent polls Recall's REST transcript API every ~30s
31
+ // (see MeetingTranscriptPoller) and feeds new turns into the LLM as
32
+ // `[MEETING — <botId>]:` tagged messages. The meetings skill teaches the
33
+ // LLM not to respond out loud to those messages, only to take notes.
29
34
  //
30
- // recording_config.transcript.provider transcription provider config
31
- // recording_config.realtime_endpoints webhook/websocket delivery
32
- //
33
- // IMPORTANT:
34
- // - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
35
- // - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
36
- // - `transcription_options` does NOT exist — use `transcript.provider`
37
- // - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
38
- //
39
- // ARCHITECTURE (post-2026-05-22 redesign):
40
- // Input (meeting → osborn): Recall's documented WebSocket audio protocol.
41
- // `audio_separate_raw` config + websocket realtime endpoint streams
42
- // per-participant PCM (S16LE 16kHz mono, base64 in JSON) to the agent's
43
- // /meeting-audio-in WS handler. Bot's own audio is excluded by default
44
- // → zero possibility of feedback loop, no echo cancellation needed.
45
- // Output (osborn → meeting): webpage output_media (LiveKit-on-page). Bot
46
- // page subscribes to osborn's LiveKit audio track and plays it via
47
- // track.attach(); Recall captures the page's audio output and injects
48
- // into the meeting.
49
- // Webhook transcripts (transcript.data): retained as a SECONDARY signal —
50
- // the agent index.ts handler for this event currently logs but does NOT
51
- // forward to the LLM (intentionally disabled). The Deepgram WS path
52
- // above is the LLM input.
53
- const httpBase = webhookBaseUrl.replace(/\/$/, '');
54
- const wsBase = httpBase.replace(/^https?:\/\//, m => m === 'https://' ? 'wss://' : 'ws://');
35
+ // We DO keep `recording_config.transcript.provider.recallai_streaming` so
36
+ // Recall actually transcribes the meeting the REST endpoint we poll
37
+ // requires this to be configured, otherwise transcripts are empty.
55
38
  const res = await fetch(`${RECALL_BASE_URL}/bot`, {
56
39
  method: 'POST',
57
40
  headers: {
@@ -64,46 +47,12 @@ export class RecallClient extends EventEmitter {
64
47
  recording_config: {
65
48
  transcript: {
66
49
  provider: {
67
- // recallai_streaming is built-in — no external API key needed,
68
- // low-latency, works across all meeting platforms.
69
- // Kept for the secondary webhook signal (display / future use);
70
- // LLM input now comes from the Deepgram WS pipe below.
71
50
  recallai_streaming: {
72
51
  mode: 'prioritize_low_latency',
73
52
  language_code: 'en',
74
53
  },
75
54
  },
76
55
  },
77
- // Per-participant raw PCM audio stream. Bot's own audio is excluded
78
- // (we don't set include_bot_in_recording.audio:true).
79
- audio_separate_raw: {},
80
- realtime_endpoints: [
81
- {
82
- // Transcript webhook (secondary signal; LLM forwarding disabled).
83
- type: 'webhook',
84
- url: `${httpBase}/webhook/recall`,
85
- events: ['transcript.data'],
86
- },
87
- {
88
- // Per-participant PCM audio → agent's Deepgram STT pipe.
89
- type: 'websocket',
90
- url: `${wsBase}/meeting-audio-in`,
91
- events: ['audio_separate_raw.data'],
92
- },
93
- ],
94
- },
95
- output_media: {
96
- camera: {
97
- // `kind` (not `type`) — confirmed from prior debugging.
98
- // The page Recall renders connects to LiveKit and plays osborn's
99
- // TTS audio via track.attach(); Recall captures the page audio.
100
- // The page does NOT call getUserMedia anymore — input now comes
101
- // from the audio_separate_raw WebSocket above.
102
- kind: 'webpage',
103
- config: {
104
- url: outputPageUrl,
105
- },
106
- },
107
56
  },
108
57
  }),
109
58
  });
@@ -112,9 +61,37 @@ export class RecallClient extends EventEmitter {
112
61
  throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
113
62
  }
114
63
  const bot = (await res.json());
115
- console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (output page: ${outputPageUrl})`);
64
+ console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (polling-only, no audio pipeline)`);
116
65
  return bot.id;
117
66
  }
67
+ /**
68
+ * Fetch the bot's current transcript. Returns an array of "transcript turns"
69
+ * (each turn = one speaker's utterance) sorted by start time. Use the bot's
70
+ * `recordings[0].id` from getBotStatus / bot record to locate the recording,
71
+ * then list its transcripts.
72
+ *
73
+ * Per Recall docs:
74
+ * GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
75
+ * GET /api/v1/transcript/{transcript_id} → transcript with download_url
76
+ * Download the transcript JSON from download_url to get the actual content.
77
+ *
78
+ * For the polling use case (called every ~30s), we use the simpler combined
79
+ * endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
80
+ * convenience and returns the full transcript so far in one call. The caller
81
+ * is responsible for de-duping (keeping a since-cursor) so the LLM only sees
82
+ * new turns.
83
+ */
84
+ async getTranscript(botId) {
85
+ const res = await fetch(`${RECALL_BASE_URL}/bot/${botId}/transcript`, {
86
+ headers: { 'Authorization': `Token ${this.#apiKey}` },
87
+ });
88
+ if (!res.ok) {
89
+ const err = await res.text().catch(() => '');
90
+ throw new Error(`Recall.ai transcript fetch failed: ${res.status} ${err.substring(0, 200)}`);
91
+ }
92
+ const turns = await res.json();
93
+ return Array.isArray(turns) ? turns : [];
94
+ }
118
95
  async leaveMeeting(botId) {
119
96
  await fetch(`${RECALL_BASE_URL}/bot/${botId}/leave_call`, {
120
97
  method: 'POST',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.41",
3
+ "version": "0.9.43",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
@@ -11,7 +11,7 @@
11
11
  "dev:logged": "tsx scripts/dev-logged.ts",
12
12
  "review": "tsx scripts/review.ts",
13
13
  "start": "tsx src/index.ts",
14
- "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts && cp src/meeting-output.html dist/",
14
+ "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts",
15
15
  "room": "tsx src/index.ts --room",
16
16
  "prepublishOnly": "npm run build"
17
17
  },