osborn 0.9.42 → 0.9.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/meetings/SKILL.md +73 -0
- package/dist/index.js +111 -527
- package/dist/meeting-transcript-poller.d.ts +60 -0
- package/dist/meeting-transcript-poller.js +112 -0
- package/dist/recall-client.d.ts +49 -2
- package/dist/recall-client.js +41 -64
- package/package.json +2 -2
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Skill: Meetings
|
|
2
|
+
|
|
3
|
+
Silent note-taking and TODO-tracking when osborn is sitting in a live meeting.
|
|
4
|
+
|
|
5
|
+
## When to use
|
|
6
|
+
|
|
7
|
+
When a user message arrives with the prefix `[MEETING — <botId>]:` (every ~30 seconds while a Recall.ai meeting bot is active). Also use this skill when the orchestrating system injects `[SYSTEM] You are now in a meeting ...`.
|
|
8
|
+
|
|
9
|
+
**Do NOT use this skill** for normal user messages in the voice-native chat — those still get spoken responses as usual.
|
|
10
|
+
|
|
11
|
+
## How to behave
|
|
12
|
+
|
|
13
|
+
For every `[MEETING — *]:` message:
|
|
14
|
+
|
|
15
|
+
1. **Do NOT speak.** No TTS output. No `tts_say`. No conversational reply. This is a silent observer path. If you must acknowledge that you processed the message, do it via a Write/Edit tool call (writing to the workspace), not via spoken or chat output.
|
|
16
|
+
2. **Update `meeting-todos.md`** in the session workspace (`{workspace}/meeting-todos.md`). Append new action items, decisions, and open questions as they emerge in the transcript. Do not rewrite existing entries unless something contradicts.
|
|
17
|
+
3. **Optionally trigger background research silently.** If a topic in the meeting would benefit from a quick web/code lookup, dispatch a researcher sub-agent via the Task tool. Save its output to `{workspace}/library/meeting-research-<topic-slug>.md`. Do NOT speak the result.
|
|
18
|
+
4. **Do not consume voice-native attention.** The user can still talk to you via the voice-native browser. When they do (a normal user message with no `[MEETING — *]` prefix), respond normally — speak. Treat the meeting transcript as background context they can ask about ("what did Sarah say about pricing?" → answer normally).
|
|
19
|
+
|
|
20
|
+
## The `meeting-todos.md` file
|
|
21
|
+
|
|
22
|
+
Keep it scannable. Structure:
|
|
23
|
+
|
|
24
|
+
```markdown
|
|
25
|
+
# Meeting Notes
|
|
26
|
+
|
|
27
|
+
**Bot:** <botId> · **Started:** <ISO timestamp>
|
|
28
|
+
|
|
29
|
+
## TODOs
|
|
30
|
+
|
|
31
|
+
- [ ] <person>: <action item> — <context>
|
|
32
|
+
- [ ] <person>: <action item>
|
|
33
|
+
|
|
34
|
+
## Decisions
|
|
35
|
+
|
|
36
|
+
- <date/time> — <what was decided> (raised by <person>)
|
|
37
|
+
|
|
38
|
+
## Open Questions
|
|
39
|
+
|
|
40
|
+
- <question> — raised by <person>, still unresolved
|
|
41
|
+
- <question> — answered by <person>: <answer>
|
|
42
|
+
|
|
43
|
+
## Highlights
|
|
44
|
+
|
|
45
|
+
- <key moment or quote worth surfacing>
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Update the same file across multiple poll cycles — don't create `meeting-todos-1.md`, `meeting-todos-2.md`. One file, evolving.
|
|
49
|
+
|
|
50
|
+
## Workspace path
|
|
51
|
+
|
|
52
|
+
The session workspace is `~/.claude/projects/<slug>/osb/<session-uuid>/`. Read the env variable or the spec.md header if you need to confirm the exact path. Write absolute paths in tool calls (e.g. `/Users/<user>/.claude/projects/.../osb/<uuid>/meeting-todos.md`).
|
|
53
|
+
|
|
54
|
+
## On meeting end
|
|
55
|
+
|
|
56
|
+
When the user leaves the meeting (the system stops sending `[MEETING — *]:` messages and may inject `[SYSTEM] meeting ended`), do a final pass on `meeting-todos.md` to:
|
|
57
|
+
- Mark items the user has clearly committed to
|
|
58
|
+
- Move resolved open questions to a `## Resolved` section
|
|
59
|
+
- Add a `## Summary` section at the top with 3-5 lines distilling the meeting
|
|
60
|
+
|
|
61
|
+
Still silent. The user will ask out loud if they want a recap.
|
|
62
|
+
|
|
63
|
+
## When the user asks about the meeting
|
|
64
|
+
|
|
65
|
+
When a non-meeting-tagged message references the meeting ("what's on the todo list?", "what did we decide about X?", "who's handling Y?"), respond normally — speak. Read `meeting-todos.md` first to ground the response. Don't make up speaker names or decisions; only state what's recorded.
|
|
66
|
+
|
|
67
|
+
## Anti-patterns
|
|
68
|
+
|
|
69
|
+
- ❌ Speaking in response to a `[MEETING — *]:` message
|
|
70
|
+
- ❌ Creating a new file per poll cycle instead of updating one
|
|
71
|
+
- ❌ Trying to drive the meeting (don't add "we should..." items unless someone in the meeting said them)
|
|
72
|
+
- ❌ Asking the user clarifying questions during the meeting — they're not paying attention to chat
|
|
73
|
+
- ❌ Re-transcribing what's in the message into the TODO file verbatim. Distill.
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Load environment variables FIRST before any other imports
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { voice, initializeLogger } from '@livekit/agents';
|
|
4
|
-
import { Room, RoomEvent,
|
|
4
|
+
import { Room, RoomEvent, } from '@livekit/rtc-node';
|
|
5
5
|
import { AccessToken } from 'livekit-server-sdk';
|
|
6
6
|
// Initialize logger before anything else
|
|
7
7
|
initializeLogger({ pretty: true, level: 'info' });
|
|
@@ -10,7 +10,6 @@ initializeLogger({ pretty: true, level: 'info' });
|
|
|
10
10
|
import { setMaxListeners } from 'node:events';
|
|
11
11
|
setMaxListeners(50);
|
|
12
12
|
import { createServer } from 'http';
|
|
13
|
-
import { WebSocket, WebSocketServer } from 'ws';
|
|
14
13
|
import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
|
|
15
14
|
import { dirname, join } from 'node:path';
|
|
16
15
|
import { fileURLToPath } from 'node:url';
|
|
@@ -34,6 +33,7 @@ import { askHaiku, askFastBrain, updateSpecFromJSONL, processResearchCompletion,
|
|
|
34
33
|
import { DIRECT_MODE_PROMPT, getRealtimeInstructions, getScriptInjection, getProactiveInjection, getNotificationInjection } from './prompts.js';
|
|
35
34
|
import { MCP_CATALOG } from './config.js';
|
|
36
35
|
import { getRecallClient } from './recall-client.js';
|
|
36
|
+
import { MeetingTranscriptPoller } from './meeting-transcript-poller.js';
|
|
37
37
|
import { llm } from '@livekit/agents';
|
|
38
38
|
import { z } from 'zod';
|
|
39
39
|
// ============================================================
|
|
@@ -147,79 +147,6 @@ process.on('uncaughtException', (error) => {
|
|
|
147
147
|
// ============================================================
|
|
148
148
|
// Module-level room code so the HTTP server can expose it via GET /room-code
|
|
149
149
|
let currentRoomCode = null;
|
|
150
|
-
// Meeting output WebSocket — module-level so both startApiServer and main() can access it
|
|
151
|
-
let meetingOutputWs = null;
|
|
152
|
-
// Module-level AgentSession reference so /meeting-audio-in WS handler can switch
|
|
153
|
-
// the RoomIO-linked participant when meeting audio starts/stops (B2 design).
|
|
154
|
-
let activeAgentSession = null;
|
|
155
|
-
// Identity of the local user participant the session was originally listening to
|
|
156
|
-
// — captured at the moment we switch to the meeting publisher, restored on cleanup.
|
|
157
|
-
let preMeetingUserIdentity = null;
|
|
158
|
-
function sendToMeetingOutput(msg) {
|
|
159
|
-
if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
|
|
160
|
-
try {
|
|
161
|
-
meetingOutputWs.send(JSON.stringify(msg));
|
|
162
|
-
}
|
|
163
|
-
catch { }
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
// Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
|
|
167
|
-
// Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
|
|
168
|
-
async function synthesizeForMeeting(text, ttsConfig) {
|
|
169
|
-
if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
|
|
170
|
-
return;
|
|
171
|
-
const ttsInstance = createTTS(ttsConfig);
|
|
172
|
-
try {
|
|
173
|
-
const chunks = [];
|
|
174
|
-
let sampleRate = 24000;
|
|
175
|
-
let numChannels = 1;
|
|
176
|
-
const stream = ttsInstance.synthesize(text);
|
|
177
|
-
for await (const event of stream) {
|
|
178
|
-
if (event === Symbol.for('END_OF_STREAM'))
|
|
179
|
-
break;
|
|
180
|
-
const e = event;
|
|
181
|
-
if (e?.frame?.data) {
|
|
182
|
-
chunks.push(e.frame.data);
|
|
183
|
-
sampleRate = e.frame.sampleRate ?? sampleRate;
|
|
184
|
-
numChannels = e.frame.numChannels ?? numChannels;
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
if (chunks.length === 0)
|
|
188
|
-
return;
|
|
189
|
-
const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
|
|
190
|
-
const pcm = new Int16Array(totalSamples);
|
|
191
|
-
let offset = 0;
|
|
192
|
-
for (const c of chunks) {
|
|
193
|
-
pcm.set(c, offset);
|
|
194
|
-
offset += c.length;
|
|
195
|
-
}
|
|
196
|
-
// WAV header (44 bytes) + PCM data
|
|
197
|
-
const dataBytes = pcm.length * 2;
|
|
198
|
-
const wav = Buffer.alloc(44 + dataBytes);
|
|
199
|
-
wav.write('RIFF', 0);
|
|
200
|
-
wav.writeUInt32LE(36 + dataBytes, 4);
|
|
201
|
-
wav.write('WAVE', 8);
|
|
202
|
-
wav.write('fmt ', 12);
|
|
203
|
-
wav.writeUInt32LE(16, 16);
|
|
204
|
-
wav.writeUInt16LE(1, 20);
|
|
205
|
-
wav.writeUInt16LE(numChannels, 22);
|
|
206
|
-
wav.writeUInt32LE(sampleRate, 24);
|
|
207
|
-
wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
|
|
208
|
-
wav.writeUInt16LE(numChannels * 2, 32);
|
|
209
|
-
wav.writeUInt16LE(16, 34);
|
|
210
|
-
wav.write('data', 36);
|
|
211
|
-
wav.writeUInt32LE(dataBytes, 40);
|
|
212
|
-
for (let i = 0; i < pcm.length; i++)
|
|
213
|
-
wav.writeInt16LE(pcm[i], 44 + i * 2);
|
|
214
|
-
if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
|
|
215
|
-
meetingOutputWs.send(wav);
|
|
216
|
-
console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
finally {
|
|
220
|
-
await ttsInstance.close().catch(() => { });
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
150
|
function startApiServer(workingDir, port) {
|
|
224
151
|
const server = createServer(async (req, res) => {
|
|
225
152
|
// CORS headers for cloud frontend
|
|
@@ -317,40 +244,6 @@ function startApiServer(workingDir, port) {
|
|
|
317
244
|
});
|
|
318
245
|
return;
|
|
319
246
|
}
|
|
320
|
-
// GET /meeting-output — Output Media webpage for Recall.ai bot audio.
|
|
321
|
-
//
|
|
322
|
-
// The file lives next to this compiled JS (copied by the build script from
|
|
323
|
-
// src/ to dist/). Resolve via __dirname rather than process.cwd() — in
|
|
324
|
-
// production cwd is the user's workspace, NOT the osborn package directory.
|
|
325
|
-
if (req.method === 'GET' && url.pathname === '/meeting-output') {
|
|
326
|
-
// Try the package-relative path first (post-build location), then fall
|
|
327
|
-
// back to source path for `tsx src/index.ts` dev runs.
|
|
328
|
-
const candidates = [
|
|
329
|
-
join(__dirname, 'meeting-output.html'), // dist/ (production)
|
|
330
|
-
join(__dirname, '..', 'src', 'meeting-output.html'), // dev: dist/ → src/
|
|
331
|
-
join(__dirname, '..', 'meeting-output.html'), // tsx run from src/
|
|
332
|
-
];
|
|
333
|
-
let html = null;
|
|
334
|
-
let foundPath = null;
|
|
335
|
-
for (const p of candidates) {
|
|
336
|
-
try {
|
|
337
|
-
html = readFileSync(p, 'utf-8');
|
|
338
|
-
foundPath = p;
|
|
339
|
-
break;
|
|
340
|
-
}
|
|
341
|
-
catch { }
|
|
342
|
-
}
|
|
343
|
-
if (html) {
|
|
344
|
-
res.writeHead(200, { 'Content-Type': 'text/html' });
|
|
345
|
-
res.end(html);
|
|
346
|
-
}
|
|
347
|
-
else {
|
|
348
|
-
console.warn(`[meeting-output] not found in any of: ${candidates.join(', ')}`);
|
|
349
|
-
res.writeHead(404, { 'Content-Type': 'text/plain' });
|
|
350
|
-
res.end('meeting-output.html not found');
|
|
351
|
-
}
|
|
352
|
-
return;
|
|
353
|
-
}
|
|
354
247
|
if (req.method === 'GET' && url.pathname === '/room-code') {
|
|
355
248
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
356
249
|
res.end(JSON.stringify({ roomCode: currentRoomCode }));
|
|
@@ -965,286 +858,12 @@ function startApiServer(workingDir, port) {
|
|
|
965
858
|
};
|
|
966
859
|
cleanStaleUploadDirs();
|
|
967
860
|
setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
|
|
968
|
-
//
|
|
969
|
-
//
|
|
970
|
-
//
|
|
971
|
-
//
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
// backwards-compat with old machine images still serving the legacy path.
|
|
975
|
-
const meetingOutputWss = new WebSocketServer({ noServer: true });
|
|
976
|
-
meetingOutputWss.on('connection', (ws) => {
|
|
977
|
-
console.log('📺 Meeting output browser connected (legacy /meeting-audio)');
|
|
978
|
-
meetingOutputWs = ws;
|
|
979
|
-
ws.on('close', () => {
|
|
980
|
-
console.log('📺 Meeting output browser disconnected (legacy)');
|
|
981
|
-
if (meetingOutputWs === ws)
|
|
982
|
-
meetingOutputWs = null;
|
|
983
|
-
});
|
|
984
|
-
});
|
|
985
|
-
// ============================================================
|
|
986
|
-
// Recall.ai meeting-audio-in WebSocket — /meeting-audio-in
|
|
987
|
-
// ============================================================
|
|
988
|
-
// Recall.ai's per-participant real-time audio protocol. Bot is configured
|
|
989
|
-
// (in recall-client.ts joinMeeting) with audio_separate_raw + a realtime
|
|
990
|
-
// endpoint pointing at this URL. Recall sends JSON events containing
|
|
991
|
-
// base64-encoded PCM (S16LE, 16kHz, mono) for every meeting participant
|
|
992
|
-
// (bot's own audio NOT included by default — no feedback loop possible).
|
|
993
|
-
//
|
|
994
|
-
// Flow: Recall → /meeting-audio-in → open a SECOND LiveKit connection from
|
|
995
|
-
// this agent process as a publisher participant → publish PCM as an
|
|
996
|
-
// audio track in the same LiveKit room → the existing AgentSession's
|
|
997
|
-
// STT subscribes to it as a remote track → routes to currentLLM.chat()
|
|
998
|
-
// via the same pipeline as voice-native user mic.
|
|
999
|
-
//
|
|
1000
|
-
// The advantage of this design vs a parallel STT pipeline: meeting audio
|
|
1001
|
-
// becomes "just another participant" in the LiveKit room — same end-of-turn
|
|
1002
|
-
// detection, same interrupt handling, same conversation context, no parallel
|
|
1003
|
-
// chat() paths to maintain.
|
|
1004
|
-
//
|
|
1005
|
-
// Wait until activeAgentSession._roomIO exists AND the publisher participant
|
|
1006
|
-
// is visible to the agent's room. Both can race against join_meeting:
|
|
1007
|
-
// - Agent session may still be starting up when Recall connects.
|
|
1008
|
-
// - LiveKit takes a moment to propagate the publisher's join to the agent
|
|
1009
|
-
// side after publishTrack() returns on our side.
|
|
1010
|
-
// Bounded poll (200ms cadence) avoids both timing gaps.
|
|
1011
|
-
async function waitForRoomIOAndParticipant(publisherIdentity, timeoutMs) {
|
|
1012
|
-
const deadline = Date.now() + timeoutMs;
|
|
1013
|
-
let roomIO = null;
|
|
1014
|
-
let participantVisible = false;
|
|
1015
|
-
while (Date.now() < deadline) {
|
|
1016
|
-
roomIO = activeAgentSession?._roomIO;
|
|
1017
|
-
if (roomIO && typeof roomIO.setParticipant === 'function') {
|
|
1018
|
-
const agentRoom = roomIO.rtcRoom;
|
|
1019
|
-
const remotes = agentRoom?.remoteParticipants;
|
|
1020
|
-
if (remotes && typeof remotes.values === 'function') {
|
|
1021
|
-
for (const p of remotes.values()) {
|
|
1022
|
-
if (p?.identity === publisherIdentity) {
|
|
1023
|
-
participantVisible = true;
|
|
1024
|
-
break;
|
|
1025
|
-
}
|
|
1026
|
-
}
|
|
1027
|
-
}
|
|
1028
|
-
if (participantVisible)
|
|
1029
|
-
return { roomIO, participantVisible };
|
|
1030
|
-
}
|
|
1031
|
-
await new Promise(r => setTimeout(r, 200));
|
|
1032
|
-
}
|
|
1033
|
-
// Timed out — return whatever we have. Caller decides whether to proceed.
|
|
1034
|
-
return { roomIO, participantVisible };
|
|
1035
|
-
}
|
|
1036
|
-
const meetingAudioInWss = new WebSocketServer({ noServer: true });
|
|
1037
|
-
meetingAudioInWss.on('connection', async (recallWs) => {
|
|
1038
|
-
console.log('🎙️ Recall audio-in WebSocket connected — setting up LiveKit publisher');
|
|
1039
|
-
const livekitUrl = process.env.LIVEKIT_URL;
|
|
1040
|
-
const apiKey = process.env.LIVEKIT_API_KEY;
|
|
1041
|
-
const apiSecret = process.env.LIVEKIT_API_SECRET;
|
|
1042
|
-
if (!livekitUrl || !apiKey || !apiSecret) {
|
|
1043
|
-
console.warn('⚠️ LIVEKIT_URL / LIVEKIT_API_KEY / LIVEKIT_API_SECRET not set — meeting audio publisher disabled');
|
|
1044
|
-
recallWs.close();
|
|
1045
|
-
return;
|
|
1046
|
-
}
|
|
1047
|
-
if (!currentRoomCode) {
|
|
1048
|
-
console.warn('⚠️ No active LiveKit room (currentRoomCode null) — meeting audio publisher cannot attach');
|
|
1049
|
-
recallWs.close();
|
|
1050
|
-
return;
|
|
1051
|
-
}
|
|
1052
|
-
const roomName = `osborn-${currentRoomCode}`;
|
|
1053
|
-
// Mint a publisher token via livekit-server-sdk (already imported for
|
|
1054
|
-
// /api/token style flows). Long TTL — meetings can run for hours.
|
|
1055
|
-
const identity = `meeting-audio-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
1056
|
-
const at = new AccessToken(apiKey, apiSecret, {
|
|
1057
|
-
identity,
|
|
1058
|
-
ttl: 14400, // 4 hours
|
|
1059
|
-
metadata: JSON.stringify({ role: 'meeting-audio-publisher' }),
|
|
1060
|
-
});
|
|
1061
|
-
at.addGrant({ roomJoin: true, room: roomName, canPublish: true, canSubscribe: false });
|
|
1062
|
-
const token = await at.toJwt();
|
|
1063
|
-
let room = null;
|
|
1064
|
-
let source = null;
|
|
1065
|
-
let track = null;
|
|
1066
|
-
const cleanup = async () => {
|
|
1067
|
-
// Restore AgentSession STT input to the original user participant before
|
|
1068
|
-
// tearing down the publisher track. If we don't switch back, the session
|
|
1069
|
-
// will be stuck waiting on a participant that's about to disappear.
|
|
1070
|
-
try {
|
|
1071
|
-
const roomIO = activeAgentSession?._roomIO;
|
|
1072
|
-
if (roomIO && typeof roomIO.setParticipant === 'function') {
|
|
1073
|
-
if (preMeetingUserIdentity) {
|
|
1074
|
-
roomIO.setParticipant(preMeetingUserIdentity);
|
|
1075
|
-
console.log(`🔁 Restored AgentSession STT input to user: ${preMeetingUserIdentity}`);
|
|
1076
|
-
}
|
|
1077
|
-
else {
|
|
1078
|
-
roomIO.unsetParticipant();
|
|
1079
|
-
console.log('🔁 Cleared AgentSession STT input (no original user to restore)');
|
|
1080
|
-
}
|
|
1081
|
-
}
|
|
1082
|
-
}
|
|
1083
|
-
catch (err) {
|
|
1084
|
-
console.warn('⚠️ Failed to restore RoomIO participant on cleanup:', err.message);
|
|
1085
|
-
}
|
|
1086
|
-
preMeetingUserIdentity = null;
|
|
1087
|
-
try {
|
|
1088
|
-
if (track)
|
|
1089
|
-
await track.close(true);
|
|
1090
|
-
}
|
|
1091
|
-
catch { }
|
|
1092
|
-
try {
|
|
1093
|
-
if (source)
|
|
1094
|
-
await source.close();
|
|
1095
|
-
}
|
|
1096
|
-
catch { }
|
|
1097
|
-
try {
|
|
1098
|
-
if (room)
|
|
1099
|
-
await room.disconnect();
|
|
1100
|
-
}
|
|
1101
|
-
catch { }
|
|
1102
|
-
room = null;
|
|
1103
|
-
source = null;
|
|
1104
|
-
track = null;
|
|
1105
|
-
};
|
|
1106
|
-
try {
|
|
1107
|
-
room = new Room();
|
|
1108
|
-
await room.connect(livekitUrl, token);
|
|
1109
|
-
if (!room.localParticipant)
|
|
1110
|
-
throw new Error('LiveKit connected but localParticipant missing');
|
|
1111
|
-
// Recall sends S16LE PCM at 16kHz mono. AudioSource matches the format.
|
|
1112
|
-
source = new AudioSource(16000, 1);
|
|
1113
|
-
track = LocalAudioTrack.createAudioTrack('meeting-audio', source);
|
|
1114
|
-
await room.localParticipant.publishTrack(track, new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }));
|
|
1115
|
-
console.log(`🎙️ Meeting audio publisher connected to ${roomName} as ${identity}`);
|
|
1116
|
-
// B2 — switch the existing AgentSession's RoomIO input from the local user
|
|
1117
|
-
// to this meeting-audio publisher. While the meeting is active, the user
|
|
1118
|
-
// talks via the meeting (Recall captures it and sends PCM here), and the
|
|
1119
|
-
// agent treats this publisher as the "speaking" participant for STT/EOT.
|
|
1120
|
-
// Original user identity is stashed so cleanup() can restore it.
|
|
1121
|
-
//
|
|
1122
|
-
// 15s timeout accommodates: session-start race (agent still booting when
|
|
1123
|
-
// user clicks "join meeting"), LiveKit participant-join propagation
|
|
1124
|
-
// (~hundreds of ms), and Fly cold-path latency on first request.
|
|
1125
|
-
try {
|
|
1126
|
-
const { roomIO, participantVisible } = await waitForRoomIOAndParticipant(identity, 15000);
|
|
1127
|
-
if (!roomIO) {
|
|
1128
|
-
console.warn('⚠️ Timed out waiting for AgentSession._roomIO (15s) — meeting audio published but STT not switched. Meeting audio will be ignored until a session starts.');
|
|
1129
|
-
}
|
|
1130
|
-
else if (!participantVisible) {
|
|
1131
|
-
// RoomIO exists but our publisher hasn't propagated to the agent's
|
|
1132
|
-
// room view yet. setParticipant stores the identity and links on
|
|
1133
|
-
// participant-connected event, so this is still safe to call —
|
|
1134
|
-
// RoomIO will pick up the link when the event arrives.
|
|
1135
|
-
preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
|
|
1136
|
-
roomIO.setParticipant(identity);
|
|
1137
|
-
console.log(`🔁 Switched AgentSession STT input (publisher not yet visible — will link on connect): ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
|
|
1138
|
-
}
|
|
1139
|
-
else {
|
|
1140
|
-
preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
|
|
1141
|
-
roomIO.setParticipant(identity);
|
|
1142
|
-
console.log(`🔁 Switched AgentSession STT input: ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
|
|
1143
|
-
}
|
|
1144
|
-
}
|
|
1145
|
-
catch (err) {
|
|
1146
|
-
console.warn('⚠️ Failed to switch RoomIO participant:', err.message);
|
|
1147
|
-
}
|
|
1148
|
-
}
|
|
1149
|
-
catch (err) {
|
|
1150
|
-
console.error('❌ Failed to set up LiveKit publisher for meeting audio:', err instanceof Error ? err.message : err);
|
|
1151
|
-
try {
|
|
1152
|
-
recallWs.close();
|
|
1153
|
-
}
|
|
1154
|
-
catch { }
|
|
1155
|
-
await cleanup();
|
|
1156
|
-
return;
|
|
1157
|
-
}
|
|
1158
|
-
// Recall → us: JSON events with base64-encoded PCM. Decode, wrap as
|
|
1159
|
-
// AudioFrame, and capture into the source. AgentSession in the main room
|
|
1160
|
-
// will subscribe to this published track and STT it via the normal pipeline.
|
|
1161
|
-
// Payload shape from
|
|
1162
|
-
// docs.recall.ai/docs/how-to-get-separate-audio-per-participant-realtime:
|
|
1163
|
-
// { event: 'audio_separate_raw.data', data: { data: { buffer: '<base64>', ... }, participant: {...} } }
|
|
1164
|
-
//
|
|
1165
|
-
// Diagnostic counters so we can tell from prod logs whether (a) Recall is
|
|
1166
|
-
// streaming any frames at all, (b) they're decoding correctly, and (c)
|
|
1167
|
-
// captureFrame is succeeding. Logged every 100 frames (~5s at 50fps).
|
|
1168
|
-
let totalMessages = 0;
|
|
1169
|
-
let audioFrames = 0;
|
|
1170
|
-
let bytesIn = 0;
|
|
1171
|
-
let lastSpeakerSeen;
|
|
1172
|
-
const startTs = Date.now();
|
|
1173
|
-
recallWs.on('message', async (raw) => {
|
|
1174
|
-
totalMessages++;
|
|
1175
|
-
if (!source)
|
|
1176
|
-
return;
|
|
1177
|
-
try {
|
|
1178
|
-
const msg = JSON.parse(raw.toString());
|
|
1179
|
-
if (msg.event !== 'audio_separate_raw.data') {
|
|
1180
|
-
// First-time event-type diagnostic — log unknown event types once so
|
|
1181
|
-
// we know if Recall's payload shape changed
|
|
1182
|
-
if (totalMessages <= 3) {
|
|
1183
|
-
console.log(`[meeting-audio-in] non-audio event: ${msg.event}`);
|
|
1184
|
-
}
|
|
1185
|
-
return;
|
|
1186
|
-
}
|
|
1187
|
-
const b64 = msg.data?.data?.buffer;
|
|
1188
|
-
if (!b64) {
|
|
1189
|
-
if (audioFrames === 0) {
|
|
1190
|
-
console.warn(`[meeting-audio-in] first audio event had no buffer field. payload keys=${Object.keys(msg.data?.data ?? {}).join(',')}`);
|
|
1191
|
-
}
|
|
1192
|
-
return;
|
|
1193
|
-
}
|
|
1194
|
-
const pcmBuf = Buffer.from(b64, 'base64');
|
|
1195
|
-
bytesIn += pcmBuf.byteLength;
|
|
1196
|
-
const speakerName = msg.data?.data?.participant?.name || msg.data?.participant?.name;
|
|
1197
|
-
if (speakerName && speakerName !== lastSpeakerSeen) {
|
|
1198
|
-
console.log(`[meeting-audio-in] now hearing: ${speakerName}`);
|
|
1199
|
-
lastSpeakerSeen = speakerName;
|
|
1200
|
-
}
|
|
1201
|
-
// AudioFrame expects Int16Array. The PCM buffer is S16LE — view it
|
|
1202
|
-
// directly without copy. Length / 2 = samples (each sample 2 bytes).
|
|
1203
|
-
const samplesPerChannel = pcmBuf.byteLength / 2;
|
|
1204
|
-
const int16 = new Int16Array(pcmBuf.buffer, pcmBuf.byteOffset, samplesPerChannel);
|
|
1205
|
-
const frame = new AudioFrame(int16, 16000, 1, samplesPerChannel);
|
|
1206
|
-
await source.captureFrame(frame);
|
|
1207
|
-
audioFrames++;
|
|
1208
|
-
if (audioFrames === 1) {
|
|
1209
|
-
console.log(`[meeting-audio-in] FIRST audio frame captured (${pcmBuf.byteLength} bytes, ${samplesPerChannel} samples)`);
|
|
1210
|
-
}
|
|
1211
|
-
if (audioFrames % 100 === 0) {
|
|
1212
|
-
const elapsed = ((Date.now() - startTs) / 1000).toFixed(1);
|
|
1213
|
-
console.log(`[meeting-audio-in] heartbeat: ${audioFrames} frames, ${(bytesIn / 1024).toFixed(1)} KB in ${elapsed}s (last speaker: ${lastSpeakerSeen ?? 'unknown'})`);
|
|
1214
|
-
}
|
|
1215
|
-
}
|
|
1216
|
-
catch (err) {
|
|
1217
|
-
// Don't log every frame parse failure — could be noisy if Recall sends
|
|
1218
|
-
// non-audio_separate_raw events on the same channel.
|
|
1219
|
-
if (err.message?.includes('JSON'))
|
|
1220
|
-
return;
|
|
1221
|
-
console.warn('⚠️ meeting audio capture error:', err instanceof Error ? err.message : err);
|
|
1222
|
-
}
|
|
1223
|
-
});
|
|
1224
|
-
recallWs.on('close', async () => {
|
|
1225
|
-
const elapsed = ((Date.now() - startTs) / 1000).toFixed(1);
|
|
1226
|
-
console.log(`🎙️ Recall audio-in WebSocket closed — tearing down LiveKit publisher. Total: ${audioFrames} audio frames / ${totalMessages} messages / ${(bytesIn / 1024).toFixed(1)} KB over ${elapsed}s`);
|
|
1227
|
-
await cleanup();
|
|
1228
|
-
});
|
|
1229
|
-
recallWs.on('error', (err) => {
|
|
1230
|
-
console.warn('⚠️ Recall WS error:', err instanceof Error ? err.message : err);
|
|
1231
|
-
});
|
|
1232
|
-
});
|
|
1233
|
-
server.on('upgrade', (req, socket, head) => {
|
|
1234
|
-
const url = new URL(req.url || '/', `http://localhost:${port}`);
|
|
1235
|
-
if (url.pathname === '/meeting-audio') {
|
|
1236
|
-
meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
|
|
1237
|
-
meetingOutputWss.emit('connection', ws, req);
|
|
1238
|
-
});
|
|
1239
|
-
}
|
|
1240
|
-
else if (url.pathname === '/meeting-audio-in') {
|
|
1241
|
-
meetingAudioInWss.handleUpgrade(req, socket, head, (ws) => {
|
|
1242
|
-
meetingAudioInWss.emit('connection', ws, req);
|
|
1243
|
-
});
|
|
1244
|
-
}
|
|
1245
|
-
else {
|
|
1246
|
-
socket.destroy();
|
|
1247
|
-
}
|
|
861
|
+
// No WebSocket upgrade routes — meeting audio in/out moved off LiveKit to
|
|
862
|
+
// a polling architecture (see MeetingTranscriptPoller). The /meeting-audio
|
|
863
|
+
// and /meeting-audio-in routes were the old WebSocket-audio pipeline; both
|
|
864
|
+
// are gone. Reject all upgrade attempts.
|
|
865
|
+
server.on('upgrade', (_req, socket) => {
|
|
866
|
+
socket.destroy();
|
|
1248
867
|
});
|
|
1249
868
|
server.on('error', (err) => {
|
|
1250
869
|
if (err.code === 'EADDRINUSE') {
|
|
@@ -1467,7 +1086,7 @@ async function main() {
|
|
|
1467
1086
|
// session-only path (no user prefix).
|
|
1468
1087
|
let currentUserId = '';
|
|
1469
1088
|
let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
|
|
1470
|
-
|
|
1089
|
+
let activeMeetingPoller = null; // Transcript poller bound to that bot
|
|
1471
1090
|
// Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
|
|
1472
1091
|
// Updated by resume_session, session_selected, continue_session, switch_session handlers
|
|
1473
1092
|
let currentResumeSessionId;
|
|
@@ -1918,6 +1537,40 @@ async function main() {
|
|
|
1918
1537
|
}
|
|
1919
1538
|
}
|
|
1920
1539
|
}
|
|
1540
|
+
// Compaction event → frontend bridge. Forwards the raw event (consumed by the
|
|
1541
|
+
// dedicated banner UI state machine) AND emits a `claude_output` chat bubble
|
|
1542
|
+
// (so the activity is visible inline in chat even when the banner is hidden,
|
|
1543
|
+
// collapsed, or unreliable on iPad/iPhone). Extracted as a helper because
|
|
1544
|
+
// both direct-mode and pipeline-mode need to register it — the pipeline path
|
|
1545
|
+
// previously skipped this entirely, so compaction events fired into the void
|
|
1546
|
+
// in pipeline mode.
|
|
1547
|
+
const buildOnCompactionEvent = () => (event) => {
|
|
1548
|
+
try {
|
|
1549
|
+
// Raw event → banner state machine (compaction_started/progress/complete handlers in VoiceRoom.tsx).
|
|
1550
|
+
sendToFrontend({ ...event });
|
|
1551
|
+
// Inline chat bubble — reuses the existing claude_output path that's already working.
|
|
1552
|
+
if (event.type === 'compaction_started') {
|
|
1553
|
+
const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
|
|
1554
|
+
sendToFrontend({
|
|
1555
|
+
type: 'claude_output',
|
|
1556
|
+
text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
|
|
1557
|
+
agentRole: 'direct',
|
|
1558
|
+
});
|
|
1559
|
+
}
|
|
1560
|
+
else if (event.type === 'compaction_complete') {
|
|
1561
|
+
const n = event.skillsWritten ?? 0;
|
|
1562
|
+
const names = Array.isArray(event.skillNames) && event.skillNames.length > 0
|
|
1563
|
+
? ` — ${event.skillNames.join(', ')}`
|
|
1564
|
+
: '';
|
|
1565
|
+
sendToFrontend({
|
|
1566
|
+
type: 'claude_output',
|
|
1567
|
+
text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
|
|
1568
|
+
agentRole: 'direct',
|
|
1569
|
+
});
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
catch { /* non-fatal */ }
|
|
1573
|
+
};
|
|
1921
1574
|
// Create DIRECT session (STT + Claude Agent SDK + TTS)
|
|
1922
1575
|
async function createDirectSession(resumeSessionId, llmOverride) {
|
|
1923
1576
|
console.log('🎯 Creating direct session...');
|
|
@@ -1933,39 +1586,7 @@ async function main() {
|
|
|
1933
1586
|
resumeSessionId,
|
|
1934
1587
|
voiceMode: 'direct',
|
|
1935
1588
|
skipTTSQueue: true,
|
|
1936
|
-
onCompactionEvent: (
|
|
1937
|
-
try {
|
|
1938
|
-
// Forward the raw event so the dedicated banner UI can render it (if/when fixed).
|
|
1939
|
-
sendToFrontend({ ...event });
|
|
1940
|
-
// ALSO emit as a claude_output chat bubble — reuses the existing message path
|
|
1941
|
-
// that's already working end-to-end. PreCompact → in-progress bubble.
|
|
1942
|
-
// PostCompact → completion bubble with the skills summary. The dedicated
|
|
1943
|
-
// banner has been unreliable in production (data path works on backend, banner
|
|
1944
|
-
// never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
|
|
1945
|
-
// are visible without dev tools.
|
|
1946
|
-
if (event.type === 'compaction_started') {
|
|
1947
|
-
const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
|
|
1948
|
-
sendToFrontend({
|
|
1949
|
-
type: 'claude_output',
|
|
1950
|
-
text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
|
|
1951
|
-
agentRole: 'direct',
|
|
1952
|
-
});
|
|
1953
|
-
}
|
|
1954
|
-
else if (event.type === 'compaction_complete') {
|
|
1955
|
-
const ev = event;
|
|
1956
|
-
const n = ev.skillsWritten ?? 0;
|
|
1957
|
-
const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
|
|
1958
|
-
? ` — ${ev.skillNames.join(', ')}`
|
|
1959
|
-
: '';
|
|
1960
|
-
sendToFrontend({
|
|
1961
|
-
type: 'claude_output',
|
|
1962
|
-
text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
|
|
1963
|
-
agentRole: 'direct',
|
|
1964
|
-
});
|
|
1965
|
-
}
|
|
1966
|
-
}
|
|
1967
|
-
catch { /* non-fatal */ }
|
|
1968
|
-
},
|
|
1589
|
+
onCompactionEvent: buildOnCompactionEvent(),
|
|
1969
1590
|
});
|
|
1970
1591
|
currentLLM = directLLM;
|
|
1971
1592
|
// Reset the session always-allow list for each new direct session
|
|
@@ -2165,20 +1786,6 @@ async function main() {
|
|
|
2165
1786
|
}
|
|
2166
1787
|
const sayId = Date.now(); // simple ID to correlate start/end logs
|
|
2167
1788
|
console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
|
|
2168
|
-
// Forward spoken text + audio to meeting output page when bot is in a meeting.
|
|
2169
|
-
// Uses DIRECT_MODE_TTS (same OpenAI fable voice as the live session) — was
|
|
2170
|
-
// previously using directConfig.tts which falls back to DEFAULT_CONFIG.direct.tts
|
|
2171
|
-
// (Deepgram aura-2-asteria-en) when no user config exists, producing a different
|
|
2172
|
-
// voice in the meeting than what the user hears in voice-native. Both paths now
|
|
2173
|
-
// share the single source of truth.
|
|
2174
|
-
// PCM frames are WAV-encoded and pushed as binary WebSocket frames.
|
|
2175
|
-
// Recall captures the browser page's audio output and injects it into the meeting.
|
|
2176
|
-
if (activeMeetingBotId) {
|
|
2177
|
-
sendToMeetingOutput({ type: 'speak', text: data.text });
|
|
2178
|
-
if (meetingOutputWs) {
|
|
2179
|
-
synthesizeForMeeting(data.text, DIRECT_MODE_TTS).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
|
|
2180
|
-
}
|
|
2181
|
-
}
|
|
2182
1789
|
try {
|
|
2183
1790
|
const handle = currentSession.say(data.text);
|
|
2184
1791
|
if (handle && typeof handle.addDoneCallback === 'function') {
|
|
@@ -2316,39 +1923,7 @@ async function main() {
|
|
|
2316
1923
|
sessionBaseDir,
|
|
2317
1924
|
mcpServers,
|
|
2318
1925
|
resumeSessionId,
|
|
2319
|
-
onCompactionEvent: (
|
|
2320
|
-
try {
|
|
2321
|
-
// Forward the raw event so the dedicated banner UI can render it (if/when fixed).
|
|
2322
|
-
sendToFrontend({ ...event });
|
|
2323
|
-
// ALSO emit as a claude_output chat bubble — reuses the existing message path
|
|
2324
|
-
// that's already working end-to-end. PreCompact → in-progress bubble.
|
|
2325
|
-
// PostCompact → completion bubble with the skills summary. The dedicated
|
|
2326
|
-
// banner has been unreliable in production (data path works on backend, banner
|
|
2327
|
-
// never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
|
|
2328
|
-
// are visible without dev tools.
|
|
2329
|
-
if (event.type === 'compaction_started') {
|
|
2330
|
-
const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
|
|
2331
|
-
sendToFrontend({
|
|
2332
|
-
type: 'claude_output',
|
|
2333
|
-
text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
|
|
2334
|
-
agentRole: 'direct',
|
|
2335
|
-
});
|
|
2336
|
-
}
|
|
2337
|
-
else if (event.type === 'compaction_complete') {
|
|
2338
|
-
const ev = event;
|
|
2339
|
-
const n = ev.skillsWritten ?? 0;
|
|
2340
|
-
const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
|
|
2341
|
-
? ` — ${ev.skillNames.join(', ')}`
|
|
2342
|
-
: '';
|
|
2343
|
-
sendToFrontend({
|
|
2344
|
-
type: 'claude_output',
|
|
2345
|
-
text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
|
|
2346
|
-
agentRole: 'direct',
|
|
2347
|
-
});
|
|
2348
|
-
}
|
|
2349
|
-
}
|
|
2350
|
-
catch { /* non-fatal */ }
|
|
2351
|
-
},
|
|
1926
|
+
onCompactionEvent: buildOnCompactionEvent(),
|
|
2352
1927
|
});
|
|
2353
1928
|
currentLLM = realtimeClaudeHandler;
|
|
2354
1929
|
// For resumed sessions, eagerly create workspace (we know the real ID)
|
|
@@ -2891,7 +2466,6 @@ async function main() {
|
|
|
2891
2466
|
}
|
|
2892
2467
|
lastCompletedResearch = null;
|
|
2893
2468
|
currentSession = null;
|
|
2894
|
-
activeAgentSession = null;
|
|
2895
2469
|
currentAgent = null;
|
|
2896
2470
|
// Same disconnect-leak fix as the other two cleanup sites — kill the Claude SDK
|
|
2897
2471
|
// subprocess BEFORE dropping the reference. See killCurrentLLM() for full context.
|
|
@@ -2937,7 +2511,6 @@ async function main() {
|
|
|
2937
2511
|
}
|
|
2938
2512
|
catch { }
|
|
2939
2513
|
currentSession = null;
|
|
2940
|
-
activeAgentSession = null;
|
|
2941
2514
|
currentAgent = null;
|
|
2942
2515
|
// Same disconnect-leak fix — kill the previous user's Claude subprocess
|
|
2943
2516
|
// before binding currentLLM to the new user's session below.
|
|
@@ -3057,6 +2630,13 @@ async function main() {
|
|
|
3057
2630
|
resumeSessionId,
|
|
3058
2631
|
voiceMode: 'direct',
|
|
3059
2632
|
skipTTSQueue: true,
|
|
2633
|
+
// PipelineDirectOptions extends ClaudeLLMOptions; passing this through
|
|
2634
|
+
// forwards it into the inner `new ClaudeLLM(opts)`. Without this,
|
|
2635
|
+
// pipeline mode silently drops every PreCompact/PostCompact event
|
|
2636
|
+
// — banner never appears, chat bubble never appears — because
|
|
2637
|
+
// createDirectSession's `createClaudeLLM(...)` call is skipped when
|
|
2638
|
+
// an llmOverride is supplied (which is exactly what pipeline mode does).
|
|
2639
|
+
onCompactionEvent: buildOnCompactionEvent(),
|
|
3060
2640
|
getChatHistory: () => getChatHistory(20).map(t => ({ role: t.role, content: t.text })),
|
|
3061
2641
|
getResearchContext: () => {
|
|
3062
2642
|
if (activeResearch?.researchLog.length) {
|
|
@@ -3092,7 +2672,6 @@ async function main() {
|
|
|
3092
2672
|
agent = result.agent;
|
|
3093
2673
|
}
|
|
3094
2674
|
currentSession = session;
|
|
3095
|
-
activeAgentSession = session;
|
|
3096
2675
|
currentAgent = agent; // Store for updateChatCtx() context injection
|
|
3097
2676
|
// ============================================================
|
|
3098
2677
|
// Session event wiring — extracted into function for auto-recovery
|
|
@@ -3252,7 +2831,6 @@ async function main() {
|
|
|
3252
2831
|
}
|
|
3253
2832
|
catch { }
|
|
3254
2833
|
currentSession = null;
|
|
3255
|
-
activeAgentSession = null;
|
|
3256
2834
|
currentAgent = null;
|
|
3257
2835
|
// Clear stale state from crashed session
|
|
3258
2836
|
voiceQueue.length = 0;
|
|
@@ -3314,7 +2892,6 @@ async function main() {
|
|
|
3314
2892
|
const newSession = result.session;
|
|
3315
2893
|
const newAgent = result.agent;
|
|
3316
2894
|
currentSession = newSession;
|
|
3317
|
-
activeAgentSession = newSession;
|
|
3318
2895
|
currentAgent = newAgent;
|
|
3319
2896
|
// Re-wire event listeners on the new session
|
|
3320
2897
|
wireSessionEvents(newSession, newAgent);
|
|
@@ -3371,7 +2948,6 @@ async function main() {
|
|
|
3371
2948
|
}
|
|
3372
2949
|
catch { }
|
|
3373
2950
|
currentSession = null;
|
|
3374
|
-
activeAgentSession = null;
|
|
3375
2951
|
currentAgent = null;
|
|
3376
2952
|
// Clear voice queue — stale injections from the crashed session
|
|
3377
2953
|
voiceQueue.length = 0;
|
|
@@ -3395,7 +2971,6 @@ async function main() {
|
|
|
3395
2971
|
const newSession = result.session;
|
|
3396
2972
|
const newAgent = result.agent;
|
|
3397
2973
|
currentSession = newSession;
|
|
3398
|
-
activeAgentSession = newSession;
|
|
3399
2974
|
currentAgent = newAgent;
|
|
3400
2975
|
// Re-wire event listeners on the new session
|
|
3401
2976
|
wireSessionEvents(newSession, newAgent);
|
|
@@ -3590,7 +3165,6 @@ async function main() {
|
|
|
3590
3165
|
if (currentSession) {
|
|
3591
3166
|
const sessionToClose = currentSession;
|
|
3592
3167
|
currentSession = null;
|
|
3593
|
-
activeAgentSession = null;
|
|
3594
3168
|
// Track async close so new connections can wait for byte stream handler to be released
|
|
3595
3169
|
pendingSessionClose = (async () => {
|
|
3596
3170
|
try {
|
|
@@ -3612,6 +3186,10 @@ async function main() {
|
|
|
3612
3186
|
clearFastBrainSession();
|
|
3613
3187
|
clearPipelineFastBrainSession();
|
|
3614
3188
|
// Auto-leave any active meeting bot when user disconnects from the room
|
|
3189
|
+
if (activeMeetingPoller) {
|
|
3190
|
+
activeMeetingPoller.stop();
|
|
3191
|
+
activeMeetingPoller = null;
|
|
3192
|
+
}
|
|
3615
3193
|
if (activeMeetingBotId) {
|
|
3616
3194
|
const recallDisconnect = getRecallClient();
|
|
3617
3195
|
if (recallDisconnect) {
|
|
@@ -4195,61 +3773,61 @@ async function main() {
|
|
|
4195
3773
|
(process.env.FLY_APP_NAME
|
|
4196
3774
|
? `https://${process.env.FLY_APP_NAME}.fly.dev`
|
|
4197
3775
|
: `http://localhost:${apiPort}`);
|
|
4198
|
-
//
|
|
4199
|
-
//
|
|
4200
|
-
//
|
|
4201
|
-
//
|
|
4202
|
-
|
|
4203
|
-
|
|
4204
|
-
|
|
4205
|
-
|
|
4206
|
-
|
|
4207
|
-
|
|
4208
|
-
//
|
|
4209
|
-
//
|
|
4210
|
-
//
|
|
4211
|
-
//
|
|
4212
|
-
|
|
4213
|
-
//
|
|
4214
|
-
// Auth: the endpoint uses LiveKit room-presence as the auth check
|
|
4215
|
-
// — no shared secret needed. The agent must already be in the
|
|
4216
|
-
// requested room (which it is by this point) for the mint to
|
|
4217
|
-
// succeed.
|
|
4218
|
-
let outputPageUrl;
|
|
4219
|
-
const frontendUrl = data.frontendBase
|
|
4220
|
-
|| process.env.OSBORN_FRONTEND_URL;
|
|
4221
|
-
if (frontendUrl) {
|
|
3776
|
+
// Polling architecture (post-2026-05-22): the bot joins by name
|
|
3777
|
+
// only — no output_media webpage, no LiveKit republish, no audio
|
|
3778
|
+
// pipeline at all. Recall captures the meeting audio internally
|
|
3779
|
+
// and we pull the transcript via its REST API every ~30s.
|
|
3780
|
+
await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
|
|
3781
|
+
const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase);
|
|
3782
|
+
const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
|
|
3783
|
+
recallJoin.registerBot(botId, sessionId);
|
|
3784
|
+
activeMeetingBotId = botId;
|
|
3785
|
+
await sendToFrontend({ type: 'meeting_joined', botId, message: 'Osborn has joined the meeting' });
|
|
3786
|
+
// System injection so the LLM knows it's in a meeting and which
|
|
3787
|
+
// skill to apply. The meetings skill (agent/.claude/skills/meetings/SKILL.md)
|
|
3788
|
+
// teaches the agent: don't speak in response to [MEETING — *]:
|
|
3789
|
+
// messages, keep meeting-todos.md updated in the workspace, etc.
|
|
3790
|
+
if (currentLLM) {
|
|
4222
3791
|
try {
|
|
4223
|
-
const
|
|
4224
|
-
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
body: JSON.stringify({ botId: botLkId, roomName }),
|
|
3792
|
+
const sysCtx = new llm.ChatContext();
|
|
3793
|
+
sysCtx.addMessage({
|
|
3794
|
+
role: 'user',
|
|
3795
|
+
content: `[SYSTEM] You are now in a meeting (Recall bot ID: ${botId}, URL: ${meetingUrl}). Transcript chunks will arrive every ~30 seconds tagged \`[MEETING — ${botId}]:\`. Follow the meetings skill: do NOT speak in response (no TTS output), instead maintain meeting-todos.md in the session workspace, optionally trigger background research silently. The voice-native user can still interact normally — only the meeting-tagged messages are the silent-observer path. Acknowledge by writing the initial meeting-todos.md skeleton.`,
|
|
4228
3796
|
});
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
const params = new URLSearchParams({ token, url, room: roomName, botId: botLkId });
|
|
4232
|
-
outputPageUrl = `${frontendUrl}/meeting-bot?${params.toString()}`;
|
|
4233
|
-
console.log(`🎫 Meeting-bot token minted for room=${roomName} bot=${botLkId}`);
|
|
4234
|
-
}
|
|
4235
|
-
else {
|
|
4236
|
-
const errText = await tokenRes.text().catch(() => '');
|
|
4237
|
-
console.warn(`⚠️ meeting-bot-token mint failed (HTTP ${tokenRes.status}: ${errText.substring(0, 120)}) — falling back to legacy /meeting-output path`);
|
|
4238
|
-
}
|
|
3797
|
+
currentLLM.chat({ chatCtx: sysCtx });
|
|
3798
|
+
console.log('📓 Meeting system injection sent to LLM');
|
|
4239
3799
|
}
|
|
4240
|
-
catch (
|
|
4241
|
-
console.warn(
|
|
3800
|
+
catch (sysErr) {
|
|
3801
|
+
console.warn('⚠️ Meeting system injection failed:', sysErr.message);
|
|
4242
3802
|
}
|
|
4243
3803
|
}
|
|
4244
|
-
|
|
4245
|
-
|
|
3804
|
+
// Start polling the transcript every 30s. Each batch of new turns
|
|
3805
|
+
// is pushed to currentLLM.chat() tagged [MEETING — botId]: so the
|
|
3806
|
+
// skill kicks in. Poller dedups via first-word timestamp cursor.
|
|
3807
|
+
if (activeMeetingPoller) {
|
|
3808
|
+
activeMeetingPoller.stop();
|
|
3809
|
+
activeMeetingPoller = null;
|
|
4246
3810
|
}
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
3811
|
+
activeMeetingPoller = new MeetingTranscriptPoller({
|
|
3812
|
+
botId,
|
|
3813
|
+
recall: recallJoin,
|
|
3814
|
+
onTurns: async ({ formatted }) => {
|
|
3815
|
+
if (!currentLLM) {
|
|
3816
|
+
console.warn('📓 Meeting transcript arrived but currentLLM is null — dropping');
|
|
3817
|
+
return;
|
|
3818
|
+
}
|
|
3819
|
+
const tagged = `[MEETING — ${botId}]:\n${formatted}`;
|
|
3820
|
+
try {
|
|
3821
|
+
const turnCtx = new llm.ChatContext();
|
|
3822
|
+
turnCtx.addMessage({ role: 'user', content: tagged });
|
|
3823
|
+
currentLLM.chat({ chatCtx: turnCtx });
|
|
3824
|
+
}
|
|
3825
|
+
catch (err) {
|
|
3826
|
+
console.warn(`⚠️ Failed to forward meeting transcript to LLM: ${err.message}`);
|
|
3827
|
+
}
|
|
3828
|
+
},
|
|
3829
|
+
});
|
|
3830
|
+
activeMeetingPoller.start();
|
|
4253
3831
|
}
|
|
4254
3832
|
catch (err) {
|
|
4255
3833
|
console.error('❌ Recall.ai join error:', err);
|
|
@@ -4263,6 +3841,12 @@ async function main() {
|
|
|
4263
3841
|
const recallLeave = getRecallClient();
|
|
4264
3842
|
if (recallLeave && botId) {
|
|
4265
3843
|
try {
|
|
3844
|
+
// Stop the transcript poller FIRST so no more transcript chunks get
|
|
3845
|
+
// forwarded to the LLM during the leave.
|
|
3846
|
+
if (activeMeetingPoller) {
|
|
3847
|
+
activeMeetingPoller.stop();
|
|
3848
|
+
activeMeetingPoller = null;
|
|
3849
|
+
}
|
|
4266
3850
|
await recallLeave.leaveMeeting(botId);
|
|
4267
3851
|
activeMeetingBotId = null;
|
|
4268
3852
|
await sendToFrontend({ type: 'meeting_left', botId });
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
|
|
3
|
+
* interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
|
|
4
|
+
* messages.
|
|
5
|
+
*
|
|
6
|
+
* This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
|
|
7
|
+
* PCM from Recall into a LiveKit room. The polling architecture is simpler
|
|
8
|
+
* (no parallel STT, no audio pipeline, no participant juggling), survives
|
|
9
|
+
* agent restarts (Recall keeps the transcript on its side), and the LLM
|
|
10
|
+
* never speaks in the meeting — it's a silent note-taker.
|
|
11
|
+
*
|
|
12
|
+
* Lifecycle:
|
|
13
|
+
* const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
|
|
14
|
+
* poller.start()
|
|
15
|
+
* ...
|
|
16
|
+
* poller.stop() // on leave_meeting / disconnect / session switch
|
|
17
|
+
*
|
|
18
|
+
* Dedup strategy:
|
|
19
|
+
* Each turn carries a `start_timestamp.relative` on its first word (seconds
|
|
20
|
+
* since recording start). We track the highest cursor we've forwarded and
|
|
21
|
+
* only send turns with a strictly greater first-word timestamp. This means
|
|
22
|
+
* re-fetches don't double-deliver, and partial transcripts that get refined
|
|
23
|
+
* later don't re-trigger LLM processing of already-handled turns.
|
|
24
|
+
*
|
|
25
|
+
* Error handling:
|
|
26
|
+
* Transient fetch errors are logged + skipped (poll continues on next tick).
|
|
27
|
+
* No backoff — Recall's transcript endpoint is stable enough that a 30s
|
|
28
|
+
* cadence makes "slow start" non-issues self-recover within one cycle.
|
|
29
|
+
*/
|
|
30
|
+
import type { RecallClient, TranscriptTurn } from './recall-client.js';
|
|
31
|
+
export interface MeetingTranscriptPollerOptions {
|
|
32
|
+
botId: string;
|
|
33
|
+
recall: RecallClient;
|
|
34
|
+
/** Called when new transcript turns arrive (de-duped). Get a fresh batch each tick. */
|
|
35
|
+
onTurns: (chunk: {
|
|
36
|
+
botId: string;
|
|
37
|
+
turns: TranscriptTurn[];
|
|
38
|
+
formatted: string;
|
|
39
|
+
}) => void | Promise<void>;
|
|
40
|
+
/** Default 30s — matches the user's stated cadence. */
|
|
41
|
+
intervalMs?: number;
|
|
42
|
+
/** Optional debug logger. */
|
|
43
|
+
onError?: (err: Error) => void;
|
|
44
|
+
}
|
|
45
|
+
export declare class MeetingTranscriptPoller {
|
|
46
|
+
#private;
|
|
47
|
+
constructor(opts: MeetingTranscriptPollerOptions);
|
|
48
|
+
start(): void;
|
|
49
|
+
stop(): void;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Format an array of turns into a single string for LLM consumption.
|
|
53
|
+
*
|
|
54
|
+
* Each turn becomes:
|
|
55
|
+
* <Speaker>: <text>
|
|
56
|
+
*
|
|
57
|
+
* Whitespace-only words and zero-content turns are dropped. Returns empty
|
|
58
|
+
* string if nothing meaningful is in the batch.
|
|
59
|
+
*/
|
|
60
|
+
export declare function formatTurns(turns: TranscriptTurn[]): string;
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
|
|
3
|
+
* interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
|
|
4
|
+
* messages.
|
|
5
|
+
*
|
|
6
|
+
* This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
|
|
7
|
+
* PCM from Recall into a LiveKit room. The polling architecture is simpler
|
|
8
|
+
* (no parallel STT, no audio pipeline, no participant juggling), survives
|
|
9
|
+
* agent restarts (Recall keeps the transcript on its side), and the LLM
|
|
10
|
+
* never speaks in the meeting — it's a silent note-taker.
|
|
11
|
+
*
|
|
12
|
+
* Lifecycle:
|
|
13
|
+
* const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
|
|
14
|
+
* poller.start()
|
|
15
|
+
* ...
|
|
16
|
+
* poller.stop() // on leave_meeting / disconnect / session switch
|
|
17
|
+
*
|
|
18
|
+
* Dedup strategy:
|
|
19
|
+
* Each turn carries a `start_timestamp.relative` on its first word (seconds
|
|
20
|
+
* since recording start). We track the highest cursor we've forwarded and
|
|
21
|
+
* only send turns with a strictly greater first-word timestamp. This means
|
|
22
|
+
* re-fetches don't double-deliver, and partial transcripts that get refined
|
|
23
|
+
* later don't re-trigger LLM processing of already-handled turns.
|
|
24
|
+
*
|
|
25
|
+
* Error handling:
|
|
26
|
+
* Transient fetch errors are logged + skipped (poll continues on next tick).
|
|
27
|
+
* No backoff — Recall's transcript endpoint is stable enough that a 30s
|
|
28
|
+
* cadence makes "slow start" non-issues self-recover within one cycle.
|
|
29
|
+
*/
|
|
30
|
+
export class MeetingTranscriptPoller {
|
|
31
|
+
#opts;
|
|
32
|
+
#timer = null;
|
|
33
|
+
#cursor = -Infinity; // highest first-word.start_timestamp.relative we've forwarded
|
|
34
|
+
#inFlight = false; // prevent overlapping polls if one cycle runs long
|
|
35
|
+
#stopped = false;
|
|
36
|
+
constructor(opts) {
|
|
37
|
+
this.#opts = opts;
|
|
38
|
+
}
|
|
39
|
+
start() {
|
|
40
|
+
if (this.#timer)
|
|
41
|
+
return;
|
|
42
|
+
const interval = this.#opts.intervalMs ?? 30_000;
|
|
43
|
+
console.log(`📓 MeetingTranscriptPoller starting for bot=${this.#opts.botId.substring(0, 8)} (every ${Math.round(interval / 1000)}s)`);
|
|
44
|
+
// Fire once immediately so the LLM sees the meeting started, then on interval.
|
|
45
|
+
void this.#tick();
|
|
46
|
+
this.#timer = setInterval(() => void this.#tick(), interval);
|
|
47
|
+
}
|
|
48
|
+
stop() {
|
|
49
|
+
if (this.#stopped)
|
|
50
|
+
return;
|
|
51
|
+
this.#stopped = true;
|
|
52
|
+
if (this.#timer) {
|
|
53
|
+
clearInterval(this.#timer);
|
|
54
|
+
this.#timer = null;
|
|
55
|
+
}
|
|
56
|
+
console.log(`📓 MeetingTranscriptPoller stopped for bot=${this.#opts.botId.substring(0, 8)}`);
|
|
57
|
+
}
|
|
58
|
+
async #tick() {
|
|
59
|
+
if (this.#inFlight || this.#stopped)
|
|
60
|
+
return;
|
|
61
|
+
this.#inFlight = true;
|
|
62
|
+
try {
|
|
63
|
+
const all = await this.#opts.recall.getTranscript(this.#opts.botId);
|
|
64
|
+
const fresh = all.filter(t => {
|
|
65
|
+
const firstWordTs = t.words?.[0]?.start_timestamp?.relative;
|
|
66
|
+
return typeof firstWordTs === 'number' && firstWordTs > this.#cursor;
|
|
67
|
+
});
|
|
68
|
+
if (fresh.length === 0)
|
|
69
|
+
return;
|
|
70
|
+
// Advance cursor to highest seen first-word ts (across all returned turns,
|
|
71
|
+
// not just the fresh ones — guards against Recall returning a paged subset).
|
|
72
|
+
for (const t of all) {
|
|
73
|
+
const ts = t.words?.[0]?.start_timestamp?.relative;
|
|
74
|
+
if (typeof ts === 'number' && ts > this.#cursor)
|
|
75
|
+
this.#cursor = ts;
|
|
76
|
+
}
|
|
77
|
+
const formatted = formatTurns(fresh);
|
|
78
|
+
if (!formatted)
|
|
79
|
+
return; // pure-whitespace fresh batch — skip
|
|
80
|
+
console.log(`📓 MeetingTranscriptPoller: ${fresh.length} new turn(s), cursor=${this.#cursor.toFixed(1)}s, chars=${formatted.length}`);
|
|
81
|
+
await this.#opts.onTurns({ botId: this.#opts.botId, turns: fresh, formatted });
|
|
82
|
+
}
|
|
83
|
+
catch (err) {
|
|
84
|
+
const e = err instanceof Error ? err : new Error(String(err));
|
|
85
|
+
this.#opts.onError?.(e);
|
|
86
|
+
console.warn(`⚠️ MeetingTranscriptPoller tick failed: ${e.message}`);
|
|
87
|
+
}
|
|
88
|
+
finally {
|
|
89
|
+
this.#inFlight = false;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Format an array of turns into a single string for LLM consumption.
|
|
95
|
+
*
|
|
96
|
+
* Each turn becomes:
|
|
97
|
+
* <Speaker>: <text>
|
|
98
|
+
*
|
|
99
|
+
* Whitespace-only words and zero-content turns are dropped. Returns empty
|
|
100
|
+
* string if nothing meaningful is in the batch.
|
|
101
|
+
*/
|
|
102
|
+
export function formatTurns(turns) {
|
|
103
|
+
const lines = [];
|
|
104
|
+
for (const t of turns) {
|
|
105
|
+
const speaker = t.speaker || t.participant?.name || 'Unknown';
|
|
106
|
+
const text = (t.words ?? []).map(w => w.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
107
|
+
if (!text)
|
|
108
|
+
continue;
|
|
109
|
+
lines.push(`${speaker}: ${text}`);
|
|
110
|
+
}
|
|
111
|
+
return lines.join('\n');
|
|
112
|
+
}
|
package/dist/recall-client.d.ts
CHANGED
|
@@ -4,6 +4,36 @@ export interface RecallBot {
|
|
|
4
4
|
meeting_url: string;
|
|
5
5
|
status: string;
|
|
6
6
|
}
|
|
7
|
+
/**
|
|
8
|
+
* One transcript turn = one speaker's continuous utterance.
|
|
9
|
+
* Shape returned by GET /api/v1/bot/{bot_id}/transcript.
|
|
10
|
+
*
|
|
11
|
+
* Per Recall docs each turn contains:
|
|
12
|
+
* - speaker: participant name (or 'Unknown')
|
|
13
|
+
* - words: array of { text, start_timestamp.relative, end_timestamp.relative }
|
|
14
|
+
* - The `start_timestamp.relative` (seconds since recording start) on the
|
|
15
|
+
* FIRST word is the turn's start; we use this as the dedup cursor.
|
|
16
|
+
*/
|
|
17
|
+
export interface TranscriptTurn {
|
|
18
|
+
speaker?: string;
|
|
19
|
+
participant?: {
|
|
20
|
+
id?: number;
|
|
21
|
+
name?: string;
|
|
22
|
+
is_host?: boolean;
|
|
23
|
+
};
|
|
24
|
+
words: Array<{
|
|
25
|
+
text: string;
|
|
26
|
+
start_timestamp?: {
|
|
27
|
+
relative?: number;
|
|
28
|
+
absolute?: string;
|
|
29
|
+
};
|
|
30
|
+
end_timestamp?: {
|
|
31
|
+
relative?: number;
|
|
32
|
+
absolute?: string;
|
|
33
|
+
};
|
|
34
|
+
}>;
|
|
35
|
+
language?: string;
|
|
36
|
+
}
|
|
7
37
|
export interface TranscriptPayload {
|
|
8
38
|
event: string;
|
|
9
39
|
data: {
|
|
@@ -49,10 +79,27 @@ export declare class RecallClient extends EventEmitter {
|
|
|
49
79
|
* room as the osborn agent (no separate WebSocket+WAV pipe).
|
|
50
80
|
* @param opts.botName Display name of the bot in the meeting
|
|
51
81
|
*/
|
|
52
|
-
joinMeeting(meetingUrl: string,
|
|
53
|
-
outputPageUrl?: string;
|
|
82
|
+
joinMeeting(meetingUrl: string, _webhookBaseUrl: string, opts?: {
|
|
54
83
|
botName?: string;
|
|
55
84
|
}): Promise<string>;
|
|
85
|
+
/**
|
|
86
|
+
* Fetch the bot's current transcript. Returns an array of "transcript turns"
|
|
87
|
+
* (each turn = one speaker's utterance) sorted by start time. Use the bot's
|
|
88
|
+
* `recordings[0].id` from getBotStatus / bot record to locate the recording,
|
|
89
|
+
* then list its transcripts.
|
|
90
|
+
*
|
|
91
|
+
* Per Recall docs:
|
|
92
|
+
* GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
|
|
93
|
+
* GET /api/v1/transcript/{transcript_id} → transcript with download_url
|
|
94
|
+
* Download the transcript JSON from download_url to get the actual content.
|
|
95
|
+
*
|
|
96
|
+
* For the polling use case (called every ~30s), we use the simpler combined
|
|
97
|
+
* endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
|
|
98
|
+
* convenience and returns the full transcript so far in one call. The caller
|
|
99
|
+
* is responsible for de-duping (keeping a since-cursor) so the LLM only sees
|
|
100
|
+
* new turns.
|
|
101
|
+
*/
|
|
102
|
+
getTranscript(botId: string): Promise<TranscriptTurn[]>;
|
|
56
103
|
leaveMeeting(botId: string): Promise<void>;
|
|
57
104
|
getBotStatus(botId: string): Promise<string>;
|
|
58
105
|
handleWebhook(payload: TranscriptPayload): void;
|
package/dist/recall-client.js
CHANGED
|
@@ -21,37 +21,20 @@ export class RecallClient extends EventEmitter {
|
|
|
21
21
|
* room as the osborn agent (no separate WebSocket+WAV pipe).
|
|
22
22
|
* @param opts.botName Display name of the bot in the meeting
|
|
23
23
|
*/
|
|
24
|
-
async joinMeeting(meetingUrl,
|
|
24
|
+
async joinMeeting(meetingUrl, _webhookBaseUrl, opts) {
|
|
25
25
|
const botName = opts?.botName ?? 'Osborn';
|
|
26
|
-
|
|
27
|
-
//
|
|
28
|
-
// and
|
|
26
|
+
// ARCHITECTURE (post-2026-05-22 polling redesign):
|
|
27
|
+
// The bot joins by name only — visible in the meeting participant list as
|
|
28
|
+
// "Osborn" but with no audio output and no avatar. We do NOT configure any
|
|
29
|
+
// `output_media`, `audio_separate_raw`, or `realtime_endpoints` — instead
|
|
30
|
+
// the agent polls Recall's REST transcript API every ~30s
|
|
31
|
+
// (see MeetingTranscriptPoller) and feeds new turns into the LLM as
|
|
32
|
+
// `[MEETING — <botId>]:` tagged messages. The meetings skill teaches the
|
|
33
|
+
// LLM not to respond out loud to those messages, only to take notes.
|
|
29
34
|
//
|
|
30
|
-
// recording_config.transcript.provider
|
|
31
|
-
//
|
|
32
|
-
//
|
|
33
|
-
// IMPORTANT:
|
|
34
|
-
// - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
|
|
35
|
-
// - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
|
|
36
|
-
// - `transcription_options` does NOT exist — use `transcript.provider`
|
|
37
|
-
// - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
|
|
38
|
-
//
|
|
39
|
-
// ARCHITECTURE (post-2026-05-22 redesign):
|
|
40
|
-
// Input (meeting → osborn): Recall's documented WebSocket audio protocol.
|
|
41
|
-
// `audio_separate_raw` config + websocket realtime endpoint streams
|
|
42
|
-
// per-participant PCM (S16LE 16kHz mono, base64 in JSON) to the agent's
|
|
43
|
-
// /meeting-audio-in WS handler. Bot's own audio is excluded by default
|
|
44
|
-
// → zero possibility of feedback loop, no echo cancellation needed.
|
|
45
|
-
// Output (osborn → meeting): webpage output_media (LiveKit-on-page). Bot
|
|
46
|
-
// page subscribes to osborn's LiveKit audio track and plays it via
|
|
47
|
-
// track.attach(); Recall captures the page's audio output and injects
|
|
48
|
-
// into the meeting.
|
|
49
|
-
// Webhook transcripts (transcript.data): retained as a SECONDARY signal —
|
|
50
|
-
// the agent index.ts handler for this event currently logs but does NOT
|
|
51
|
-
// forward to the LLM (intentionally disabled). The Deepgram WS path
|
|
52
|
-
// above is the LLM input.
|
|
53
|
-
const httpBase = webhookBaseUrl.replace(/\/$/, '');
|
|
54
|
-
const wsBase = httpBase.replace(/^https?:\/\//, m => m === 'https://' ? 'wss://' : 'ws://');
|
|
35
|
+
// We DO keep `recording_config.transcript.provider.recallai_streaming` so
|
|
36
|
+
// Recall actually transcribes the meeting — the REST endpoint we poll
|
|
37
|
+
// requires this to be configured, otherwise transcripts are empty.
|
|
55
38
|
const res = await fetch(`${RECALL_BASE_URL}/bot`, {
|
|
56
39
|
method: 'POST',
|
|
57
40
|
headers: {
|
|
@@ -64,46 +47,12 @@ export class RecallClient extends EventEmitter {
|
|
|
64
47
|
recording_config: {
|
|
65
48
|
transcript: {
|
|
66
49
|
provider: {
|
|
67
|
-
// recallai_streaming is built-in — no external API key needed,
|
|
68
|
-
// low-latency, works across all meeting platforms.
|
|
69
|
-
// Kept for the secondary webhook signal (display / future use);
|
|
70
|
-
// LLM input now comes from the Deepgram WS pipe below.
|
|
71
50
|
recallai_streaming: {
|
|
72
51
|
mode: 'prioritize_low_latency',
|
|
73
52
|
language_code: 'en',
|
|
74
53
|
},
|
|
75
54
|
},
|
|
76
55
|
},
|
|
77
|
-
// Per-participant raw PCM audio stream. Bot's own audio is excluded
|
|
78
|
-
// (we don't set include_bot_in_recording.audio:true).
|
|
79
|
-
audio_separate_raw: {},
|
|
80
|
-
realtime_endpoints: [
|
|
81
|
-
{
|
|
82
|
-
// Transcript webhook (secondary signal; LLM forwarding disabled).
|
|
83
|
-
type: 'webhook',
|
|
84
|
-
url: `${httpBase}/webhook/recall`,
|
|
85
|
-
events: ['transcript.data'],
|
|
86
|
-
},
|
|
87
|
-
{
|
|
88
|
-
// Per-participant PCM audio → agent's Deepgram STT pipe.
|
|
89
|
-
type: 'websocket',
|
|
90
|
-
url: `${wsBase}/meeting-audio-in`,
|
|
91
|
-
events: ['audio_separate_raw.data'],
|
|
92
|
-
},
|
|
93
|
-
],
|
|
94
|
-
},
|
|
95
|
-
output_media: {
|
|
96
|
-
camera: {
|
|
97
|
-
// `kind` (not `type`) — confirmed from prior debugging.
|
|
98
|
-
// The page Recall renders connects to LiveKit and plays osborn's
|
|
99
|
-
// TTS audio via track.attach(); Recall captures the page audio.
|
|
100
|
-
// The page does NOT call getUserMedia anymore — input now comes
|
|
101
|
-
// from the audio_separate_raw WebSocket above.
|
|
102
|
-
kind: 'webpage',
|
|
103
|
-
config: {
|
|
104
|
-
url: outputPageUrl,
|
|
105
|
-
},
|
|
106
|
-
},
|
|
107
56
|
},
|
|
108
57
|
}),
|
|
109
58
|
});
|
|
@@ -112,9 +61,37 @@ export class RecallClient extends EventEmitter {
|
|
|
112
61
|
throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
|
|
113
62
|
}
|
|
114
63
|
const bot = (await res.json());
|
|
115
|
-
console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (
|
|
64
|
+
console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (polling-only, no audio pipeline)`);
|
|
116
65
|
return bot.id;
|
|
117
66
|
}
|
|
67
|
+
/**
|
|
68
|
+
* Fetch the bot's current transcript. Returns an array of "transcript turns"
|
|
69
|
+
* (each turn = one speaker's utterance) sorted by start time. Use the bot's
|
|
70
|
+
* `recordings[0].id` from getBotStatus / bot record to locate the recording,
|
|
71
|
+
* then list its transcripts.
|
|
72
|
+
*
|
|
73
|
+
* Per Recall docs:
|
|
74
|
+
* GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
|
|
75
|
+
* GET /api/v1/transcript/{transcript_id} → transcript with download_url
|
|
76
|
+
* Download the transcript JSON from download_url to get the actual content.
|
|
77
|
+
*
|
|
78
|
+
* For the polling use case (called every ~30s), we use the simpler combined
|
|
79
|
+
* endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
|
|
80
|
+
* convenience and returns the full transcript so far in one call. The caller
|
|
81
|
+
* is responsible for de-duping (keeping a since-cursor) so the LLM only sees
|
|
82
|
+
* new turns.
|
|
83
|
+
*/
|
|
84
|
+
async getTranscript(botId) {
|
|
85
|
+
const res = await fetch(`${RECALL_BASE_URL}/bot/${botId}/transcript`, {
|
|
86
|
+
headers: { 'Authorization': `Token ${this.#apiKey}` },
|
|
87
|
+
});
|
|
88
|
+
if (!res.ok) {
|
|
89
|
+
const err = await res.text().catch(() => '');
|
|
90
|
+
throw new Error(`Recall.ai transcript fetch failed: ${res.status} ${err.substring(0, 200)}`);
|
|
91
|
+
}
|
|
92
|
+
const turns = await res.json();
|
|
93
|
+
return Array.isArray(turns) ? turns : [];
|
|
94
|
+
}
|
|
118
95
|
async leaveMeeting(botId) {
|
|
119
96
|
await fetch(`${RECALL_BASE_URL}/bot/${botId}/leave_call`, {
|
|
120
97
|
method: 'POST',
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "osborn",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.43",
|
|
4
4
|
"description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"dev:logged": "tsx scripts/dev-logged.ts",
|
|
12
12
|
"review": "tsx scripts/review.ts",
|
|
13
13
|
"start": "tsx src/index.ts",
|
|
14
|
-
"build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts
|
|
14
|
+
"build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts",
|
|
15
15
|
"room": "tsx src/index.ts --room",
|
|
16
16
|
"prepublishOnly": "npm run build"
|
|
17
17
|
},
|