osborn 0.9.41 → 0.9.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/meetings/SKILL.md +73 -0
- package/dist/index.js +111 -492
- package/dist/meeting-transcript-poller.d.ts +60 -0
- package/dist/meeting-transcript-poller.js +112 -0
- package/dist/recall-client.d.ts +49 -2
- package/dist/recall-client.js +41 -64
- package/package.json +2 -2
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Skill: Meetings
|
|
2
|
+
|
|
3
|
+
Silent note-taking and TODO-tracking when osborn is sitting in a live meeting.
|
|
4
|
+
|
|
5
|
+
## When to use
|
|
6
|
+
|
|
7
|
+
When a user message arrives with the prefix `[MEETING — <botId>]:` (every ~30 seconds while a Recall.ai meeting bot is active). Also use this skill when the orchestrating system injects `[SYSTEM] You are now in a meeting ...`.
|
|
8
|
+
|
|
9
|
+
**Do NOT use this skill** for normal user messages in the voice-native chat — those still get spoken responses as usual.
|
|
10
|
+
|
|
11
|
+
## How to behave
|
|
12
|
+
|
|
13
|
+
For every `[MEETING — *]:` message:
|
|
14
|
+
|
|
15
|
+
1. **Do NOT speak.** No TTS output. No `tts_say`. No conversational reply. This is a silent observer path. If you must acknowledge that you processed the message, do it via a Write/Edit tool call (writing to the workspace), not via spoken or chat output.
|
|
16
|
+
2. **Update `meeting-todos.md`** in the session workspace (`{workspace}/meeting-todos.md`). Append new action items, decisions, and open questions as they emerge in the transcript. Do not rewrite existing entries unless something contradicts.
|
|
17
|
+
3. **Optionally trigger background research silently.** If a topic in the meeting would benefit from a quick web/code lookup, dispatch a researcher sub-agent via the Task tool. Save its output to `{workspace}/library/meeting-research-<topic-slug>.md`. Do NOT speak the result.
|
|
18
|
+
4. **Do not consume voice-native attention.** The user can still talk to you via the voice-native browser. When they do (a normal user message with no `[MEETING — *]` prefix), respond normally — speak. Treat the meeting transcript as background context they can ask about ("what did Sarah say about pricing?" → answer normally).
|
|
19
|
+
|
|
20
|
+
## The `meeting-todos.md` file
|
|
21
|
+
|
|
22
|
+
Keep it scannable. Structure:
|
|
23
|
+
|
|
24
|
+
```markdown
|
|
25
|
+
# Meeting Notes
|
|
26
|
+
|
|
27
|
+
**Bot:** <botId> · **Started:** <ISO timestamp>
|
|
28
|
+
|
|
29
|
+
## TODOs
|
|
30
|
+
|
|
31
|
+
- [ ] <person>: <action item> — <context>
|
|
32
|
+
- [ ] <person>: <action item>
|
|
33
|
+
|
|
34
|
+
## Decisions
|
|
35
|
+
|
|
36
|
+
- <date/time> — <what was decided> (raised by <person>)
|
|
37
|
+
|
|
38
|
+
## Open Questions
|
|
39
|
+
|
|
40
|
+
- <question> — raised by <person>, still unresolved
|
|
41
|
+
- <question> — answered by <person>: <answer>
|
|
42
|
+
|
|
43
|
+
## Highlights
|
|
44
|
+
|
|
45
|
+
- <key moment or quote worth surfacing>
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Update the same file across multiple poll cycles — don't create `meeting-todos-1.md`, `meeting-todos-2.md`. One file, evolving.
|
|
49
|
+
|
|
50
|
+
## Workspace path
|
|
51
|
+
|
|
52
|
+
The session workspace is `~/.claude/projects/<slug>/osb/<session-uuid>/`. Read the env variable or the spec.md header if you need to confirm the exact path. Write absolute paths in tool calls (e.g. `/Users/<user>/.claude/projects/.../osb/<uuid>/meeting-todos.md`).
|
|
53
|
+
|
|
54
|
+
## On meeting end
|
|
55
|
+
|
|
56
|
+
When the user leaves the meeting (the system stops sending `[MEETING — *]:` messages and may inject `[SYSTEM] meeting ended`), do a final pass on `meeting-todos.md` to:
|
|
57
|
+
- Mark items the user has clearly committed to
|
|
58
|
+
- Move resolved open questions to a `## Resolved` section
|
|
59
|
+
- Add a `## Summary` section at the top with 3-5 lines distilling the meeting
|
|
60
|
+
|
|
61
|
+
Still silent. The user will ask out loud if they want a recap.
|
|
62
|
+
|
|
63
|
+
## When the user asks about the meeting
|
|
64
|
+
|
|
65
|
+
When a non-meeting-tagged message references the meeting ("what's on the todo list?", "what did we decide about X?", "who's handling Y?"), respond normally — speak. Read `meeting-todos.md` first to ground the response. Don't make up speaker names or decisions; only state what's recorded.
|
|
66
|
+
|
|
67
|
+
## Anti-patterns
|
|
68
|
+
|
|
69
|
+
- ❌ Speaking in response to a `[MEETING — *]:` message
|
|
70
|
+
- ❌ Creating a new file per poll cycle instead of updating one
|
|
71
|
+
- ❌ Trying to drive the meeting (don't add "we should..." items unless someone in the meeting said them)
|
|
72
|
+
- ❌ Asking the user clarifying questions during the meeting — they're not paying attention to chat
|
|
73
|
+
- ❌ Re-transcribing what's in the message into the TODO file verbatim. Distill.
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Load environment variables FIRST before any other imports
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { voice, initializeLogger } from '@livekit/agents';
|
|
4
|
-
import { Room, RoomEvent,
|
|
4
|
+
import { Room, RoomEvent, } from '@livekit/rtc-node';
|
|
5
5
|
import { AccessToken } from 'livekit-server-sdk';
|
|
6
6
|
// Initialize logger before anything else
|
|
7
7
|
initializeLogger({ pretty: true, level: 'info' });
|
|
@@ -10,7 +10,6 @@ initializeLogger({ pretty: true, level: 'info' });
|
|
|
10
10
|
import { setMaxListeners } from 'node:events';
|
|
11
11
|
setMaxListeners(50);
|
|
12
12
|
import { createServer } from 'http';
|
|
13
|
-
import { WebSocket, WebSocketServer } from 'ws';
|
|
14
13
|
import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
|
|
15
14
|
import { dirname, join } from 'node:path';
|
|
16
15
|
import { fileURLToPath } from 'node:url';
|
|
@@ -34,6 +33,7 @@ import { askHaiku, askFastBrain, updateSpecFromJSONL, processResearchCompletion,
|
|
|
34
33
|
import { DIRECT_MODE_PROMPT, getRealtimeInstructions, getScriptInjection, getProactiveInjection, getNotificationInjection } from './prompts.js';
|
|
35
34
|
import { MCP_CATALOG } from './config.js';
|
|
36
35
|
import { getRecallClient } from './recall-client.js';
|
|
36
|
+
import { MeetingTranscriptPoller } from './meeting-transcript-poller.js';
|
|
37
37
|
import { llm } from '@livekit/agents';
|
|
38
38
|
import { z } from 'zod';
|
|
39
39
|
// ============================================================
|
|
@@ -147,79 +147,6 @@ process.on('uncaughtException', (error) => {
|
|
|
147
147
|
// ============================================================
|
|
148
148
|
// Module-level room code so the HTTP server can expose it via GET /room-code
|
|
149
149
|
let currentRoomCode = null;
|
|
150
|
-
// Meeting output WebSocket — module-level so both startApiServer and main() can access it
|
|
151
|
-
let meetingOutputWs = null;
|
|
152
|
-
// Module-level AgentSession reference so /meeting-audio-in WS handler can switch
|
|
153
|
-
// the RoomIO-linked participant when meeting audio starts/stops (B2 design).
|
|
154
|
-
let activeAgentSession = null;
|
|
155
|
-
// Identity of the local user participant the session was originally listening to
|
|
156
|
-
// — captured at the moment we switch to the meeting publisher, restored on cleanup.
|
|
157
|
-
let preMeetingUserIdentity = null;
|
|
158
|
-
function sendToMeetingOutput(msg) {
|
|
159
|
-
if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
|
|
160
|
-
try {
|
|
161
|
-
meetingOutputWs.send(JSON.stringify(msg));
|
|
162
|
-
}
|
|
163
|
-
catch { }
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
// Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
|
|
167
|
-
// Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
|
|
168
|
-
async function synthesizeForMeeting(text, ttsConfig) {
|
|
169
|
-
if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
|
|
170
|
-
return;
|
|
171
|
-
const ttsInstance = createTTS(ttsConfig);
|
|
172
|
-
try {
|
|
173
|
-
const chunks = [];
|
|
174
|
-
let sampleRate = 24000;
|
|
175
|
-
let numChannels = 1;
|
|
176
|
-
const stream = ttsInstance.synthesize(text);
|
|
177
|
-
for await (const event of stream) {
|
|
178
|
-
if (event === Symbol.for('END_OF_STREAM'))
|
|
179
|
-
break;
|
|
180
|
-
const e = event;
|
|
181
|
-
if (e?.frame?.data) {
|
|
182
|
-
chunks.push(e.frame.data);
|
|
183
|
-
sampleRate = e.frame.sampleRate ?? sampleRate;
|
|
184
|
-
numChannels = e.frame.numChannels ?? numChannels;
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
if (chunks.length === 0)
|
|
188
|
-
return;
|
|
189
|
-
const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
|
|
190
|
-
const pcm = new Int16Array(totalSamples);
|
|
191
|
-
let offset = 0;
|
|
192
|
-
for (const c of chunks) {
|
|
193
|
-
pcm.set(c, offset);
|
|
194
|
-
offset += c.length;
|
|
195
|
-
}
|
|
196
|
-
// WAV header (44 bytes) + PCM data
|
|
197
|
-
const dataBytes = pcm.length * 2;
|
|
198
|
-
const wav = Buffer.alloc(44 + dataBytes);
|
|
199
|
-
wav.write('RIFF', 0);
|
|
200
|
-
wav.writeUInt32LE(36 + dataBytes, 4);
|
|
201
|
-
wav.write('WAVE', 8);
|
|
202
|
-
wav.write('fmt ', 12);
|
|
203
|
-
wav.writeUInt32LE(16, 16);
|
|
204
|
-
wav.writeUInt16LE(1, 20);
|
|
205
|
-
wav.writeUInt16LE(numChannels, 22);
|
|
206
|
-
wav.writeUInt32LE(sampleRate, 24);
|
|
207
|
-
wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
|
|
208
|
-
wav.writeUInt16LE(numChannels * 2, 32);
|
|
209
|
-
wav.writeUInt16LE(16, 34);
|
|
210
|
-
wav.write('data', 36);
|
|
211
|
-
wav.writeUInt32LE(dataBytes, 40);
|
|
212
|
-
for (let i = 0; i < pcm.length; i++)
|
|
213
|
-
wav.writeInt16LE(pcm[i], 44 + i * 2);
|
|
214
|
-
if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
|
|
215
|
-
meetingOutputWs.send(wav);
|
|
216
|
-
console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
finally {
|
|
220
|
-
await ttsInstance.close().catch(() => { });
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
150
|
function startApiServer(workingDir, port) {
|
|
224
151
|
const server = createServer(async (req, res) => {
|
|
225
152
|
// CORS headers for cloud frontend
|
|
@@ -317,40 +244,6 @@ function startApiServer(workingDir, port) {
|
|
|
317
244
|
});
|
|
318
245
|
return;
|
|
319
246
|
}
|
|
320
|
-
// GET /meeting-output — Output Media webpage for Recall.ai bot audio.
|
|
321
|
-
//
|
|
322
|
-
// The file lives next to this compiled JS (copied by the build script from
|
|
323
|
-
// src/ to dist/). Resolve via __dirname rather than process.cwd() — in
|
|
324
|
-
// production cwd is the user's workspace, NOT the osborn package directory.
|
|
325
|
-
if (req.method === 'GET' && url.pathname === '/meeting-output') {
|
|
326
|
-
// Try the package-relative path first (post-build location), then fall
|
|
327
|
-
// back to source path for `tsx src/index.ts` dev runs.
|
|
328
|
-
const candidates = [
|
|
329
|
-
join(__dirname, 'meeting-output.html'), // dist/ (production)
|
|
330
|
-
join(__dirname, '..', 'src', 'meeting-output.html'), // dev: dist/ → src/
|
|
331
|
-
join(__dirname, '..', 'meeting-output.html'), // tsx run from src/
|
|
332
|
-
];
|
|
333
|
-
let html = null;
|
|
334
|
-
let foundPath = null;
|
|
335
|
-
for (const p of candidates) {
|
|
336
|
-
try {
|
|
337
|
-
html = readFileSync(p, 'utf-8');
|
|
338
|
-
foundPath = p;
|
|
339
|
-
break;
|
|
340
|
-
}
|
|
341
|
-
catch { }
|
|
342
|
-
}
|
|
343
|
-
if (html) {
|
|
344
|
-
res.writeHead(200, { 'Content-Type': 'text/html' });
|
|
345
|
-
res.end(html);
|
|
346
|
-
}
|
|
347
|
-
else {
|
|
348
|
-
console.warn(`[meeting-output] not found in any of: ${candidates.join(', ')}`);
|
|
349
|
-
res.writeHead(404, { 'Content-Type': 'text/plain' });
|
|
350
|
-
res.end('meeting-output.html not found');
|
|
351
|
-
}
|
|
352
|
-
return;
|
|
353
|
-
}
|
|
354
247
|
if (req.method === 'GET' && url.pathname === '/room-code') {
|
|
355
248
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
356
249
|
res.end(JSON.stringify({ roomCode: currentRoomCode }));
|
|
@@ -965,251 +858,12 @@ function startApiServer(workingDir, port) {
|
|
|
965
858
|
};
|
|
966
859
|
cleanStaleUploadDirs();
|
|
967
860
|
setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
|
|
968
|
-
//
|
|
969
|
-
//
|
|
970
|
-
//
|
|
971
|
-
//
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
// backwards-compat with old machine images still serving the legacy path.
|
|
975
|
-
const meetingOutputWss = new WebSocketServer({ noServer: true });
|
|
976
|
-
meetingOutputWss.on('connection', (ws) => {
|
|
977
|
-
console.log('📺 Meeting output browser connected (legacy /meeting-audio)');
|
|
978
|
-
meetingOutputWs = ws;
|
|
979
|
-
ws.on('close', () => {
|
|
980
|
-
console.log('📺 Meeting output browser disconnected (legacy)');
|
|
981
|
-
if (meetingOutputWs === ws)
|
|
982
|
-
meetingOutputWs = null;
|
|
983
|
-
});
|
|
984
|
-
});
|
|
985
|
-
// ============================================================
|
|
986
|
-
// Recall.ai meeting-audio-in WebSocket — /meeting-audio-in
|
|
987
|
-
// ============================================================
|
|
988
|
-
// Recall.ai's per-participant real-time audio protocol. Bot is configured
|
|
989
|
-
// (in recall-client.ts joinMeeting) with audio_separate_raw + a realtime
|
|
990
|
-
// endpoint pointing at this URL. Recall sends JSON events containing
|
|
991
|
-
// base64-encoded PCM (S16LE, 16kHz, mono) for every meeting participant
|
|
992
|
-
// (bot's own audio NOT included by default — no feedback loop possible).
|
|
993
|
-
//
|
|
994
|
-
// Flow: Recall → /meeting-audio-in → open a SECOND LiveKit connection from
|
|
995
|
-
// this agent process as a publisher participant → publish PCM as an
|
|
996
|
-
// audio track in the same LiveKit room → the existing AgentSession's
|
|
997
|
-
// STT subscribes to it as a remote track → routes to currentLLM.chat()
|
|
998
|
-
// via the same pipeline as voice-native user mic.
|
|
999
|
-
//
|
|
1000
|
-
// The advantage of this design vs a parallel STT pipeline: meeting audio
|
|
1001
|
-
// becomes "just another participant" in the LiveKit room — same end-of-turn
|
|
1002
|
-
// detection, same interrupt handling, same conversation context, no parallel
|
|
1003
|
-
// chat() paths to maintain.
|
|
1004
|
-
//
|
|
1005
|
-
// Wait until activeAgentSession._roomIO exists AND the publisher participant
|
|
1006
|
-
// is visible to the agent's room. Both can race against join_meeting:
|
|
1007
|
-
// - Agent session may still be starting up when Recall connects.
|
|
1008
|
-
// - LiveKit takes a moment to propagate the publisher's join to the agent
|
|
1009
|
-
// side after publishTrack() returns on our side.
|
|
1010
|
-
// Bounded poll (200ms cadence) avoids both timing gaps.
|
|
1011
|
-
async function waitForRoomIOAndParticipant(publisherIdentity, timeoutMs) {
|
|
1012
|
-
const deadline = Date.now() + timeoutMs;
|
|
1013
|
-
let roomIO = null;
|
|
1014
|
-
let participantVisible = false;
|
|
1015
|
-
while (Date.now() < deadline) {
|
|
1016
|
-
roomIO = activeAgentSession?._roomIO;
|
|
1017
|
-
if (roomIO && typeof roomIO.setParticipant === 'function') {
|
|
1018
|
-
const agentRoom = roomIO.rtcRoom;
|
|
1019
|
-
const remotes = agentRoom?.remoteParticipants;
|
|
1020
|
-
if (remotes && typeof remotes.values === 'function') {
|
|
1021
|
-
for (const p of remotes.values()) {
|
|
1022
|
-
if (p?.identity === publisherIdentity) {
|
|
1023
|
-
participantVisible = true;
|
|
1024
|
-
break;
|
|
1025
|
-
}
|
|
1026
|
-
}
|
|
1027
|
-
}
|
|
1028
|
-
if (participantVisible)
|
|
1029
|
-
return { roomIO, participantVisible };
|
|
1030
|
-
}
|
|
1031
|
-
await new Promise(r => setTimeout(r, 200));
|
|
1032
|
-
}
|
|
1033
|
-
// Timed out — return whatever we have. Caller decides whether to proceed.
|
|
1034
|
-
return { roomIO, participantVisible };
|
|
1035
|
-
}
|
|
1036
|
-
const meetingAudioInWss = new WebSocketServer({ noServer: true });
|
|
1037
|
-
meetingAudioInWss.on('connection', async (recallWs) => {
|
|
1038
|
-
console.log('🎙️ Recall audio-in WebSocket connected — setting up LiveKit publisher');
|
|
1039
|
-
const livekitUrl = process.env.LIVEKIT_URL;
|
|
1040
|
-
const apiKey = process.env.LIVEKIT_API_KEY;
|
|
1041
|
-
const apiSecret = process.env.LIVEKIT_API_SECRET;
|
|
1042
|
-
if (!livekitUrl || !apiKey || !apiSecret) {
|
|
1043
|
-
console.warn('⚠️ LIVEKIT_URL / LIVEKIT_API_KEY / LIVEKIT_API_SECRET not set — meeting audio publisher disabled');
|
|
1044
|
-
recallWs.close();
|
|
1045
|
-
return;
|
|
1046
|
-
}
|
|
1047
|
-
if (!currentRoomCode) {
|
|
1048
|
-
console.warn('⚠️ No active LiveKit room (currentRoomCode null) — meeting audio publisher cannot attach');
|
|
1049
|
-
recallWs.close();
|
|
1050
|
-
return;
|
|
1051
|
-
}
|
|
1052
|
-
const roomName = `osborn-${currentRoomCode}`;
|
|
1053
|
-
// Mint a publisher token via livekit-server-sdk (already imported for
|
|
1054
|
-
// /api/token style flows). Long TTL — meetings can run for hours.
|
|
1055
|
-
const identity = `meeting-audio-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
1056
|
-
const at = new AccessToken(apiKey, apiSecret, {
|
|
1057
|
-
identity,
|
|
1058
|
-
ttl: 14400, // 4 hours
|
|
1059
|
-
metadata: JSON.stringify({ role: 'meeting-audio-publisher' }),
|
|
1060
|
-
});
|
|
1061
|
-
at.addGrant({ roomJoin: true, room: roomName, canPublish: true, canSubscribe: false });
|
|
1062
|
-
const token = await at.toJwt();
|
|
1063
|
-
let room = null;
|
|
1064
|
-
let source = null;
|
|
1065
|
-
let track = null;
|
|
1066
|
-
const cleanup = async () => {
|
|
1067
|
-
// Restore AgentSession STT input to the original user participant before
|
|
1068
|
-
// tearing down the publisher track. If we don't switch back, the session
|
|
1069
|
-
// will be stuck waiting on a participant that's about to disappear.
|
|
1070
|
-
try {
|
|
1071
|
-
const roomIO = activeAgentSession?._roomIO;
|
|
1072
|
-
if (roomIO && typeof roomIO.setParticipant === 'function') {
|
|
1073
|
-
if (preMeetingUserIdentity) {
|
|
1074
|
-
roomIO.setParticipant(preMeetingUserIdentity);
|
|
1075
|
-
console.log(`🔁 Restored AgentSession STT input to user: ${preMeetingUserIdentity}`);
|
|
1076
|
-
}
|
|
1077
|
-
else {
|
|
1078
|
-
roomIO.unsetParticipant();
|
|
1079
|
-
console.log('🔁 Cleared AgentSession STT input (no original user to restore)');
|
|
1080
|
-
}
|
|
1081
|
-
}
|
|
1082
|
-
}
|
|
1083
|
-
catch (err) {
|
|
1084
|
-
console.warn('⚠️ Failed to restore RoomIO participant on cleanup:', err.message);
|
|
1085
|
-
}
|
|
1086
|
-
preMeetingUserIdentity = null;
|
|
1087
|
-
try {
|
|
1088
|
-
if (track)
|
|
1089
|
-
await track.close(true);
|
|
1090
|
-
}
|
|
1091
|
-
catch { }
|
|
1092
|
-
try {
|
|
1093
|
-
if (source)
|
|
1094
|
-
await source.close();
|
|
1095
|
-
}
|
|
1096
|
-
catch { }
|
|
1097
|
-
try {
|
|
1098
|
-
if (room)
|
|
1099
|
-
await room.disconnect();
|
|
1100
|
-
}
|
|
1101
|
-
catch { }
|
|
1102
|
-
room = null;
|
|
1103
|
-
source = null;
|
|
1104
|
-
track = null;
|
|
1105
|
-
};
|
|
1106
|
-
try {
|
|
1107
|
-
room = new Room();
|
|
1108
|
-
await room.connect(livekitUrl, token);
|
|
1109
|
-
if (!room.localParticipant)
|
|
1110
|
-
throw new Error('LiveKit connected but localParticipant missing');
|
|
1111
|
-
// Recall sends S16LE PCM at 16kHz mono. AudioSource matches the format.
|
|
1112
|
-
source = new AudioSource(16000, 1);
|
|
1113
|
-
track = LocalAudioTrack.createAudioTrack('meeting-audio', source);
|
|
1114
|
-
await room.localParticipant.publishTrack(track, new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }));
|
|
1115
|
-
console.log(`🎙️ Meeting audio publisher connected to ${roomName} as ${identity}`);
|
|
1116
|
-
// B2 — switch the existing AgentSession's RoomIO input from the local user
|
|
1117
|
-
// to this meeting-audio publisher. While the meeting is active, the user
|
|
1118
|
-
// talks via the meeting (Recall captures it and sends PCM here), and the
|
|
1119
|
-
// agent treats this publisher as the "speaking" participant for STT/EOT.
|
|
1120
|
-
// Original user identity is stashed so cleanup() can restore it.
|
|
1121
|
-
//
|
|
1122
|
-
// 15s timeout accommodates: session-start race (agent still booting when
|
|
1123
|
-
// user clicks "join meeting"), LiveKit participant-join propagation
|
|
1124
|
-
// (~hundreds of ms), and Fly cold-path latency on first request.
|
|
1125
|
-
try {
|
|
1126
|
-
const { roomIO, participantVisible } = await waitForRoomIOAndParticipant(identity, 15000);
|
|
1127
|
-
if (!roomIO) {
|
|
1128
|
-
console.warn('⚠️ Timed out waiting for AgentSession._roomIO (15s) — meeting audio published but STT not switched. Meeting audio will be ignored until a session starts.');
|
|
1129
|
-
}
|
|
1130
|
-
else if (!participantVisible) {
|
|
1131
|
-
// RoomIO exists but our publisher hasn't propagated to the agent's
|
|
1132
|
-
// room view yet. setParticipant stores the identity and links on
|
|
1133
|
-
// participant-connected event, so this is still safe to call —
|
|
1134
|
-
// RoomIO will pick up the link when the event arrives.
|
|
1135
|
-
preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
|
|
1136
|
-
roomIO.setParticipant(identity);
|
|
1137
|
-
console.log(`🔁 Switched AgentSession STT input (publisher not yet visible — will link on connect): ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
|
|
1138
|
-
}
|
|
1139
|
-
else {
|
|
1140
|
-
preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
|
|
1141
|
-
roomIO.setParticipant(identity);
|
|
1142
|
-
console.log(`🔁 Switched AgentSession STT input: ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
|
|
1143
|
-
}
|
|
1144
|
-
}
|
|
1145
|
-
catch (err) {
|
|
1146
|
-
console.warn('⚠️ Failed to switch RoomIO participant:', err.message);
|
|
1147
|
-
}
|
|
1148
|
-
}
|
|
1149
|
-
catch (err) {
|
|
1150
|
-
console.error('❌ Failed to set up LiveKit publisher for meeting audio:', err instanceof Error ? err.message : err);
|
|
1151
|
-
try {
|
|
1152
|
-
recallWs.close();
|
|
1153
|
-
}
|
|
1154
|
-
catch { }
|
|
1155
|
-
await cleanup();
|
|
1156
|
-
return;
|
|
1157
|
-
}
|
|
1158
|
-
// Recall → us: JSON events with base64-encoded PCM. Decode, wrap as
|
|
1159
|
-
// AudioFrame, and capture into the source. AgentSession in the main room
|
|
1160
|
-
// will subscribe to this published track and STT it via the normal pipeline.
|
|
1161
|
-
// Payload shape from
|
|
1162
|
-
// docs.recall.ai/docs/how-to-get-separate-audio-per-participant-realtime:
|
|
1163
|
-
// { event: 'audio_separate_raw.data', data: { data: { buffer: '<base64>', ... }, participant: {...} } }
|
|
1164
|
-
recallWs.on('message', async (raw) => {
|
|
1165
|
-
if (!source)
|
|
1166
|
-
return;
|
|
1167
|
-
try {
|
|
1168
|
-
const msg = JSON.parse(raw.toString());
|
|
1169
|
-
if (msg.event !== 'audio_separate_raw.data')
|
|
1170
|
-
return;
|
|
1171
|
-
const b64 = msg.data?.data?.buffer;
|
|
1172
|
-
if (!b64)
|
|
1173
|
-
return;
|
|
1174
|
-
const pcmBuf = Buffer.from(b64, 'base64');
|
|
1175
|
-
// AudioFrame expects Int16Array. The PCM buffer is S16LE — view it
|
|
1176
|
-
// directly without copy. Length / 2 = samples (each sample 2 bytes).
|
|
1177
|
-
const samplesPerChannel = pcmBuf.byteLength / 2;
|
|
1178
|
-
const int16 = new Int16Array(pcmBuf.buffer, pcmBuf.byteOffset, samplesPerChannel);
|
|
1179
|
-
const frame = new AudioFrame(int16, 16000, 1, samplesPerChannel);
|
|
1180
|
-
await source.captureFrame(frame);
|
|
1181
|
-
}
|
|
1182
|
-
catch (err) {
|
|
1183
|
-
// Don't log every frame parse failure — could be noisy if Recall sends
|
|
1184
|
-
// non-audio_separate_raw events on the same channel.
|
|
1185
|
-
if (err.message?.includes('JSON'))
|
|
1186
|
-
return;
|
|
1187
|
-
console.warn('⚠️ meeting audio capture error:', err instanceof Error ? err.message : err);
|
|
1188
|
-
}
|
|
1189
|
-
});
|
|
1190
|
-
recallWs.on('close', async () => {
|
|
1191
|
-
console.log('🎙️ Recall audio-in WebSocket closed — tearing down LiveKit publisher');
|
|
1192
|
-
await cleanup();
|
|
1193
|
-
});
|
|
1194
|
-
recallWs.on('error', (err) => {
|
|
1195
|
-
console.warn('⚠️ Recall WS error:', err instanceof Error ? err.message : err);
|
|
1196
|
-
});
|
|
1197
|
-
});
|
|
1198
|
-
server.on('upgrade', (req, socket, head) => {
|
|
1199
|
-
const url = new URL(req.url || '/', `http://localhost:${port}`);
|
|
1200
|
-
if (url.pathname === '/meeting-audio') {
|
|
1201
|
-
meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
|
|
1202
|
-
meetingOutputWss.emit('connection', ws, req);
|
|
1203
|
-
});
|
|
1204
|
-
}
|
|
1205
|
-
else if (url.pathname === '/meeting-audio-in') {
|
|
1206
|
-
meetingAudioInWss.handleUpgrade(req, socket, head, (ws) => {
|
|
1207
|
-
meetingAudioInWss.emit('connection', ws, req);
|
|
1208
|
-
});
|
|
1209
|
-
}
|
|
1210
|
-
else {
|
|
1211
|
-
socket.destroy();
|
|
1212
|
-
}
|
|
861
|
+
// No WebSocket upgrade routes — meeting audio in/out moved off LiveKit to
|
|
862
|
+
// a polling architecture (see MeetingTranscriptPoller). The /meeting-audio
|
|
863
|
+
// and /meeting-audio-in routes were the old WebSocket-audio pipeline; both
|
|
864
|
+
// are gone. Reject all upgrade attempts.
|
|
865
|
+
server.on('upgrade', (_req, socket) => {
|
|
866
|
+
socket.destroy();
|
|
1213
867
|
});
|
|
1214
868
|
server.on('error', (err) => {
|
|
1215
869
|
if (err.code === 'EADDRINUSE') {
|
|
@@ -1432,7 +1086,7 @@ async function main() {
|
|
|
1432
1086
|
// session-only path (no user prefix).
|
|
1433
1087
|
let currentUserId = '';
|
|
1434
1088
|
let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
|
|
1435
|
-
|
|
1089
|
+
let activeMeetingPoller = null; // Transcript poller bound to that bot
|
|
1436
1090
|
// Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
|
|
1437
1091
|
// Updated by resume_session, session_selected, continue_session, switch_session handlers
|
|
1438
1092
|
let currentResumeSessionId;
|
|
@@ -1883,6 +1537,40 @@ async function main() {
|
|
|
1883
1537
|
}
|
|
1884
1538
|
}
|
|
1885
1539
|
}
|
|
1540
|
+
// Compaction event → frontend bridge. Forwards the raw event (consumed by the
|
|
1541
|
+
// dedicated banner UI state machine) AND emits a `claude_output` chat bubble
|
|
1542
|
+
// (so the activity is visible inline in chat even when the banner is hidden,
|
|
1543
|
+
// collapsed, or unreliable on iPad/iPhone). Extracted as a helper because
|
|
1544
|
+
// both direct-mode and pipeline-mode need to register it — the pipeline path
|
|
1545
|
+
// previously skipped this entirely, so compaction events fired into the void
|
|
1546
|
+
// in pipeline mode.
|
|
1547
|
+
const buildOnCompactionEvent = () => (event) => {
|
|
1548
|
+
try {
|
|
1549
|
+
// Raw event → banner state machine (compaction_started/progress/complete handlers in VoiceRoom.tsx).
|
|
1550
|
+
sendToFrontend({ ...event });
|
|
1551
|
+
// Inline chat bubble — reuses the existing claude_output path that's already working.
|
|
1552
|
+
if (event.type === 'compaction_started') {
|
|
1553
|
+
const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
|
|
1554
|
+
sendToFrontend({
|
|
1555
|
+
type: 'claude_output',
|
|
1556
|
+
text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
|
|
1557
|
+
agentRole: 'direct',
|
|
1558
|
+
});
|
|
1559
|
+
}
|
|
1560
|
+
else if (event.type === 'compaction_complete') {
|
|
1561
|
+
const n = event.skillsWritten ?? 0;
|
|
1562
|
+
const names = Array.isArray(event.skillNames) && event.skillNames.length > 0
|
|
1563
|
+
? ` — ${event.skillNames.join(', ')}`
|
|
1564
|
+
: '';
|
|
1565
|
+
sendToFrontend({
|
|
1566
|
+
type: 'claude_output',
|
|
1567
|
+
text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
|
|
1568
|
+
agentRole: 'direct',
|
|
1569
|
+
});
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
catch { /* non-fatal */ }
|
|
1573
|
+
};
|
|
1886
1574
|
// Create DIRECT session (STT + Claude Agent SDK + TTS)
|
|
1887
1575
|
async function createDirectSession(resumeSessionId, llmOverride) {
|
|
1888
1576
|
console.log('🎯 Creating direct session...');
|
|
@@ -1898,39 +1586,7 @@ async function main() {
|
|
|
1898
1586
|
resumeSessionId,
|
|
1899
1587
|
voiceMode: 'direct',
|
|
1900
1588
|
skipTTSQueue: true,
|
|
1901
|
-
onCompactionEvent: (
|
|
1902
|
-
try {
|
|
1903
|
-
// Forward the raw event so the dedicated banner UI can render it (if/when fixed).
|
|
1904
|
-
sendToFrontend({ ...event });
|
|
1905
|
-
// ALSO emit as a claude_output chat bubble — reuses the existing message path
|
|
1906
|
-
// that's already working end-to-end. PreCompact → in-progress bubble.
|
|
1907
|
-
// PostCompact → completion bubble with the skills summary. The dedicated
|
|
1908
|
-
// banner has been unreliable in production (data path works on backend, banner
|
|
1909
|
-
// never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
|
|
1910
|
-
// are visible without dev tools.
|
|
1911
|
-
if (event.type === 'compaction_started') {
|
|
1912
|
-
const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
|
|
1913
|
-
sendToFrontend({
|
|
1914
|
-
type: 'claude_output',
|
|
1915
|
-
text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
|
|
1916
|
-
agentRole: 'direct',
|
|
1917
|
-
});
|
|
1918
|
-
}
|
|
1919
|
-
else if (event.type === 'compaction_complete') {
|
|
1920
|
-
const ev = event;
|
|
1921
|
-
const n = ev.skillsWritten ?? 0;
|
|
1922
|
-
const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
|
|
1923
|
-
? ` — ${ev.skillNames.join(', ')}`
|
|
1924
|
-
: '';
|
|
1925
|
-
sendToFrontend({
|
|
1926
|
-
type: 'claude_output',
|
|
1927
|
-
text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
|
|
1928
|
-
agentRole: 'direct',
|
|
1929
|
-
});
|
|
1930
|
-
}
|
|
1931
|
-
}
|
|
1932
|
-
catch { /* non-fatal */ }
|
|
1933
|
-
},
|
|
1589
|
+
onCompactionEvent: buildOnCompactionEvent(),
|
|
1934
1590
|
});
|
|
1935
1591
|
currentLLM = directLLM;
|
|
1936
1592
|
// Reset the session always-allow list for each new direct session
|
|
@@ -2130,20 +1786,6 @@ async function main() {
|
|
|
2130
1786
|
}
|
|
2131
1787
|
const sayId = Date.now(); // simple ID to correlate start/end logs
|
|
2132
1788
|
console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
|
|
2133
|
-
// Forward spoken text + audio to meeting output page when bot is in a meeting.
|
|
2134
|
-
// Uses DIRECT_MODE_TTS (same OpenAI fable voice as the live session) — was
|
|
2135
|
-
// previously using directConfig.tts which falls back to DEFAULT_CONFIG.direct.tts
|
|
2136
|
-
// (Deepgram aura-2-asteria-en) when no user config exists, producing a different
|
|
2137
|
-
// voice in the meeting than what the user hears in voice-native. Both paths now
|
|
2138
|
-
// share the single source of truth.
|
|
2139
|
-
// PCM frames are WAV-encoded and pushed as binary WebSocket frames.
|
|
2140
|
-
// Recall captures the browser page's audio output and injects it into the meeting.
|
|
2141
|
-
if (activeMeetingBotId) {
|
|
2142
|
-
sendToMeetingOutput({ type: 'speak', text: data.text });
|
|
2143
|
-
if (meetingOutputWs) {
|
|
2144
|
-
synthesizeForMeeting(data.text, DIRECT_MODE_TTS).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
|
|
2145
|
-
}
|
|
2146
|
-
}
|
|
2147
1789
|
try {
|
|
2148
1790
|
const handle = currentSession.say(data.text);
|
|
2149
1791
|
if (handle && typeof handle.addDoneCallback === 'function') {
|
|
@@ -2281,39 +1923,7 @@ async function main() {
|
|
|
2281
1923
|
sessionBaseDir,
|
|
2282
1924
|
mcpServers,
|
|
2283
1925
|
resumeSessionId,
|
|
2284
|
-
onCompactionEvent: (
|
|
2285
|
-
try {
|
|
2286
|
-
// Forward the raw event so the dedicated banner UI can render it (if/when fixed).
|
|
2287
|
-
sendToFrontend({ ...event });
|
|
2288
|
-
// ALSO emit as a claude_output chat bubble — reuses the existing message path
|
|
2289
|
-
// that's already working end-to-end. PreCompact → in-progress bubble.
|
|
2290
|
-
// PostCompact → completion bubble with the skills summary. The dedicated
|
|
2291
|
-
// banner has been unreliable in production (data path works on backend, banner
|
|
2292
|
-
// never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
|
|
2293
|
-
// are visible without dev tools.
|
|
2294
|
-
if (event.type === 'compaction_started') {
|
|
2295
|
-
const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
|
|
2296
|
-
sendToFrontend({
|
|
2297
|
-
type: 'claude_output',
|
|
2298
|
-
text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
|
|
2299
|
-
agentRole: 'direct',
|
|
2300
|
-
});
|
|
2301
|
-
}
|
|
2302
|
-
else if (event.type === 'compaction_complete') {
|
|
2303
|
-
const ev = event;
|
|
2304
|
-
const n = ev.skillsWritten ?? 0;
|
|
2305
|
-
const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
|
|
2306
|
-
? ` — ${ev.skillNames.join(', ')}`
|
|
2307
|
-
: '';
|
|
2308
|
-
sendToFrontend({
|
|
2309
|
-
type: 'claude_output',
|
|
2310
|
-
text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
|
|
2311
|
-
agentRole: 'direct',
|
|
2312
|
-
});
|
|
2313
|
-
}
|
|
2314
|
-
}
|
|
2315
|
-
catch { /* non-fatal */ }
|
|
2316
|
-
},
|
|
1926
|
+
onCompactionEvent: buildOnCompactionEvent(),
|
|
2317
1927
|
});
|
|
2318
1928
|
currentLLM = realtimeClaudeHandler;
|
|
2319
1929
|
// For resumed sessions, eagerly create workspace (we know the real ID)
|
|
@@ -2856,7 +2466,6 @@ async function main() {
|
|
|
2856
2466
|
}
|
|
2857
2467
|
lastCompletedResearch = null;
|
|
2858
2468
|
currentSession = null;
|
|
2859
|
-
activeAgentSession = null;
|
|
2860
2469
|
currentAgent = null;
|
|
2861
2470
|
// Same disconnect-leak fix as the other two cleanup sites — kill the Claude SDK
|
|
2862
2471
|
// subprocess BEFORE dropping the reference. See killCurrentLLM() for full context.
|
|
@@ -2902,7 +2511,6 @@ async function main() {
|
|
|
2902
2511
|
}
|
|
2903
2512
|
catch { }
|
|
2904
2513
|
currentSession = null;
|
|
2905
|
-
activeAgentSession = null;
|
|
2906
2514
|
currentAgent = null;
|
|
2907
2515
|
// Same disconnect-leak fix — kill the previous user's Claude subprocess
|
|
2908
2516
|
// before binding currentLLM to the new user's session below.
|
|
@@ -3022,6 +2630,13 @@ async function main() {
|
|
|
3022
2630
|
resumeSessionId,
|
|
3023
2631
|
voiceMode: 'direct',
|
|
3024
2632
|
skipTTSQueue: true,
|
|
2633
|
+
// PipelineDirectOptions extends ClaudeLLMOptions; passing this through
|
|
2634
|
+
// forwards it into the inner `new ClaudeLLM(opts)`. Without this,
|
|
2635
|
+
// pipeline mode silently drops every PreCompact/PostCompact event
|
|
2636
|
+
// — banner never appears, chat bubble never appears — because
|
|
2637
|
+
// createDirectSession's `createClaudeLLM(...)` call is skipped when
|
|
2638
|
+
// an llmOverride is supplied (which is exactly what pipeline mode does).
|
|
2639
|
+
onCompactionEvent: buildOnCompactionEvent(),
|
|
3025
2640
|
getChatHistory: () => getChatHistory(20).map(t => ({ role: t.role, content: t.text })),
|
|
3026
2641
|
getResearchContext: () => {
|
|
3027
2642
|
if (activeResearch?.researchLog.length) {
|
|
@@ -3057,7 +2672,6 @@ async function main() {
|
|
|
3057
2672
|
agent = result.agent;
|
|
3058
2673
|
}
|
|
3059
2674
|
currentSession = session;
|
|
3060
|
-
activeAgentSession = session;
|
|
3061
2675
|
currentAgent = agent; // Store for updateChatCtx() context injection
|
|
3062
2676
|
// ============================================================
|
|
3063
2677
|
// Session event wiring — extracted into function for auto-recovery
|
|
@@ -3217,7 +2831,6 @@ async function main() {
|
|
|
3217
2831
|
}
|
|
3218
2832
|
catch { }
|
|
3219
2833
|
currentSession = null;
|
|
3220
|
-
activeAgentSession = null;
|
|
3221
2834
|
currentAgent = null;
|
|
3222
2835
|
// Clear stale state from crashed session
|
|
3223
2836
|
voiceQueue.length = 0;
|
|
@@ -3279,7 +2892,6 @@ async function main() {
|
|
|
3279
2892
|
const newSession = result.session;
|
|
3280
2893
|
const newAgent = result.agent;
|
|
3281
2894
|
currentSession = newSession;
|
|
3282
|
-
activeAgentSession = newSession;
|
|
3283
2895
|
currentAgent = newAgent;
|
|
3284
2896
|
// Re-wire event listeners on the new session
|
|
3285
2897
|
wireSessionEvents(newSession, newAgent);
|
|
@@ -3336,7 +2948,6 @@ async function main() {
|
|
|
3336
2948
|
}
|
|
3337
2949
|
catch { }
|
|
3338
2950
|
currentSession = null;
|
|
3339
|
-
activeAgentSession = null;
|
|
3340
2951
|
currentAgent = null;
|
|
3341
2952
|
// Clear voice queue — stale injections from the crashed session
|
|
3342
2953
|
voiceQueue.length = 0;
|
|
@@ -3360,7 +2971,6 @@ async function main() {
|
|
|
3360
2971
|
const newSession = result.session;
|
|
3361
2972
|
const newAgent = result.agent;
|
|
3362
2973
|
currentSession = newSession;
|
|
3363
|
-
activeAgentSession = newSession;
|
|
3364
2974
|
currentAgent = newAgent;
|
|
3365
2975
|
// Re-wire event listeners on the new session
|
|
3366
2976
|
wireSessionEvents(newSession, newAgent);
|
|
@@ -3555,7 +3165,6 @@ async function main() {
|
|
|
3555
3165
|
if (currentSession) {
|
|
3556
3166
|
const sessionToClose = currentSession;
|
|
3557
3167
|
currentSession = null;
|
|
3558
|
-
activeAgentSession = null;
|
|
3559
3168
|
// Track async close so new connections can wait for byte stream handler to be released
|
|
3560
3169
|
pendingSessionClose = (async () => {
|
|
3561
3170
|
try {
|
|
@@ -3577,6 +3186,10 @@ async function main() {
|
|
|
3577
3186
|
clearFastBrainSession();
|
|
3578
3187
|
clearPipelineFastBrainSession();
|
|
3579
3188
|
// Auto-leave any active meeting bot when user disconnects from the room
|
|
3189
|
+
if (activeMeetingPoller) {
|
|
3190
|
+
activeMeetingPoller.stop();
|
|
3191
|
+
activeMeetingPoller = null;
|
|
3192
|
+
}
|
|
3580
3193
|
if (activeMeetingBotId) {
|
|
3581
3194
|
const recallDisconnect = getRecallClient();
|
|
3582
3195
|
if (recallDisconnect) {
|
|
@@ -4160,61 +3773,61 @@ async function main() {
|
|
|
4160
3773
|
(process.env.FLY_APP_NAME
|
|
4161
3774
|
? `https://${process.env.FLY_APP_NAME}.fly.dev`
|
|
4162
3775
|
: `http://localhost:${apiPort}`);
|
|
4163
|
-
//
|
|
4164
|
-
//
|
|
4165
|
-
//
|
|
4166
|
-
//
|
|
4167
|
-
|
|
4168
|
-
|
|
4169
|
-
|
|
4170
|
-
|
|
4171
|
-
|
|
4172
|
-
|
|
4173
|
-
//
|
|
4174
|
-
//
|
|
4175
|
-
//
|
|
4176
|
-
//
|
|
4177
|
-
|
|
4178
|
-
//
|
|
4179
|
-
// Auth: the endpoint uses LiveKit room-presence as the auth check
|
|
4180
|
-
// — no shared secret needed. The agent must already be in the
|
|
4181
|
-
// requested room (which it is by this point) for the mint to
|
|
4182
|
-
// succeed.
|
|
4183
|
-
let outputPageUrl;
|
|
4184
|
-
const frontendUrl = data.frontendBase
|
|
4185
|
-
|| process.env.OSBORN_FRONTEND_URL;
|
|
4186
|
-
if (frontendUrl) {
|
|
3776
|
+
// Polling architecture (post-2026-05-22): the bot joins by name
|
|
3777
|
+
// only — no output_media webpage, no LiveKit republish, no audio
|
|
3778
|
+
// pipeline at all. Recall captures the meeting audio internally
|
|
3779
|
+
// and we pull the transcript via its REST API every ~30s.
|
|
3780
|
+
await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
|
|
3781
|
+
const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase);
|
|
3782
|
+
const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
|
|
3783
|
+
recallJoin.registerBot(botId, sessionId);
|
|
3784
|
+
activeMeetingBotId = botId;
|
|
3785
|
+
await sendToFrontend({ type: 'meeting_joined', botId, message: 'Osborn has joined the meeting' });
|
|
3786
|
+
// System injection so the LLM knows it's in a meeting and which
|
|
3787
|
+
// skill to apply. The meetings skill (agent/.claude/skills/meetings/SKILL.md)
|
|
3788
|
+
// teaches the agent: don't speak in response to [MEETING — *]:
|
|
3789
|
+
// messages, keep meeting-todos.md updated in the workspace, etc.
|
|
3790
|
+
if (currentLLM) {
|
|
4187
3791
|
try {
|
|
4188
|
-
const
|
|
4189
|
-
|
|
4190
|
-
|
|
4191
|
-
|
|
4192
|
-
body: JSON.stringify({ botId: botLkId, roomName }),
|
|
3792
|
+
const sysCtx = new llm.ChatContext();
|
|
3793
|
+
sysCtx.addMessage({
|
|
3794
|
+
role: 'user',
|
|
3795
|
+
content: `[SYSTEM] You are now in a meeting (Recall bot ID: ${botId}, URL: ${meetingUrl}). Transcript chunks will arrive every ~30 seconds tagged \`[MEETING — ${botId}]:\`. Follow the meetings skill: do NOT speak in response (no TTS output), instead maintain meeting-todos.md in the session workspace, optionally trigger background research silently. The voice-native user can still interact normally — only the meeting-tagged messages are the silent-observer path. Acknowledge by writing the initial meeting-todos.md skeleton.`,
|
|
4193
3796
|
});
|
|
4194
|
-
|
|
4195
|
-
|
|
4196
|
-
const params = new URLSearchParams({ token, url, room: roomName, botId: botLkId });
|
|
4197
|
-
outputPageUrl = `${frontendUrl}/meeting-bot?${params.toString()}`;
|
|
4198
|
-
console.log(`🎫 Meeting-bot token minted for room=${roomName} bot=${botLkId}`);
|
|
4199
|
-
}
|
|
4200
|
-
else {
|
|
4201
|
-
const errText = await tokenRes.text().catch(() => '');
|
|
4202
|
-
console.warn(`⚠️ meeting-bot-token mint failed (HTTP ${tokenRes.status}: ${errText.substring(0, 120)}) — falling back to legacy /meeting-output path`);
|
|
4203
|
-
}
|
|
3797
|
+
currentLLM.chat({ chatCtx: sysCtx });
|
|
3798
|
+
console.log('📓 Meeting system injection sent to LLM');
|
|
4204
3799
|
}
|
|
4205
|
-
catch (
|
|
4206
|
-
console.warn(
|
|
3800
|
+
catch (sysErr) {
|
|
3801
|
+
console.warn('⚠️ Meeting system injection failed:', sysErr.message);
|
|
4207
3802
|
}
|
|
4208
3803
|
}
|
|
4209
|
-
|
|
4210
|
-
|
|
3804
|
+
// Start polling the transcript every 30s. Each batch of new turns
|
|
3805
|
+
// is pushed to currentLLM.chat() tagged [MEETING — botId]: so the
|
|
3806
|
+
// skill kicks in. Poller dedups via first-word timestamp cursor.
|
|
3807
|
+
if (activeMeetingPoller) {
|
|
3808
|
+
activeMeetingPoller.stop();
|
|
3809
|
+
activeMeetingPoller = null;
|
|
4211
3810
|
}
|
|
4212
|
-
|
|
4213
|
-
|
|
4214
|
-
|
|
4215
|
-
|
|
4216
|
-
|
|
4217
|
-
|
|
3811
|
+
activeMeetingPoller = new MeetingTranscriptPoller({
|
|
3812
|
+
botId,
|
|
3813
|
+
recall: recallJoin,
|
|
3814
|
+
onTurns: async ({ formatted }) => {
|
|
3815
|
+
if (!currentLLM) {
|
|
3816
|
+
console.warn('📓 Meeting transcript arrived but currentLLM is null — dropping');
|
|
3817
|
+
return;
|
|
3818
|
+
}
|
|
3819
|
+
const tagged = `[MEETING — ${botId}]:\n${formatted}`;
|
|
3820
|
+
try {
|
|
3821
|
+
const turnCtx = new llm.ChatContext();
|
|
3822
|
+
turnCtx.addMessage({ role: 'user', content: tagged });
|
|
3823
|
+
currentLLM.chat({ chatCtx: turnCtx });
|
|
3824
|
+
}
|
|
3825
|
+
catch (err) {
|
|
3826
|
+
console.warn(`⚠️ Failed to forward meeting transcript to LLM: ${err.message}`);
|
|
3827
|
+
}
|
|
3828
|
+
},
|
|
3829
|
+
});
|
|
3830
|
+
activeMeetingPoller.start();
|
|
4218
3831
|
}
|
|
4219
3832
|
catch (err) {
|
|
4220
3833
|
console.error('❌ Recall.ai join error:', err);
|
|
@@ -4228,6 +3841,12 @@ async function main() {
|
|
|
4228
3841
|
const recallLeave = getRecallClient();
|
|
4229
3842
|
if (recallLeave && botId) {
|
|
4230
3843
|
try {
|
|
3844
|
+
// Stop the transcript poller FIRST so no more transcript chunks get
|
|
3845
|
+
// forwarded to the LLM during the leave.
|
|
3846
|
+
if (activeMeetingPoller) {
|
|
3847
|
+
activeMeetingPoller.stop();
|
|
3848
|
+
activeMeetingPoller = null;
|
|
3849
|
+
}
|
|
4231
3850
|
await recallLeave.leaveMeeting(botId);
|
|
4232
3851
|
activeMeetingBotId = null;
|
|
4233
3852
|
await sendToFrontend({ type: 'meeting_left', botId });
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
|
|
3
|
+
* interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
|
|
4
|
+
* messages.
|
|
5
|
+
*
|
|
6
|
+
* This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
|
|
7
|
+
* PCM from Recall into a LiveKit room. The polling architecture is simpler
|
|
8
|
+
* (no parallel STT, no audio pipeline, no participant juggling), survives
|
|
9
|
+
* agent restarts (Recall keeps the transcript on its side), and the LLM
|
|
10
|
+
* never speaks in the meeting — it's a silent note-taker.
|
|
11
|
+
*
|
|
12
|
+
* Lifecycle:
|
|
13
|
+
* const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
|
|
14
|
+
* poller.start()
|
|
15
|
+
* ...
|
|
16
|
+
* poller.stop() // on leave_meeting / disconnect / session switch
|
|
17
|
+
*
|
|
18
|
+
* Dedup strategy:
|
|
19
|
+
* Each turn carries a `start_timestamp.relative` on its first word (seconds
|
|
20
|
+
* since recording start). We track the highest cursor we've forwarded and
|
|
21
|
+
* only send turns with a strictly greater first-word timestamp. This means
|
|
22
|
+
* re-fetches don't double-deliver, and partial transcripts that get refined
|
|
23
|
+
* later don't re-trigger LLM processing of already-handled turns.
|
|
24
|
+
*
|
|
25
|
+
* Error handling:
|
|
26
|
+
* Transient fetch errors are logged + skipped (poll continues on next tick).
|
|
27
|
+
* No backoff — Recall's transcript endpoint is stable enough that a 30s
|
|
28
|
+
* cadence makes "slow start" non-issues self-recover within one cycle.
|
|
29
|
+
*/
|
|
30
|
+
import type { RecallClient, TranscriptTurn } from './recall-client.js';
|
|
31
|
+
export interface MeetingTranscriptPollerOptions {
|
|
32
|
+
botId: string;
|
|
33
|
+
recall: RecallClient;
|
|
34
|
+
/** Called when new transcript turns arrive (de-duped). Get a fresh batch each tick. */
|
|
35
|
+
onTurns: (chunk: {
|
|
36
|
+
botId: string;
|
|
37
|
+
turns: TranscriptTurn[];
|
|
38
|
+
formatted: string;
|
|
39
|
+
}) => void | Promise<void>;
|
|
40
|
+
/** Default 30s — matches the user's stated cadence. */
|
|
41
|
+
intervalMs?: number;
|
|
42
|
+
/** Optional debug logger. */
|
|
43
|
+
onError?: (err: Error) => void;
|
|
44
|
+
}
|
|
45
|
+
export declare class MeetingTranscriptPoller {
|
|
46
|
+
#private;
|
|
47
|
+
constructor(opts: MeetingTranscriptPollerOptions);
|
|
48
|
+
start(): void;
|
|
49
|
+
stop(): void;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Format an array of turns into a single string for LLM consumption.
|
|
53
|
+
*
|
|
54
|
+
* Each turn becomes:
|
|
55
|
+
* <Speaker>: <text>
|
|
56
|
+
*
|
|
57
|
+
* Whitespace-only words and zero-content turns are dropped. Returns empty
|
|
58
|
+
* string if nothing meaningful is in the batch.
|
|
59
|
+
*/
|
|
60
|
+
export declare function formatTurns(turns: TranscriptTurn[]): string;
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
|
|
3
|
+
* interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
|
|
4
|
+
* messages.
|
|
5
|
+
*
|
|
6
|
+
* This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
|
|
7
|
+
* PCM from Recall into a LiveKit room. The polling architecture is simpler
|
|
8
|
+
* (no parallel STT, no audio pipeline, no participant juggling), survives
|
|
9
|
+
* agent restarts (Recall keeps the transcript on its side), and the LLM
|
|
10
|
+
* never speaks in the meeting — it's a silent note-taker.
|
|
11
|
+
*
|
|
12
|
+
* Lifecycle:
|
|
13
|
+
* const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
|
|
14
|
+
* poller.start()
|
|
15
|
+
* ...
|
|
16
|
+
* poller.stop() // on leave_meeting / disconnect / session switch
|
|
17
|
+
*
|
|
18
|
+
* Dedup strategy:
|
|
19
|
+
* Each turn carries a `start_timestamp.relative` on its first word (seconds
|
|
20
|
+
* since recording start). We track the highest cursor we've forwarded and
|
|
21
|
+
* only send turns with a strictly greater first-word timestamp. This means
|
|
22
|
+
* re-fetches don't double-deliver, and partial transcripts that get refined
|
|
23
|
+
* later don't re-trigger LLM processing of already-handled turns.
|
|
24
|
+
*
|
|
25
|
+
* Error handling:
|
|
26
|
+
* Transient fetch errors are logged + skipped (poll continues on next tick).
|
|
27
|
+
* No backoff — Recall's transcript endpoint is stable enough that a 30s
|
|
28
|
+
* cadence makes "slow start" non-issues self-recover within one cycle.
|
|
29
|
+
*/
|
|
30
|
+
export class MeetingTranscriptPoller {
|
|
31
|
+
#opts;
|
|
32
|
+
#timer = null;
|
|
33
|
+
#cursor = -Infinity; // highest first-word.start_timestamp.relative we've forwarded
|
|
34
|
+
#inFlight = false; // prevent overlapping polls if one cycle runs long
|
|
35
|
+
#stopped = false;
|
|
36
|
+
constructor(opts) {
|
|
37
|
+
this.#opts = opts;
|
|
38
|
+
}
|
|
39
|
+
start() {
|
|
40
|
+
if (this.#timer)
|
|
41
|
+
return;
|
|
42
|
+
const interval = this.#opts.intervalMs ?? 30_000;
|
|
43
|
+
console.log(`📓 MeetingTranscriptPoller starting for bot=${this.#opts.botId.substring(0, 8)} (every ${Math.round(interval / 1000)}s)`);
|
|
44
|
+
// Fire once immediately so the LLM sees the meeting started, then on interval.
|
|
45
|
+
void this.#tick();
|
|
46
|
+
this.#timer = setInterval(() => void this.#tick(), interval);
|
|
47
|
+
}
|
|
48
|
+
stop() {
|
|
49
|
+
if (this.#stopped)
|
|
50
|
+
return;
|
|
51
|
+
this.#stopped = true;
|
|
52
|
+
if (this.#timer) {
|
|
53
|
+
clearInterval(this.#timer);
|
|
54
|
+
this.#timer = null;
|
|
55
|
+
}
|
|
56
|
+
console.log(`📓 MeetingTranscriptPoller stopped for bot=${this.#opts.botId.substring(0, 8)}`);
|
|
57
|
+
}
|
|
58
|
+
async #tick() {
|
|
59
|
+
if (this.#inFlight || this.#stopped)
|
|
60
|
+
return;
|
|
61
|
+
this.#inFlight = true;
|
|
62
|
+
try {
|
|
63
|
+
const all = await this.#opts.recall.getTranscript(this.#opts.botId);
|
|
64
|
+
const fresh = all.filter(t => {
|
|
65
|
+
const firstWordTs = t.words?.[0]?.start_timestamp?.relative;
|
|
66
|
+
return typeof firstWordTs === 'number' && firstWordTs > this.#cursor;
|
|
67
|
+
});
|
|
68
|
+
if (fresh.length === 0)
|
|
69
|
+
return;
|
|
70
|
+
// Advance cursor to highest seen first-word ts (across all returned turns,
|
|
71
|
+
// not just the fresh ones — guards against Recall returning a paged subset).
|
|
72
|
+
for (const t of all) {
|
|
73
|
+
const ts = t.words?.[0]?.start_timestamp?.relative;
|
|
74
|
+
if (typeof ts === 'number' && ts > this.#cursor)
|
|
75
|
+
this.#cursor = ts;
|
|
76
|
+
}
|
|
77
|
+
const formatted = formatTurns(fresh);
|
|
78
|
+
if (!formatted)
|
|
79
|
+
return; // pure-whitespace fresh batch — skip
|
|
80
|
+
console.log(`📓 MeetingTranscriptPoller: ${fresh.length} new turn(s), cursor=${this.#cursor.toFixed(1)}s, chars=${formatted.length}`);
|
|
81
|
+
await this.#opts.onTurns({ botId: this.#opts.botId, turns: fresh, formatted });
|
|
82
|
+
}
|
|
83
|
+
catch (err) {
|
|
84
|
+
const e = err instanceof Error ? err : new Error(String(err));
|
|
85
|
+
this.#opts.onError?.(e);
|
|
86
|
+
console.warn(`⚠️ MeetingTranscriptPoller tick failed: ${e.message}`);
|
|
87
|
+
}
|
|
88
|
+
finally {
|
|
89
|
+
this.#inFlight = false;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Format an array of turns into a single string for LLM consumption.
|
|
95
|
+
*
|
|
96
|
+
* Each turn becomes:
|
|
97
|
+
* <Speaker>: <text>
|
|
98
|
+
*
|
|
99
|
+
* Whitespace-only words and zero-content turns are dropped. Returns empty
|
|
100
|
+
* string if nothing meaningful is in the batch.
|
|
101
|
+
*/
|
|
102
|
+
export function formatTurns(turns) {
|
|
103
|
+
const lines = [];
|
|
104
|
+
for (const t of turns) {
|
|
105
|
+
const speaker = t.speaker || t.participant?.name || 'Unknown';
|
|
106
|
+
const text = (t.words ?? []).map(w => w.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
107
|
+
if (!text)
|
|
108
|
+
continue;
|
|
109
|
+
lines.push(`${speaker}: ${text}`);
|
|
110
|
+
}
|
|
111
|
+
return lines.join('\n');
|
|
112
|
+
}
|
package/dist/recall-client.d.ts
CHANGED
|
@@ -4,6 +4,36 @@ export interface RecallBot {
|
|
|
4
4
|
meeting_url: string;
|
|
5
5
|
status: string;
|
|
6
6
|
}
|
|
7
|
+
/**
|
|
8
|
+
* One transcript turn = one speaker's continuous utterance.
|
|
9
|
+
* Shape returned by GET /api/v1/bot/{bot_id}/transcript.
|
|
10
|
+
*
|
|
11
|
+
* Per Recall docs each turn contains:
|
|
12
|
+
* - speaker: participant name (or 'Unknown')
|
|
13
|
+
* - words: array of { text, start_timestamp.relative, end_timestamp.relative }
|
|
14
|
+
* - The `start_timestamp.relative` (seconds since recording start) on the
|
|
15
|
+
* FIRST word is the turn's start; we use this as the dedup cursor.
|
|
16
|
+
*/
|
|
17
|
+
export interface TranscriptTurn {
|
|
18
|
+
speaker?: string;
|
|
19
|
+
participant?: {
|
|
20
|
+
id?: number;
|
|
21
|
+
name?: string;
|
|
22
|
+
is_host?: boolean;
|
|
23
|
+
};
|
|
24
|
+
words: Array<{
|
|
25
|
+
text: string;
|
|
26
|
+
start_timestamp?: {
|
|
27
|
+
relative?: number;
|
|
28
|
+
absolute?: string;
|
|
29
|
+
};
|
|
30
|
+
end_timestamp?: {
|
|
31
|
+
relative?: number;
|
|
32
|
+
absolute?: string;
|
|
33
|
+
};
|
|
34
|
+
}>;
|
|
35
|
+
language?: string;
|
|
36
|
+
}
|
|
7
37
|
export interface TranscriptPayload {
|
|
8
38
|
event: string;
|
|
9
39
|
data: {
|
|
@@ -49,10 +79,27 @@ export declare class RecallClient extends EventEmitter {
|
|
|
49
79
|
* room as the osborn agent (no separate WebSocket+WAV pipe).
|
|
50
80
|
* @param opts.botName Display name of the bot in the meeting
|
|
51
81
|
*/
|
|
52
|
-
joinMeeting(meetingUrl: string,
|
|
53
|
-
outputPageUrl?: string;
|
|
82
|
+
joinMeeting(meetingUrl: string, _webhookBaseUrl: string, opts?: {
|
|
54
83
|
botName?: string;
|
|
55
84
|
}): Promise<string>;
|
|
85
|
+
/**
|
|
86
|
+
* Fetch the bot's current transcript. Returns an array of "transcript turns"
|
|
87
|
+
* (each turn = one speaker's utterance) sorted by start time. Use the bot's
|
|
88
|
+
* `recordings[0].id` from getBotStatus / bot record to locate the recording,
|
|
89
|
+
* then list its transcripts.
|
|
90
|
+
*
|
|
91
|
+
* Per Recall docs:
|
|
92
|
+
* GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
|
|
93
|
+
* GET /api/v1/transcript/{transcript_id} → transcript with download_url
|
|
94
|
+
* Download the transcript JSON from download_url to get the actual content.
|
|
95
|
+
*
|
|
96
|
+
* For the polling use case (called every ~30s), we use the simpler combined
|
|
97
|
+
* endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
|
|
98
|
+
* convenience and returns the full transcript so far in one call. The caller
|
|
99
|
+
* is responsible for de-duping (keeping a since-cursor) so the LLM only sees
|
|
100
|
+
* new turns.
|
|
101
|
+
*/
|
|
102
|
+
getTranscript(botId: string): Promise<TranscriptTurn[]>;
|
|
56
103
|
leaveMeeting(botId: string): Promise<void>;
|
|
57
104
|
getBotStatus(botId: string): Promise<string>;
|
|
58
105
|
handleWebhook(payload: TranscriptPayload): void;
|
package/dist/recall-client.js
CHANGED
|
@@ -21,37 +21,20 @@ export class RecallClient extends EventEmitter {
|
|
|
21
21
|
* room as the osborn agent (no separate WebSocket+WAV pipe).
|
|
22
22
|
* @param opts.botName Display name of the bot in the meeting
|
|
23
23
|
*/
|
|
24
|
-
async joinMeeting(meetingUrl,
|
|
24
|
+
async joinMeeting(meetingUrl, _webhookBaseUrl, opts) {
|
|
25
25
|
const botName = opts?.botName ?? 'Osborn';
|
|
26
|
-
|
|
27
|
-
//
|
|
28
|
-
// and
|
|
26
|
+
// ARCHITECTURE (post-2026-05-22 polling redesign):
|
|
27
|
+
// The bot joins by name only — visible in the meeting participant list as
|
|
28
|
+
// "Osborn" but with no audio output and no avatar. We do NOT configure any
|
|
29
|
+
// `output_media`, `audio_separate_raw`, or `realtime_endpoints` — instead
|
|
30
|
+
// the agent polls Recall's REST transcript API every ~30s
|
|
31
|
+
// (see MeetingTranscriptPoller) and feeds new turns into the LLM as
|
|
32
|
+
// `[MEETING — <botId>]:` tagged messages. The meetings skill teaches the
|
|
33
|
+
// LLM not to respond out loud to those messages, only to take notes.
|
|
29
34
|
//
|
|
30
|
-
// recording_config.transcript.provider
|
|
31
|
-
//
|
|
32
|
-
//
|
|
33
|
-
// IMPORTANT:
|
|
34
|
-
// - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
|
|
35
|
-
// - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
|
|
36
|
-
// - `transcription_options` does NOT exist — use `transcript.provider`
|
|
37
|
-
// - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
|
|
38
|
-
//
|
|
39
|
-
// ARCHITECTURE (post-2026-05-22 redesign):
|
|
40
|
-
// Input (meeting → osborn): Recall's documented WebSocket audio protocol.
|
|
41
|
-
// `audio_separate_raw` config + websocket realtime endpoint streams
|
|
42
|
-
// per-participant PCM (S16LE 16kHz mono, base64 in JSON) to the agent's
|
|
43
|
-
// /meeting-audio-in WS handler. Bot's own audio is excluded by default
|
|
44
|
-
// → zero possibility of feedback loop, no echo cancellation needed.
|
|
45
|
-
// Output (osborn → meeting): webpage output_media (LiveKit-on-page). Bot
|
|
46
|
-
// page subscribes to osborn's LiveKit audio track and plays it via
|
|
47
|
-
// track.attach(); Recall captures the page's audio output and injects
|
|
48
|
-
// into the meeting.
|
|
49
|
-
// Webhook transcripts (transcript.data): retained as a SECONDARY signal —
|
|
50
|
-
// the agent index.ts handler for this event currently logs but does NOT
|
|
51
|
-
// forward to the LLM (intentionally disabled). The Deepgram WS path
|
|
52
|
-
// above is the LLM input.
|
|
53
|
-
const httpBase = webhookBaseUrl.replace(/\/$/, '');
|
|
54
|
-
const wsBase = httpBase.replace(/^https?:\/\//, m => m === 'https://' ? 'wss://' : 'ws://');
|
|
35
|
+
// We DO keep `recording_config.transcript.provider.recallai_streaming` so
|
|
36
|
+
// Recall actually transcribes the meeting — the REST endpoint we poll
|
|
37
|
+
// requires this to be configured, otherwise transcripts are empty.
|
|
55
38
|
const res = await fetch(`${RECALL_BASE_URL}/bot`, {
|
|
56
39
|
method: 'POST',
|
|
57
40
|
headers: {
|
|
@@ -64,46 +47,12 @@ export class RecallClient extends EventEmitter {
|
|
|
64
47
|
recording_config: {
|
|
65
48
|
transcript: {
|
|
66
49
|
provider: {
|
|
67
|
-
// recallai_streaming is built-in — no external API key needed,
|
|
68
|
-
// low-latency, works across all meeting platforms.
|
|
69
|
-
// Kept for the secondary webhook signal (display / future use);
|
|
70
|
-
// LLM input now comes from the Deepgram WS pipe below.
|
|
71
50
|
recallai_streaming: {
|
|
72
51
|
mode: 'prioritize_low_latency',
|
|
73
52
|
language_code: 'en',
|
|
74
53
|
},
|
|
75
54
|
},
|
|
76
55
|
},
|
|
77
|
-
// Per-participant raw PCM audio stream. Bot's own audio is excluded
|
|
78
|
-
// (we don't set include_bot_in_recording.audio:true).
|
|
79
|
-
audio_separate_raw: {},
|
|
80
|
-
realtime_endpoints: [
|
|
81
|
-
{
|
|
82
|
-
// Transcript webhook (secondary signal; LLM forwarding disabled).
|
|
83
|
-
type: 'webhook',
|
|
84
|
-
url: `${httpBase}/webhook/recall`,
|
|
85
|
-
events: ['transcript.data'],
|
|
86
|
-
},
|
|
87
|
-
{
|
|
88
|
-
// Per-participant PCM audio → agent's Deepgram STT pipe.
|
|
89
|
-
type: 'websocket',
|
|
90
|
-
url: `${wsBase}/meeting-audio-in`,
|
|
91
|
-
events: ['audio_separate_raw.data'],
|
|
92
|
-
},
|
|
93
|
-
],
|
|
94
|
-
},
|
|
95
|
-
output_media: {
|
|
96
|
-
camera: {
|
|
97
|
-
// `kind` (not `type`) — confirmed from prior debugging.
|
|
98
|
-
// The page Recall renders connects to LiveKit and plays osborn's
|
|
99
|
-
// TTS audio via track.attach(); Recall captures the page audio.
|
|
100
|
-
// The page does NOT call getUserMedia anymore — input now comes
|
|
101
|
-
// from the audio_separate_raw WebSocket above.
|
|
102
|
-
kind: 'webpage',
|
|
103
|
-
config: {
|
|
104
|
-
url: outputPageUrl,
|
|
105
|
-
},
|
|
106
|
-
},
|
|
107
56
|
},
|
|
108
57
|
}),
|
|
109
58
|
});
|
|
@@ -112,9 +61,37 @@ export class RecallClient extends EventEmitter {
|
|
|
112
61
|
throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
|
|
113
62
|
}
|
|
114
63
|
const bot = (await res.json());
|
|
115
|
-
console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (
|
|
64
|
+
console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (polling-only, no audio pipeline)`);
|
|
116
65
|
return bot.id;
|
|
117
66
|
}
|
|
67
|
+
/**
|
|
68
|
+
* Fetch the bot's current transcript. Returns an array of "transcript turns"
|
|
69
|
+
* (each turn = one speaker's utterance) sorted by start time. Use the bot's
|
|
70
|
+
* `recordings[0].id` from getBotStatus / bot record to locate the recording,
|
|
71
|
+
* then list its transcripts.
|
|
72
|
+
*
|
|
73
|
+
* Per Recall docs:
|
|
74
|
+
* GET /api/v1/bot/{bot_id} → bot record incl. `recordings: [...]`
|
|
75
|
+
* GET /api/v1/transcript/{transcript_id} → transcript with download_url
|
|
76
|
+
* Download the transcript JSON from download_url to get the actual content.
|
|
77
|
+
*
|
|
78
|
+
* For the polling use case (called every ~30s), we use the simpler combined
|
|
79
|
+
* endpoint: `GET /api/v1/bot/{bot_id}/transcript` which Recall exposes as a
|
|
80
|
+
* convenience and returns the full transcript so far in one call. The caller
|
|
81
|
+
* is responsible for de-duping (keeping a since-cursor) so the LLM only sees
|
|
82
|
+
* new turns.
|
|
83
|
+
*/
|
|
84
|
+
async getTranscript(botId) {
|
|
85
|
+
const res = await fetch(`${RECALL_BASE_URL}/bot/${botId}/transcript`, {
|
|
86
|
+
headers: { 'Authorization': `Token ${this.#apiKey}` },
|
|
87
|
+
});
|
|
88
|
+
if (!res.ok) {
|
|
89
|
+
const err = await res.text().catch(() => '');
|
|
90
|
+
throw new Error(`Recall.ai transcript fetch failed: ${res.status} ${err.substring(0, 200)}`);
|
|
91
|
+
}
|
|
92
|
+
const turns = await res.json();
|
|
93
|
+
return Array.isArray(turns) ? turns : [];
|
|
94
|
+
}
|
|
118
95
|
async leaveMeeting(botId) {
|
|
119
96
|
await fetch(`${RECALL_BASE_URL}/bot/${botId}/leave_call`, {
|
|
120
97
|
method: 'POST',
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "osborn",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.43",
|
|
4
4
|
"description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"dev:logged": "tsx scripts/dev-logged.ts",
|
|
12
12
|
"review": "tsx scripts/review.ts",
|
|
13
13
|
"start": "tsx src/index.ts",
|
|
14
|
-
"build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts
|
|
14
|
+
"build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts",
|
|
15
15
|
"room": "tsx src/index.ts --room",
|
|
16
16
|
"prepublishOnly": "npm run build"
|
|
17
17
|
},
|