osborn 0.9.40 → 0.9.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent, AudioSource, AudioFrame, LocalAudioTrack, TrackPublishOptions, TrackSource, } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -149,6 +149,12 @@ process.on('uncaughtException', (error) => {
149
149
  let currentRoomCode = null;
150
150
  // Meeting output WebSocket — module-level so both startApiServer and main() can access it
151
151
  let meetingOutputWs = null;
152
+ // Module-level AgentSession reference so /meeting-audio-in WS handler can switch
153
+ // the RoomIO-linked participant when meeting audio starts/stops (B2 design).
154
+ let activeAgentSession = null;
155
+ // Identity of the local user participant the session was originally listening to
156
+ // — captured at the moment we switch to the meeting publisher, restored on cleanup.
157
+ let preMeetingUserIdentity = null;
152
158
  function sendToMeetingOutput(msg) {
153
159
  if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
154
160
  try {
@@ -960,20 +966,270 @@ function startApiServer(workingDir, port) {
960
966
  cleanStaleUploadDirs();
961
967
  setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
962
968
  // ============================================================
963
- // Meeting Output WebSocket — /meeting-audio
969
+ // Meeting Output WebSocket — /meeting-audio (LEGACY)
964
970
  // ============================================================
965
- // Recall's headless browser opens meeting-output.html which connects here.
966
- // We push: JSON { type: 'speak', text } for display, binary PCM for audio (future).
971
+ // Recall's headless browser used to open meeting-output.html which connects
972
+ // here. With the new /meeting-bot Next.js page (Phase 2 + LiveKit), Recall
973
+ // points at frontend/meeting-bot instead — this handler exists only for
974
+ // backwards-compat with old machine images still serving the legacy path.
967
975
  const meetingOutputWss = new WebSocketServer({ noServer: true });
968
976
  meetingOutputWss.on('connection', (ws) => {
969
- console.log('📺 Meeting output browser connected');
977
+ console.log('📺 Meeting output browser connected (legacy /meeting-audio)');
970
978
  meetingOutputWs = ws;
971
979
  ws.on('close', () => {
972
- console.log('📺 Meeting output browser disconnected');
980
+ console.log('📺 Meeting output browser disconnected (legacy)');
973
981
  if (meetingOutputWs === ws)
974
982
  meetingOutputWs = null;
975
983
  });
976
984
  });
985
+ // ============================================================
986
+ // Recall.ai meeting-audio-in WebSocket — /meeting-audio-in
987
+ // ============================================================
988
+ // Recall.ai's per-participant real-time audio protocol. Bot is configured
989
+ // (in recall-client.ts joinMeeting) with audio_separate_raw + a realtime
990
+ // endpoint pointing at this URL. Recall sends JSON events containing
991
+ // base64-encoded PCM (S16LE, 16kHz, mono) for every meeting participant
992
+ // (bot's own audio NOT included by default — no feedback loop possible).
993
+ //
994
+ // Flow: Recall → /meeting-audio-in → open a SECOND LiveKit connection from
995
+ // this agent process as a publisher participant → publish PCM as an
996
+ // audio track in the same LiveKit room → the existing AgentSession's
997
+ // STT subscribes to it as a remote track → routes to currentLLM.chat()
998
+ // via the same pipeline as voice-native user mic.
999
+ //
1000
+ // The advantage of this design vs a parallel STT pipeline: meeting audio
1001
+ // becomes "just another participant" in the LiveKit room — same end-of-turn
1002
+ // detection, same interrupt handling, same conversation context, no parallel
1003
+ // chat() paths to maintain.
1004
+ //
1005
+ // Wait until activeAgentSession._roomIO exists AND the publisher participant
1006
+ // is visible to the agent's room. Both can race against join_meeting:
1007
+ // - Agent session may still be starting up when Recall connects.
1008
+ // - LiveKit takes a moment to propagate the publisher's join to the agent
1009
+ // side after publishTrack() returns on our side.
1010
+ // Bounded poll (200ms cadence) avoids both timing gaps.
1011
+ async function waitForRoomIOAndParticipant(publisherIdentity, timeoutMs) {
1012
+ const deadline = Date.now() + timeoutMs;
1013
+ let roomIO = null;
1014
+ let participantVisible = false;
1015
+ while (Date.now() < deadline) {
1016
+ roomIO = activeAgentSession?._roomIO;
1017
+ if (roomIO && typeof roomIO.setParticipant === 'function') {
1018
+ const agentRoom = roomIO.rtcRoom;
1019
+ const remotes = agentRoom?.remoteParticipants;
1020
+ if (remotes && typeof remotes.values === 'function') {
1021
+ for (const p of remotes.values()) {
1022
+ if (p?.identity === publisherIdentity) {
1023
+ participantVisible = true;
1024
+ break;
1025
+ }
1026
+ }
1027
+ }
1028
+ if (participantVisible)
1029
+ return { roomIO, participantVisible };
1030
+ }
1031
+ await new Promise(r => setTimeout(r, 200));
1032
+ }
1033
+ // Timed out — return whatever we have. Caller decides whether to proceed.
1034
+ return { roomIO, participantVisible };
1035
+ }
1036
+ const meetingAudioInWss = new WebSocketServer({ noServer: true });
1037
+ meetingAudioInWss.on('connection', async (recallWs) => {
1038
+ console.log('🎙️ Recall audio-in WebSocket connected — setting up LiveKit publisher');
1039
+ const livekitUrl = process.env.LIVEKIT_URL;
1040
+ const apiKey = process.env.LIVEKIT_API_KEY;
1041
+ const apiSecret = process.env.LIVEKIT_API_SECRET;
1042
+ if (!livekitUrl || !apiKey || !apiSecret) {
1043
+ console.warn('⚠️ LIVEKIT_URL / LIVEKIT_API_KEY / LIVEKIT_API_SECRET not set — meeting audio publisher disabled');
1044
+ recallWs.close();
1045
+ return;
1046
+ }
1047
+ if (!currentRoomCode) {
1048
+ console.warn('⚠️ No active LiveKit room (currentRoomCode null) — meeting audio publisher cannot attach');
1049
+ recallWs.close();
1050
+ return;
1051
+ }
1052
+ const roomName = `osborn-${currentRoomCode}`;
1053
+ // Mint a publisher token via livekit-server-sdk (already imported for
1054
+ // /api/token style flows). Long TTL — meetings can run for hours.
1055
+ const identity = `meeting-audio-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
1056
+ const at = new AccessToken(apiKey, apiSecret, {
1057
+ identity,
1058
+ ttl: 14400, // 4 hours
1059
+ metadata: JSON.stringify({ role: 'meeting-audio-publisher' }),
1060
+ });
1061
+ at.addGrant({ roomJoin: true, room: roomName, canPublish: true, canSubscribe: false });
1062
+ const token = await at.toJwt();
1063
+ let room = null;
1064
+ let source = null;
1065
+ let track = null;
1066
+ const cleanup = async () => {
1067
+ // Restore AgentSession STT input to the original user participant before
1068
+ // tearing down the publisher track. If we don't switch back, the session
1069
+ // will be stuck waiting on a participant that's about to disappear.
1070
+ try {
1071
+ const roomIO = activeAgentSession?._roomIO;
1072
+ if (roomIO && typeof roomIO.setParticipant === 'function') {
1073
+ if (preMeetingUserIdentity) {
1074
+ roomIO.setParticipant(preMeetingUserIdentity);
1075
+ console.log(`🔁 Restored AgentSession STT input to user: ${preMeetingUserIdentity}`);
1076
+ }
1077
+ else {
1078
+ roomIO.unsetParticipant();
1079
+ console.log('🔁 Cleared AgentSession STT input (no original user to restore)');
1080
+ }
1081
+ }
1082
+ }
1083
+ catch (err) {
1084
+ console.warn('⚠️ Failed to restore RoomIO participant on cleanup:', err.message);
1085
+ }
1086
+ preMeetingUserIdentity = null;
1087
+ try {
1088
+ if (track)
1089
+ await track.close(true);
1090
+ }
1091
+ catch { }
1092
+ try {
1093
+ if (source)
1094
+ await source.close();
1095
+ }
1096
+ catch { }
1097
+ try {
1098
+ if (room)
1099
+ await room.disconnect();
1100
+ }
1101
+ catch { }
1102
+ room = null;
1103
+ source = null;
1104
+ track = null;
1105
+ };
1106
+ try {
1107
+ room = new Room();
1108
+ await room.connect(livekitUrl, token);
1109
+ if (!room.localParticipant)
1110
+ throw new Error('LiveKit connected but localParticipant missing');
1111
+ // Recall sends S16LE PCM at 16kHz mono. AudioSource matches the format.
1112
+ source = new AudioSource(16000, 1);
1113
+ track = LocalAudioTrack.createAudioTrack('meeting-audio', source);
1114
+ await room.localParticipant.publishTrack(track, new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }));
1115
+ console.log(`🎙️ Meeting audio publisher connected to ${roomName} as ${identity}`);
1116
+ // B2 — switch the existing AgentSession's RoomIO input from the local user
1117
+ // to this meeting-audio publisher. While the meeting is active, the user
1118
+ // talks via the meeting (Recall captures it and sends PCM here), and the
1119
+ // agent treats this publisher as the "speaking" participant for STT/EOT.
1120
+ // Original user identity is stashed so cleanup() can restore it.
1121
+ //
1122
+ // 15s timeout accommodates: session-start race (agent still booting when
1123
+ // user clicks "join meeting"), LiveKit participant-join propagation
1124
+ // (~hundreds of ms), and Fly cold-path latency on first request.
1125
+ try {
1126
+ const { roomIO, participantVisible } = await waitForRoomIOAndParticipant(identity, 15000);
1127
+ if (!roomIO) {
1128
+ console.warn('⚠️ Timed out waiting for AgentSession._roomIO (15s) — meeting audio published but STT not switched. Meeting audio will be ignored until a session starts.');
1129
+ }
1130
+ else if (!participantVisible) {
1131
+ // RoomIO exists but our publisher hasn't propagated to the agent's
1132
+ // room view yet. setParticipant stores the identity and links on
1133
+ // participant-connected event, so this is still safe to call —
1134
+ // RoomIO will pick up the link when the event arrives.
1135
+ preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
1136
+ roomIO.setParticipant(identity);
1137
+ console.log(`🔁 Switched AgentSession STT input (publisher not yet visible — will link on connect): ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
1138
+ }
1139
+ else {
1140
+ preMeetingUserIdentity = roomIO.linkedParticipant?.identity ?? null;
1141
+ roomIO.setParticipant(identity);
1142
+ console.log(`🔁 Switched AgentSession STT input: ${preMeetingUserIdentity ?? '(none)'} → ${identity}`);
1143
+ }
1144
+ }
1145
+ catch (err) {
1146
+ console.warn('⚠️ Failed to switch RoomIO participant:', err.message);
1147
+ }
1148
+ }
1149
+ catch (err) {
1150
+ console.error('❌ Failed to set up LiveKit publisher for meeting audio:', err instanceof Error ? err.message : err);
1151
+ try {
1152
+ recallWs.close();
1153
+ }
1154
+ catch { }
1155
+ await cleanup();
1156
+ return;
1157
+ }
1158
+ // Recall → us: JSON events with base64-encoded PCM. Decode, wrap as
1159
+ // AudioFrame, and capture into the source. AgentSession in the main room
1160
+ // will subscribe to this published track and STT it via the normal pipeline.
1161
+ // Payload shape from
1162
+ // docs.recall.ai/docs/how-to-get-separate-audio-per-participant-realtime:
1163
+ // { event: 'audio_separate_raw.data', data: { data: { buffer: '<base64>', ... }, participant: {...} } }
1164
+ //
1165
+ // Diagnostic counters so we can tell from prod logs whether (a) Recall is
1166
+ // streaming any frames at all, (b) they're decoding correctly, and (c)
1167
+ // captureFrame is succeeding. Logged every 100 frames (~5s at 50fps).
1168
+ let totalMessages = 0;
1169
+ let audioFrames = 0;
1170
+ let bytesIn = 0;
1171
+ let lastSpeakerSeen;
1172
+ const startTs = Date.now();
1173
+ recallWs.on('message', async (raw) => {
1174
+ totalMessages++;
1175
+ if (!source)
1176
+ return;
1177
+ try {
1178
+ const msg = JSON.parse(raw.toString());
1179
+ if (msg.event !== 'audio_separate_raw.data') {
1180
+ // First-time event-type diagnostic — log unknown event types once so
1181
+ // we know if Recall's payload shape changed
1182
+ if (totalMessages <= 3) {
1183
+ console.log(`[meeting-audio-in] non-audio event: ${msg.event}`);
1184
+ }
1185
+ return;
1186
+ }
1187
+ const b64 = msg.data?.data?.buffer;
1188
+ if (!b64) {
1189
+ if (audioFrames === 0) {
1190
+ console.warn(`[meeting-audio-in] first audio event had no buffer field. payload keys=${Object.keys(msg.data?.data ?? {}).join(',')}`);
1191
+ }
1192
+ return;
1193
+ }
1194
+ const pcmBuf = Buffer.from(b64, 'base64');
1195
+ bytesIn += pcmBuf.byteLength;
1196
+ const speakerName = msg.data?.data?.participant?.name || msg.data?.participant?.name;
1197
+ if (speakerName && speakerName !== lastSpeakerSeen) {
1198
+ console.log(`[meeting-audio-in] now hearing: ${speakerName}`);
1199
+ lastSpeakerSeen = speakerName;
1200
+ }
1201
+ // AudioFrame expects Int16Array. The PCM buffer is S16LE — view it
1202
+ // directly without copy. Length / 2 = samples (each sample 2 bytes).
1203
+ const samplesPerChannel = pcmBuf.byteLength / 2;
1204
+ const int16 = new Int16Array(pcmBuf.buffer, pcmBuf.byteOffset, samplesPerChannel);
1205
+ const frame = new AudioFrame(int16, 16000, 1, samplesPerChannel);
1206
+ await source.captureFrame(frame);
1207
+ audioFrames++;
1208
+ if (audioFrames === 1) {
1209
+ console.log(`[meeting-audio-in] FIRST audio frame captured (${pcmBuf.byteLength} bytes, ${samplesPerChannel} samples)`);
1210
+ }
1211
+ if (audioFrames % 100 === 0) {
1212
+ const elapsed = ((Date.now() - startTs) / 1000).toFixed(1);
1213
+ console.log(`[meeting-audio-in] heartbeat: ${audioFrames} frames, ${(bytesIn / 1024).toFixed(1)} KB in ${elapsed}s (last speaker: ${lastSpeakerSeen ?? 'unknown'})`);
1214
+ }
1215
+ }
1216
+ catch (err) {
1217
+ // Don't log every frame parse failure — could be noisy if Recall sends
1218
+ // non-audio_separate_raw events on the same channel.
1219
+ if (err.message?.includes('JSON'))
1220
+ return;
1221
+ console.warn('⚠️ meeting audio capture error:', err instanceof Error ? err.message : err);
1222
+ }
1223
+ });
1224
+ recallWs.on('close', async () => {
1225
+ const elapsed = ((Date.now() - startTs) / 1000).toFixed(1);
1226
+ console.log(`🎙️ Recall audio-in WebSocket closed — tearing down LiveKit publisher. Total: ${audioFrames} audio frames / ${totalMessages} messages / ${(bytesIn / 1024).toFixed(1)} KB over ${elapsed}s`);
1227
+ await cleanup();
1228
+ });
1229
+ recallWs.on('error', (err) => {
1230
+ console.warn('⚠️ Recall WS error:', err instanceof Error ? err.message : err);
1231
+ });
1232
+ });
977
1233
  server.on('upgrade', (req, socket, head) => {
978
1234
  const url = new URL(req.url || '/', `http://localhost:${port}`);
979
1235
  if (url.pathname === '/meeting-audio') {
@@ -981,6 +1237,11 @@ function startApiServer(workingDir, port) {
981
1237
  meetingOutputWss.emit('connection', ws, req);
982
1238
  });
983
1239
  }
1240
+ else if (url.pathname === '/meeting-audio-in') {
1241
+ meetingAudioInWss.handleUpgrade(req, socket, head, (ws) => {
1242
+ meetingAudioInWss.emit('connection', ws, req);
1243
+ });
1244
+ }
984
1245
  else {
985
1246
  socket.destroy();
986
1247
  }
@@ -2630,6 +2891,7 @@ async function main() {
2630
2891
  }
2631
2892
  lastCompletedResearch = null;
2632
2893
  currentSession = null;
2894
+ activeAgentSession = null;
2633
2895
  currentAgent = null;
2634
2896
  // Same disconnect-leak fix as the other two cleanup sites — kill the Claude SDK
2635
2897
  // subprocess BEFORE dropping the reference. See killCurrentLLM() for full context.
@@ -2675,6 +2937,7 @@ async function main() {
2675
2937
  }
2676
2938
  catch { }
2677
2939
  currentSession = null;
2940
+ activeAgentSession = null;
2678
2941
  currentAgent = null;
2679
2942
  // Same disconnect-leak fix — kill the previous user's Claude subprocess
2680
2943
  // before binding currentLLM to the new user's session below.
@@ -2829,6 +3092,7 @@ async function main() {
2829
3092
  agent = result.agent;
2830
3093
  }
2831
3094
  currentSession = session;
3095
+ activeAgentSession = session;
2832
3096
  currentAgent = agent; // Store for updateChatCtx() context injection
2833
3097
  // ============================================================
2834
3098
  // Session event wiring — extracted into function for auto-recovery
@@ -2988,6 +3252,7 @@ async function main() {
2988
3252
  }
2989
3253
  catch { }
2990
3254
  currentSession = null;
3255
+ activeAgentSession = null;
2991
3256
  currentAgent = null;
2992
3257
  // Clear stale state from crashed session
2993
3258
  voiceQueue.length = 0;
@@ -3049,6 +3314,7 @@ async function main() {
3049
3314
  const newSession = result.session;
3050
3315
  const newAgent = result.agent;
3051
3316
  currentSession = newSession;
3317
+ activeAgentSession = newSession;
3052
3318
  currentAgent = newAgent;
3053
3319
  // Re-wire event listeners on the new session
3054
3320
  wireSessionEvents(newSession, newAgent);
@@ -3105,6 +3371,7 @@ async function main() {
3105
3371
  }
3106
3372
  catch { }
3107
3373
  currentSession = null;
3374
+ activeAgentSession = null;
3108
3375
  currentAgent = null;
3109
3376
  // Clear voice queue — stale injections from the crashed session
3110
3377
  voiceQueue.length = 0;
@@ -3128,6 +3395,7 @@ async function main() {
3128
3395
  const newSession = result.session;
3129
3396
  const newAgent = result.agent;
3130
3397
  currentSession = newSession;
3398
+ activeAgentSession = newSession;
3131
3399
  currentAgent = newAgent;
3132
3400
  // Re-wire event listeners on the new session
3133
3401
  wireSessionEvents(newSession, newAgent);
@@ -3322,6 +3590,7 @@ async function main() {
3322
3590
  if (currentSession) {
3323
3591
  const sessionToClose = currentSession;
3324
3592
  currentSession = null;
3593
+ activeAgentSession = null;
3325
3594
  // Track async close so new connections can wait for byte stream handler to be released
3326
3595
  pendingSessionClose = (async () => {
3327
3596
  try {
@@ -35,6 +35,23 @@ export class RecallClient extends EventEmitter {
35
35
  // - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
36
36
  // - `transcription_options` does NOT exist — use `transcript.provider`
37
37
  // - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
38
+ //
39
+ // ARCHITECTURE (post-2026-05-22 redesign):
40
+ // Input (meeting → osborn): Recall's documented WebSocket audio protocol.
41
+ // `audio_separate_raw` config + websocket realtime endpoint streams
42
+ // per-participant PCM (S16LE 16kHz mono, base64 in JSON) to the agent's
43
+ // /meeting-audio-in WS handler. Bot's own audio is excluded by default
44
+ // → zero possibility of feedback loop, no echo cancellation needed.
45
+ // Output (osborn → meeting): webpage output_media (LiveKit-on-page). Bot
46
+ // page subscribes to osborn's LiveKit audio track and plays it via
47
+ // track.attach(); Recall captures the page's audio output and injects
48
+ // into the meeting.
49
+ // Webhook transcripts (transcript.data): retained as a SECONDARY signal —
50
+ // the agent index.ts handler for this event currently logs but does NOT
51
+ // forward to the LLM (intentionally disabled). The Deepgram WS path
52
+ // above is the LLM input.
53
+ const httpBase = webhookBaseUrl.replace(/\/$/, '');
54
+ const wsBase = httpBase.replace(/^https?:\/\//, m => m === 'https://' ? 'wss://' : 'ws://');
38
55
  const res = await fetch(`${RECALL_BASE_URL}/bot`, {
39
56
  method: 'POST',
40
57
  headers: {
@@ -49,25 +66,39 @@ export class RecallClient extends EventEmitter {
49
66
  provider: {
50
67
  // recallai_streaming is built-in — no external API key needed,
51
68
  // low-latency, works across all meeting platforms.
69
+ // Kept for the secondary webhook signal (display / future use);
70
+ // LLM input now comes from the Deepgram WS pipe below.
52
71
  recallai_streaming: {
53
72
  mode: 'prioritize_low_latency',
54
73
  language_code: 'en',
55
74
  },
56
75
  },
57
76
  },
58
- realtime_endpoints: [{
77
+ // Per-participant raw PCM audio stream. Bot's own audio is excluded
78
+ // (we don't set include_bot_in_recording.audio:true).
79
+ audio_separate_raw: {},
80
+ realtime_endpoints: [
81
+ {
82
+ // Transcript webhook (secondary signal; LLM forwarding disabled).
59
83
  type: 'webhook',
60
- url: `${webhookBaseUrl}/webhook/recall`,
84
+ url: `${httpBase}/webhook/recall`,
61
85
  events: ['transcript.data'],
62
- }],
86
+ },
87
+ {
88
+ // Per-participant PCM audio → agent's Deepgram STT pipe.
89
+ type: 'websocket',
90
+ url: `${wsBase}/meeting-audio-in`,
91
+ events: ['audio_separate_raw.data'],
92
+ },
93
+ ],
63
94
  },
64
95
  output_media: {
65
96
  camera: {
66
97
  // `kind` (not `type`) — confirmed from prior debugging.
67
- // The page Recall renders is responsible for joining the same LiveKit
68
- // room as the osborn agent: meeting audio captured via getUserMedia is
69
- // published into the room; osborn's TTS audio (already in the room) is
70
- // played by the page and captured by Recall as the bot's mic output.
98
+ // The page Recall renders connects to LiveKit and plays osborn's
99
+ // TTS audio via track.attach(); Recall captures the page audio.
100
+ // The page does NOT call getUserMedia anymore input now comes
101
+ // from the audio_separate_raw WebSocket above.
71
102
  kind: 'webpage',
72
103
  config: {
73
104
  url: outputPageUrl,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.40",
3
+ "version": "0.9.42",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {