osborn 0.9.39 → 0.9.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1224,28 +1224,46 @@ async function main() {
1224
1224
  let lastCompletedResearch = null;
1225
1225
  // No manual queuing — the Claude SDK handles sequential queries internally
1226
1226
  // ============================================================
1227
- // Recall.ai — Meeting Transcript Routing
1227
+ // Recall.ai — Meeting Transcript Listener
1228
1228
  // ============================================================
1229
+ // NOTE: LLM-forwarding via Recall webhook STT was DISABLED in the Phase 2
1230
+ // LiveKit-based meeting-bot migration. Reason: Recall sends transcripts as
1231
+ // sentence-level fragments (e.g. "transcript.data" events fire ~once per
1232
+ // sentence). The old code below called currentLLM.chat() PER FRAGMENT, which
1233
+ // meant the agent fired ~10 chat() calls during a single user utterance —
1234
+ // each one prompting a separate response. The agent ended up speaking over
1235
+ // itself answering partial fragments.
1236
+ //
1237
+ // Phase 2 routes meeting audio through LiveKit instead (see
1238
+ // frontend/src/app/meeting-bot/page.tsx). The agent's existing Deepgram Flux
1239
+ // STT processes that audio via end-of-turn detection — ONE chat() call per
1240
+ // actual completed utterance, no fragment storms.
1241
+ //
1242
+ // We keep the listener registered so we have a hook for future work (e.g.
1243
+ // forwarding the live transcript to the frontend chat panel as a read-only
1244
+ // "what was said in the meeting" display, separate from the LLM input path).
1229
1245
  const recall = getRecallClient();
1230
1246
  if (recall) {
1231
- console.log('🎥 Recall.ai client initialized (RECALL_API_KEY present)');
1247
+ console.log('🎥 Recall.ai client initialized (webhook STT receiver — LLM forwarding disabled, see meeting-bot Phase 2)');
1232
1248
  recall.on('transcript', ({ botId, speaker, text }) => {
1233
1249
  console.log(`📝 Meeting transcript [${speaker}]: ${text}`);
1234
- // Route meeting transcripts to Claude as user text with speaker attribution
1235
- if (currentLLM && currentSession) {
1236
- const meetingText = `[Meeting ${speaker}]: ${text}`;
1237
- // Use the same pipeline as user_text data channel messages
1238
- try {
1239
- if (currentVoiceMode === 'pipeline' || currentVoiceMode === 'direct') {
1240
- const chatCtx = new llm.ChatContext();
1241
- chatCtx.addMessage({ role: 'user', content: meetingText });
1242
- currentLLM.chat({ chatCtx });
1243
- }
1244
- }
1245
- catch (err) {
1246
- console.error('❌ Failed to route meeting transcript:', err);
1247
- }
1248
- }
1250
+ // INTENTIONALLY DISABLED see comment above. Audio path is now LiveKit
1251
+ // meeting-bot page publishes meeting audio → agent STT processes it.
1252
+ // The line below is preserved as a reference for future re-enablement
1253
+ // (e.g. as a display-only feature, NOT as LLM input).
1254
+ //
1255
+ // if (currentLLM && currentSession) {
1256
+ // const meetingText = `[Meeting — ${speaker}]: ${text}`
1257
+ // try {
1258
+ // if (currentVoiceMode === 'pipeline' || currentVoiceMode === 'direct') {
1259
+ // const chatCtx = new llm.ChatContext()
1260
+ // chatCtx.addMessage({ role: 'user', content: meetingText })
1261
+ // ;(currentLLM as any).chat({ chatCtx })
1262
+ // }
1263
+ // } catch (err) {
1264
+ // console.error('❌ Failed to route meeting transcript:', err)
1265
+ // }
1266
+ // }
1249
1267
  });
1250
1268
  }
1251
1269
  // ============================================================
@@ -1656,9 +1674,34 @@ async function main() {
1656
1674
  skipTTSQueue: true,
1657
1675
  onCompactionEvent: (event) => {
1658
1676
  try {
1659
- // Forward every field frontend renders stage + detail + skill list during compaction.
1660
- // Spread covers compaction_started/progress/complete (different fields per type).
1677
+ // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
1661
1678
  sendToFrontend({ ...event });
1679
+ // ALSO emit as a claude_output chat bubble — reuses the existing message path
1680
+ // that's already working end-to-end. PreCompact → in-progress bubble.
1681
+ // PostCompact → completion bubble with the skills summary. The dedicated
1682
+ // banner has been unreliable in production (data path works on backend, banner
1683
+ // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
1684
+ // are visible without dev tools.
1685
+ if (event.type === 'compaction_started') {
1686
+ const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
1687
+ sendToFrontend({
1688
+ type: 'claude_output',
1689
+ text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
1690
+ agentRole: 'direct',
1691
+ });
1692
+ }
1693
+ else if (event.type === 'compaction_complete') {
1694
+ const ev = event;
1695
+ const n = ev.skillsWritten ?? 0;
1696
+ const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
1697
+ ? ` — ${ev.skillNames.join(', ')}`
1698
+ : '';
1699
+ sendToFrontend({
1700
+ type: 'claude_output',
1701
+ text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
1702
+ agentRole: 'direct',
1703
+ });
1704
+ }
1662
1705
  }
1663
1706
  catch { /* non-fatal */ }
1664
1707
  },
@@ -1862,14 +1905,17 @@ async function main() {
1862
1905
  const sayId = Date.now(); // simple ID to correlate start/end logs
1863
1906
  console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
1864
1907
  // Forward spoken text + audio to meeting output page when bot is in a meeting.
1865
- // Text appears immediately; audio uses the same configured TTS (directConfig.tts)
1866
- // so voice/provider stays consistent no separate hardcoded provider.
1908
+ // Uses DIRECT_MODE_TTS (same OpenAI fable voice as the live session) was
1909
+ // previously using directConfig.tts which falls back to DEFAULT_CONFIG.direct.tts
1910
+ // (Deepgram aura-2-asteria-en) when no user config exists, producing a different
1911
+ // voice in the meeting than what the user hears in voice-native. Both paths now
1912
+ // share the single source of truth.
1867
1913
  // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
1868
1914
  // Recall captures the browser page's audio output and injects it into the meeting.
1869
1915
  if (activeMeetingBotId) {
1870
1916
  sendToMeetingOutput({ type: 'speak', text: data.text });
1871
1917
  if (meetingOutputWs) {
1872
- synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
1918
+ synthesizeForMeeting(data.text, DIRECT_MODE_TTS).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
1873
1919
  }
1874
1920
  }
1875
1921
  try {
@@ -2011,9 +2057,34 @@ async function main() {
2011
2057
  resumeSessionId,
2012
2058
  onCompactionEvent: (event) => {
2013
2059
  try {
2014
- // Forward every field frontend renders stage + detail + skill list during compaction.
2015
- // Spread covers compaction_started/progress/complete (different fields per type).
2060
+ // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
2016
2061
  sendToFrontend({ ...event });
2062
+ // ALSO emit as a claude_output chat bubble — reuses the existing message path
2063
+ // that's already working end-to-end. PreCompact → in-progress bubble.
2064
+ // PostCompact → completion bubble with the skills summary. The dedicated
2065
+ // banner has been unreliable in production (data path works on backend, banner
2066
+ // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
2067
+ // are visible without dev tools.
2068
+ if (event.type === 'compaction_started') {
2069
+ const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
2070
+ sendToFrontend({
2071
+ type: 'claude_output',
2072
+ text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
2073
+ agentRole: 'direct',
2074
+ });
2075
+ }
2076
+ else if (event.type === 'compaction_complete') {
2077
+ const ev = event;
2078
+ const n = ev.skillsWritten ?? 0;
2079
+ const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
2080
+ ? ` — ${ev.skillNames.join(', ')}`
2081
+ : '';
2082
+ sendToFrontend({
2083
+ type: 'claude_output',
2084
+ text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
2085
+ agentRole: 'direct',
2086
+ });
2087
+ }
2017
2088
  }
2018
2089
  catch { /* non-fatal */ }
2019
2090
  },
@@ -3855,8 +3926,57 @@ async function main() {
3855
3926
  (process.env.FLY_APP_NAME
3856
3927
  ? `https://${process.env.FLY_APP_NAME}.fly.dev`
3857
3928
  : `http://localhost:${apiPort}`);
3929
+ // Try to mint a LiveKit bot token + construct the frontend-hosted
3930
+ // meeting-bot page URL. The bot page joins the same LiveKit room
3931
+ // as this agent so meeting audio flows through LiveKit directly
3932
+ // (no agent-side WebSocket+WAV pipe). Falls back to the legacy
3933
+ // /meeting-output webpage if no frontend URL is resolvable, so
3934
+ // the old code path keeps working during the migration window.
3935
+ //
3936
+ // Frontend URL resolution (in priority order):
3937
+ // 1. data.frontendBase — the public URL the user's browser is on,
3938
+ // passed through the join_meeting data channel message. Works
3939
+ // automatically for localhost dev + production without any
3940
+ // env var.
3941
+ // 2. OSBORN_FRONTEND_URL — existing convention from sprites.ts
3942
+ // (frontend/src/lib/sprites.ts:241) that injects the public
3943
+ // frontend URL into sandbox env vars. Defense in depth.
3944
+ //
3945
+ // Auth: the endpoint uses LiveKit room-presence as the auth check
3946
+ // — no shared secret needed. The agent must already be in the
3947
+ // requested room (which it is by this point) for the mint to
3948
+ // succeed.
3949
+ let outputPageUrl;
3950
+ const frontendUrl = data.frontendBase
3951
+ || process.env.OSBORN_FRONTEND_URL;
3952
+ if (frontendUrl) {
3953
+ try {
3954
+ const botLkId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
3955
+ const tokenRes = await fetch(`${frontendUrl}/api/meeting-bot-token`, {
3956
+ method: 'POST',
3957
+ headers: { 'Content-Type': 'application/json' },
3958
+ body: JSON.stringify({ botId: botLkId, roomName }),
3959
+ });
3960
+ if (tokenRes.ok) {
3961
+ const { token, url } = await tokenRes.json();
3962
+ const params = new URLSearchParams({ token, url, room: roomName, botId: botLkId });
3963
+ outputPageUrl = `${frontendUrl}/meeting-bot?${params.toString()}`;
3964
+ console.log(`🎫 Meeting-bot token minted for room=${roomName} bot=${botLkId}`);
3965
+ }
3966
+ else {
3967
+ const errText = await tokenRes.text().catch(() => '');
3968
+ console.warn(`⚠️ meeting-bot-token mint failed (HTTP ${tokenRes.status}: ${errText.substring(0, 120)}) — falling back to legacy /meeting-output path`);
3969
+ }
3970
+ }
3971
+ catch (mintErr) {
3972
+ console.warn(`⚠️ meeting-bot-token mint threw — falling back: ${mintErr.message}`);
3973
+ }
3974
+ }
3975
+ else {
3976
+ console.log('ℹ️ No frontend URL (data.frontendBase + OSBORN_FRONTEND_URL both empty) — using legacy /meeting-output path');
3977
+ }
3858
3978
  await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
3859
- const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase);
3979
+ const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase, { outputPageUrl });
3860
3980
  const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
3861
3981
  recallJoin.registerBot(botId, sessionId);
3862
3982
  activeMeetingBotId = botId;
@@ -36,7 +36,23 @@ export interface TranscriptPayload {
36
36
  export declare class RecallClient extends EventEmitter {
37
37
  #private;
38
38
  constructor(apiKey: string);
39
- joinMeeting(meetingUrl: string, webhookBaseUrl: string, botName?: string): Promise<string>;
39
+ /**
40
+ * Join a meeting via Recall.ai.
41
+ *
42
+ * @param meetingUrl Zoom / Google Meet / Teams URL the bot should dial in to
43
+ * @param webhookBaseUrl Base URL for the agent's HTTP endpoints (transcript webhook)
44
+ * @param opts.outputPageUrl Full URL for the bot's camera/audio page. If provided,
45
+ * replaces the default `${webhookBaseUrl}/meeting-output`.
46
+ * Used to point at the frontend-hosted /meeting-bot page
47
+ * with token + room embedded as query params, so the page
48
+ * connects to LiveKit and audio flows through the same
49
+ * room as the osborn agent (no separate WebSocket+WAV pipe).
50
+ * @param opts.botName Display name of the bot in the meeting
51
+ */
52
+ joinMeeting(meetingUrl: string, webhookBaseUrl: string, opts?: {
53
+ outputPageUrl?: string;
54
+ botName?: string;
55
+ }): Promise<string>;
40
56
  leaveMeeting(botId: string): Promise<void>;
41
57
  getBotStatus(botId: string): Promise<string>;
42
58
  handleWebhook(payload: TranscriptPayload): void;
@@ -8,7 +8,22 @@ export class RecallClient extends EventEmitter {
8
8
  super();
9
9
  this.#apiKey = apiKey;
10
10
  }
11
- async joinMeeting(meetingUrl, webhookBaseUrl, botName = 'Osborn') {
11
+ /**
12
+ * Join a meeting via Recall.ai.
13
+ *
14
+ * @param meetingUrl Zoom / Google Meet / Teams URL the bot should dial in to
15
+ * @param webhookBaseUrl Base URL for the agent's HTTP endpoints (transcript webhook)
16
+ * @param opts.outputPageUrl Full URL for the bot's camera/audio page. If provided,
17
+ * replaces the default `${webhookBaseUrl}/meeting-output`.
18
+ * Used to point at the frontend-hosted /meeting-bot page
19
+ * with token + room embedded as query params, so the page
20
+ * connects to LiveKit and audio flows through the same
21
+ * room as the osborn agent (no separate WebSocket+WAV pipe).
22
+ * @param opts.botName Display name of the bot in the meeting
23
+ */
24
+ async joinMeeting(meetingUrl, webhookBaseUrl, opts) {
25
+ const botName = opts?.botName ?? 'Osborn';
26
+ const outputPageUrl = opts?.outputPageUrl ?? `${webhookBaseUrl}/meeting-output`;
12
27
  // Authoritative structure per https://docs.recall.ai/reference/bot_create
13
28
  // and https://docs.recall.ai/docs/real-time-transcription:
14
29
  //
@@ -49,10 +64,13 @@ export class RecallClient extends EventEmitter {
49
64
  output_media: {
50
65
  camera: {
51
66
  // `kind` (not `type`) — confirmed from prior debugging.
52
- // Output webpage plays TTS audio so meeting participants can hear the agent.
67
+ // The page Recall renders is responsible for joining the same LiveKit
68
+ // room as the osborn agent: meeting audio captured via getUserMedia is
69
+ // published into the room; osborn's TTS audio (already in the room) is
70
+ // played by the page and captured by Recall as the bot's mic output.
53
71
  kind: 'webpage',
54
72
  config: {
55
- url: `${webhookBaseUrl}/meeting-output`,
73
+ url: outputPageUrl,
56
74
  },
57
75
  },
58
76
  },
@@ -63,7 +81,7 @@ export class RecallClient extends EventEmitter {
63
81
  throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
64
82
  }
65
83
  const bot = (await res.json());
66
- console.log(`🤖 Recall.ai bot joined meeting: ${bot.id}`);
84
+ console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (output page: ${outputPageUrl})`);
67
85
  return bot.id;
68
86
  }
69
87
  async leaveMeeting(botId) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.39",
3
+ "version": "0.9.40",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {