osborn 0.9.38 → 0.9.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  // Load environment variables FIRST before any other imports
2
2
  import 'dotenv/config';
3
3
  import { voice, initializeLogger } from '@livekit/agents';
4
- import { Room, RoomEvent, RemoteParticipant } from '@livekit/rtc-node';
4
+ import { Room, RoomEvent } from '@livekit/rtc-node';
5
5
  import { AccessToken } from 'livekit-server-sdk';
6
6
  // Initialize logger before anything else
7
7
  initializeLogger({ pretty: true, level: 'info' });
@@ -1224,28 +1224,46 @@ async function main() {
1224
1224
  let lastCompletedResearch = null;
1225
1225
  // No manual queuing — the Claude SDK handles sequential queries internally
1226
1226
  // ============================================================
1227
- // Recall.ai — Meeting Transcript Routing
1227
+ // Recall.ai — Meeting Transcript Listener
1228
1228
  // ============================================================
1229
+ // NOTE: LLM-forwarding via Recall webhook STT was DISABLED in the Phase 2
1230
+ // LiveKit-based meeting-bot migration. Reason: Recall sends transcripts as
1231
+ // sentence-level fragments (e.g. "transcript.data" events fire ~once per
1232
+ // sentence). The old code below called currentLLM.chat() PER FRAGMENT, which
1233
+ // meant the agent fired ~10 chat() calls during a single user utterance —
1234
+ // each one prompting a separate response. The agent ended up speaking over
1235
+ // itself answering partial fragments.
1236
+ //
1237
+ // Phase 2 routes meeting audio through LiveKit instead (see
1238
+ // frontend/src/app/meeting-bot/page.tsx). The agent's existing Deepgram Flux
1239
+ // STT processes that audio via end-of-turn detection — ONE chat() call per
1240
+ // actual completed utterance, no fragment storms.
1241
+ //
1242
+ // We keep the listener registered so we have a hook for future work (e.g.
1243
+ // forwarding the live transcript to the frontend chat panel as a read-only
1244
+ // "what was said in the meeting" display, separate from the LLM input path).
1229
1245
  const recall = getRecallClient();
1230
1246
  if (recall) {
1231
- console.log('🎥 Recall.ai client initialized (RECALL_API_KEY present)');
1247
+ console.log('🎥 Recall.ai client initialized (webhook STT receiver — LLM forwarding disabled, see meeting-bot Phase 2)');
1232
1248
  recall.on('transcript', ({ botId, speaker, text }) => {
1233
1249
  console.log(`📝 Meeting transcript [${speaker}]: ${text}`);
1234
- // Route meeting transcripts to Claude as user text with speaker attribution
1235
- if (currentLLM && currentSession) {
1236
- const meetingText = `[Meeting ${speaker}]: ${text}`;
1237
- // Use the same pipeline as user_text data channel messages
1238
- try {
1239
- if (currentVoiceMode === 'pipeline' || currentVoiceMode === 'direct') {
1240
- const chatCtx = new llm.ChatContext();
1241
- chatCtx.addMessage({ role: 'user', content: meetingText });
1242
- currentLLM.chat({ chatCtx });
1243
- }
1244
- }
1245
- catch (err) {
1246
- console.error('❌ Failed to route meeting transcript:', err);
1247
- }
1248
- }
1250
+ // INTENTIONALLY DISABLED see comment above. Audio path is now LiveKit
1251
+ // meeting-bot page publishes meeting audio → agent STT processes it.
1252
+ // The line below is preserved as a reference for future re-enablement
1253
+ // (e.g. as a display-only feature, NOT as LLM input).
1254
+ //
1255
+ // if (currentLLM && currentSession) {
1256
+ // const meetingText = `[Meeting — ${speaker}]: ${text}`
1257
+ // try {
1258
+ // if (currentVoiceMode === 'pipeline' || currentVoiceMode === 'direct') {
1259
+ // const chatCtx = new llm.ChatContext()
1260
+ // chatCtx.addMessage({ role: 'user', content: meetingText })
1261
+ // ;(currentLLM as any).chat({ chatCtx })
1262
+ // }
1263
+ // } catch (err) {
1264
+ // console.error('❌ Failed to route meeting transcript:', err)
1265
+ // }
1266
+ // }
1249
1267
  });
1250
1268
  }
1251
1269
  // ============================================================
@@ -1656,9 +1674,34 @@ async function main() {
1656
1674
  skipTTSQueue: true,
1657
1675
  onCompactionEvent: (event) => {
1658
1676
  try {
1659
- // Forward every field frontend renders stage + detail + skill list during compaction.
1660
- // Spread covers compaction_started/progress/complete (different fields per type).
1677
+ // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
1661
1678
  sendToFrontend({ ...event });
1679
+ // ALSO emit as a claude_output chat bubble — reuses the existing message path
1680
+ // that's already working end-to-end. PreCompact → in-progress bubble.
1681
+ // PostCompact → completion bubble with the skills summary. The dedicated
1682
+ // banner has been unreliable in production (data path works on backend, banner
1683
+ // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
1684
+ // are visible without dev tools.
1685
+ if (event.type === 'compaction_started') {
1686
+ const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
1687
+ sendToFrontend({
1688
+ type: 'claude_output',
1689
+ text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
1690
+ agentRole: 'direct',
1691
+ });
1692
+ }
1693
+ else if (event.type === 'compaction_complete') {
1694
+ const ev = event;
1695
+ const n = ev.skillsWritten ?? 0;
1696
+ const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
1697
+ ? ` — ${ev.skillNames.join(', ')}`
1698
+ : '';
1699
+ sendToFrontend({
1700
+ type: 'claude_output',
1701
+ text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
1702
+ agentRole: 'direct',
1703
+ });
1704
+ }
1662
1705
  }
1663
1706
  catch { /* non-fatal */ }
1664
1707
  },
@@ -1862,14 +1905,17 @@ async function main() {
1862
1905
  const sayId = Date.now(); // simple ID to correlate start/end logs
1863
1906
  console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
1864
1907
  // Forward spoken text + audio to meeting output page when bot is in a meeting.
1865
- // Text appears immediately; audio uses the same configured TTS (directConfig.tts)
1866
- // so voice/provider stays consistent no separate hardcoded provider.
1908
+ // Uses DIRECT_MODE_TTS (same OpenAI fable voice as the live session) was
1909
+ // previously using directConfig.tts which falls back to DEFAULT_CONFIG.direct.tts
1910
+ // (Deepgram aura-2-asteria-en) when no user config exists, producing a different
1911
+ // voice in the meeting than what the user hears in voice-native. Both paths now
1912
+ // share the single source of truth.
1867
1913
  // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
1868
1914
  // Recall captures the browser page's audio output and injects it into the meeting.
1869
1915
  if (activeMeetingBotId) {
1870
1916
  sendToMeetingOutput({ type: 'speak', text: data.text });
1871
1917
  if (meetingOutputWs) {
1872
- synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
1918
+ synthesizeForMeeting(data.text, DIRECT_MODE_TTS).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
1873
1919
  }
1874
1920
  }
1875
1921
  try {
@@ -2011,9 +2057,34 @@ async function main() {
2011
2057
  resumeSessionId,
2012
2058
  onCompactionEvent: (event) => {
2013
2059
  try {
2014
- // Forward every field frontend renders stage + detail + skill list during compaction.
2015
- // Spread covers compaction_started/progress/complete (different fields per type).
2060
+ // Forward the raw event so the dedicated banner UI can render it (if/when fixed).
2016
2061
  sendToFrontend({ ...event });
2062
+ // ALSO emit as a claude_output chat bubble — reuses the existing message path
2063
+ // that's already working end-to-end. PreCompact → in-progress bubble.
2064
+ // PostCompact → completion bubble with the skills summary. The dedicated
2065
+ // banner has been unreliable in production (data path works on backend, banner
2066
+ // never appears on iPad/iPhone where dev tools aren't accessible). Chat bubbles
2067
+ // are visible without dev tools.
2068
+ if (event.type === 'compaction_started') {
2069
+ const triggerLabel = event.trigger ? ` (${event.trigger})` : '';
2070
+ sendToFrontend({
2071
+ type: 'claude_output',
2072
+ text: `🧠 _Crystallizing session memory…_${triggerLabel}`,
2073
+ agentRole: 'direct',
2074
+ });
2075
+ }
2076
+ else if (event.type === 'compaction_complete') {
2077
+ const ev = event;
2078
+ const n = ev.skillsWritten ?? 0;
2079
+ const names = Array.isArray(ev.skillNames) && ev.skillNames.length > 0
2080
+ ? ` — ${ev.skillNames.join(', ')}`
2081
+ : '';
2082
+ sendToFrontend({
2083
+ type: 'claude_output',
2084
+ text: `🧠 Memory crystallized — ${n} skill${n === 1 ? '' : 's'} updated${names}.`,
2085
+ agentRole: 'direct',
2086
+ });
2087
+ }
2017
2088
  }
2018
2089
  catch { /* non-fatal */ }
2019
2090
  },
@@ -2530,51 +2601,16 @@ async function main() {
2530
2601
  console.log('✅ Connected to room:', roomName);
2531
2602
  localParticipant = room.localParticipant;
2532
2603
  });
2533
- // EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
2534
- // server-side audio-level VAD on the participant's WebRTC track fires ~50-100ms
2535
- // after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
2536
- //
2537
- // Flow: user starts talking ActiveSpeakersChanged includes a RemoteParticipant
2538
- // if agent is currently speaking interrupt the SpeechHandle to flush TTS playback.
2539
- // The existing handleSpeechDone callback (around line 1320) captures the spoken-text
2540
- // + JSONL context into lastInterruption; PipelineDirectLLM consumes it on the next
2541
- // chat() call to enrich the user's message with [INTERRUPTED] context — so the
2542
- // post-interrupt note flow is preserved even though we're cutting TTS earlier.
2543
- //
2544
- // Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
2545
- // room, and when its TTS plays it appears in the active-speakers list too. An earlier
2546
- // attempt that compared `s.identity !== room.localParticipant?.identity` failed because
2547
- // localParticipant.identity could be undefined at event-fire time, letting the agent's
2548
- // own speech trigger a self-interrupt. The type check is bulletproof.
2549
- //
2550
- // Realtime mode skipped — the SDK handles interruption internally there, and manual
2551
- // interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
2552
- let lastActiveSpeakerInterruptAt = 0;
2553
- room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
2554
- if (!Array.isArray(speakers) || speakers.length === 0)
2555
- return;
2556
- const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
2557
- if (remoteSpeakers.length === 0)
2558
- return;
2559
- if (currentVoiceMode === 'realtime')
2560
- return;
2561
- if (agentState !== 'speaking')
2562
- return;
2563
- const now = Date.now();
2564
- const debounced = now - lastActiveSpeakerInterruptAt < 1000;
2565
- lastActiveSpeakerInterruptAt = now;
2566
- try {
2567
- if (!debounced) {
2568
- const ids = remoteSpeakers.map((s) => s.identity).join(',');
2569
- console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
2570
- }
2571
- currentSession?.interrupt();
2572
- }
2573
- catch (err) {
2574
- if (!debounced)
2575
- console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
2576
- }
2577
- });
2604
+ // NOTE: previously this section also had a RoomEvent.ActiveSpeakersChanged
2605
+ // handler that interrupted TTS on any sustained audio activity (~50ms after
2606
+ // mic onset). That fired too eagerly coughs, paper rustles, the agent's
2607
+ // own TTS bleeding through the mic, and other non-speech sounds tripped it
2608
+ // ~10-15% of the time, leaving the agent silent with no recovery path
2609
+ // (because no STT transcript would follow). Dropped in favor of the
2610
+ // user_state_changed 'speaking' handler below, which is fed by Deepgram
2611
+ // Flux STT's speech-vs-noise classification: slower (~100-300ms) but
2612
+ // confidence-aware. The latency tradeoff is worth eliminating the false
2613
+ // interrupts at the root.
2578
2614
  room.on(RoomEvent.Disconnected, () => {
2579
2615
  console.log('👋 Disconnected from room');
2580
2616
  // Clean up active research and voice queue
@@ -2868,19 +2904,20 @@ async function main() {
2868
2904
  }
2869
2905
  });
2870
2906
  // User state tracking — prevents queue from colliding with server-side VAD.
2871
- // Also a secondary interrupt trigger: when Deepgram STT classifies speech onset
2872
- // it propagates here via agent_activity.onStartOfSpeech _updateUserState('speaking').
2873
- // Fires later than ActiveSpeakersChanged (Deepgram has ~100-300ms classification
2874
- // latency vs LiveKit's ~50-100ms audio-level) but acts as a redundant fallback in
2875
- // case the room-level event drops. interrupt() is idempotent on an already-
2876
- // interrupted SpeechHandle so calling both paths is harmless.
2907
+ // Also the PRIMARY interrupt trigger now that the over-eager ActiveSpeakersChanged
2908
+ // path is gone. Fires when Deepgram Flux STT classifies frames as speech (not noise)
2909
+ // and propagates via agent_activity.onStartOfSpeech → _updateUserState('speaking').
2910
+ // Latency ~100-300ms after mic onset, which is the cost of confidence-aware
2911
+ // detection — vs the prior ActiveSpeakers handler that fired at ~50ms on any audio
2912
+ // activity and tripped ~10-15% false interrupts on coughs, paper rustle, agent's
2913
+ // own TTS bleeding through the mic, etc.
2877
2914
  sess.on('user_state_changed', (ev) => {
2878
2915
  const prev = userState;
2879
2916
  userState = ev.newState;
2880
2917
  console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
2881
2918
  if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
2882
2919
  try {
2883
- console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS (fallback)');
2920
+ console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS');
2884
2921
  currentSession?.interrupt();
2885
2922
  }
2886
2923
  catch (err) {
@@ -3889,8 +3926,57 @@ async function main() {
3889
3926
  (process.env.FLY_APP_NAME
3890
3927
  ? `https://${process.env.FLY_APP_NAME}.fly.dev`
3891
3928
  : `http://localhost:${apiPort}`);
3929
+ // Try to mint a LiveKit bot token + construct the frontend-hosted
3930
+ // meeting-bot page URL. The bot page joins the same LiveKit room
3931
+ // as this agent so meeting audio flows through LiveKit directly
3932
+ // (no agent-side WebSocket+WAV pipe). Falls back to the legacy
3933
+ // /meeting-output webpage if no frontend URL is resolvable, so
3934
+ // the old code path keeps working during the migration window.
3935
+ //
3936
+ // Frontend URL resolution (in priority order):
3937
+ // 1. data.frontendBase — the public URL the user's browser is on,
3938
+ // passed through the join_meeting data channel message. Works
3939
+ // automatically for localhost dev + production without any
3940
+ // env var.
3941
+ // 2. OSBORN_FRONTEND_URL — existing convention from sprites.ts
3942
+ // (frontend/src/lib/sprites.ts:241) that injects the public
3943
+ // frontend URL into sandbox env vars. Defense in depth.
3944
+ //
3945
+ // Auth: the endpoint uses LiveKit room-presence as the auth check
3946
+ // — no shared secret needed. The agent must already be in the
3947
+ // requested room (which it is by this point) for the mint to
3948
+ // succeed.
3949
+ let outputPageUrl;
3950
+ const frontendUrl = data.frontendBase
3951
+ || process.env.OSBORN_FRONTEND_URL;
3952
+ if (frontendUrl) {
3953
+ try {
3954
+ const botLkId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
3955
+ const tokenRes = await fetch(`${frontendUrl}/api/meeting-bot-token`, {
3956
+ method: 'POST',
3957
+ headers: { 'Content-Type': 'application/json' },
3958
+ body: JSON.stringify({ botId: botLkId, roomName }),
3959
+ });
3960
+ if (tokenRes.ok) {
3961
+ const { token, url } = await tokenRes.json();
3962
+ const params = new URLSearchParams({ token, url, room: roomName, botId: botLkId });
3963
+ outputPageUrl = `${frontendUrl}/meeting-bot?${params.toString()}`;
3964
+ console.log(`🎫 Meeting-bot token minted for room=${roomName} bot=${botLkId}`);
3965
+ }
3966
+ else {
3967
+ const errText = await tokenRes.text().catch(() => '');
3968
+ console.warn(`⚠️ meeting-bot-token mint failed (HTTP ${tokenRes.status}: ${errText.substring(0, 120)}) — falling back to legacy /meeting-output path`);
3969
+ }
3970
+ }
3971
+ catch (mintErr) {
3972
+ console.warn(`⚠️ meeting-bot-token mint threw — falling back: ${mintErr.message}`);
3973
+ }
3974
+ }
3975
+ else {
3976
+ console.log('ℹ️ No frontend URL (data.frontendBase + OSBORN_FRONTEND_URL both empty) — using legacy /meeting-output path');
3977
+ }
3892
3978
  await sendToFrontend({ type: 'meeting_joining', message: 'Osborn is joining your meeting...' });
3893
- const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase);
3979
+ const botId = await recallJoin.joinMeeting(meetingUrl, webhookBase, { outputPageUrl });
3894
3980
  const sessionId = currentLLM?.sessionId || currentResumeSessionId || 'default';
3895
3981
  recallJoin.registerBot(botId, sessionId);
3896
3982
  activeMeetingBotId = botId;
@@ -36,7 +36,23 @@ export interface TranscriptPayload {
36
36
  export declare class RecallClient extends EventEmitter {
37
37
  #private;
38
38
  constructor(apiKey: string);
39
- joinMeeting(meetingUrl: string, webhookBaseUrl: string, botName?: string): Promise<string>;
39
+ /**
40
+ * Join a meeting via Recall.ai.
41
+ *
42
+ * @param meetingUrl Zoom / Google Meet / Teams URL the bot should dial in to
43
+ * @param webhookBaseUrl Base URL for the agent's HTTP endpoints (transcript webhook)
44
+ * @param opts.outputPageUrl Full URL for the bot's camera/audio page. If provided,
45
+ * replaces the default `${webhookBaseUrl}/meeting-output`.
46
+ * Used to point at the frontend-hosted /meeting-bot page
47
+ * with token + room embedded as query params, so the page
48
+ * connects to LiveKit and audio flows through the same
49
+ * room as the osborn agent (no separate WebSocket+WAV pipe).
50
+ * @param opts.botName Display name of the bot in the meeting
51
+ */
52
+ joinMeeting(meetingUrl: string, webhookBaseUrl: string, opts?: {
53
+ outputPageUrl?: string;
54
+ botName?: string;
55
+ }): Promise<string>;
40
56
  leaveMeeting(botId: string): Promise<void>;
41
57
  getBotStatus(botId: string): Promise<string>;
42
58
  handleWebhook(payload: TranscriptPayload): void;
@@ -8,7 +8,22 @@ export class RecallClient extends EventEmitter {
8
8
  super();
9
9
  this.#apiKey = apiKey;
10
10
  }
11
- async joinMeeting(meetingUrl, webhookBaseUrl, botName = 'Osborn') {
11
+ /**
12
+ * Join a meeting via Recall.ai.
13
+ *
14
+ * @param meetingUrl Zoom / Google Meet / Teams URL the bot should dial in to
15
+ * @param webhookBaseUrl Base URL for the agent's HTTP endpoints (transcript webhook)
16
+ * @param opts.outputPageUrl Full URL for the bot's camera/audio page. If provided,
17
+ * replaces the default `${webhookBaseUrl}/meeting-output`.
18
+ * Used to point at the frontend-hosted /meeting-bot page
19
+ * with token + room embedded as query params, so the page
20
+ * connects to LiveKit and audio flows through the same
21
+ * room as the osborn agent (no separate WebSocket+WAV pipe).
22
+ * @param opts.botName Display name of the bot in the meeting
23
+ */
24
+ async joinMeeting(meetingUrl, webhookBaseUrl, opts) {
25
+ const botName = opts?.botName ?? 'Osborn';
26
+ const outputPageUrl = opts?.outputPageUrl ?? `${webhookBaseUrl}/meeting-output`;
12
27
  // Authoritative structure per https://docs.recall.ai/reference/bot_create
13
28
  // and https://docs.recall.ai/docs/real-time-transcription:
14
29
  //
@@ -49,10 +64,13 @@ export class RecallClient extends EventEmitter {
49
64
  output_media: {
50
65
  camera: {
51
66
  // `kind` (not `type`) — confirmed from prior debugging.
52
- // Output webpage plays TTS audio so meeting participants can hear the agent.
67
+ // The page Recall renders is responsible for joining the same LiveKit
68
+ // room as the osborn agent: meeting audio captured via getUserMedia is
69
+ // published into the room; osborn's TTS audio (already in the room) is
70
+ // played by the page and captured by Recall as the bot's mic output.
53
71
  kind: 'webpage',
54
72
  config: {
55
- url: `${webhookBaseUrl}/meeting-output`,
73
+ url: outputPageUrl,
56
74
  },
57
75
  },
58
76
  },
@@ -63,7 +81,7 @@ export class RecallClient extends EventEmitter {
63
81
  throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
64
82
  }
65
83
  const bot = (await res.json());
66
- console.log(`🤖 Recall.ai bot joined meeting: ${bot.id}`);
84
+ console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (output page: ${outputPageUrl})`);
67
85
  return bot.id;
68
86
  }
69
87
  async leaveMeeting(botId) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.38",
3
+ "version": "0.9.40",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {