osborn 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/.claude/settings.local.json +9 -0
  2. package/.claude/skills/markdown-to-pdf/SKILL.md +29 -0
  3. package/.claude/skills/pdf-to-markdown/SKILL.md +28 -0
  4. package/.claude/skills/playwright-browser/SKILL.md +75 -0
  5. package/.claude/skills/youtube-transcript/SKILL.md +24 -0
  6. package/dist/claude-llm.d.ts +29 -1
  7. package/dist/claude-llm.js +334 -78
  8. package/dist/config.d.ts +5 -1
  9. package/dist/config.js +4 -1
  10. package/dist/fast-brain.d.ts +70 -16
  11. package/dist/fast-brain.js +662 -99
  12. package/dist/index-3-2-26-legacy.d.ts +1 -0
  13. package/dist/index-3-2-26-legacy.js +2233 -0
  14. package/dist/index.js +752 -423
  15. package/dist/jsonl-search.d.ts +66 -0
  16. package/dist/jsonl-search.js +274 -0
  17. package/dist/leagcyprompts2.d.ts +0 -0
  18. package/dist/leagcyprompts2.js +573 -0
  19. package/dist/pipeline-direct-llm.d.ts +77 -0
  20. package/dist/pipeline-direct-llm.js +216 -0
  21. package/dist/pipeline-fastbrain.d.ts +45 -0
  22. package/dist/pipeline-fastbrain.js +367 -0
  23. package/dist/prompts-2-25-26.d.ts +0 -0
  24. package/dist/prompts-2-25-26.js +518 -0
  25. package/dist/prompts-3-2-26.d.ts +78 -0
  26. package/dist/prompts-3-2-26.js +1319 -0
  27. package/dist/prompts.d.ts +83 -12
  28. package/dist/prompts.js +1991 -588
  29. package/dist/session-access.d.ts +24 -0
  30. package/dist/session-access.js +74 -0
  31. package/dist/summary-index.d.ts +87 -0
  32. package/dist/summary-index.js +570 -0
  33. package/dist/turn-detector-shim.d.ts +24 -0
  34. package/dist/turn-detector-shim.js +83 -0
  35. package/dist/voice-io.d.ts +9 -3
  36. package/dist/voice-io.js +39 -20
  37. package/package.json +13 -10
@@ -0,0 +1,2233 @@
1
+ // Load environment variables FIRST before any other imports
2
+ import 'dotenv/config';
3
+ import { voice, initializeLogger } from '@livekit/agents';
4
+ import { Room, RoomEvent } from '@livekit/rtc-node';
5
+ import { AccessToken } from 'livekit-server-sdk';
6
+ // Initialize logger before anything else
7
+ initializeLogger({ pretty: true, level: 'info' });
8
+ import { createServer } from 'http';
9
+ import { loadConfig, getMcpServers, getEnabledMcpServerNames, getVoiceMode, getRealtimeConfig, getDirectConfig, listSessions, getMostRecentSessionId, sessionExists, cleanupOrphanedMetadata, getSessionSummary, getConversationHistory, ensureSessionWorkspace, getMcpServerStatusList, buildMcpServersForKeys, listWorkspaceArtifacts, readSessionSpec, listLibraryFiles } from './config.js';
10
+ import { createSTT, createTTS, createVAD, createRealtimeModelFromConfig } from './voice-io.js';
11
+ import { createClaudeLLM } from './claude-llm.js';
12
+ import { createSmitheryProxy, destroySmitheryProxy, parseSmitheryUrl, isSmitheryUrl, SmitheryAuthorizationError } from './smithery-proxy.js';
13
+ import { askHaiku, updateSpecFromJSONL, augmentResearchResult, writeQuestionToSpec, checkOutputAgainstQuestions, contextualizeResearchUpdate, generateProactivePrompt, generateVisualDocument, clearFastBrainHistory } from './fast-brain.js';
14
+ import { DIRECT_MODE_PROMPT, getRealtimeInstructions, getResearchCompleteInjection, getResearchUpdateInjection, getNotificationInjection } from './prompts.js';
15
+ import { MCP_CATALOG } from './config.js';
16
+ import { llm } from '@livekit/agents';
17
+ import { z } from 'zod';
18
+ // ============================================================
19
+ // DUAL MODE VOICE ARCHITECTURE
20
+ // ============================================================
21
+ // DIRECT MODE (default): STT → Claude Agent SDK → TTS
22
+ // - Full coding capabilities via Claude Agent SDK
23
+ // - Permission system flows to frontend
24
+ // - Best for actual coding tasks
25
+ //
26
+ // REALTIME MODE: OpenAI/Gemini native speech-to-speech
27
+ // - Faster response, lower latency
28
+ // - Voice LLM with tool calling (ask_agent, respond_permission)
29
+ // - Routes tasks to Claude agents for execution
30
+ // ============================================================
31
+ // Generate a short, user-friendly room code
32
+ function generateRoomCode() {
33
+ const chars = 'abcdefghjkmnpqrstuvwxyz23456789';
34
+ let code = '';
35
+ for (let i = 0; i < 6; i++) {
36
+ code += chars[Math.floor(Math.random() * chars.length)];
37
+ }
38
+ return code;
39
+ }
40
+ // Parse CLI arguments
41
+ function parseArgs() {
42
+ const args = process.argv.slice(2);
43
+ let roomCode;
44
+ for (let i = 0; i < args.length; i++) {
45
+ if (args[i] === '--room' && args[i + 1]) {
46
+ roomCode = args[i + 1];
47
+ }
48
+ // Short code detection (e.g., `npm run dev abc123`)
49
+ if (!args[i].startsWith('-') && args[i].length >= 4 && args[i].length <= 10 &&
50
+ !['dev', 'start'].includes(args[i])) {
51
+ roomCode = args[i];
52
+ }
53
+ }
54
+ return { roomCode };
55
+ }
56
+ // Global error handlers
57
+ process.on('unhandledRejection', (reason) => {
58
+ const msg = reason?.message || String(reason);
59
+ if (msg.includes('aborted') || msg.includes('AbortError')) {
60
+ console.log('⚠️ LLM request aborted (user interrupted)');
61
+ return;
62
+ }
63
+ // Gemini plugin intentionally supersedes generate_reply calls — safe to suppress
64
+ if (msg.includes('Superseded')) {
65
+ console.log('⚠️ generateReply superseded (expected during concurrent injections)');
66
+ return;
67
+ }
68
+ // OpenAI race: voice queue fired while server-side VAD already created a response
69
+ if (msg.includes('conversation_already_has_active_response') || msg.includes('active_response')) {
70
+ console.log('⚠️ OpenAI active response collision (will retry on next listening state)');
71
+ return;
72
+ }
73
+ // LiveKit SDK internal error after participant disconnect — safe to suppress
74
+ if (msg.includes("reading 'source'") || msg.includes("reading 'type'")) {
75
+ console.log('⚠️ Post-disconnect cleanup error (harmless)');
76
+ return;
77
+ }
78
+ // generateReply timeout — usually from racing concurrent injections
79
+ if (msg.includes('generateReply timed out') || msg.includes('generation_created')) {
80
+ console.log('⚠️ generateReply timed out (concurrent injection race)');
81
+ return;
82
+ }
83
+ console.error('❌ Unhandled Rejection:', msg);
84
+ });
85
+ process.on('uncaughtException', (error) => {
86
+ if (error.message?.includes('aborted') || error.message?.includes('AbortError')) {
87
+ console.log('⚠️ Operation aborted');
88
+ return;
89
+ }
90
+ console.error('❌ Uncaught Exception:', error);
91
+ });
92
+ // ============================================================
93
+ // HTTP API SERVER - Exposes session data to cloud-deployed frontend
94
+ // ============================================================
95
+ function startApiServer(workingDir, port) {
96
+ const server = createServer(async (req, res) => {
97
+ // CORS headers for cloud frontend
98
+ res.setHeader('Access-Control-Allow-Origin', '*');
99
+ res.setHeader('Access-Control-Allow-Methods', 'GET, OPTIONS');
100
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
101
+ if (req.method === 'OPTIONS') {
102
+ res.writeHead(204);
103
+ res.end();
104
+ return;
105
+ }
106
+ const url = new URL(req.url || '/', `http://localhost:${port}`);
107
+ if (req.method === 'GET' && url.pathname === '/sessions') {
108
+ try {
109
+ await cleanupOrphanedMetadata(workingDir);
110
+ const sessions = await listSessions(workingDir);
111
+ const payload = {
112
+ sessions: sessions.map(s => ({
113
+ sessionId: s.sessionId,
114
+ timestamp: s.timestamp.toISOString(),
115
+ lastMessage: s.lastMessage,
116
+ messageCount: s.messageCount,
117
+ })),
118
+ total: sessions.length,
119
+ };
120
+ res.writeHead(200, { 'Content-Type': 'application/json' });
121
+ res.end(JSON.stringify(payload));
122
+ }
123
+ catch (err) {
124
+ console.error('API /sessions error:', err);
125
+ res.writeHead(500, { 'Content-Type': 'application/json' });
126
+ res.end(JSON.stringify({ sessions: [], total: 0, error: 'Failed to list sessions' }));
127
+ }
128
+ return;
129
+ }
130
+ if (req.method === 'GET' && url.pathname === '/health') {
131
+ res.writeHead(200, { 'Content-Type': 'application/json' });
132
+ res.end(JSON.stringify({ status: 'ok', workingDir }));
133
+ return;
134
+ }
135
+ res.writeHead(404, { 'Content-Type': 'application/json' });
136
+ res.end(JSON.stringify({ error: 'Not found' }));
137
+ });
138
+ server.listen(port, () => {
139
+ console.log(`🌐 API server listening on http://localhost:${port}`);
140
+ console.log(` Sessions: http://localhost:${port}/sessions`);
141
+ });
142
+ server.on('error', (err) => {
143
+ if (err.code === 'EADDRINUSE') {
144
+ console.warn(`⚠️ API port ${port} in use, trying ${port + 1}...`);
145
+ startApiServer(workingDir, port + 1);
146
+ }
147
+ else {
148
+ console.error('❌ API server error:', err);
149
+ }
150
+ });
151
+ }
152
+ // ============================================================
153
+ // SESSION CONTEXT HELPERS
154
+ // ============================================================
155
+ /**
156
+ * Build a context briefing string for the realtime agent
157
+ * Loads session conversation history so the model has deep context.
158
+ * Gemini has smaller context limits — cap at 10 exchanges with 500 char content.
159
+ * OpenAI handles full history (30 exchanges, 2000 char content).
160
+ */
161
+ function buildContextBriefing(summary, history, provider) {
162
+ const isGemini = provider === 'gemini';
163
+ // Gemini: last 10 exchanges capped at 500 chars. OpenAI: full history.
164
+ const maxExchanges = isGemini ? 10 : history.length;
165
+ const maxContentLen = isGemini ? 500 : 2000;
166
+ const trimmedHistory = history.slice(-maxExchanges);
167
+ const lines = [
168
+ `Session ID: ${summary.sessionId.substring(0, 8)}`,
169
+ `Total messages: ${summary.messageCount}`,
170
+ '',
171
+ '=== SESSION CONVERSATION HISTORY ==='
172
+ ];
173
+ for (const exchange of trimmedHistory) {
174
+ const content = exchange.content.length > maxContentLen
175
+ ? exchange.content.substring(0, maxContentLen) + '...'
176
+ : exchange.content;
177
+ lines.push(`${exchange.role === 'user' ? 'User' : 'Assistant'}: ${content}`);
178
+ lines.push('');
179
+ }
180
+ return lines.join('\n');
181
+ }
182
+ /**
183
+ * Read spec.md and format it for the realtime voice model.
184
+ * Truncates to avoid bloating the context window.
185
+ * Returns null if spec doesn't exist or session ID isn't available.
186
+ */
187
+ function getSpecForVoiceModel(workingDir, sessionId) {
188
+ if (!sessionId)
189
+ return null;
190
+ const specContent = readSessionSpec(workingDir, sessionId);
191
+ if (!specContent)
192
+ return null;
193
+ const MAX = 3000;
194
+ if (specContent.length <= MAX)
195
+ return specContent;
196
+ const truncated = specContent.substring(0, MAX);
197
+ const lastHeading = truncated.lastIndexOf('\n## ');
198
+ if (lastHeading > MAX * 0.5) {
199
+ return truncated.substring(0, lastHeading) + '\n\n[... truncated — call read_spec for full content]';
200
+ }
201
+ return truncated + '\n\n[... truncated]';
202
+ }
203
+ /**
204
+ * Load full session conversation history into the realtime model's ChatContext.
205
+ * This gives the model persistent memory of what was discussed/researched,
206
+ * enabling deeper follow-up conversations without re-delegating to ask_agent.
207
+ *
208
+ * NOTE: Gemini's Live API doesn't support updateChatCtx (crashes with code 1008).
209
+ * For Gemini, the session resume context is already injected via generateReply({ userInput })
210
+ * which becomes part of the conversation history as model turns.
211
+ */
212
+ function loadSessionHistoryIntoChatCtx(agent, history, provider) {
213
+ if (!agent || history.length === 0)
214
+ return;
215
+ // Skip for Gemini — updateChatCtx triggers unsupported operations on Gemini Live API
216
+ if (provider === 'gemini') {
217
+ console.log(`🧠 Skipping ChatCtx load for Gemini (${history.length} exchanges) — context injected via generateReply`);
218
+ return;
219
+ }
220
+ try {
221
+ const chatCtx = agent.chatCtx.copy();
222
+ // Inject each conversation exchange as a proper chat message
223
+ for (const exchange of history) {
224
+ chatCtx.addMessage({
225
+ role: exchange.role === 'user' ? 'user' : 'assistant',
226
+ content: exchange.content,
227
+ });
228
+ }
229
+ agent.updateChatCtx(chatCtx);
230
+ console.log(`🧠 Loaded ${history.length} conversation exchanges into ChatCtx (${history.reduce((sum, e) => sum + e.content.length, 0)} chars)`);
231
+ }
232
+ catch (err) {
233
+ console.log('⚠️ Failed to load session history into ChatCtx:', err);
234
+ }
235
+ }
236
+ // Main function
237
+ async function main() {
238
+ console.log('\n🤖 Osborn Voice AI Coding Assistant\n');
239
+ // Validate environment
240
+ const livekitUrl = process.env.LIVEKIT_URL;
241
+ const apiKey = process.env.LIVEKIT_API_KEY;
242
+ const apiSecret = process.env.LIVEKIT_API_SECRET;
243
+ if (!livekitUrl || !apiKey || !apiSecret) {
244
+ console.error('❌ Missing required environment variables:');
245
+ if (!livekitUrl)
246
+ console.error(' - LIVEKIT_URL');
247
+ if (!apiKey)
248
+ console.error(' - LIVEKIT_API_KEY');
249
+ if (!apiSecret)
250
+ console.error(' - LIVEKIT_API_SECRET');
251
+ console.error('\nSet these in your .env file or environment.');
252
+ process.exit(1);
253
+ }
254
+ // Parse CLI args
255
+ const cliArgs = parseArgs();
256
+ // Load configuration
257
+ console.log('📁 Loading configuration...');
258
+ const config = loadConfig();
259
+ const mcpServers = getMcpServers(config);
260
+ const enabledMcpNames = getEnabledMcpServerNames(config);
261
+ if (enabledMcpNames.length > 0) {
262
+ console.log(`🔌 Enabled MCP servers: ${enabledMcpNames.join(', ')}`);
263
+ }
264
+ const workingDir = config.workingDirectory || process.cwd();
265
+ console.log(`📂 Working directory: ${workingDir}`);
266
+ console.log(`🔬 Mode: RESEARCH`);
267
+ // Determine voice mode
268
+ const voiceMode = getVoiceMode(config);
269
+ const realtimeConfig = getRealtimeConfig(config);
270
+ const directConfig = getDirectConfig(config);
271
+ if (voiceMode === 'realtime') {
272
+ console.log(`🎙️ REALTIME MODE: ${realtimeConfig.provider} native speech-to-speech`);
273
+ console.log(` Voice: ${realtimeConfig.provider === 'openai' ? realtimeConfig.openaiVoice : realtimeConfig.geminiVoice}`);
274
+ }
275
+ else {
276
+ console.log(`🎯 DIRECT MODE: ${directConfig.stt.provider} STT → Claude Agent SDK → ${directConfig.tts.provider} TTS`);
277
+ console.log(' 🔥 Full coding capabilities!');
278
+ }
279
+ // Determine room code
280
+ const roomCode = cliArgs.roomCode || generateRoomCode();
281
+ const roomName = `osborn-${roomCode}`;
282
+ if (cliArgs.roomCode) {
283
+ console.log(`🔗 Joining room: ${roomCode}`);
284
+ }
285
+ else {
286
+ console.log(`\n✨ Created new room: ${roomCode}`);
287
+ console.log(`\n📋 Share this with the frontend or run:`);
288
+ console.log(` Open: https://osborn.app?room=${roomCode}`);
289
+ console.log(` Or enter code "${roomCode}" in the frontend\n`);
290
+ }
291
+ // Start HTTP API server for frontend session browsing
292
+ const apiPort = parseInt(process.env.OSBORN_API_PORT || '8741', 10);
293
+ startApiServer(workingDir, apiPort);
294
+ // ============================================================
295
+ // Create Access Token for Agent
296
+ // ============================================================
297
+ console.log('🔑 Creating access token...');
298
+ const token = new AccessToken(apiKey, apiSecret, {
299
+ identity: 'osborn-agent',
300
+ name: 'Osborn AI',
301
+ metadata: JSON.stringify({ type: 'agent', version: '0.3.0' }),
302
+ });
303
+ token.addGrant({
304
+ roomJoin: true,
305
+ room: roomName,
306
+ canPublish: true,
307
+ canSubscribe: true,
308
+ canPublishData: true,
309
+ });
310
+ const jwt = await token.toJwt();
311
+ // ============================================================
312
+ // Connect to Room
313
+ // ============================================================
314
+ console.log('📡 Connecting to LiveKit...');
315
+ const room = new Room();
316
+ room.setMaxListeners(50); // Prevent MaxListenersExceeded warnings on reconnect
317
+ // Track state
318
+ let currentSession = null;
319
+ let currentAgent = null; // For updateChatCtx() context injection
320
+ let currentLLM = null;
321
+ let localParticipant = null;
322
+ let agentState = 'initializing';
323
+ let userState = 'listening'; // Track user speech state for queue safety
324
+ let currentVoiceMode = voiceMode; // Track active voice mode for data handlers
325
+ let currentProvider = realtimeConfig.provider; // Track active realtime provider
326
+ // Task deduplication guard - prevents Gemini re-execution loops
327
+ let lastTaskRequest = '';
328
+ let lastTaskTime = 0;
329
+ // Fast brain (ask_haiku) in-flight tracking — prevents ask_agent double-calling
330
+ let haikuInFlight = null;
331
+ // Background research state - tracks async ask_agent execution
332
+ let activeResearch = null;
333
+ // No manual queuing — the Claude SDK handles sequential queries internally
334
+ // ============================================================
335
+ // Unified Voice Injection Queue
336
+ // ============================================================
337
+ // ALL system injections (research updates, completions, notifications, errors)
338
+ // go through this queue. Never call generateReply directly for injections.
339
+ // The queue only drains when the voice model is confirmed 'listening'.
340
+ // After draining, the model transitions to thinking/speaking, and the queue
341
+ // naturally pauses until the next 'listening' state.
342
+ const voiceQueue = [];
343
+ let isProcessingQueue = false;
344
+ function queueVoiceInjection(instructions) {
345
+ voiceQueue.push(instructions);
346
+ console.log(`📥 Voice queue: +1 (total: ${voiceQueue.length}): ${instructions.substring(0, 80)}...`);
347
+ processVoiceQueue();
348
+ }
349
+ function processVoiceQueue() {
350
+ if (voiceQueue.length === 0)
351
+ return;
352
+ if (!currentSession)
353
+ return;
354
+ if (isProcessingQueue) {
355
+ console.log(`⏸️ Voice queue: already processing, ${voiceQueue.length} items waiting`);
356
+ return;
357
+ }
358
+ if (agentState !== 'listening') {
359
+ console.log(`⏸️ Voice queue: ${voiceQueue.length} items waiting (model: ${agentState})`);
360
+ return; // Will be called again when agent_state_changed → 'listening'
361
+ }
362
+ // Don't inject while user is speaking — server-side VAD will auto-create a response
363
+ if (userState === 'speaking') {
364
+ console.log(`⏸️ Voice queue: ${voiceQueue.length} items waiting (user speaking)`);
365
+ return;
366
+ }
367
+ isProcessingQueue = true;
368
+ // Safety timeout: if agent_state_changed never fires (e.g. Gemini state machine hang),
369
+ // clear the guard after 30s so the queue isn't permanently stuck
370
+ setTimeout(() => {
371
+ if (isProcessingQueue) {
372
+ console.log('⚠️ Voice queue: isProcessingQueue stuck for 30s, clearing');
373
+ isProcessingQueue = false;
374
+ if (voiceQueue.length > 0 && agentState === 'listening') {
375
+ processVoiceQueue();
376
+ }
377
+ }
378
+ }, 30000);
379
+ // Batch ALL queued items into one generateReply call
380
+ const items = voiceQueue.splice(0);
381
+ const batchedInstruction = items.length === 1
382
+ ? items[0]
383
+ : items.join('\n\n---\n\n');
384
+ console.log(`📡 Voice queue: processing ${items.length} batched items (${batchedInstruction.length} chars)`);
385
+ try {
386
+ // Skip interrupt for Gemini — disrupts Gemini's state machine, causing it to
387
+ // never transition back to 'listening' (hangs in speaking state indefinitely)
388
+ if (currentProvider !== 'gemini') {
389
+ currentSession.interrupt();
390
+ }
391
+ currentSession.generateReply({
392
+ instructions: batchedInstruction,
393
+ toolChoice: 'none',
394
+ });
395
+ // Model transitions to thinking/speaking after this call.
396
+ // When it returns to 'listening', agent_state_changed triggers processVoiceQueue() again.
397
+ // Also inject into chatCtx as persistent context so the model remembers across turns
398
+ injectIntoChatCtx(batchedInstruction);
399
+ }
400
+ catch (err) {
401
+ console.log('⚠️ Voice queue generateReply failed, dropping items:', err);
402
+ // Do NOT re-queue — re-queuing causes infinite retry cascades
403
+ // The frontend still has the updates via claude_output events
404
+ isProcessingQueue = false;
405
+ }
406
+ // isProcessingQueue is cleared when agent_state_changed fires
407
+ }
408
+ // Inject content into the agent's ChatContext as persistent memory
409
+ // This ensures the realtime model can reference prior research in follow-up questions
410
+ // NOTE: Gemini doesn't support updateChatCtx (crashes with "Operation not implemented" code 1008).
411
+ // For Gemini, generateReply({ instructions }) already injects as model turns, so context persists naturally.
412
+ function injectIntoChatCtx(content) {
413
+ if (!currentAgent)
414
+ return;
415
+ // Skip for Gemini — updateChatCtx triggers unsupported operations on Gemini Live API
416
+ if (currentVoiceMode === 'realtime' && currentProvider === 'gemini')
417
+ return;
418
+ try {
419
+ const chatCtx = currentAgent.chatCtx.copy();
420
+ chatCtx.addMessage({
421
+ role: 'assistant',
422
+ content: content,
423
+ });
424
+ currentAgent.updateChatCtx(chatCtx);
425
+ console.log(`🧠 ChatCtx updated (+${content.length} chars persistent context)`);
426
+ }
427
+ catch (err) {
428
+ console.log('⚠️ ChatCtx injection failed:', err);
429
+ }
430
+ }
431
+ // Extract recent voice conversation turns from the realtime LLM's in-memory ChatContext.
432
+ // Replaces the internal conversationHistory array in fast-brain.ts.
433
+ function getChatHistory(maxTurns = 20) {
434
+ if (!currentAgent)
435
+ return [];
436
+ try {
437
+ const items = currentAgent.chatCtx.items;
438
+ const turns = [];
439
+ for (const item of items) {
440
+ if (item.type !== 'message')
441
+ continue;
442
+ const msg = item;
443
+ if (msg.role !== 'user' && msg.role !== 'assistant')
444
+ continue;
445
+ const text = msg.textContent ?? '';
446
+ if (!text.trim())
447
+ continue;
448
+ turns.push({ role: msg.role, text: text.trim() });
449
+ }
450
+ return turns.slice(-maxTurns);
451
+ }
452
+ catch (err) {
453
+ console.log('⚠️ getChatHistory: failed to read chatCtx:', err);
454
+ return [];
455
+ }
456
+ }
457
+ // Research event batching — debounce rapid-fire tool events into a single voice queue entry
458
+ let researchBatchTimer = null;
459
+ function scheduleResearchBatch() {
460
+ if (researchBatchTimer)
461
+ return; // Already scheduled
462
+ researchBatchTimer = setTimeout(() => {
463
+ researchBatchTimer = null;
464
+ if (!activeResearch || activeResearch.pendingUpdates.length === 0)
465
+ return;
466
+ const updates = activeResearch.pendingUpdates.splice(0);
467
+ const batchText = updates.slice(-10).join('. ');
468
+ console.log(`📡 [research] Batching ${updates.length} events: ${batchText.substring(0, 80)}...`);
469
+ // Send to frontend for visibility
470
+ sendToFrontend({
471
+ type: 'claude_output',
472
+ text: `[Research Progress] ${batchText}`,
473
+ isStreaming: true,
474
+ agentRole: 'research-progress',
475
+ });
476
+ // COMMENTED OUT — voice narration disabled, research progress goes to -frontend logs only
477
+ // // queueVoiceInjection(getResearchUpdateInjection(batchText))
478
+ // Route through fast brain for contextual voice updates (capped at 3 per task)
479
+ if (activeResearch.voiceUpdateCount < 3) {
480
+ const voiceSid = currentLLM?.sessionId;
481
+ if (voiceSid) {
482
+ contextualizeResearchUpdate(workingDir, voiceSid, lastTaskRequest || '', updates, activeResearch.researchLog)
483
+ .then(update => {
484
+ if (update && update !== 'NOTHING' && activeResearch) {
485
+ activeResearch.voiceUpdateCount++;
486
+ queueVoiceInjection(getResearchUpdateInjection(update));
487
+ }
488
+ })
489
+ .catch(() => { }); // Silent fail — updates are optional
490
+ }
491
+ }
492
+ }, 8000); // 8s debounce: reduces voice queue flooding during research
493
+ }
494
+ // Proactive conversational loop — keeps conversation alive during research
495
+ let proactiveTimer = null;
496
+ let proactivePromptHistory = [];
497
+ const PROACTIVE_INTERVAL = 15000; // 15 seconds (offset from 8s batch timer)
498
+ const MAX_PROACTIVE_PROMPTS = 4; // Cap per research task
499
+ function startProactiveLoop(task, sessionId) {
500
+ stopProactiveLoop();
501
+ proactivePromptHistory = [];
502
+ let proactiveCount = 0;
503
+ proactiveTimer = setInterval(async () => {
504
+ if (!activeResearch) {
505
+ stopProactiveLoop();
506
+ return;
507
+ }
508
+ if (proactiveCount >= MAX_PROACTIVE_PROMPTS)
509
+ return;
510
+ if (agentState !== 'listening' || userState === 'speaking')
511
+ return;
512
+ if (researchBatchTimer)
513
+ return; // Don't collide with batch updates
514
+ if (isProcessingQueue)
515
+ return; // Don't collide with voice queue
516
+ try {
517
+ const prompt = await generateProactivePrompt(workingDir, sessionId, task, activeResearch.researchLog, proactivePromptHistory);
518
+ if (prompt && prompt !== 'NOTHING') {
519
+ proactivePromptHistory.push(prompt);
520
+ proactiveCount++;
521
+ queueVoiceInjection(`[PROACTIVE CONTEXT] ${prompt}. Say this naturally to the user. Do NOT call any tools.`);
522
+ }
523
+ }
524
+ catch { } // Silent fail — proactive prompts are optional
525
+ }, PROACTIVE_INTERVAL);
526
+ }
527
+ function stopProactiveLoop() {
528
+ if (proactiveTimer) {
529
+ clearInterval(proactiveTimer);
530
+ proactiveTimer = null;
531
+ }
532
+ proactivePromptHistory = [];
533
+ }
534
+ // Helper to send data to frontend (with size limit handling)
535
+ const MAX_MESSAGE_SIZE = 60000;
536
+ async function sendToFrontend(data) {
537
+ if (!localParticipant) {
538
+ console.log('⚠️ sendToFrontend: no localParticipant!');
539
+ return;
540
+ }
541
+ try {
542
+ const encoder = new TextEncoder();
543
+ let jsonData = JSON.stringify(data);
544
+ // If message is too large, truncate the text content
545
+ if (jsonData.length > MAX_MESSAGE_SIZE) {
546
+ const truncatedData = { ...data };
547
+ if (truncatedData.text && typeof truncatedData.text === 'string') {
548
+ const overhead = JSON.stringify({ ...truncatedData, text: '' }).length;
549
+ const maxTextLength = MAX_MESSAGE_SIZE - overhead - 100;
550
+ truncatedData.text = truncatedData.text.substring(0, maxTextLength) + '\n\n[Message truncated due to size limit]';
551
+ jsonData = JSON.stringify(truncatedData);
552
+ console.log(`⚠️ Message truncated from ${data.text?.length} to ${truncatedData.text.length} chars`);
553
+ }
554
+ }
555
+ const payload = encoder.encode(jsonData);
556
+ await localParticipant.publishData(payload, {
557
+ reliable: true,
558
+ topic: 'osborn-updates',
559
+ });
560
+ console.log(`📤 Sent to frontend: ${data.type} (${payload.length} bytes)`);
561
+ }
562
+ catch (err) {
563
+ console.error('❌ sendToFrontend error:', err);
564
+ }
565
+ }
566
+ // Helper: announce via voice - uses voice queue for realtime, say() for direct
567
+ async function announceViaVoice(text) {
568
+ if (!currentSession)
569
+ return;
570
+ if (currentVoiceMode === 'realtime') {
571
+ queueVoiceInjection(getNotificationInjection(text));
572
+ }
573
+ else {
574
+ try {
575
+ await currentSession.say(text);
576
+ }
577
+ catch (err) {
578
+ console.log('⚠️ Voice announcement failed:', err);
579
+ }
580
+ }
581
+ }
582
+ // Create DIRECT session (STT + Claude Agent SDK + TTS)
583
+ async function createDirectSession(resumeSessionId) {
584
+ console.log('🎯 Creating direct session...');
585
+ const stt = createSTT({ provider: 'deepgram' });
586
+ const tts = createTTS({ provider: 'deepgram', voice: 'aura-asteria-en' });
587
+ const vad = await createVAD();
588
+ // Create Claude LLM wrapper in research mode
589
+ const directLLM = createClaudeLLM({
590
+ workingDirectory: workingDir,
591
+ mcpServers,
592
+ resumeSessionId,
593
+ });
594
+ currentLLM = directLLM;
595
+ // For resumed sessions, eagerly create workspace (we know the real ID)
596
+ if (resumeSessionId) {
597
+ const workspace = ensureSessionWorkspace(workingDir, resumeSessionId);
598
+ console.log(`📁 Session workspace (resumed): ${workspace}`);
599
+ }
600
+ // For new sessions, create workspace when SDK assigns real session ID
601
+ directLLM.events.once('session_id', ({ sessionId }) => {
602
+ const workspace = ensureSessionWorkspace(workingDir, sessionId);
603
+ console.log(`📁 Session workspace created: ${workspace}`);
604
+ });
605
+ // Wire up MCP server changes to frontend
606
+ directLLM.events.on('mcp_servers_changed', (data) => {
607
+ console.log(`🔌 MCP servers changed: ${data.enabledKeys.join(', ') || 'none'}`);
608
+ sendToFrontend({
609
+ type: 'mcp_servers_changed',
610
+ enabledKeys: data.enabledKeys,
611
+ mcpServers: getMcpServerStatusList(config),
612
+ });
613
+ });
614
+ // Wire up events from the Claude SDK wrapper to frontend
615
+ directLLM.events.on('tool_use', (data) => {
616
+ console.log(`🔧 Claude: ${data.name}`);
617
+ sendToFrontend({ type: 'tool_use', tool: data.name, agentRole: 'direct' });
618
+ });
619
+ directLLM.events.on('tool_result', (data) => {
620
+ console.log(`✅ Done: ${data.name}`);
621
+ sendToFrontend({ type: 'tool_use', tool: data.name, status: 'completed', agentRole: 'direct' });
622
+ // Detect research artifact writes (session workspace or legacy research dir)
623
+ if ((data.name === 'Write' || data.name === 'Edit') && data.input?.file_path) {
624
+ const fp = data.input.file_path;
625
+ if (fp.includes('.osborn/sessions/') || fp.includes('.osborn/research/')) {
626
+ sendToFrontend({
627
+ type: 'research_artifact_updated',
628
+ filePath: fp,
629
+ fileName: fp.split('/').pop(),
630
+ });
631
+ }
632
+ }
633
+ });
634
+ // Wire up Claude text output - RAW text goes to frontend for chat bubbles
635
+ directLLM.events.on('assistant_text', (data) => {
636
+ console.log(`💬 Claude text: ${data.text?.substring(0, 60)}...`);
637
+ sendToFrontend({
638
+ type: 'claude_output',
639
+ text: data.text,
640
+ isStreaming: true,
641
+ agentRole: 'direct',
642
+ });
643
+ });
644
+ // Wire up Claude final result - RAW result goes to frontend
645
+ directLLM.events.on('assistant_result', (data) => {
646
+ console.log(`📋 Claude result: ${data.text?.substring(0, 60)}...`);
647
+ sendToFrontend({
648
+ type: 'claude_output',
649
+ text: data.text,
650
+ isStreaming: false,
651
+ isFinal: true,
652
+ agentRole: 'direct',
653
+ });
654
+ });
655
+ // Wire up permission requests - sends to frontend for user approval
656
+ directLLM.events.on('permission_request', (data) => {
657
+ console.log(`⚠️ Permission needed: ${data.toolName}`);
658
+ const toolName = data.toolName;
659
+ const input = data.input || {};
660
+ // Build descriptive message based on tool type
661
+ let description = `I need permission to use ${toolName}.`;
662
+ if (toolName === 'Bash' && input.command) {
663
+ const cmd = String(input.command).substring(0, 60);
664
+ description = `I want to run the command: ${cmd}${String(input.command).length > 60 ? '...' : ''}`;
665
+ }
666
+ else if (toolName === 'Write' && input.file_path) {
667
+ description = `I want to create or overwrite the file: ${input.file_path}`;
668
+ }
669
+ else if (toolName === 'Edit' && input.file_path) {
670
+ description = `I want to edit the file: ${input.file_path}`;
671
+ }
672
+ else if (toolName === 'WebFetch' && input.url) {
673
+ description = `I want to fetch content from: ${input.url}`;
674
+ }
675
+ sendToFrontend({
676
+ type: 'permission_request',
677
+ toolName: data.toolName,
678
+ input: data.input,
679
+ description,
680
+ agentRole: 'direct',
681
+ });
682
+ // Speak the descriptive request so user knows to respond
683
+ if (currentSession) {
684
+ const ttsMessage = `${description} Say yes, no, or always.`;
685
+ currentSession.say?.(ttsMessage).catch(() => { });
686
+ }
687
+ });
688
+ // Wire up session resume failure - notify frontend when SDK creates new session instead
689
+ directLLM.events.on('session_resume_failed', (data) => {
690
+ console.error(`❌ Session resume failed: ${data.requestedSessionId} → ${data.actualSessionId}`);
691
+ sendToFrontend({
692
+ type: 'session_resume_failed',
693
+ requestedSessionId: data.requestedSessionId,
694
+ actualSessionId: data.actualSessionId,
695
+ });
696
+ });
697
+ // Wire up file checkpoint capture - track restore points for file rewind
698
+ directLLM.events.on('checkpoint_captured', (data) => {
699
+ console.log(`📍 Checkpoint: ${data.checkpointId.substring(0, 8)}...`);
700
+ sendToFrontend({
701
+ type: 'checkpoint_captured',
702
+ checkpointId: data.checkpointId,
703
+ });
704
+ });
705
+ // Create the Agent with instructions, STT, LLM, TTS
706
+ const agent = new voice.Agent({
707
+ instructions: DIRECT_MODE_PROMPT,
708
+ stt,
709
+ llm: directLLM,
710
+ tts,
711
+ vad,
712
+ turnDetection: 'vad',
713
+ });
714
+ // Create the session (no longer passes STT/LLM/TTS here)
715
+ const session = new voice.AgentSession({
716
+ turnDetection: 'vad',
717
+ });
718
+ return { session, agent };
719
+ }
720
+ // ============================================================
721
+ // REALTIME MODE - OpenAI/Gemini native speech-to-speech
722
+ // ============================================================
723
+ // Claude handler for realtime mode tool execution
724
+ let realtimeClaudeHandler = null;
725
+ // Create REALTIME session (OpenAI/Gemini native speech-to-speech)
726
+ async function createRealtimeSession(sessionRealtimeConfig, resumeSessionId) {
727
+ const rtConfig = sessionRealtimeConfig || realtimeConfig;
728
+ console.log(`🎯 Creating realtime session (${rtConfig.provider})...`);
729
+ // Create Claude LLM for tool execution (research tasks)
730
+ realtimeClaudeHandler = createClaudeLLM({
731
+ workingDirectory: workingDir,
732
+ mcpServers,
733
+ resumeSessionId,
734
+ });
735
+ currentLLM = realtimeClaudeHandler;
736
+ // For resumed sessions, eagerly create workspace (we know the real ID)
737
+ if (resumeSessionId) {
738
+ const workspace = ensureSessionWorkspace(workingDir, resumeSessionId);
739
+ console.log(`📁 Session workspace (resumed): ${workspace}`);
740
+ }
741
+ // For new sessions, create workspace when SDK assigns real session ID
742
+ realtimeClaudeHandler.events.once('session_id', ({ sessionId }) => {
743
+ const workspace = ensureSessionWorkspace(workingDir, sessionId);
744
+ console.log(`📁 Session workspace created: ${workspace}`);
745
+ });
746
+ // Wire up MCP server changes to frontend
747
+ realtimeClaudeHandler.events.on('mcp_servers_changed', (data) => {
748
+ console.log(`🔌 MCP servers changed: ${data.enabledKeys.join(', ') || 'none'}`);
749
+ sendToFrontend({
750
+ type: 'mcp_servers_changed',
751
+ enabledKeys: data.enabledKeys,
752
+ mcpServers: getMcpServerStatusList(config),
753
+ });
754
+ });
755
+ // Wire up Claude events to frontend
756
+ realtimeClaudeHandler.events.on('tool_use', (data) => {
757
+ console.log(`🔧 Claude: ${data.name}`);
758
+ sendToFrontend({ type: 'tool_use', tool: data.name, agentRole: 'realtime' });
759
+ });
760
+ realtimeClaudeHandler.events.on('tool_result', (data) => {
761
+ console.log(`✅ Done: ${data.name}`);
762
+ sendToFrontend({ type: 'tool_use', tool: data.name, status: 'completed', agentRole: 'realtime' });
763
+ // Detect research artifact writes (session workspace or legacy research dir)
764
+ if ((data.name === 'Write' || data.name === 'Edit') && data.input?.file_path) {
765
+ const fp = data.input.file_path;
766
+ if (fp.includes('.osborn/sessions/') || fp.includes('.osborn/research/')) {
767
+ sendToFrontend({
768
+ type: 'research_artifact_updated',
769
+ filePath: fp,
770
+ fileName: fp.split('/').pop(),
771
+ });
772
+ }
773
+ }
774
+ });
775
+ realtimeClaudeHandler.events.on('assistant_result', (data) => {
776
+ console.log(`📋 Claude result: ${data.text?.substring(0, 60)}...`);
777
+ sendToFrontend({
778
+ type: 'claude_output',
779
+ text: data.text,
780
+ isStreaming: false,
781
+ isFinal: true,
782
+ agentRole: 'realtime',
783
+ });
784
+ });
785
+ // Stream Claude's research text to frontend as progress updates
786
+ // Skips during active research to avoid duplication with per-task onText handler
787
+ realtimeClaudeHandler.events.on('assistant_text', (data) => {
788
+ if (data.text && data.text.trim()) {
789
+ if (activeResearch)
790
+ return;
791
+ sendToFrontend({
792
+ type: 'claude_output',
793
+ text: data.text,
794
+ isStreaming: true,
795
+ agentRole: 'realtime-agent',
796
+ });
797
+ }
798
+ });
799
+ realtimeClaudeHandler.events.on('permission_request', (data) => {
800
+ console.log(`⚠️ Permission needed: ${data.toolName}`);
801
+ const toolName = data.toolName;
802
+ const input = data.input || {};
803
+ // Build descriptive message based on tool type
804
+ let description = `I need permission to use ${toolName}.`;
805
+ if (toolName === 'Bash' && input.command) {
806
+ const cmd = String(input.command).substring(0, 60);
807
+ description = `I want to run the command: ${cmd}${String(input.command).length > 60 ? '...' : ''}`;
808
+ }
809
+ else if (toolName === 'Write' && input.file_path) {
810
+ description = `I want to create or overwrite the file: ${input.file_path}`;
811
+ }
812
+ else if (toolName === 'Edit' && input.file_path) {
813
+ description = `I want to edit the file: ${input.file_path}`;
814
+ }
815
+ else if (toolName === 'WebFetch' && input.url) {
816
+ description = `I want to fetch content from: ${input.url}`;
817
+ }
818
+ sendToFrontend({
819
+ type: 'permission_request',
820
+ toolName: data.toolName,
821
+ input: data.input,
822
+ description,
823
+ agentRole: 'realtime',
824
+ });
825
+ });
826
+ // Wire up session resume failure for realtime mode
827
+ realtimeClaudeHandler.events.on('session_resume_failed', (data) => {
828
+ console.error(`❌ Session resume failed: ${data.requestedSessionId} → ${data.actualSessionId}`);
829
+ sendToFrontend({
830
+ type: 'session_resume_failed',
831
+ requestedSessionId: data.requestedSessionId,
832
+ actualSessionId: data.actualSessionId,
833
+ });
834
+ });
835
+ // Wire up file checkpoint capture for realtime mode
836
+ realtimeClaudeHandler.events.on('checkpoint_captured', (data) => {
837
+ console.log(`📍 Checkpoint: ${data.checkpointId.substring(0, 8)}...`);
838
+ sendToFrontend({
839
+ type: 'checkpoint_captured',
840
+ checkpointId: data.checkpointId,
841
+ });
842
+ });
843
+ // Extract priority content from research results — preserves URLs, code blocks, and key details
844
+ function extractPriorityContent(result, maxChars = 4000) {
845
+ if (result.length <= maxChars)
846
+ return result;
847
+ // Extract URLs (preserve for voice relay)
848
+ const urlRegex = /https?:\/\/[^\s\)\"\'>\]]+/g;
849
+ const urls = [...new Set(result.match(urlRegex) || [])];
850
+ // Extract code blocks (first 2, up to 400 chars each)
851
+ const codeBlockRegex = /```[\s\S]*?```/g;
852
+ const codeBlocks = [];
853
+ let match;
854
+ while ((match = codeBlockRegex.exec(result)) !== null && codeBlocks.length < 2) {
855
+ const block = match[0].length > 400 ? match[0].substring(0, 397) + '```' : match[0];
856
+ codeBlocks.push(block);
857
+ }
858
+ // Build sections
859
+ const sections = [];
860
+ // Take the first ~2500 chars of narrative (intro + main findings)
861
+ const narrativeEnd = Math.min(result.length, 2500);
862
+ const narrativeTruncated = result.substring(0, narrativeEnd);
863
+ const lastPeriod = narrativeTruncated.lastIndexOf('.');
864
+ const narrative = lastPeriod > narrativeEnd * 0.6
865
+ ? narrativeTruncated.substring(0, lastPeriod + 1)
866
+ : narrativeTruncated;
867
+ sections.push(narrative);
868
+ // Append conclusion (last ~500 chars) if result is long enough
869
+ if (result.length > 3000) {
870
+ const tail = result.substring(result.length - 500);
871
+ const firstPeriod = tail.indexOf('.');
872
+ const conclusion = firstPeriod > 0 ? tail.substring(firstPeriod + 1).trim() : tail.trim();
873
+ if (conclusion.length > 50) {
874
+ sections.push(`\n\n[CONCLUSION]\n${conclusion}`);
875
+ }
876
+ }
877
+ // Append code blocks if not already in the narrative
878
+ if (codeBlocks.length > 0) {
879
+ const codeSection = codeBlocks.filter(cb => !narrative.includes(cb));
880
+ if (codeSection.length > 0) {
881
+ sections.push(`\n\n[CODE EXAMPLES]\n${codeSection.join('\n\n')}`);
882
+ }
883
+ }
884
+ // Append URLs if not already in the narrative
885
+ const newUrls = urls.filter(u => !narrative.includes(u));
886
+ if (newUrls.length > 0) {
887
+ sections.push(`\n\n[LINKS]\n${newUrls.slice(0, 5).join('\n')}`);
888
+ }
889
+ let assembled = sections.join('');
890
+ // Final safety truncation if assembled exceeds maxChars
891
+ if (assembled.length > maxChars) {
892
+ const truncated = assembled.substring(0, maxChars);
893
+ const lp = truncated.lastIndexOf('.');
894
+ assembled = lp > maxChars * 0.7 ? truncated.substring(0, lp + 1) : truncated + '...';
895
+ }
896
+ return assembled;
897
+ }
898
+ // Extracted research execution — called by ask_agent, SDK handles queuing internally
899
+ function executeResearch(task) {
900
+ sendToFrontend({ type: 'system', text: `Executing: ${task}` });
901
+ // Fire-and-forget: write user question to spec.md BEFORE agent starts
902
+ const questionSid = currentLLM?.sessionId || resumeSessionId;
903
+ if (questionSid) {
904
+ writeQuestionToSpec(workingDir, questionSid, task).catch(err => console.error('❌ writeQuestionToSpec failed:', err));
905
+ }
906
+ // Clean up previous research listeners to avoid duplicate event handlers
907
+ if (activeResearch) {
908
+ activeResearch.cleanup();
909
+ if (researchBatchTimer) {
910
+ clearTimeout(researchBatchTimer);
911
+ researchBatchTimer = null;
912
+ }
913
+ }
914
+ // Set up research log batching — events push to queue for state-driven injection
915
+ const researchLog = [];
916
+ const pendingUpdates = [];
917
+ const onToolUse = (data) => {
918
+ const input = data.input || {};
919
+ let entry;
920
+ if (data.name === 'Read' && input.file_path) {
921
+ const fileName = input.file_path.split('/').pop() || input.file_path;
922
+ entry = `Reading ${fileName}`;
923
+ }
924
+ else if (data.name === 'Bash' && input.command) {
925
+ const cmd = input.command.substring(0, 80);
926
+ entry = `Running: ${cmd}`;
927
+ }
928
+ else if (data.name === 'Glob' && input.pattern) {
929
+ entry = `Searching for files matching ${input.pattern}`;
930
+ }
931
+ else if (data.name === 'Grep' && input.pattern) {
932
+ entry = `Searching for "${input.pattern}" in files`;
933
+ }
934
+ else if (data.name === 'WebSearch' && input.query) {
935
+ entry = `Searching the web for "${input.query}"`;
936
+ }
937
+ else if (data.name === 'WebFetch' && input.url) {
938
+ const hostname = input.url.replace(/https?:\/\//, '').split('/')[0];
939
+ entry = `Fetching content from ${hostname}`;
940
+ }
941
+ else if (data.name === 'Write' && input.file_path) {
942
+ const fileName = input.file_path.split('/').pop() || input.file_path;
943
+ entry = `Writing ${fileName}`;
944
+ }
945
+ else if (data.name === 'Edit' && input.file_path) {
946
+ const fileName = input.file_path.split('/').pop() || input.file_path;
947
+ entry = `Editing ${fileName}`;
948
+ }
949
+ else if (data.name.startsWith('mcp__')) {
950
+ const parts = data.name.split('__');
951
+ const serverName = parts[1] || 'external';
952
+ const toolAction = parts.slice(2).join(' ') || 'tool';
953
+ entry = `Using ${serverName}: ${toolAction}`;
954
+ }
955
+ else {
956
+ entry = `Using ${data.name}`;
957
+ }
958
+ researchLog.push(entry);
959
+ pendingUpdates.push(entry);
960
+ scheduleResearchBatch();
961
+ };
962
+ const ANSWER_CHECK_THRESHOLD = 300; // chars — only check substantial outputs
963
+ const onToolResult = (data) => {
964
+ // Only log to researchLog for the final summary — don't push to pendingUpdates
965
+ // This prevents redundant "Reading config.ts. Read done." voice updates
966
+ researchLog.push(`${data.name} completed`);
967
+ // Fire-and-forget: check if substantial tool results answer any spec questions
968
+ // Note: PostToolUse emits { name, input, response } — use data.response (not data.result)
969
+ const resultText = typeof data.response === 'string' ? data.response : JSON.stringify(data.response || '');
970
+ if (resultText.length > ANSWER_CHECK_THRESHOLD) {
971
+ const sid = currentLLM?.sessionId || resumeSessionId;
972
+ if (sid)
973
+ checkOutputAgainstQuestions(workingDir, sid, resultText, 'tool_result').catch(() => { });
974
+ }
975
+ // When AskUserQuestion completes, the user's answer is a decision — track it in spec
976
+ if (data.name === 'AskUserQuestion' && data.response) {
977
+ const sid = currentLLM?.sessionId || resumeSessionId;
978
+ if (sid) {
979
+ const questionText = JSON.stringify(data.input?.questions || data.input || {});
980
+ const answerText = typeof data.response === 'string' ? data.response : JSON.stringify(data.response);
981
+ const specUpdate = `User answered a clarifying question during research.\nQuestion: ${questionText}\nAnswer: ${answerText}\nRecord this as a user decision in spec.md.`;
982
+ askHaiku(workingDir, sid, specUpdate).catch(err => console.error('❌ Failed to record AskUserQuestion answer in spec:', err));
983
+ console.log(`📝 AskUserQuestion answer forwarded to fast brain for spec tracking`);
984
+ }
985
+ }
986
+ };
987
+ const onText = (data) => {
988
+ if (data.text?.trim()) {
989
+ const text = data.text.trim();
990
+ const preview = text.substring(0, 150);
991
+ const firstSentence = preview.match(/^[^.!?\n]+[.!?]/)?.[0] || preview;
992
+ researchLog.push(firstSentence);
993
+ pendingUpdates.push(firstSentence);
994
+ scheduleResearchBatch();
995
+ // Fire-and-forget: check if substantial agent reasoning answers any spec questions
996
+ if (text.length > ANSWER_CHECK_THRESHOLD) {
997
+ const sid = currentLLM?.sessionId || resumeSessionId;
998
+ if (sid)
999
+ checkOutputAgainstQuestions(workingDir, sid, text, 'assistant_text').catch(() => { });
1000
+ }
1001
+ }
1002
+ };
1003
+ realtimeClaudeHandler.events.on('tool_use', onToolUse);
1004
+ realtimeClaudeHandler.events.on('tool_result', onToolResult);
1005
+ realtimeClaudeHandler.events.on('assistant_text', onText);
1006
+ const cleanupListeners = () => {
1007
+ realtimeClaudeHandler?.events.off('tool_use', onToolUse);
1008
+ realtimeClaudeHandler?.events.off('tool_result', onToolResult);
1009
+ realtimeClaudeHandler?.events.off('assistant_text', onText);
1010
+ };
1011
+ // Track active research — updates drain when model enters 'listening' state
1012
+ activeResearch = {
1013
+ researchLog,
1014
+ pendingUpdates,
1015
+ cleanup: cleanupListeners,
1016
+ voiceUpdateCount: 0,
1017
+ };
1018
+ // Start proactive conversational loop
1019
+ const proactiveSid = currentLLM?.sessionId || resumeSessionId;
1020
+ if (proactiveSid) {
1021
+ startProactiveLoop(task, proactiveSid);
1022
+ }
1023
+ // Run research in the background (non-blocking)
1024
+ const researchPromise = (async () => {
1025
+ const stream = realtimeClaudeHandler.chat({
1026
+ chatCtx: {
1027
+ items: [{ type: 'message', role: 'user', content: [task] }],
1028
+ },
1029
+ });
1030
+ let result = '';
1031
+ for await (const chunk of stream) {
1032
+ if (chunk.delta?.content) {
1033
+ result += chunk.delta.content;
1034
+ }
1035
+ }
1036
+ return result;
1037
+ })();
1038
+ // Handle completion asynchronously
1039
+ researchPromise.then(async (result) => {
1040
+ console.log(`✅ [realtime] Research complete (${result.length} chars)`);
1041
+ // Clean up
1042
+ cleanupListeners();
1043
+ // Send raw result to frontend as a log entry (not assistant_response — that's reserved
1044
+ // for the voice model's spoken response, avoiding duplication in chat)
1045
+ await sendToFrontend({ type: 'claude_output', text: result, isStreaming: false, agentRole: 'research-result' });
1046
+ const resultPreview = result.length > 150
1047
+ ? result.substring(0, 150) + '...'
1048
+ : result;
1049
+ await sendToFrontend({ type: 'task_completed', task, resultPreview });
1050
+ // Build enhanced return with research log
1051
+ const logSummary = researchLog.length > 0
1052
+ ? `\n\n[RESEARCH LOG]\n${researchLog.slice(0, 25).join('\n')}`
1053
+ : '';
1054
+ // Extract priority content — preserves URLs, code blocks, and key details (4000 char limit)
1055
+ const resultForVoice = extractPriorityContent(result);
1056
+ const fullResult = (resultForVoice + logSummary) || 'Research completed successfully.';
1057
+ // Clear active research and timers before injecting final results
1058
+ if (researchBatchTimer) {
1059
+ clearTimeout(researchBatchTimer);
1060
+ researchBatchTimer = null;
1061
+ }
1062
+ stopProactiveLoop();
1063
+ activeResearch = null;
1064
+ // Send final results to frontend for visibility
1065
+ await sendToFrontend({
1066
+ type: 'claude_output',
1067
+ text: `[Research Complete] Injecting findings into voice model (${fullResult.length} chars)`,
1068
+ isStreaming: false,
1069
+ agentRole: 'research-progress',
1070
+ });
1071
+ // Route through fast brain for context augmentation before voice injection
1072
+ // Fast brain adds spec context but does NOT summarize — passes details through verbatim
1073
+ const voiceSid = currentLLM?.sessionId || resumeSessionId;
1074
+ console.log(`📡 [realtime] Augmenting results via fast brain (${fullResult.length} chars, agentState: ${agentState})`);
1075
+ if (voiceSid) {
1076
+ augmentResearchResult(workingDir, voiceSid, task, fullResult)
1077
+ .then(augmented => {
1078
+ queueVoiceInjection(getResearchCompleteInjection(task, augmented));
1079
+ })
1080
+ .catch(() => {
1081
+ // Fallback: use result directly if fast brain fails
1082
+ queueVoiceInjection(getResearchCompleteInjection(task, fullResult));
1083
+ });
1084
+ }
1085
+ else {
1086
+ queueVoiceInjection(getResearchCompleteInjection(task, fullResult));
1087
+ }
1088
+ // Inject FULL untruncated result into ChatCtx so voice model can answer
1089
+ // follow-up questions ("tell me more", "what were those links?") from memory
1090
+ injectIntoChatCtx(`[FULL RESEARCH DETAILS for "${task}"]\n${result}`);
1091
+ // Fire-and-forget JSONL-based refinement pass via fast brain
1092
+ // Reads FULL untruncated data from JSONL — no content buffer, no truncation
1093
+ const postResearchSessionId = currentLLM?.sessionId || resumeSessionId;
1094
+ if (postResearchSessionId) {
1095
+ updateSpecFromJSONL(workingDir, postResearchSessionId, task, researchLog)
1096
+ .then(updateResult => {
1097
+ if (!updateResult)
1098
+ return;
1099
+ // Notify frontend about spec.md update
1100
+ if (updateResult.spec) {
1101
+ const specPath = `${workingDir}/.osborn/sessions/${postResearchSessionId}/spec.md`;
1102
+ sendToFrontend({
1103
+ type: 'research_artifact_updated',
1104
+ filePath: specPath,
1105
+ fileName: 'spec.md',
1106
+ });
1107
+ const truncated = getSpecForVoiceModel(workingDir, postResearchSessionId);
1108
+ if (truncated) {
1109
+ injectIntoChatCtx(`[UPDATED SESSION SPEC]\n${truncated}`);
1110
+ console.log(`📋 Re-injected spec.md into ChatCtx after fast brain update (${truncated.length} chars)`);
1111
+ }
1112
+ }
1113
+ // Notify frontend about each library file written by the fast brain
1114
+ for (const libFile of updateResult.libraryFiles) {
1115
+ const libPath = `${workingDir}/.osborn/sessions/${postResearchSessionId}/library/${libFile}`;
1116
+ sendToFrontend({
1117
+ type: 'research_artifact_updated',
1118
+ filePath: libPath,
1119
+ fileName: libFile,
1120
+ });
1121
+ }
1122
+ });
1123
+ }
1124
+ }).catch(async (err) => {
1125
+ console.error(`❌ [realtime] Research failed:`, err);
1126
+ // Clean up
1127
+ cleanupListeners();
1128
+ if (researchBatchTimer) {
1129
+ clearTimeout(researchBatchTimer);
1130
+ researchBatchTimer = null;
1131
+ }
1132
+ stopProactiveLoop();
1133
+ activeResearch = null;
1134
+ // Queue error notification — will be spoken when model is available
1135
+ queueVoiceInjection(`[NOTIFICATION] The research task encountered an error: ${err.message}. Let the user know briefly and ask if they want to try again. Do NOT call any tools.`);
1136
+ });
1137
+ // Return immediately to unblock the voice model
1138
+ return 'Research started. I\'ll relay findings as they come in — you can keep talking to the user while I work.';
1139
+ }
1140
+ // Create tools for the realtime voice LLM
1141
+ const askAgentTool = llm.tool({
1142
+ description: `Delegate a task to your backend agent (Claude), which has full research, analysis, reasoning, and coding capabilities.
1143
+
1144
+ Use for:
1145
+ - Researching topics, technologies, concepts, or ideas in depth
1146
+ - Fetching and analyzing web pages, articles, blog posts, YouTube transcripts
1147
+ - Reading and summarizing documentation, papers, or reference materials
1148
+ - Exploring and analyzing codebases, configs, architecture
1149
+ - Comparing options, tools, approaches — with tradeoffs and recommendations
1150
+ - Running bash commands, testing implementations
1151
+ - Using MCP tools (GitHub, YouTube, and other external tools)
1152
+ - Saving findings to the session library and updating the spec
1153
+ - Any question requiring research, analysis, verification, or deeper reasoning
1154
+
1155
+ Reformulate the user's spoken request into a clear, specific task.
1156
+ The more context you include (topic, constraints, what they want to learn), the better the results.
1157
+ If the user wants specific details (examples, URLs, comparisons, step-by-step breakdown), mention that in your request.`,
1158
+ parameters: z.object({
1159
+ request: z.string().describe('The task or question to delegate to the agent'),
1160
+ }),
1161
+ execute: async ({ request: task }) => {
1162
+ console.log(`\n🔨 [realtime] Task: "${task}"`);
1163
+ // Guard: if ask_haiku is currently handling a similar question, skip ask_agent
1164
+ // This prevents the double-calling pattern where Gemini fires both in rapid succession
1165
+ if (haikuInFlight && (Date.now() - haikuInFlight.time) < 8000) {
1166
+ console.log(`⏭️ Skipping ask_agent — ask_haiku is already handling: "${haikuInFlight.question.substring(0, 60)}"`);
1167
+ return 'The fast brain is already looking into this. Wait for its answer first.';
1168
+ }
1169
+ // Deduplication guard: prevent re-execution of same task within 10s
1170
+ const now = Date.now();
1171
+ if (task === lastTaskRequest && (now - lastTaskTime) < 10000) {
1172
+ console.log('⏭️ Skipping duplicate task (within 10s window)');
1173
+ return 'This task was just completed. The results were already relayed.';
1174
+ }
1175
+ lastTaskRequest = task;
1176
+ lastTaskTime = now;
1177
+ return executeResearch(task);
1178
+ },
1179
+ });
1180
+ const respondPermissionTool = llm.tool({
1181
+ description: `Respond to a permission request. Call after hearing user's response.`,
1182
+ parameters: z.object({
1183
+ response: z.enum(['allow', 'deny', 'always_allow']),
1184
+ }),
1185
+ execute: async ({ response }) => {
1186
+ if (!realtimeClaudeHandler?.hasPendingPermission()) {
1187
+ return 'No pending permission.';
1188
+ }
1189
+ const pending = realtimeClaudeHandler.getPendingPermission();
1190
+ const allow = response === 'allow' || response === 'always_allow';
1191
+ realtimeClaudeHandler.respondToPermission(allow);
1192
+ await sendToFrontend({ type: 'permission_response', response, toolName: pending?.toolName });
1193
+ return `Permission ${response} for ${pending?.toolName || 'tool'}.`;
1194
+ },
1195
+ });
1196
+ const readSpecTool = llm.tool({
1197
+ description: `Read the session spec (spec.md) — shared state between you and your backend agent.
1198
+ Use when: checking decisions, reading open questions to ask the user, understanding architecture/context, seeing what research has been saved. Updated by your backend agent during research.`,
1199
+ parameters: z.object({}),
1200
+ execute: async () => {
1201
+ const sessionId = currentLLM?.sessionId || resumeSessionId;
1202
+ if (!sessionId)
1203
+ return 'No session spec yet — session is still initializing.';
1204
+ const specContent = readSessionSpec(workingDir, sessionId);
1205
+ if (!specContent)
1206
+ return 'Spec is empty — no research done yet.';
1207
+ const libraryFiles = listLibraryFiles(workingDir, sessionId);
1208
+ const libSection = libraryFiles.length > 0
1209
+ ? `\n\n[LIBRARY FILES: ${libraryFiles.join(', ')}]`
1210
+ : '';
1211
+ const MAX = 4000;
1212
+ const content = specContent.length > MAX
1213
+ ? specContent.substring(0, MAX) + '\n\n[... truncated]'
1214
+ : specContent;
1215
+ return content + libSection;
1216
+ },
1217
+ });
1218
+ const askHaikuTool = llm.tool({
1219
+ description: `Ask your fast brain — a quick knowledge assistant with access to session files and web search (~2 seconds).
1220
+
1221
+ Use for:
1222
+ - Questions answerable from the session spec or research library (much faster than ask_agent)
1223
+ - Quick web lookups for simple factual questions (definitions, current versions, basic how-to)
1224
+ - Recording user decisions: "User decided: [decision]. Update the spec."
1225
+ - Recording user preferences: "User prefers: [preference]. Update the spec."
1226
+ - Checking what research has been done on a topic
1227
+ - Reading specific library files for details
1228
+
1229
+ Do NOT use for: deep research, code analysis, multi-file codebase exploration, complex investigations → use ask_agent.
1230
+ If the fast brain responds with NEEDS_DEEPER_RESEARCH, tell the user you need to look deeper, then call ask_agent with the context it provides.`,
1231
+ parameters: z.object({
1232
+ question: z.string().describe('The question to ask or instruction to execute'),
1233
+ }),
1234
+ execute: async ({ question }) => {
1235
+ const sessionId = currentLLM?.sessionId || resumeSessionId;
1236
+ if (!sessionId)
1237
+ return 'Session not ready yet. Try ask_agent instead.';
1238
+ console.log(`🧠 [fast brain] Question: "${question.substring(0, 80)}..."`);
1239
+ // Track in-flight state to prevent ask_agent double-calling
1240
+ haikuInFlight = { question, time: Date.now() };
1241
+ // Build live research context if the agent is actively researching
1242
+ // This is a READ of the existing researchLog array — safe, no race conditions
1243
+ let researchContext;
1244
+ if (activeResearch && activeResearch.researchLog.length > 0) {
1245
+ const recentLog = activeResearch.researchLog.slice(-15);
1246
+ researchContext = `Research topic: "${lastTaskRequest || 'unknown'}"\nSteps completed (${activeResearch.researchLog.length} total, showing last ${recentLog.length}):\n${recentLog.join('\n')}`;
1247
+ }
1248
+ try {
1249
+ const chatHistory = getChatHistory(20);
1250
+ const answer = await askHaiku(workingDir, sessionId, question, researchContext, chatHistory);
1251
+ haikuInFlight = null; // Clear in-flight state
1252
+ console.log(`🧠 [fast brain] Answer (${answer.length} chars)`);
1253
+ // Notify frontend if the fast brain likely wrote to spec.md
1254
+ // (fast brain writes bypass the SDK tool system, so no tool_result event fires)
1255
+ if (answer.includes('Written: spec.md') || question.toLowerCase().includes('update the spec') || question.toLowerCase().includes('user decided') || question.toLowerCase().includes('user prefers')) {
1256
+ const specPath = `${workingDir}/.osborn/sessions/${sessionId}/spec.md`;
1257
+ sendToFrontend({
1258
+ type: 'research_artifact_updated',
1259
+ filePath: specPath,
1260
+ fileName: 'spec.md',
1261
+ });
1262
+ }
1263
+ // If research is active and this was a user decision/direction,
1264
+ // also queue it for the agent SDK so it picks up the context
1265
+ // when its queue reaches the next query
1266
+ if (activeResearch && (question.toLowerCase().includes('user decided') ||
1267
+ question.toLowerCase().includes('user prefers') ||
1268
+ question.toLowerCase().includes('update the spec') ||
1269
+ question.toLowerCase().includes('also check') ||
1270
+ question.toLowerCase().includes('focus on') ||
1271
+ question.toLowerCase().includes('redirect'))) {
1272
+ console.log(`📨 [fast brain] Passing user direction to agent SDK queue: "${question.substring(0, 60)}..."`);
1273
+ // Queue as a lightweight context update — agent reads spec.md
1274
+ // at the start of its next query and will see the updated direction
1275
+ executeResearch(`[USER DIRECTION during active research] ${question}. The user's spec.md has been updated with this. Acknowledge briefly and incorporate into your current research context.`);
1276
+ }
1277
+ return answer;
1278
+ }
1279
+ catch (err) {
1280
+ haikuInFlight = null; // Clear in-flight state on error
1281
+ console.error('❌ Fast brain failed:', err);
1282
+ return 'Fast brain lookup failed. Try ask_agent for a deeper search.';
1283
+ }
1284
+ },
1285
+ });
1286
+ const generateDocumentTool = llm.tool({
1287
+ description: `Generate a visual document (comparison table, Mermaid diagram, structured analysis, summary) from research findings. Saved to the session library as a markdown file.
1288
+
1289
+ Use when the user asks for:
1290
+ - "Compare X and Y" → type: 'comparison' (markdown table with features, pros, cons)
1291
+ - "Draw a diagram" / "Show the architecture" / "Map out the flow" → type: 'diagram' (Mermaid flowchart/sequence/architecture)
1292
+ - "Analyze the tradeoffs" / "Break down the options" → type: 'analysis' (structured pros/cons, decision matrix)
1293
+ - "Summarize what we found" / "Give me an overview document" → type: 'summary' (organized findings with key takeaways)
1294
+
1295
+ For actual images (photos, illustrations, screenshots), use ask_agent instead — this tool generates text-based visual documents only.`,
1296
+ parameters: z.object({
1297
+ request: z.string().describe('What to generate — be specific about the topic and what aspects to cover'),
1298
+ type: z.enum(['comparison', 'diagram', 'analysis', 'summary']).describe('Document type'),
1299
+ }),
1300
+ execute: async ({ request, type }) => {
1301
+ const sid = currentLLM?.sessionId || resumeSessionId;
1302
+ if (!sid)
1303
+ return 'Session not ready yet.';
1304
+ console.log(`📊 [generate_document] Type: ${type}, Request: "${request.substring(0, 60)}..."`);
1305
+ try {
1306
+ const result = await generateVisualDocument(workingDir, sid, request, type);
1307
+ if (!result)
1308
+ return 'Could not generate document — not enough research context available.';
1309
+ const fullPath = `${workingDir}/.osborn/sessions/${sid}/library/${result.fileName}`;
1310
+ sendToFrontend({
1311
+ type: 'research_artifact_updated',
1312
+ filePath: fullPath,
1313
+ fileName: result.fileName,
1314
+ });
1315
+ return `Generated: ${result.fileName} (${result.content.length} chars) — saved to session library. The document contains a ${type} with the requested information.`;
1316
+ }
1317
+ catch (err) {
1318
+ console.error('❌ Document generation failed:', err);
1319
+ return 'Document generation failed. Try asking the research agent for a more detailed analysis.';
1320
+ }
1321
+ },
1322
+ });
1323
+ // Instructions for realtime voice LLM
1324
+ const realtimeInstructions = getRealtimeInstructions(workingDir);
1325
+ // Create realtime model
1326
+ const realtimeModel = createRealtimeModelFromConfig(rtConfig, realtimeInstructions);
1327
+ // Create the Agent with realtime model and tools
1328
+ const agent = new voice.Agent({
1329
+ instructions: realtimeInstructions,
1330
+ llm: realtimeModel,
1331
+ tools: {
1332
+ ask_agent: askAgentTool,
1333
+ ask_haiku: askHaikuTool,
1334
+ read_spec: readSpecTool,
1335
+ generate_document: generateDocumentTool,
1336
+ respond_permission: respondPermissionTool,
1337
+ },
1338
+ });
1339
+ // Create the session
1340
+ const session = new voice.AgentSession({});
1341
+ return { session, agent };
1342
+ }
1343
+ // ============================================================
1344
+ // Room Event Handlers
1345
+ // ============================================================
1346
+ room.on(RoomEvent.Connected, () => {
1347
+ console.log('✅ Connected to room:', roomName);
1348
+ localParticipant = room.localParticipant;
1349
+ });
1350
+ room.on(RoomEvent.Disconnected, () => {
1351
+ console.log('👋 Disconnected from room');
1352
+ // Clean up active research and voice queue
1353
+ voiceQueue.length = 0;
1354
+ isProcessingQueue = false;
1355
+ if (researchBatchTimer) {
1356
+ clearTimeout(researchBatchTimer);
1357
+ researchBatchTimer = null;
1358
+ }
1359
+ stopProactiveLoop();
1360
+ if (activeResearch) {
1361
+ activeResearch.cleanup();
1362
+ activeResearch = null;
1363
+ }
1364
+ currentSession = null;
1365
+ currentAgent = null;
1366
+ currentLLM = null;
1367
+ clearFastBrainHistory();
1368
+ });
1369
+ room.on(RoomEvent.ParticipantConnected, async (participant) => {
1370
+ console.log(`\n👤 User joined: ${participant.identity}`);
1371
+ // Clean up any existing session before creating a new one
1372
+ voiceQueue.length = 0;
1373
+ isProcessingQueue = false;
1374
+ if (researchBatchTimer) {
1375
+ clearTimeout(researchBatchTimer);
1376
+ researchBatchTimer = null;
1377
+ }
1378
+ stopProactiveLoop();
1379
+ clearFastBrainHistory();
1380
+ if (activeResearch) {
1381
+ activeResearch.cleanup();
1382
+ activeResearch = null;
1383
+ }
1384
+ if (currentSession) {
1385
+ console.log('🧹 Cleaning up previous session...');
1386
+ try {
1387
+ await currentSession.close();
1388
+ }
1389
+ catch { }
1390
+ try {
1391
+ currentSession.removeAllListeners();
1392
+ }
1393
+ catch { }
1394
+ currentSession = null;
1395
+ currentAgent = null;
1396
+ currentLLM = null;
1397
+ }
1398
+ // Extract voice architecture, provider, and sessionId from participant metadata (sent by frontend)
1399
+ // This overrides the config file setting for per-session flexibility
1400
+ let sessionVoiceMode = voiceMode; // Default to config
1401
+ let sessionRealtimeProvider = realtimeConfig.provider; // Default to config
1402
+ let preSelectedSessionId = null;
1403
+ try {
1404
+ const metadata = JSON.parse(participant.metadata || '{}');
1405
+ console.log(`📋 Participant metadata:`, metadata);
1406
+ if (metadata.voiceArch === 'realtime' || metadata.voiceArch === 'direct') {
1407
+ sessionVoiceMode = metadata.voiceArch;
1408
+ console.log(`🎙️ Using voice mode from frontend: ${sessionVoiceMode}`);
1409
+ }
1410
+ else if (metadata.voiceArch) {
1411
+ console.log(`⚠️ Unknown voiceArch "${metadata.voiceArch}", using config: ${voiceMode}`);
1412
+ }
1413
+ // Read provider selection from frontend (openai or gemini)
1414
+ if (metadata.provider === 'openai' || metadata.provider === 'gemini') {
1415
+ sessionRealtimeProvider = metadata.provider;
1416
+ console.log(`🎙️ Using provider from frontend: ${sessionRealtimeProvider}`);
1417
+ }
1418
+ // Read pre-selected session ID from frontend (session browser selection)
1419
+ if (metadata.sessionId && typeof metadata.sessionId === 'string' && metadata.sessionId.length > 0) {
1420
+ preSelectedSessionId = metadata.sessionId;
1421
+ console.log(`📂 Pre-selected session from frontend: ${preSelectedSessionId}`);
1422
+ }
1423
+ }
1424
+ catch (err) {
1425
+ console.log('⚠️ Could not parse participant metadata, using config voiceMode:', voiceMode);
1426
+ }
1427
+ // Sync to outer scope so DataReceived handler can use it
1428
+ currentVoiceMode = sessionVoiceMode;
1429
+ currentProvider = sessionRealtimeProvider;
1430
+ // Resume session ID — only set when resuming an existing session
1431
+ const resumeSessionId = preSelectedSessionId || undefined;
1432
+ if (resumeSessionId) {
1433
+ console.log(`🆔 Resuming session: ${resumeSessionId}`);
1434
+ }
1435
+ else {
1436
+ console.log(`🆔 New session (ID assigned by SDK)`);
1437
+ }
1438
+ // Create session based on voice mode (from frontend or config)
1439
+ let session;
1440
+ let agent;
1441
+ if (sessionVoiceMode === 'realtime') {
1442
+ // Override the config provider with the frontend's selection
1443
+ const sessionRealtimeConfig = { ...realtimeConfig, provider: sessionRealtimeProvider };
1444
+ console.log(`🎙️ REALTIME MODE: ${sessionRealtimeConfig.provider} native speech-to-speech`);
1445
+ const result = await createRealtimeSession(sessionRealtimeConfig, resumeSessionId);
1446
+ session = result.session;
1447
+ agent = result.agent;
1448
+ }
1449
+ else {
1450
+ console.log(`🎯 DIRECT MODE: Claude Agent SDK with full coding capabilities`);
1451
+ const result = await createDirectSession(resumeSessionId);
1452
+ session = result.session;
1453
+ agent = result.agent;
1454
+ }
1455
+ currentSession = session;
1456
+ currentAgent = agent; // Store for updateChatCtx() context injection
1457
+ // ============================================================
1458
+ // Session event wiring — extracted into function for auto-recovery
1459
+ // ============================================================
1460
+ let lastRecoveryTime = 0;
1461
+ const MIN_RECOVERY_INTERVAL = 10000; // 10 seconds between recovery attempts
1462
+ function wireSessionEvents(sess, agt) {
1463
+ // Transcript dedup state (reset per wiring)
1464
+ let lastSentUserTranscript = '';
1465
+ let lastSentAgentTranscript = '';
1466
+ function sendUserTranscript(transcript, source) {
1467
+ if (!transcript || transcript.length < 3)
1468
+ return;
1469
+ const normalized = transcript.trim().replace(/\s+/g, ' ');
1470
+ if (normalized === lastSentUserTranscript)
1471
+ return;
1472
+ if (normalized === '<noise>' || normalized.toLowerCase() === 'thank you')
1473
+ return;
1474
+ console.log(`📝 User (${source}): "${transcript.substring(0, 60)}..."`);
1475
+ sendToFrontend({ type: 'user_transcript', text: transcript });
1476
+ lastSentUserTranscript = normalized;
1477
+ }
1478
+ function sendAgentTranscript(text, source) {
1479
+ if (!text || text.length < 3)
1480
+ return;
1481
+ const normalized = text.trim().replace(/\s+/g, ' ');
1482
+ if (normalized === lastSentAgentTranscript)
1483
+ return;
1484
+ console.log(`💬 Agent (${source}): "${text.substring(0, 60)}..."`);
1485
+ sendToFrontend({ type: 'assistant_response', text });
1486
+ lastSentAgentTranscript = normalized;
1487
+ }
1488
+ // PRIMARY: conversation_item_added is the authoritative source
1489
+ sess.on('conversation_item_added', (ev) => {
1490
+ let text = '';
1491
+ if (Array.isArray(ev.item?.content)) {
1492
+ text = typeof ev.item.content[0] === 'string'
1493
+ ? ev.item.content.join('\n')
1494
+ : ev.item.content.map((c) => c.text).filter(Boolean).join('\n');
1495
+ }
1496
+ else if (typeof ev.item?.content === 'string') {
1497
+ text = ev.item.content;
1498
+ }
1499
+ else if (ev.item?.text) {
1500
+ text = ev.item.text;
1501
+ }
1502
+ if (ev.item?.role === 'user' && text) {
1503
+ sendUserTranscript(text, 'conv_item');
1504
+ }
1505
+ else if (ev.item?.role === 'assistant' && text) {
1506
+ sendAgentTranscript(text, 'conv_item');
1507
+ }
1508
+ });
1509
+ // FALLBACK: user_speech_committed
1510
+ sess.on('user_speech_committed', (ev) => {
1511
+ const transcript = ev.transcript || ev.text || '';
1512
+ sendUserTranscript(transcript, 'committed');
1513
+ });
1514
+ // Agent state tracking
1515
+ sess.on('agent_state_changed', (ev) => {
1516
+ agentState = ev.newState;
1517
+ // Clear processing guard when model transitions to any new state
1518
+ isProcessingQueue = false;
1519
+ console.log(`🤖 State: ${ev.newState}`);
1520
+ sendToFrontend({ type: 'agent_state', state: ev.newState });
1521
+ // When the model becomes available (listening), process any queued voice injections
1522
+ if (ev.newState === 'listening' && voiceQueue.length > 0) {
1523
+ setTimeout(() => processVoiceQueue(), 500); // 500ms to let model settle
1524
+ }
1525
+ });
1526
+ // User state tracking — prevents queue from colliding with server-side VAD
1527
+ sess.on('user_state_changed', (ev) => {
1528
+ userState = ev.newState;
1529
+ console.log(`👤 User state: ${ev.newState}`);
1530
+ });
1531
+ // FALLBACK: playout_completed
1532
+ sess.on('playout_completed', (ev) => {
1533
+ const message = ev.message || ev.text || ev.content;
1534
+ if (message && message.length > 0) {
1535
+ sendAgentTranscript(message, 'playout');
1536
+ }
1537
+ });
1538
+ // Error handler
1539
+ sess.on('error', (ev) => {
1540
+ const msg = ev.error?.message || String(ev.error);
1541
+ // OpenAI race: voice queue collided with server-side VAD auto-response
1542
+ if (msg.includes('conversation_already_has_active_response') || msg.includes('active_response')) {
1543
+ console.log('⚠️ OpenAI active response collision — queue will retry on next listening state');
1544
+ return;
1545
+ }
1546
+ console.error('❌ Session error:', ev.error);
1547
+ });
1548
+ // Close handler with auto-recovery for Gemini 1008 crashes
1549
+ sess.on('close', async (ev) => {
1550
+ console.log('🚪 Session closed:', ev.reason);
1551
+ // Auto-recover from crashes in realtime mode
1552
+ if (ev.reason === 'error' && currentVoiceMode === 'realtime') {
1553
+ const now = Date.now();
1554
+ if (now - lastRecoveryTime < MIN_RECOVERY_INTERVAL) {
1555
+ console.log('⚠️ Recovery too frequent — skipping to prevent loop');
1556
+ sendToFrontend({ type: 'agent_state', state: 'error' });
1557
+ return;
1558
+ }
1559
+ lastRecoveryTime = now;
1560
+ console.log('🔄 Auto-recovering from session crash...');
1561
+ // Clean up dead session
1562
+ try {
1563
+ sess.removeAllListeners();
1564
+ }
1565
+ catch { }
1566
+ currentSession = null;
1567
+ currentAgent = null;
1568
+ // Clear voice queue — stale injections from the crashed session
1569
+ voiceQueue.length = 0;
1570
+ isProcessingQueue = false;
1571
+ if (researchBatchTimer) {
1572
+ clearTimeout(researchBatchTimer);
1573
+ researchBatchTimer = null;
1574
+ }
1575
+ stopProactiveLoop();
1576
+ if (activeResearch) {
1577
+ activeResearch.cleanup();
1578
+ activeResearch = null;
1579
+ }
1580
+ try {
1581
+ const recoveryConfig = { ...realtimeConfig, provider: currentProvider };
1582
+ // Reuse existing session ID for workspace continuity during recovery
1583
+ // Prefer real SDK session ID, fall back to original resume ID
1584
+ const recoverySessionId = currentLLM?.sessionId || resumeSessionId;
1585
+ const result = await createRealtimeSession(recoveryConfig, recoverySessionId);
1586
+ const newSession = result.session;
1587
+ const newAgent = result.agent;
1588
+ currentSession = newSession;
1589
+ currentAgent = newAgent;
1590
+ // Re-wire event listeners on the new session
1591
+ wireSessionEvents(newSession, newAgent);
1592
+ await newSession.start({ agent: newAgent, room });
1593
+ // Sync state
1594
+ agentState = 'listening';
1595
+ sendToFrontend({ type: 'agent_state', state: 'listening' });
1596
+ // Resume Claude session if one was active
1597
+ if (currentLLM?.sessionId) {
1598
+ currentLLM.setContinueSession(true);
1599
+ }
1600
+ // Inject conversation context into the recovered session
1601
+ const recoveredSessionId = currentLLM?.sessionId || recoverySessionId;
1602
+ if (recoveredSessionId) {
1603
+ try {
1604
+ const summary = await getSessionSummary(recoveredSessionId, workingDir);
1605
+ const conversationHistory = await getConversationHistory(recoveredSessionId, workingDir, 30);
1606
+ if (summary && conversationHistory.length > 0) {
1607
+ const contextBriefing = buildContextBriefing(summary, conversationHistory, currentProvider);
1608
+ queueVoiceInjection(`[SESSION RECOVERED] The voice session crashed and was auto-recovered. Here's the conversation context from before the crash:\n${contextBriefing}\n\nBriefly tell the user the connection was interrupted and you still have context from the conversation. Ask if they can hear you and what they'd like to continue with. Do NOT call any tools.`);
1609
+ console.log('📋 Injected conversation context into recovered session');
1610
+ }
1611
+ else {
1612
+ queueVoiceInjection('[NOTIFICATION] The voice session was briefly interrupted but has been recovered. Ask the user if they can hear you and continue where you left off. Do NOT call any tools.');
1613
+ }
1614
+ }
1615
+ catch (err) {
1616
+ console.log('⚠️ Failed to load conversation context for recovery:', err);
1617
+ queueVoiceInjection('[NOTIFICATION] The voice session was briefly interrupted but has been recovered. Ask the user if they can hear you and continue where you left off. Do NOT call any tools.');
1618
+ }
1619
+ }
1620
+ else {
1621
+ // No session ID — generic notification
1622
+ queueVoiceInjection('[NOTIFICATION] The voice session was briefly interrupted but has been recovered. Ask the user if they can hear you and continue where you left off. Do NOT call any tools.');
1623
+ }
1624
+ console.log('✅ Auto-recovery complete');
1625
+ }
1626
+ catch (err) {
1627
+ console.error('❌ Auto-recovery failed:', err);
1628
+ sendToFrontend({ type: 'agent_state', state: 'error' });
1629
+ }
1630
+ }
1631
+ });
1632
+ }
1633
+ // Wire events on the initial session
1634
+ wireSessionEvents(session, agent);
1635
+ // Start voice session
1636
+ console.log('🎬 Starting voice session...');
1637
+ try {
1638
+ await session.start({ agent, room });
1639
+ console.log('✅ Voice session started!');
1640
+ console.log('🎤 Ready - speak to begin!\n');
1641
+ // Workspace is created later in the session_id event handler (when SDK assigns real ID)
1642
+ // Send ready signal with persistent retry
1643
+ console.log('💓 Sending agent_ready signal...');
1644
+ let readySent = false;
1645
+ const provider = sessionVoiceMode === 'realtime' ? realtimeConfig.provider : 'claude';
1646
+ // Fetch full session list for startup session browser
1647
+ const allSessions = await listSessions(workingDir);
1648
+ const recentSessionId = allSessions.length > 0 ? allSessions[0].sessionId : null;
1649
+ const hasRecentSession = allSessions.length > 0;
1650
+ // Prepare sessions for frontend (up to 50)
1651
+ const sessionsForFrontend = allSessions.slice(0, 50).map(s => ({
1652
+ sessionId: s.sessionId,
1653
+ timestamp: s.timestamp.toISOString(),
1654
+ lastMessage: s.lastMessage,
1655
+ messageCount: s.messageCount,
1656
+ }));
1657
+ const sendReady = async () => {
1658
+ if (readySent)
1659
+ return;
1660
+ await sendToFrontend({
1661
+ type: 'agent_ready',
1662
+ provider,
1663
+ voiceMode: sessionVoiceMode,
1664
+ hasRecentSession,
1665
+ recentSessionId,
1666
+ sessions: sessionsForFrontend,
1667
+ preSelectedSessionId,
1668
+ mcpServers: getMcpServerStatusList(config),
1669
+ enabledMcpServers: enabledMcpNames,
1670
+ });
1671
+ };
1672
+ const readyInterval = setInterval(sendReady, 2000);
1673
+ await sendReady();
1674
+ setTimeout(() => {
1675
+ clearInterval(readyInterval);
1676
+ console.log('✅ agent_ready retries complete');
1677
+ }, 20000);
1678
+ // Stop agent_ready retries on user speech
1679
+ session.on('input_speech_started', () => {
1680
+ readySent = true;
1681
+ clearInterval(readyInterval);
1682
+ });
1683
+ // Greet user via TTS (delayed if resume prompt will be shown)
1684
+ // For realtime mode: use generateReply() since there's no standalone TTS
1685
+ // For direct mode: use say() which goes through the configured TTS
1686
+ const greetViaVoice = async (text) => {
1687
+ if (sessionVoiceMode === 'realtime') {
1688
+ // Realtime models handle their own speech generation
1689
+ await session.generateReply({ userInput: text });
1690
+ }
1691
+ else {
1692
+ await session.say(text);
1693
+ }
1694
+ };
1695
+ if (preSelectedSessionId && sessionExists(preSelectedSessionId, workingDir)) {
1696
+ // User pre-selected a session from the session browser — auto-resume immediately
1697
+ console.log(`📂 Auto-resuming pre-selected session: ${preSelectedSessionId}`);
1698
+ if (currentLLM) {
1699
+ currentLLM.setResumeSessionId(preSelectedSessionId);
1700
+ console.log(`🔄 Session resume configured: ${preSelectedSessionId}`);
1701
+ // Fetch context and greet with it
1702
+ const summary = await getSessionSummary(preSelectedSessionId, workingDir);
1703
+ const conversationHistory = await getConversationHistory(preSelectedSessionId, workingDir, 30);
1704
+ await sendToFrontend({
1705
+ type: 'session_resume_set',
1706
+ sessionId: preSelectedSessionId,
1707
+ success: true,
1708
+ });
1709
+ // Send existing workspace artifacts to frontend (session-scoped)
1710
+ const preArtifacts = listWorkspaceArtifacts(workingDir, preSelectedSessionId);
1711
+ if (preArtifacts.length > 0) {
1712
+ console.log(`📁 Sending ${preArtifacts.length} workspace artifacts to frontend`);
1713
+ await sendToFrontend({
1714
+ type: 'session_artifacts',
1715
+ sessionId: preSelectedSessionId,
1716
+ artifacts: preArtifacts.map(a => ({
1717
+ filePath: a.filePath,
1718
+ fileName: a.fileName,
1719
+ type: a.type,
1720
+ updatedAt: a.updatedAt,
1721
+ }))
1722
+ });
1723
+ }
1724
+ // Load full session history into realtime model's context
1725
+ if (summary) {
1726
+ loadSessionHistoryIntoChatCtx(currentAgent, conversationHistory, currentProvider);
1727
+ const contextBriefing = buildContextBriefing(summary, conversationHistory, currentProvider);
1728
+ const specContent = getSpecForVoiceModel(workingDir, preSelectedSessionId);
1729
+ const specSection = specContent
1730
+ ? `\n\n=== SESSION SPEC ===\n${specContent}\n=== END SPEC ===\nCheck "Open Questions" — if any are unanswered, ask the user about them.`
1731
+ : '';
1732
+ try {
1733
+ if (sessionVoiceMode === 'realtime') {
1734
+ const contextPrompt = `[SESSION RESUMED] The user chose to continue a previous research session. Here's the context:\n${contextBriefing}${specSection}\n\nBriefly acknowledge the previous session. If there are open questions in the spec, ask the most important one. Otherwise ask what they'd like to continue with.`;
1735
+ await session.generateReply({ instructions: contextPrompt });
1736
+ }
1737
+ else {
1738
+ await session.say("Welcome back! Ready to continue our previous conversation.");
1739
+ }
1740
+ }
1741
+ catch (err) {
1742
+ console.log('⚠️ Pre-selected session greeting failed:', err);
1743
+ }
1744
+ }
1745
+ }
1746
+ }
1747
+ else if (!preSelectedSessionId && hasRecentSession) {
1748
+ // No pre-selected session but sessions exist — defer greeting for session gate
1749
+ console.log('⏳ Deferring greeting until session gate is completed');
1750
+ }
1751
+ else {
1752
+ // No sessions at all (or new session chosen) — greet as new user
1753
+ try {
1754
+ console.log('👋 Sending greeting...');
1755
+ await greetViaVoice("The user just connected for the first time. Briefly greet them as Osborn and ask what they're working on.");
1756
+ console.log('✅ Greeting sent');
1757
+ }
1758
+ catch (err) {
1759
+ console.log('⚠️ Greeting failed:', err);
1760
+ }
1761
+ }
1762
+ }
1763
+ catch (err) {
1764
+ console.error('❌ Failed to start session:', err);
1765
+ }
1766
+ });
1767
+ room.on(RoomEvent.ParticipantDisconnected, (participant) => {
1768
+ console.log(`👋 User left: ${participant.identity}`);
1769
+ if (currentSession) {
1770
+ currentSession.removeAllListeners();
1771
+ currentSession = null;
1772
+ currentLLM = null;
1773
+ }
1774
+ console.log('⏳ Waiting for new user...\n');
1775
+ });
1776
+ room.on(RoomEvent.DataReceived, async (payload, participant, kind, topic) => {
1777
+ if (topic !== 'user-input')
1778
+ return;
1779
+ try {
1780
+ const data = JSON.parse(new TextDecoder().decode(payload));
1781
+ console.log('📨 Data:', data.type);
1782
+ if (data.type === 'permission_response') {
1783
+ // Handle permission response for direct mode
1784
+ if (currentLLM && currentLLM.hasPendingPermission?.()) {
1785
+ const allow = data.response === 'allow' || data.response === 'always_allow';
1786
+ currentLLM.respondToPermission(allow);
1787
+ console.log(`✅ Permission: ${data.response}`);
1788
+ }
1789
+ }
1790
+ else if (data.type === 'user_text' && currentSession) {
1791
+ console.log(`📝 Text: "${data.content}"`);
1792
+ // Skip interrupt for Gemini — disrupts state machine (hangs in speaking state)
1793
+ if (currentProvider !== 'gemini') {
1794
+ currentSession.interrupt();
1795
+ }
1796
+ await currentSession.generateReply({ userInput: data.content });
1797
+ }
1798
+ // ============================================================
1799
+ // SESSION MANAGEMENT HANDLERS
1800
+ // ============================================================
1801
+ else if (data.type === 'list_sessions') {
1802
+ // List available sessions for this project
1803
+ console.log('📋 Listing available sessions...');
1804
+ try {
1805
+ // Clean up orphaned metadata entries before listing
1806
+ await cleanupOrphanedMetadata(workingDir);
1807
+ const sessions = await listSessions(workingDir);
1808
+ await sendToFrontend({
1809
+ type: 'sessions_list',
1810
+ sessions: sessions.map(s => ({
1811
+ sessionId: s.sessionId,
1812
+ timestamp: s.timestamp.toISOString(),
1813
+ lastMessage: s.lastMessage,
1814
+ messageCount: s.messageCount,
1815
+ })),
1816
+ count: sessions.length,
1817
+ });
1818
+ }
1819
+ catch (err) {
1820
+ console.error('Failed to list sessions:', err);
1821
+ await sendToFrontend({
1822
+ type: 'sessions_list',
1823
+ sessions: [],
1824
+ count: 0,
1825
+ error: 'Failed to list sessions',
1826
+ });
1827
+ }
1828
+ }
1829
+ else if (data.type === 'resume_session' && currentLLM) {
1830
+ // Lightweight: set resume ID and send artifacts to frontend only
1831
+ // Context injection (generateReply) happens in session_selected handler
1832
+ // to avoid double generateReply calls that cause timeouts
1833
+ const sessionId = data.sessionId;
1834
+ if (sessionId && sessionExists(sessionId, workingDir)) {
1835
+ currentLLM.setResumeSessionId(sessionId);
1836
+ console.log(`🔄 Will resume session: ${sessionId}`);
1837
+ await sendToFrontend({
1838
+ type: 'session_resume_set',
1839
+ sessionId,
1840
+ success: true,
1841
+ });
1842
+ // Send existing session artifacts to frontend (session-scoped)
1843
+ const artifacts = listWorkspaceArtifacts(workingDir, sessionId);
1844
+ if (artifacts.length > 0) {
1845
+ console.log(`📁 Sending ${artifacts.length} session artifacts to frontend`);
1846
+ await sendToFrontend({
1847
+ type: 'session_artifacts',
1848
+ sessionId,
1849
+ artifacts: artifacts.map(a => ({
1850
+ filePath: a.filePath,
1851
+ fileName: a.fileName,
1852
+ type: a.type,
1853
+ updatedAt: a.updatedAt,
1854
+ }))
1855
+ });
1856
+ }
1857
+ }
1858
+ else {
1859
+ console.error(`❌ Session not found: ${sessionId}`);
1860
+ await sendToFrontend({
1861
+ type: 'session_resume_set',
1862
+ sessionId,
1863
+ success: false,
1864
+ error: 'Session not found',
1865
+ });
1866
+ }
1867
+ }
1868
+ else if (data.type === 'continue_session' && currentLLM) {
1869
+ const recentId = await getMostRecentSessionId(workingDir);
1870
+ if (recentId) {
1871
+ currentLLM.setResumeSessionId(recentId);
1872
+ console.log(`🔄 Continuing most recent session: ${recentId}`);
1873
+ const summary = await getSessionSummary(recentId, workingDir);
1874
+ const conversationHistory = await getConversationHistory(recentId, workingDir, 30);
1875
+ await sendToFrontend({
1876
+ type: 'session_resume_set',
1877
+ sessionId: recentId,
1878
+ success: true,
1879
+ });
1880
+ // Send existing session artifacts to frontend (session-scoped)
1881
+ const artifacts = listWorkspaceArtifacts(workingDir, recentId);
1882
+ if (artifacts.length > 0) {
1883
+ console.log(`📁 Sending ${artifacts.length} session artifacts to frontend`);
1884
+ await sendToFrontend({
1885
+ type: 'session_artifacts',
1886
+ sessionId: recentId,
1887
+ artifacts: artifacts.map(a => ({
1888
+ filePath: a.filePath,
1889
+ fileName: a.fileName,
1890
+ type: a.type,
1891
+ updatedAt: a.updatedAt,
1892
+ }))
1893
+ });
1894
+ }
1895
+ if (currentSession && summary) {
1896
+ loadSessionHistoryIntoChatCtx(currentAgent, conversationHistory, currentProvider);
1897
+ const contextBriefing = buildContextBriefing(summary, conversationHistory, currentProvider);
1898
+ const specContent = getSpecForVoiceModel(workingDir, recentId);
1899
+ const specSection = specContent
1900
+ ? `\n\n=== SESSION SPEC ===\n${specContent}\n=== END SPEC ===\nCheck "Open Questions" — if any are unanswered, ask the user about them.`
1901
+ : '';
1902
+ console.log('📋 Injecting session context into voice agent...');
1903
+ try {
1904
+ if (currentVoiceMode === 'realtime') {
1905
+ const contextPrompt = `[SESSION RESUMED] The user chose to continue their most recent research session. Here's the context:\n${contextBriefing}${specSection}\n\nBriefly acknowledge the previous session. If there are open questions in the spec, ask the most important one. Otherwise ask what they'd like to continue with.`;
1906
+ await currentSession.generateReply({ instructions: contextPrompt });
1907
+ }
1908
+ else {
1909
+ await currentSession.say("Continuing where we left off.");
1910
+ }
1911
+ }
1912
+ catch (err) {
1913
+ console.log('⚠️ Context injection failed:', err);
1914
+ }
1915
+ }
1916
+ }
1917
+ else {
1918
+ console.log('📋 No previous sessions found - starting fresh');
1919
+ await sendToFrontend({
1920
+ type: 'session_resume_set',
1921
+ sessionId: null,
1922
+ success: false,
1923
+ error: 'No previous sessions found',
1924
+ });
1925
+ }
1926
+ }
1927
+ else if (data.type === 'switch_session' && currentLLM) {
1928
+ // Switch to a different session mid-conversation
1929
+ const sessionId = data.sessionId;
1930
+ if (sessionId && sessionExists(sessionId, workingDir)) {
1931
+ // Step 1: Get FULL context summary with conversation history
1932
+ const summary = await getSessionSummary(sessionId, workingDir);
1933
+ const conversationHistory = await getConversationHistory(sessionId, workingDir, 30);
1934
+ // Step 2: Reset LLM state and configure for new session
1935
+ currentLLM.resetForSessionSwitch();
1936
+ currentLLM.setResumeSessionId(sessionId);
1937
+ clearFastBrainHistory();
1938
+ console.log(`🔄 Switched to session: ${sessionId}`);
1939
+ // Step 3: Send full context to frontend (including conversation history)
1940
+ await sendToFrontend({
1941
+ type: 'session_switched',
1942
+ sessionId,
1943
+ success: true,
1944
+ summary,
1945
+ conversationHistory,
1946
+ });
1947
+ // Step 3.5: Send existing session artifacts to frontend (session-scoped)
1948
+ const switchArtifacts = listWorkspaceArtifacts(workingDir, sessionId);
1949
+ if (switchArtifacts.length > 0) {
1950
+ console.log(`📁 Sending ${switchArtifacts.length} session artifacts to frontend`);
1951
+ await sendToFrontend({
1952
+ type: 'session_artifacts',
1953
+ sessionId,
1954
+ artifacts: switchArtifacts.map(a => ({
1955
+ filePath: a.filePath,
1956
+ fileName: a.fileName,
1957
+ type: a.type,
1958
+ updatedAt: a.updatedAt,
1959
+ }))
1960
+ });
1961
+ }
1962
+ // Step 4: Voice agent acknowledges context
1963
+ if (currentSession && summary) {
1964
+ loadSessionHistoryIntoChatCtx(currentAgent, conversationHistory, currentProvider);
1965
+ const contextBriefing = buildContextBriefing(summary, conversationHistory, currentProvider);
1966
+ try {
1967
+ if (currentVoiceMode === 'realtime') {
1968
+ const contextPrompt = `[SESSION SWITCHED] The user switched to a different research session. Here's the context:\n${contextBriefing}\n\nBriefly acknowledge the switch and summarize what was being worked on.`;
1969
+ await currentSession.generateReply({ instructions: contextPrompt });
1970
+ }
1971
+ else {
1972
+ const acknowledgment = summary.lastMessages.length > 0
1973
+ ? `I've switched to your previous session. You were working on: ${summary.lastMessages[summary.lastMessages.length - 1]?.substring(0, 100)}`
1974
+ : `Switched to previous session with ${summary.messageCount} messages. What would you like to continue with?`;
1975
+ await currentSession.say(acknowledgment);
1976
+ }
1977
+ }
1978
+ catch (err) {
1979
+ console.log('⚠️ Switch acknowledgment failed:', err);
1980
+ }
1981
+ }
1982
+ }
1983
+ else {
1984
+ await sendToFrontend({
1985
+ type: 'session_switched',
1986
+ sessionId,
1987
+ success: false,
1988
+ error: 'Session not found',
1989
+ });
1990
+ }
1991
+ }
1992
+ else if (data.type === 'get_current_session' && currentLLM) {
1993
+ // Get current session ID
1994
+ await sendToFrontend({
1995
+ type: 'current_session',
1996
+ sessionId: currentLLM.sessionId,
1997
+ isResumingSession: currentLLM.isResumingSession,
1998
+ });
1999
+ }
2000
+ else if (data.type === 'get_session_artifacts') {
2001
+ const sessionId = data.sessionId;
2002
+ if (sessionId) {
2003
+ const artifacts = listWorkspaceArtifacts(workingDir, sessionId);
2004
+ console.log(`📁 Sending ${artifacts.length} session artifacts for ${sessionId.substring(0, 8)}`);
2005
+ await sendToFrontend({
2006
+ type: 'session_artifacts',
2007
+ sessionId,
2008
+ artifacts: artifacts.map(a => ({
2009
+ filePath: a.filePath,
2010
+ fileName: a.fileName,
2011
+ type: a.type,
2012
+ updatedAt: a.updatedAt,
2013
+ }))
2014
+ });
2015
+ }
2016
+ }
2017
+ // ============================================================
2018
+ // SESSION GATE HANDLER (initial session selection before voice)
2019
+ // ============================================================
2020
+ else if (data.type === 'get_plan_file') {
2021
+ const filePath = data.filePath;
2022
+ if (filePath && filePath.includes('.claude/plans/')) {
2023
+ try {
2024
+ const fs = await import('fs');
2025
+ const content = fs.readFileSync(filePath, 'utf-8');
2026
+ await sendToFrontend({ type: 'plan_file_content', filePath, content, fileName: filePath.split('/').pop() });
2027
+ }
2028
+ catch (err) {
2029
+ await sendToFrontend({ type: 'plan_file_content', filePath, content: '', error: err.message });
2030
+ }
2031
+ }
2032
+ }
2033
+ else if (data.type === 'get_research_artifact') {
2034
+ const filePath = data.filePath;
2035
+ if (filePath && (filePath.includes('.osborn/sessions/') || filePath.includes('.osborn/research/'))) {
2036
+ try {
2037
+ const fs = await import('fs');
2038
+ const fileName = filePath.split('/').pop() || '';
2039
+ const ext = fileName.split('.').pop()?.toLowerCase() || '';
2040
+ const isImage = ['png', 'jpg', 'jpeg', 'gif', 'webp'].includes(ext);
2041
+ if (isImage) {
2042
+ const base64 = fs.readFileSync(filePath, 'base64');
2043
+ await sendToFrontend({ type: 'research_artifact_content', filePath, content: base64, fileName, isImage: true, mimeType: `image/${ext}` });
2044
+ }
2045
+ else {
2046
+ const content = fs.readFileSync(filePath, 'utf-8');
2047
+ await sendToFrontend({ type: 'research_artifact_content', filePath, content, fileName, isImage: false });
2048
+ }
2049
+ }
2050
+ catch (err) {
2051
+ await sendToFrontend({ type: 'research_artifact_content', filePath, content: '', error: err.message });
2052
+ }
2053
+ }
2054
+ }
2055
+ // ============================================================
2056
+ // MCP SERVER TOGGLE HANDLERS
2057
+ // ============================================================
2058
+ else if (data.type === 'mcp_toggle' && currentLLM) {
2059
+ const serverKey = data.serverKey;
2060
+ const enabled = data.enabled;
2061
+ console.log(`🔌 MCP toggle: ${serverKey} → ${enabled ? 'ON' : 'OFF'}`);
2062
+ if (enabled) {
2063
+ try {
2064
+ // Check if this is a Smithery HTTP server — use proxy to bypass SDK bug
2065
+ const catalogEntry = MCP_CATALOG.find(e => e.serverKey === serverKey);
2066
+ const isSmitheryServer = catalogEntry?.url && isSmitheryUrl(catalogEntry.url);
2067
+ if (isSmitheryServer && catalogEntry?.url) {
2068
+ // Smithery cloud server: use in-process proxy (bypasses SDK HTTP bug #18296)
2069
+ const parsed = parseSmitheryUrl(catalogEntry.url);
2070
+ if (parsed) {
2071
+ const proxyConfig = await createSmitheryProxy({
2072
+ name: serverKey,
2073
+ namespace: parsed.namespace,
2074
+ connectionId: parsed.connectionId,
2075
+ });
2076
+ currentLLM.enableMcpServer(serverKey, proxyConfig);
2077
+ await announceViaVoice(`${serverKey} tools enabled.`);
2078
+ }
2079
+ else {
2080
+ throw new Error(`Could not parse Smithery URL: ${catalogEntry.url}`);
2081
+ }
2082
+ }
2083
+ else {
2084
+ // Non-Smithery server: use standard config (stdio or direct http)
2085
+ const serverConfigs = buildMcpServersForKeys(config, [serverKey]);
2086
+ const serverConfig = serverConfigs[serverKey];
2087
+ if (serverConfig) {
2088
+ currentLLM.enableMcpServer(serverKey, serverConfig);
2089
+ await announceViaVoice(`${serverKey} tools enabled.`);
2090
+ }
2091
+ else {
2092
+ throw new Error('Server configuration not found');
2093
+ }
2094
+ }
2095
+ }
2096
+ catch (err) {
2097
+ const errorMsg = err instanceof SmitheryAuthorizationError
2098
+ ? `OAuth required: ${err.authorizationUrl}`
2099
+ : err.message;
2100
+ console.error(`❌ MCP toggle failed for ${serverKey}: ${errorMsg}`);
2101
+ await sendToFrontend({
2102
+ type: 'mcp_toggle_result',
2103
+ serverKey,
2104
+ success: false,
2105
+ error: errorMsg,
2106
+ });
2107
+ }
2108
+ }
2109
+ else {
2110
+ await destroySmitheryProxy(serverKey); // Clean up proxy if exists
2111
+ currentLLM.disableMcpServer(serverKey);
2112
+ await announceViaVoice(`${serverKey} tools disabled.`);
2113
+ }
2114
+ // Send updated status back
2115
+ await sendToFrontend({
2116
+ type: 'mcp_toggle_result',
2117
+ serverKey,
2118
+ enabled,
2119
+ success: true,
2120
+ mcpServers: getMcpServerStatusList(config),
2121
+ enabledKeys: currentLLM.getEnabledMcpServerKeys(),
2122
+ });
2123
+ }
2124
+ else if (data.type === 'get_mcp_status') {
2125
+ // Frontend requesting current MCP status
2126
+ const statusList = getMcpServerStatusList(config);
2127
+ const enabledKeys = currentLLM?.getEnabledMcpServerKeys() || [];
2128
+ // Merge runtime enabled state into status list
2129
+ const mergedStatus = statusList.map(s => ({
2130
+ ...s,
2131
+ enabled: enabledKeys.includes(s.serverKey),
2132
+ }));
2133
+ await sendToFrontend({
2134
+ type: 'mcp_status',
2135
+ mcpServers: mergedStatus,
2136
+ enabledKeys,
2137
+ });
2138
+ }
2139
+ else if (data.type === 'session_selected') {
2140
+ const sessionId = data.sessionId;
2141
+ console.log(`🚪 Session gate completed: ${sessionId ? `resume ${sessionId}` : 'fresh start'}`);
2142
+ if (sessionId && currentLLM && sessionExists(sessionId, workingDir)) {
2143
+ // Resume the selected session
2144
+ currentLLM.setResumeSessionId(sessionId);
2145
+ console.log(`🔄 Resuming session: ${sessionId}`);
2146
+ // Fetch context and greet with it
2147
+ const summary = await getSessionSummary(sessionId, workingDir);
2148
+ const conversationHistory = await getConversationHistory(sessionId, workingDir, 30);
2149
+ await sendToFrontend({
2150
+ type: 'session_resume_set',
2151
+ sessionId,
2152
+ success: true,
2153
+ });
2154
+ // Send existing session artifacts to frontend (session-scoped)
2155
+ const gateArtifacts = listWorkspaceArtifacts(workingDir, sessionId);
2156
+ if (gateArtifacts.length > 0) {
2157
+ console.log(`📁 Sending ${gateArtifacts.length} session artifacts to frontend`);
2158
+ await sendToFrontend({
2159
+ type: 'session_artifacts',
2160
+ sessionId,
2161
+ artifacts: gateArtifacts.map(a => ({
2162
+ filePath: a.filePath,
2163
+ fileName: a.fileName,
2164
+ type: a.type,
2165
+ updatedAt: a.updatedAt,
2166
+ }))
2167
+ });
2168
+ }
2169
+ // Load full session history and greet with context
2170
+ if (currentSession && summary) {
2171
+ loadSessionHistoryIntoChatCtx(currentAgent, conversationHistory, currentProvider);
2172
+ const contextBriefing = buildContextBriefing(summary, conversationHistory, currentProvider);
2173
+ const specContent = getSpecForVoiceModel(workingDir, sessionId);
2174
+ const specSection = specContent
2175
+ ? `\n\n=== SESSION SPEC ===\n${specContent}\n=== END SPEC ===\nCheck "Open Questions" — if any are unanswered, ask the user about them.`
2176
+ : '';
2177
+ try {
2178
+ if (currentVoiceMode === 'realtime') {
2179
+ const contextPrompt = `[SESSION RESUMED] The user chose to continue a previous research session. Here's the context:\n${contextBriefing}${specSection}\n\nBriefly acknowledge the previous session. If there are open questions in the spec, ask the most important one. Otherwise ask what they'd like to continue with.`;
2180
+ await currentSession.generateReply({ instructions: contextPrompt });
2181
+ }
2182
+ else {
2183
+ await currentSession.say("Welcome back! Ready to continue our previous conversation.");
2184
+ }
2185
+ }
2186
+ catch (err) {
2187
+ console.log('⚠️ Session gate greeting failed:', err);
2188
+ }
2189
+ }
2190
+ }
2191
+ else {
2192
+ // Fresh start - just greet normally
2193
+ console.log('🆕 Starting fresh session');
2194
+ if (currentSession) {
2195
+ try {
2196
+ if (currentVoiceMode === 'realtime') {
2197
+ await currentSession.generateReply({ userInput: "The user just connected and chose to start a fresh session. Briefly greet them as Osborn and ask what they're working on." });
2198
+ }
2199
+ else {
2200
+ await currentSession.say("Hey! I'm Osborn. What are you working on?");
2201
+ }
2202
+ }
2203
+ catch (err) {
2204
+ console.log('⚠️ Fresh session greeting failed:', err);
2205
+ }
2206
+ }
2207
+ }
2208
+ }
2209
+ }
2210
+ catch { }
2211
+ });
2212
+ // ============================================================
2213
+ // Connect to Room
2214
+ // ============================================================
2215
+ try {
2216
+ await room.connect(livekitUrl, jwt, {
2217
+ autoSubscribe: true,
2218
+ dynacast: true,
2219
+ });
2220
+ localParticipant = room.localParticipant;
2221
+ console.log('✅ Connected to room:', roomName);
2222
+ console.log('\n⏳ Waiting for user to connect...');
2223
+ console.log(` Room: ${roomCode}\n`);
2224
+ // Keep process alive
2225
+ await new Promise(() => { });
2226
+ }
2227
+ catch (err) {
2228
+ console.error('❌ Failed to connect:', err);
2229
+ process.exit(1);
2230
+ }
2231
+ }
2232
+ // Run
2233
+ main().catch(console.error);