osborn 0.1.2 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,10 @@ import { EventEmitter } from 'events';
3
3
  interface ClaudeHandlerOptions {
4
4
  workingDirectory?: string;
5
5
  allowedTools?: string[];
6
- permissionMode?: 'default' | 'acceptEdits' | 'bypassPermissions';
6
+ permissionMode?: 'default' | 'acceptEdits' | 'bypassPermissions' | 'plan';
7
7
  mcpServers?: Record<string, McpServerConfig>;
8
8
  requireAllPermissions?: boolean;
9
+ agentRole?: 'plan' | 'execute';
9
10
  }
10
11
  export type { McpServerConfig };
11
12
  export interface PermissionRequestEvent {
@@ -32,7 +33,18 @@ export declare class ClaudeHandler extends EventEmitter {
32
33
  private toolStartTimes;
33
34
  private alwaysAllowedTools;
34
35
  private static readonly ALL_TOOLS;
36
+ private static readonly PLAN_TOOLS;
37
+ private static readonly EXECUTE_TOOLS;
38
+ private agentRole;
35
39
  constructor(options?: ClaudeHandlerOptions);
40
+ /**
41
+ * Get the agent's role
42
+ */
43
+ getRole(): 'plan' | 'execute';
44
+ /**
45
+ * Check if this is a plan-mode agent
46
+ */
47
+ isPlanMode(): boolean;
36
48
  /**
37
49
  * Generate human-readable description for a tool call
38
50
  */
@@ -54,22 +54,57 @@ export class ClaudeHandler extends EventEmitter {
54
54
  // LSP (Language Server Protocol)
55
55
  'LSP',
56
56
  ];
57
+ // Plan mode tools - read-only, research, context gathering
58
+ static PLAN_TOOLS = [
59
+ 'Read', // View file contents
60
+ 'Glob', // File pattern matching
61
+ 'Grep', // Content searching
62
+ 'Bash', // Read-only bash (ls, git status, git log, etc.)
63
+ 'Task', // Research agents
64
+ 'WebFetch', // Web content analysis
65
+ 'WebSearch', // Internet searching
66
+ 'LSP', // Code intelligence (go to definition, references)
67
+ ];
68
+ // Execute mode tools - full access
69
+ static EXECUTE_TOOLS = ClaudeHandler.ALL_TOOLS;
70
+ agentRole;
57
71
  constructor(options = {}) {
58
72
  super();
73
+ // Set agent role
74
+ this.agentRole = options.agentRole || (options.permissionMode === 'plan' ? 'plan' : 'execute');
75
+ // For plan mode, restrict to read-only tools
76
+ const isPlanMode = options.permissionMode === 'plan';
77
+ const defaultTools = isPlanMode ? ClaudeHandler.PLAN_TOOLS : ClaudeHandler.ALL_TOOLS;
59
78
  this.options = {
60
79
  workingDirectory: options.workingDirectory || process.cwd(),
61
- allowedTools: options.allowedTools || ClaudeHandler.ALL_TOOLS,
62
- permissionMode: options.permissionMode || 'default',
80
+ allowedTools: options.allowedTools || defaultTools,
81
+ // Plan mode uses 'default' permission mode but with restricted tools
82
+ permissionMode: isPlanMode ? 'default' : (options.permissionMode || 'default'),
63
83
  mcpServers: options.mcpServers,
64
- // By default, require permission for ALL tools
65
- requireAllPermissions: options.requireAllPermissions ?? true,
84
+ // Plan mode doesn't require permissions (read-only is safe)
85
+ // Execute mode requires permissions for safety
86
+ requireAllPermissions: isPlanMode ? false : (options.requireAllPermissions ?? true),
66
87
  };
88
+ const roleEmoji = this.agentRole === 'plan' ? 'šŸ“‹' : 'šŸ”Ø';
89
+ console.log(`${roleEmoji} Agent role: ${this.agentRole.toUpperCase()}`);
67
90
  console.log(`šŸ”§ Allowed tools: ${this.options.allowedTools?.join(', ')}`);
68
- console.log(`šŸ” Require all permissions: ${this.options.requireAllPermissions}`);
91
+ console.log(`šŸ” Require permissions: ${this.options.requireAllPermissions}`);
69
92
  if (this.options.mcpServers) {
70
93
  console.log(`šŸ”Œ MCP servers: ${Object.keys(this.options.mcpServers).join(', ')}`);
71
94
  }
72
95
  }
96
+ /**
97
+ * Get the agent's role
98
+ */
99
+ getRole() {
100
+ return this.agentRole;
101
+ }
102
+ /**
103
+ * Check if this is a plan-mode agent
104
+ */
105
+ isPlanMode() {
106
+ return this.agentRole === 'plan';
107
+ }
73
108
  /**
74
109
  * Generate human-readable description for a tool call
75
110
  */
package/dist/index.d.ts CHANGED
@@ -1,3 +1 @@
1
1
  import 'dotenv/config';
2
- declare const _default: import("@livekit/agents").Agent;
3
- export default _default;
package/dist/index.js CHANGED
@@ -1,337 +1,449 @@
1
- import { ServerOptions, cli, defineAgent, llm, voice } from '@livekit/agents';
1
+ import { llm, voice, initializeLogger } from '@livekit/agents';
2
2
  import * as openai from '@livekit/agents-plugin-openai';
3
3
  import * as google from '@livekit/agents-plugin-google';
4
+ import { Room, RoomEvent } from '@livekit/rtc-node';
5
+ import { AccessToken } from 'livekit-server-sdk';
4
6
  import { z } from 'zod';
5
- import { fileURLToPath } from 'url';
6
7
  import 'dotenv/config';
8
+ // Initialize logger before anything else
9
+ initializeLogger({ pretty: true, level: 'info' });
7
10
  import { ClaudeHandler } from './claude-handler.js';
8
11
  import { CodexHandler } from './codex-handler.js';
9
12
  import { loadConfig, getMcpServers, getEnabledMcpServerNames } from './config.js';
10
- // Parse CLI arguments for room code
13
+ // Generate a short, user-friendly room code
14
+ function generateRoomCode() {
15
+ const chars = 'abcdefghjkmnpqrstuvwxyz23456789';
16
+ let code = '';
17
+ for (let i = 0; i < 6; i++) {
18
+ code += chars[Math.floor(Math.random() * chars.length)];
19
+ }
20
+ return code;
21
+ }
22
+ // Parse CLI arguments
11
23
  function parseArgs() {
12
24
  const args = process.argv.slice(2);
13
25
  let roomCode;
26
+ let provider;
14
27
  for (let i = 0; i < args.length; i++) {
15
28
  if (args[i] === '--room' && args[i + 1]) {
16
29
  roomCode = args[i + 1];
17
30
  }
31
+ if (args[i] === '--provider' && args[i + 1]) {
32
+ provider = args[i + 1];
33
+ }
34
+ // Short code detection (e.g., `npm run dev abc123`)
35
+ if (!args[i].startsWith('-') && args[i].length >= 4 && args[i].length <= 10 &&
36
+ !['dev', 'start'].includes(args[i])) {
37
+ roomCode = args[i];
38
+ }
18
39
  }
19
- return { roomCode };
40
+ return { roomCode, provider };
20
41
  }
21
- const cliArgs = parseArgs();
22
- if (cliArgs.roomCode) {
23
- console.log(`šŸ”— Room code provided: ${cliArgs.roomCode}`);
24
- }
25
- // Global error handlers to catch silent failures
26
- process.on('unhandledRejection', (reason, promise) => {
42
+ // Global error handlers
43
+ process.on('unhandledRejection', (reason) => {
27
44
  console.error('āŒ Unhandled Rejection:', reason);
28
45
  });
29
46
  process.on('uncaughtException', (error) => {
30
47
  console.error('āŒ Uncaught Exception:', error);
31
48
  });
32
- // Default provider (can be overridden by participant metadata)
33
- const DEFAULT_PROVIDER = process.env.LLM_PROVIDER || 'openai';
34
- // Debug mode
35
- const DEBUG = process.env.DEBUG_LIVEKIT === 'true';
36
- if (DEBUG) {
37
- console.log('šŸ› Debug logging enabled');
38
- }
39
- console.log(`šŸ¤– Default LLM Provider: ${DEFAULT_PROVIDER}`);
40
- // Load configuration from ~/.osborn/config.yaml
41
- console.log('šŸ“ Loading configuration...');
42
- const config = loadConfig();
43
- const mcpServers = getMcpServers(config);
44
- const enabledMcpNames = getEnabledMcpServerNames(config);
45
- if (enabledMcpNames.length > 0) {
46
- console.log(`šŸ”Œ Enabled MCP servers: ${enabledMcpNames.join(', ')}`);
47
- }
48
- // Pre-initialize Claude handler at module load (before any connections)
49
- console.log('šŸ”„ Pre-initializing Claude Code...');
50
- const workingDir = config.workingDirectory || process.cwd();
51
- const claude = new ClaudeHandler({
52
- workingDirectory: workingDir,
53
- permissionMode: 'default', // Ask for permission on dangerous tools (Bash, Write, Edit)
54
- mcpServers: Object.keys(mcpServers).length > 0 ? mcpServers : undefined,
55
- });
56
- console.log(`šŸ“‚ Working directory: ${workingDir}`);
57
- // Listen for permission requests from Claude
58
- claude.on('permission_request', (req) => {
59
- console.log(`\nāš ļø PERMISSION REQUIRED āš ļø`);
60
- console.log(`šŸ”§ Tool: ${req.toolName}`);
61
- console.log(`šŸ“ Action: ${req.description}`);
62
- console.log(`ā³ Waiting for user response (say: allow, deny, or always allow)...`);
63
- // Send to frontend for UI display
64
- sendToFrontend({
65
- type: 'permission_request',
66
- toolName: req.toolName,
67
- description: req.description,
49
+ // Main function
50
+ async function main() {
51
+ console.log('\nšŸ¤– Osborn Voice AI Coding Assistant\n');
52
+ // Validate environment
53
+ const livekitUrl = process.env.LIVEKIT_URL;
54
+ const apiKey = process.env.LIVEKIT_API_KEY;
55
+ const apiSecret = process.env.LIVEKIT_API_SECRET;
56
+ if (!livekitUrl || !apiKey || !apiSecret) {
57
+ console.error('āŒ Missing required environment variables:');
58
+ if (!livekitUrl)
59
+ console.error(' - LIVEKIT_URL');
60
+ if (!apiKey)
61
+ console.error(' - LIVEKIT_API_KEY');
62
+ if (!apiSecret)
63
+ console.error(' - LIVEKIT_API_SECRET');
64
+ console.error('\nSet these in your .env file or environment.');
65
+ process.exit(1);
66
+ }
67
+ // Parse CLI args
68
+ const cliArgs = parseArgs();
69
+ // Load configuration
70
+ console.log('šŸ“ Loading configuration...');
71
+ const config = loadConfig();
72
+ const mcpServers = getMcpServers(config);
73
+ const enabledMcpNames = getEnabledMcpServerNames(config);
74
+ if (enabledMcpNames.length > 0) {
75
+ console.log(`šŸ”Œ Enabled MCP servers: ${enabledMcpNames.join(', ')}`);
76
+ }
77
+ const workingDir = config.workingDirectory || process.cwd();
78
+ console.log(`šŸ“‚ Working directory: ${workingDir}`);
79
+ // Determine room code
80
+ const roomCode = cliArgs.roomCode || generateRoomCode();
81
+ const roomName = `osborn-${roomCode}`;
82
+ if (cliArgs.roomCode) {
83
+ console.log(`šŸ”— Joining room: ${roomCode}`);
84
+ }
85
+ else {
86
+ console.log(`\n✨ Created new room: ${roomCode}`);
87
+ console.log(`\nšŸ“‹ Share this with the frontend or run:`);
88
+ console.log(` Open: https://osborn.app?room=${roomCode}`);
89
+ console.log(` Or enter code "${roomCode}" in the frontend\n`);
90
+ }
91
+ // Default provider
92
+ const defaultProvider = cliArgs.provider || process.env.LLM_PROVIDER || 'openai';
93
+ console.log(`šŸŽÆ Default voice provider: ${defaultProvider}`);
94
+ // ============================================================
95
+ // Initialize Claude Agents (Dual Architecture)
96
+ // ============================================================
97
+ console.log('\nšŸ”„ Initializing Claude agents...');
98
+ // Plan Agent - Read-only, research
99
+ const planAgent = {
100
+ id: 1,
101
+ role: 'plan',
102
+ handler: new ClaudeHandler({
103
+ workingDirectory: workingDir,
104
+ permissionMode: 'plan',
105
+ agentRole: 'plan',
106
+ mcpServers: Object.keys(mcpServers).length > 0 ? mcpServers : undefined,
107
+ }),
108
+ busy: false,
109
+ currentTask: null,
110
+ context: [],
111
+ };
112
+ // Execute Agent - Full access
113
+ const executeAgent = {
114
+ id: 2,
115
+ role: 'execute',
116
+ handler: new ClaudeHandler({
117
+ workingDirectory: workingDir,
118
+ permissionMode: 'default',
119
+ agentRole: 'execute',
120
+ mcpServers: Object.keys(mcpServers).length > 0 ? mcpServers : undefined,
121
+ }),
122
+ busy: false,
123
+ currentTask: null,
124
+ context: [],
125
+ };
126
+ const agentPool = [planAgent, executeAgent];
127
+ // Smart routing
128
+ function routeTask(task) {
129
+ const taskLower = task.toLowerCase();
130
+ const executeKeywords = [
131
+ 'create', 'make', 'build', 'implement', 'add', 'write',
132
+ 'fix', 'update', 'change', 'modify', 'edit', 'refactor',
133
+ 'delete', 'remove', 'run', 'execute', 'install', 'deploy',
134
+ 'commit', 'push', 'test', 'debug', 'start', 'stop',
135
+ ];
136
+ for (const keyword of executeKeywords) {
137
+ if (taskLower.includes(keyword)) {
138
+ if (executeAgent.busy && !planAgent.busy) {
139
+ return planAgent;
140
+ }
141
+ return executeAgent;
142
+ }
143
+ }
144
+ return planAgent.busy ? executeAgent : planAgent;
145
+ }
146
+ // ============================================================
147
+ // Create Access Token for Agent
148
+ // ============================================================
149
+ console.log('šŸ”‘ Creating access token...');
150
+ const token = new AccessToken(apiKey, apiSecret, {
151
+ identity: 'osborn-agent',
152
+ name: 'Osborn AI',
153
+ metadata: JSON.stringify({ type: 'agent', version: '0.1.5' }),
68
154
  });
69
- });
70
- // Pre-warm Claude immediately on server start
71
- claude.run('Respond with just: ready')
72
- .then(() => console.log('āœ… Claude pre-warmed and ready!'))
73
- .catch((err) => console.log('āš ļø Pre-warm failed:', err.message));
74
- // Track job context and session for data channel
75
- let jobContext = null;
76
- let currentSession = null;
77
- // Track the current coding handler (can be Claude or Codex)
78
- let currentCodingAgent = 'claude';
79
- let codexHandler = null;
80
- // Helper to cleanup previous session before starting new one
81
- async function cleanupSession() {
82
- if (currentSession) {
83
- console.log('🧹 Cleaning up previous session...');
155
+ token.addGrant({
156
+ roomJoin: true,
157
+ room: roomName,
158
+ canPublish: true,
159
+ canSubscribe: true,
160
+ canPublishData: true,
161
+ });
162
+ const jwt = await token.toJwt();
163
+ // ============================================================
164
+ // Connect to Room Directly
165
+ // ============================================================
166
+ console.log('šŸ“” Connecting to LiveKit...');
167
+ const room = new Room();
168
+ // Track state
169
+ let currentSession = null;
170
+ let currentProvider = defaultProvider;
171
+ let currentCodingAgent = 'claude';
172
+ let codexHandler = null;
173
+ let localParticipant = null;
174
+ let agentState = 'initializing';
175
+ // Speech queue
176
+ const speechQueue = [];
177
+ let isSpeaking = false;
178
+ // Helper to send data to frontend
179
+ async function sendToFrontend(data) {
180
+ if (!localParticipant) {
181
+ console.log('āš ļø sendToFrontend: no localParticipant!');
182
+ return;
183
+ }
84
184
  try {
85
- currentSession.removeAllListeners();
86
- // Close session gracefully if method exists
87
- if (typeof currentSession.close === 'function') {
88
- await currentSession.close();
89
- }
185
+ const encoder = new TextEncoder();
186
+ const payload = encoder.encode(JSON.stringify(data));
187
+ await localParticipant.publishData(payload, {
188
+ reliable: true,
189
+ topic: 'osborn-updates',
190
+ });
191
+ console.log(`šŸ“¤ Sent to frontend: ${data.type}`);
90
192
  }
91
193
  catch (err) {
92
- console.log('āš ļø Session cleanup error (non-fatal):', err.message);
194
+ console.error('āŒ sendToFrontend error:', err);
93
195
  }
94
- currentSession = null;
95
- }
96
- }
97
- // Helper to send data to frontend
98
- async function sendToFrontend(data) {
99
- if (!jobContext)
100
- return;
101
- try {
102
- const encoder = new TextEncoder();
103
- const payload = encoder.encode(JSON.stringify(data));
104
- await jobContext.room.localParticipant?.publishData(payload, {
105
- reliable: true,
106
- topic: 'osborn-updates',
107
- });
108
- }
109
- catch (err) {
110
- // Ignore send errors
111
196
  }
112
- }
113
- // Define the run_code tool (works with both Claude and Codex)
114
- const runCodeTool = llm.tool({
115
- description: `Execute coding tasks using the coding agent. Use for:
116
- - Files: read, write, create, edit, list, search
117
- - Directories: current directory, list contents
118
- - Code: fix bugs, refactor, explain, review
119
- - Terminal: run commands, install packages, git
120
- - Project: analyze codebase, make changes
121
- - Web: search the web for information`,
122
- parameters: z.object({
123
- task: z.string().describe('The coding task to execute'),
124
- }),
125
- execute: async ({ task }) => {
126
- const agentName = currentCodingAgent === 'claude' ? 'Claude Code' : 'OpenAI Codex';
127
- console.log(`\nšŸ”Ø ${agentName}: "${task}"`);
128
- await sendToFrontend({ type: 'system', text: `Working on: ${task}` });
129
- try {
130
- let result;
131
- if (currentCodingAgent === 'codex' && codexHandler) {
132
- result = await codexHandler.run(task);
197
+ // Process speech queue
198
+ async function processSpeechQueue() {
199
+ if (isSpeaking || speechQueue.length === 0 || !currentSession)
200
+ return;
201
+ if (agentState !== 'listening')
202
+ return;
203
+ if (currentProvider === 'gemini') {
204
+ // Gemini doesn't support generateReply
205
+ while (speechQueue.length > 0) {
206
+ console.log(`šŸ”Š [Would say] ${speechQueue.shift()}`);
133
207
  }
134
- else {
135
- result = await claude.run(task);
136
- }
137
- console.log(`āœ… Done: ${result.length} chars`);
138
- await sendToFrontend({ type: 'assistant_response', text: result });
139
- return result;
208
+ return;
140
209
  }
141
- catch (err) {
142
- console.error('āŒ Error:', err);
143
- return `Error: ${err.message}`;
210
+ isSpeaking = true;
211
+ const message = speechQueue.shift();
212
+ try {
213
+ await Promise.race([
214
+ currentSession.generateReply({ userInput: message }),
215
+ new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
216
+ ]);
144
217
  }
145
- },
146
- });
147
- // Define the permission response tool
148
- const respondPermissionTool = llm.tool({
149
- description: `Respond to a pending permission request from Claude Code.
150
- Use this ONLY when there is a pending permission request.
151
- Call this after hearing the user's response to a permission prompt.`,
152
- parameters: z.object({
153
- response: z.enum(['allow', 'deny', 'always_allow']).describe('The user response: "allow" for one-time approval, "deny" to reject, "always_allow" to permanently allow this tool type'),
154
- }),
155
- execute: async ({ response }) => {
156
- if (!claude.hasPendingPermission()) {
157
- return 'No pending permission request.';
218
+ catch {
219
+ // Ignore speech errors
220
+ }
221
+ finally {
222
+ isSpeaking = false;
223
+ if (speechQueue.length > 0) {
224
+ setTimeout(processSpeechQueue, 500);
225
+ }
158
226
  }
159
- const pending = claude.getPendingPermission();
160
- claude.respondToPermission(response);
161
- await sendToFrontend({
162
- type: 'permission_response',
163
- response,
164
- toolName: pending?.toolName
227
+ }
228
+ // Setup agent event handlers
229
+ agentPool.forEach(slot => {
230
+ slot.handler.on('permission_request', (req) => {
231
+ console.log(`\nāš ļø [${slot.role}] PERMISSION: ${req.toolName}`);
232
+ sendToFrontend({
233
+ type: 'permission_request',
234
+ toolName: req.toolName,
235
+ description: req.description,
236
+ agentId: slot.id,
237
+ });
238
+ speechQueue.push(`[Tell user] I need permission to ${req.description}. Say yes, no, or always allow.`);
239
+ processSpeechQueue();
165
240
  });
166
- return `Permission ${response} for ${pending?.toolName || 'tool'}.`;
167
- },
168
- });
169
- // Agent instructions - dynamically includes available tools
170
- const OSBORN_INSTRUCTIONS = `You are Osborn, a voice-enabled AI assistant with coding superpowers.
171
- Keep responses under 50 words. Sound natural and human.
241
+ slot.handler.on('tool_use', (tool) => {
242
+ console.log(`šŸ”§ [${slot.role}] Using: ${tool.name}`);
243
+ });
244
+ slot.handler.on('error', (err) => {
245
+ console.error(`āŒ [${slot.role}] Error:`, err);
246
+ });
247
+ });
248
+ // Define tools for voice LLM
249
+ const runCodeTool = llm.tool({
250
+ description: `Execute ANY coding task by delegating to Claude agents. YOU MUST USE THIS for:
251
+ - Reading files ("read package.json", "show me the code")
252
+ - Writing/editing files ("fix this bug", "add a function")
253
+ - Running commands ("run npm test", "git status")
254
+ - Searching code ("find where X is defined")
255
+ - Explaining code ("what does this function do")
172
256
 
173
- AVAILABLE CAPABILITIES via run_code tool:
174
- - Read, Write, Edit, MultiEdit files
175
- - Glob (find files by pattern), Grep (search content)
176
- - Bash (run terminal commands)
177
- - WebSearch (search the web), WebFetch (fetch URLs)
178
- - NotebookEdit (edit Jupyter notebooks)
179
- - Task (delegate complex tasks), TodoWrite (track tasks)
180
- - LSP (code intelligence - go to definition, find references)
257
+ You DON'T need permission to use this - it routes to the right agent automatically.
258
+ Plan Agent = reading/research. Execute Agent = writing (will ask user for permission).`,
259
+ parameters: z.object({
260
+ task: z.string().describe('The coding task to execute'),
261
+ }),
262
+ execute: async ({ task }) => {
263
+ const slot = routeTask(task);
264
+ console.log(`\nšŸ”Ø [${slot.role}] Task: "${task}"`);
265
+ await sendToFrontend({ type: 'system', text: `${slot.role} agent: ${task}` });
266
+ slot.busy = true;
267
+ slot.currentTask = task;
268
+ sharedContext.currentFocus = task.substring(0, 50);
269
+ try {
270
+ let result;
271
+ if (currentCodingAgent === 'codex' && codexHandler) {
272
+ result = await codexHandler.run(task);
273
+ }
274
+ else {
275
+ const contextPrefix = slot.context.length > 0
276
+ ? `Context: ${slot.context.slice(-3).join(' | ')}\n\nTask: `
277
+ : '';
278
+ result = await slot.handler.run(contextPrefix + task);
279
+ }
280
+ slot.context.push(`${task.substring(0, 50)} → Done`);
281
+ if (slot.context.length > 10)
282
+ slot.context.shift();
283
+ // Update shared context
284
+ sharedContext.addAction(`${slot.role}: ${task.substring(0, 30)}`);
285
+ // Extract file references from result
286
+ const fileMatches = result.match(/(?:\/[\w\-\.\/]+|src\/[\w\-\.\/]+|\.\/[\w\-\.\/]+)/g);
287
+ if (fileMatches) {
288
+ fileMatches.slice(0, 3).forEach(f => sharedContext.addFile(f));
289
+ }
290
+ console.log(`āœ… [${slot.role}] Done`);
291
+ await sendToFrontend({ type: 'assistant_response', text: result });
292
+ // Return a concise summary for the voice LLM
293
+ const summary = result.length > 500
294
+ ? result.substring(0, 500) + '... [truncated for voice]'
295
+ : result;
296
+ return summary;
297
+ }
298
+ catch (err) {
299
+ return `Error: ${err.message}`;
300
+ }
301
+ finally {
302
+ slot.busy = false;
303
+ slot.currentTask = null;
304
+ }
305
+ },
306
+ });
307
+ const respondPermissionTool = llm.tool({
308
+ description: `Respond to a permission request. Call after hearing user's response.`,
309
+ parameters: z.object({
310
+ response: z.enum(['allow', 'deny', 'always_allow']),
311
+ }),
312
+ execute: async ({ response }) => {
313
+ const slot = agentPool.find(s => s.handler.hasPendingPermission());
314
+ if (!slot)
315
+ return 'No pending permission.';
316
+ const pending = slot.handler.getPendingPermission();
317
+ slot.handler.respondToPermission(response);
318
+ await sendToFrontend({ type: 'permission_response', response, toolName: pending?.toolName });
319
+ return `Permission ${response} for ${pending?.toolName || 'tool'}.`;
320
+ },
321
+ });
322
+ // Shared context that both voice and coding agents contribute to
323
+ const sharedContext = {
324
+ recentActions: [],
325
+ discoveredFiles: [],
326
+ currentFocus: null,
327
+ addAction(action) {
328
+ this.recentActions.push(action);
329
+ if (this.recentActions.length > 5)
330
+ this.recentActions.shift();
331
+ },
332
+ addFile(file) {
333
+ if (!this.discoveredFiles.includes(file)) {
334
+ this.discoveredFiles.push(file);
335
+ if (this.discoveredFiles.length > 10)
336
+ this.discoveredFiles.shift();
337
+ }
338
+ },
339
+ getContextSummary() {
340
+ const parts = [];
341
+ if (this.currentFocus)
342
+ parts.push(`Focus: ${this.currentFocus}`);
343
+ if (this.recentActions.length)
344
+ parts.push(`Recent: ${this.recentActions.slice(-3).join(', ')}`);
345
+ if (this.discoveredFiles.length)
346
+ parts.push(`Files: ${this.discoveredFiles.slice(-5).join(', ')}`);
347
+ return parts.join(' | ');
348
+ }
349
+ };
350
+ // Dynamic instructions with working directory context
351
+ const getInstructions = () => `You are Osborn, a voice AI coding assistant.
181
352
 
182
- WHEN TO USE run_code:
183
- - File operations (read, write, create, edit, list, find)
184
- - Code tasks (fix, refactor, explain, review, debug)
185
- - Terminal commands (run, install, test, build, git)
186
- - Web searches (look up documentation, APIs, errors)
187
- - Project analysis (understand codebase, find patterns)
353
+ WORKING DIRECTORY: ${workingDir}
188
354
 
189
- WHEN TO RESPOND DIRECTLY:
190
- - Greetings and small talk
191
- - General knowledge questions
192
- - Clarifying what the user wants
355
+ STYLE: Keep responses SHORT (under 70 words). Sound natural. Say "Got it" when given a task.
193
356
 
194
- PERMISSION HANDLING:
195
- When the coding agent needs permission, you MUST:
196
- 1. Tell the user: "[Agent] wants to [action]. Allow, deny, or always allow?"
197
- 2. When they respond, call respond_permission with their choice
357
+ CAPABILITIES (via run_code tool):
358
+ - Read/write/edit files, search codebase
359
+ - Run terminal commands (npm, git, etc)
360
+ - Fix bugs, refactor, explain code
361
+ - Search web/docs for solutions
198
362
 
199
- Be conversational and helpful. Ask follow-up questions when needed.`;
200
- // Voice assistant with tools
201
- class OsbornAssistant extends voice.Agent {
202
- constructor() {
203
- super({
204
- instructions: OSBORN_INSTRUCTIONS,
205
- tools: {
206
- run_code: runCodeTool,
207
- respond_permission: respondPermissionTool,
208
- },
209
- });
210
- }
211
- }
212
- // Create the appropriate model based on provider
213
- function createModel(provider) {
214
- if (provider === 'gemini') {
215
- console.log('šŸ“± Using Gemini Live API');
216
- console.log('šŸ”‘ GOOGLE_API_KEY:', process.env.GOOGLE_API_KEY ? 'set' : 'NOT SET');
217
- // From official docs: https://docs.livekit.io/agents/models/realtime/plugins/gemini/
218
- // Package v1.0.31 uses google.beta.realtime (not google.realtime yet)
219
- const model = new google.beta.realtime.RealtimeModel({
220
- // model: 'gemini-2.5-flash-native-audio-preview-12-2025', // From official docs
221
- model: 'gemini-3.5-flash-latest', // From official docs
222
- voice: 'Puck',
223
- instructions: OSBORN_INSTRUCTIONS,
224
- });
225
- // console.log('āœ… Gemini model created with gemini-2.5-flash-native-audio-preview-12-2025')
226
- console.log('āœ… Gemini model created with gemini-3.5-flash-latest');
227
- return model;
228
- }
229
- else {
230
- console.log('šŸ“± Using OpenAI Realtime API');
231
- console.log('šŸ”‘ OPENAI_API_KEY:', process.env.OPENAI_API_KEY ? 'set' : 'NOT SET');
232
- const model = new openai.realtime.RealtimeModel({
233
- voice: 'alloy',
234
- });
235
- console.log('āœ… OpenAI model created');
236
- return model;
237
- }
238
- }
239
- // Helper to get provider from participant metadata
240
- function getProviderFromParticipant(metadata) {
241
- if (!metadata)
242
- return DEFAULT_PROVIDER;
243
- try {
244
- const data = JSON.parse(metadata);
245
- return data.provider || DEFAULT_PROVIDER;
246
- }
247
- catch {
248
- return DEFAULT_PROVIDER;
249
- }
250
- }
251
- // Helper to get coding agent from participant metadata
252
- function getCodingAgentFromParticipant(metadata) {
253
- if (!metadata)
254
- return 'claude';
255
- try {
256
- const data = JSON.parse(metadata);
257
- return data.codingAgent || 'claude';
363
+ TWO AGENTS AVAILABLE:
364
+ - Plan Agent: Research, explore, read files (fast, no permissions needed)
365
+ - Execute Agent: Write code, make changes (asks permission for writes)
366
+
367
+ ${sharedContext.getContextSummary() ? `CONTEXT: ${sharedContext.getContextSummary()}` : ''}
368
+
369
+ PERMISSIONS: When you hear permission request, tell user what needs permission and ask "allow, deny, or always allow?" Then call respond_permission.`;
370
+ const INSTRUCTIONS = getInstructions();
371
+ // Voice agent class
372
+ class OsbornVoiceAgent extends voice.Agent {
373
+ constructor() {
374
+ super({
375
+ instructions: INSTRUCTIONS,
376
+ tools: {
377
+ run_code: runCodeTool,
378
+ respond_permission: respondPermissionTool,
379
+ },
380
+ });
381
+ }
258
382
  }
259
- catch {
260
- return 'claude';
383
+ // Create voice model
384
+ function createModel(provider) {
385
+ if (provider === 'gemini') {
386
+ console.log('šŸ“± Using Gemini Live API');
387
+ return new google.beta.realtime.RealtimeModel({
388
+ model: 'gemini-2.5-flash-native-audio-preview-12-2025',
389
+ voice: 'Puck',
390
+ instructions: INSTRUCTIONS,
391
+ });
392
+ }
393
+ else {
394
+ console.log('šŸ“± Using OpenAI Realtime API');
395
+ return new openai.realtime.RealtimeModel({
396
+ voice: 'alloy',
397
+ });
398
+ }
261
399
  }
262
- }
263
- export default defineAgent({
264
- entry: async (ctx) => {
265
- console.log('šŸš€ Agent starting for room:', ctx.room.name);
266
- // If room code was provided via CLI, validate room name
267
- if (cliArgs.roomCode) {
268
- const expectedRoom = `osborn-${cliArgs.roomCode}`;
269
- if (ctx.room.name !== expectedRoom) {
270
- console.log(`ā­ļø Skipping room ${ctx.room.name} (waiting for ${expectedRoom})`);
271
- return; // Don't handle this room
400
+ // ============================================================
401
+ // Room Event Handlers
402
+ // ============================================================
403
+ room.on(RoomEvent.Connected, () => {
404
+ console.log('āœ… Connected to room:', roomName);
405
+ localParticipant = room.localParticipant;
406
+ });
407
+ room.on(RoomEvent.Disconnected, () => {
408
+ console.log('šŸ‘‹ Disconnected from room');
409
+ currentSession = null;
410
+ });
411
+ room.on(RoomEvent.ParticipantConnected, async (participant) => {
412
+ console.log(`\nšŸ‘¤ User joined: ${participant.identity}`);
413
+ // Get provider from participant metadata
414
+ let provider = defaultProvider;
415
+ let codingAgent = 'claude';
416
+ if (participant.metadata) {
417
+ try {
418
+ const meta = JSON.parse(participant.metadata);
419
+ provider = meta.provider || defaultProvider;
420
+ codingAgent = meta.codingAgent || 'claude';
272
421
  }
273
- console.log(`āœ… Room matches expected: ${expectedRoom}`);
422
+ catch { }
274
423
  }
275
- jobContext = ctx;
276
- // Claude verbose logging
277
- claude.on('tool_use', (tool) => {
278
- console.log(`\nšŸ”§ Claude Tool Started: ${tool.name}`);
279
- if (tool.input) {
280
- const inputStr = JSON.stringify(tool.input).substring(0, 200);
281
- console.log(` Input: ${inputStr}${inputStr.length >= 200 ? '...' : ''}`);
282
- }
283
- });
284
- claude.on('tool_result', (result) => {
285
- console.log(`āœ… Claude Tool Completed: ${result.name || 'unknown'}`);
286
- });
287
- claude.on('text', (text) => {
288
- if (text.length > 0) {
289
- console.log(`šŸ’¬ Claude says: ${text.substring(0, 100)}${text.length > 100 ? '...' : ''}`);
290
- }
291
- });
292
- claude.on('error', (err) => {
293
- console.error(`āŒ Claude Error:`, err);
294
- });
295
- // Connect FIRST so we can wait for participants
296
- console.log('šŸ“” Connecting to room...');
297
- await ctx.connect();
298
- console.log('āœ… Connected to room');
299
- // Wait for a participant to join using LiveKit's built-in method
300
- console.log('ā³ Waiting for participant...');
301
- const participant = await ctx.waitForParticipant();
302
- console.log('šŸ‘¤ Participant joined:', participant.identity);
303
- console.log('šŸ“‹ Participant metadata:', participant.metadata);
304
- const provider = getProviderFromParticipant(participant.metadata);
305
- const codingAgent = getCodingAgentFromParticipant(participant.metadata);
306
- console.log(`šŸŽÆ User selected provider: ${provider}`);
307
- console.log(`šŸ”§ User selected coding agent: ${codingAgent}`);
308
- // Set the current coding agent and initialize if needed
424
+ currentProvider = provider;
309
425
  currentCodingAgent = codingAgent;
426
+ console.log(`šŸŽÆ Provider: ${provider}, Agent: ${codingAgent}`);
310
427
  if (codingAgent === 'codex') {
311
- console.log('šŸ”§ Initializing Codex handler...');
312
- codexHandler = new CodexHandler({
313
- workingDirectory: workingDir,
314
- });
315
- console.log('āœ… Codex handler ready');
428
+ codexHandler = new CodexHandler({ workingDirectory: workingDir });
316
429
  }
317
- // Create model based on user's choice
430
+ // Create voice session
318
431
  const model = createModel(provider);
319
- // Clean up any previous session before creating new one
320
- await cleanupSession();
321
- const session = new voice.AgentSession({
322
- llm: model,
323
- });
432
+ const session = new voice.AgentSession({ llm: model });
324
433
  currentSession = session;
325
- // Add session event listeners for debugging
326
- // Using string literals as AgentSessionEventTypes is not directly exported
327
- session.on('user_state_changed', (ev) => {
328
- console.log(`šŸ‘¤ User state: ${ev.oldState} → ${ev.newState}`);
329
- });
434
+ // Session events
330
435
  session.on('agent_state_changed', (ev) => {
331
- console.log(`šŸ¤– Agent state: ${ev.oldState} → ${ev.newState}`);
436
+ agentState = ev.newState;
437
+ console.log(`šŸ¤– State: ${ev.newState}`);
438
+ if (ev.newState === 'listening' && speechQueue.length > 0) {
439
+ processSpeechQueue();
440
+ }
332
441
  });
333
442
  session.on('user_input_transcribed', (ev) => {
334
- console.log(`šŸ“ Transcribed: "${ev.transcript}" (final: ${ev.isFinal})`);
443
+ console.log(`šŸ“ User: "${ev.transcript}"`);
444
+ });
445
+ session.on('user_state_changed', (ev) => {
446
+ console.log(`šŸ‘¤ User state: ${ev.oldState} → ${ev.newState}`);
335
447
  });
336
448
  session.on('error', (ev) => {
337
449
  console.error('āŒ Session error:', ev.error);
@@ -339,77 +451,108 @@ export default defineAgent({
339
451
  session.on('close', (ev) => {
340
452
  console.log('🚪 Session closed:', ev.reason);
341
453
  });
342
- ctx.room.on('trackSubscribed', (track, publication, p) => {
343
- console.log(`šŸ“„ Track subscribed: ${track.kind} from ${p.identity}`);
344
- });
345
- ctx.room.on('participantDisconnected', async (p) => {
346
- console.log(`šŸ‘‹ Participant disconnected: ${p.identity}`);
347
- // Clean up session when user disconnects to prepare for next connection
348
- await cleanupSession();
349
- });
350
- // Listen for data channel messages from frontend
351
- ctx.room.on('dataReceived', async (payload, participant, kind, topic) => {
352
- if (topic === 'user-input') {
454
+ // Start voice session
455
+ console.log('šŸŽ¬ Starting voice session...');
456
+ const agent = new OsbornVoiceAgent();
457
+ try {
458
+ await session.start({
459
+ agent,
460
+ room,
461
+ });
462
+ console.log('āœ… Voice session started!');
463
+ console.log('šŸŽ¤ Ready - speak to begin!\n');
464
+ // Send ready signal with persistent retry (frontend might not be subscribed yet)
465
+ console.log('šŸ’“ Sending agent_ready signal...');
466
+ let readySent = false;
467
+ const sendReady = async () => {
468
+ if (readySent)
469
+ return;
470
+ await sendToFrontend({ type: 'agent_ready', provider, codingAgent });
471
+ };
472
+ // Keep sending every 2 seconds for 20 seconds total
473
+ const readyInterval = setInterval(sendReady, 2000);
474
+ await sendReady();
475
+ setTimeout(() => {
476
+ clearInterval(readyInterval);
477
+ console.log('āœ… agent_ready retries complete');
478
+ }, 20000);
479
+ // Mark as sent when user first speaks (no need to keep sending)
480
+ session.on('input_speech_started', () => {
481
+ readySent = true;
482
+ clearInterval(readyInterval);
483
+ });
484
+ console.log('āœ… agent_ready sent (with retries scheduled)');
485
+ // Greet user (OpenAI only)
486
+ if (provider !== 'gemini') {
353
487
  try {
354
- const data = JSON.parse(new TextDecoder().decode(payload));
355
- console.log(`šŸ“Ø Received from frontend:`, data);
356
- if (data.type === 'permission_response') {
357
- // Handle permission response from UI
358
- if (claude.hasPendingPermission()) {
359
- claude.respondToPermission(data.response);
360
- console.log(`āœ… Permission ${data.response} from UI`);
361
- }
362
- }
363
- else if (data.type === 'user_text') {
364
- // Handle text input from frontend
365
- console.log(`šŸ“ Text input: "${data.content}"`);
366
- // Inject text into the session as user input
367
- if (currentSession) {
368
- try {
369
- // Interrupt any current speech first
370
- currentSession.interrupt();
371
- // Generate a reply to the text input
372
- await currentSession.generateReply({
373
- userInput: data.content,
374
- });
375
- console.log(`āœ… Injected text to session`);
376
- }
377
- catch (err) {
378
- console.error(`āŒ Failed to inject text:`, err);
379
- }
380
- }
381
- }
488
+ await session.generateReply({
489
+ userInput: '[Greet the user: "Hey, I\'m Osborn. What are you working on?"]'
490
+ });
382
491
  }
383
- catch (e) {
384
- // Not JSON, ignore
492
+ catch {
493
+ console.log('āš ļø Greeting skipped');
385
494
  }
386
495
  }
496
+ }
497
+ catch (err) {
498
+ console.error('āŒ Failed to start session:', err);
499
+ }
500
+ });
501
+ room.on(RoomEvent.ParticipantDisconnected, (participant) => {
502
+ console.log(`šŸ‘‹ User left: ${participant.identity}`);
503
+ if (currentSession) {
504
+ currentSession.removeAllListeners();
505
+ currentSession = null;
506
+ }
507
+ console.log('ā³ Waiting for new user...\n');
508
+ });
509
+ room.on(RoomEvent.DataReceived, async (payload, participant, kind, topic) => {
510
+ if (topic !== 'user-input')
511
+ return;
512
+ try {
513
+ const data = JSON.parse(new TextDecoder().decode(payload));
514
+ console.log('šŸ“Ø Data:', data.type);
515
+ if (data.type === 'permission_response') {
516
+ const slot = agentPool.find(s => s.handler.hasPendingPermission());
517
+ if (slot) {
518
+ slot.handler.respondToPermission(data.response);
519
+ console.log(`āœ… Permission: ${data.response}`);
520
+ }
521
+ }
522
+ else if (data.type === 'user_text' && currentSession) {
523
+ console.log(`šŸ“ Text: "${data.content}"`);
524
+ currentSession.interrupt();
525
+ await currentSession.generateReply({ userInput: data.content });
526
+ }
527
+ }
528
+ catch { }
529
+ });
530
+ // ============================================================
531
+ // Connect to Room
532
+ // ============================================================
533
+ try {
534
+ await room.connect(livekitUrl, jwt, {
535
+ autoSubscribe: true,
536
+ dynacast: true,
387
537
  });
388
- // Create the agent
389
- const agent = new OsbornAssistant();
390
- // Start session
391
- console.log('šŸŽ¬ Starting voice session...');
392
- const startTime = Date.now();
393
- await session.start({
394
- agent,
395
- room: ctx.room,
396
- });
397
- console.log(`āœ… Session started in ${Date.now() - startTime}ms with ${provider.toUpperCase()} + Claude tools`);
398
- console.log('šŸŽ¤ Ready for voice input! Speak to start.');
399
- },
400
- });
401
- // Configure server options
402
- const serverOptions = {
403
- agent: fileURLToPath(import.meta.url),
404
- };
405
- // If room code is provided, filter to only handle that room
406
- if (cliArgs.roomCode) {
407
- const targetRoom = `osborn-${cliArgs.roomCode}`;
408
- console.log(`šŸŽÆ Filtering for room: ${targetRoom}`);
409
- // The agent will be dispatched to rooms matching this pattern
410
- serverOptions.workerOptions = {
411
- // Note: Room filtering is handled by LiveKit dispatch
412
- // For local development, we validate the room in the entry function
413
- };
538
+ // Set localParticipant immediately after connection
539
+ localParticipant = room.localParticipant;
540
+ console.log('āœ… Connected to room:', roomName);
541
+ console.log('\nā³ Waiting for user to connect...');
542
+ console.log(` Room: ${roomCode}\n`);
543
+ // Warm up agents in background
544
+ console.log('šŸ”„ Warming up agents...');
545
+ Promise.all([
546
+ planAgent.handler.run('ready').then(() => console.log('āœ… Plan agent ready')),
547
+ executeAgent.handler.run('ready').then(() => console.log('āœ… Execute agent ready')),
548
+ ]).catch(() => { });
549
+ // Keep process alive
550
+ await new Promise(() => { });
551
+ }
552
+ catch (err) {
553
+ console.error('āŒ Failed to connect:', err);
554
+ process.exit(1);
555
+ }
414
556
  }
415
- cli.runApp(new ServerOptions(serverOptions));
557
+ // Run
558
+ main().catch(console.error);
package/package.json CHANGED
@@ -1,15 +1,16 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.1.2",
3
+ "version": "0.1.6",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
7
7
  "osborn": "./bin/cli.js"
8
8
  },
9
9
  "scripts": {
10
- "dev": "tsx src/index.ts dev",
11
- "start": "tsx src/index.ts start",
12
- "build": "tsc"
10
+ "dev": "tsx src/index.ts",
11
+ "start": "tsx src/index.ts",
12
+ "build": "tsc",
13
+ "room": "tsx src/index.ts --room"
13
14
  },
14
15
  "keywords": [
15
16
  "voice",
@@ -30,8 +31,10 @@
30
31
  "@livekit/agents": "^1.0.0",
31
32
  "@livekit/agents-plugin-google": "^1.0.0",
32
33
  "@livekit/agents-plugin-openai": "^1.0.0",
34
+ "@livekit/rtc-node": "^0.13.22",
33
35
  "@openai/codex-sdk": "^0.77.0",
34
36
  "dotenv": "^16.4.0",
37
+ "livekit-server-sdk": "^2.15.0",
35
38
  "tsx": "^4.0.0",
36
39
  "yaml": "^2.3.0",
37
40
  "zod": "^3.23.0"