npm - osborn - Versions diffs - 0.1.6 → 0.5.3 - Mend

osborn 0.1.6 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/.env.example +8 -1
package/dist/bridge-llm.d.ts +22 -0
package/dist/bridge-llm.js +39 -0
package/dist/claude-handler.d.ts +6 -0
package/dist/claude-handler.js +43 -1
package/dist/claude-llm.d.ts +128 -0
package/dist/claude-llm.js +623 -0
package/dist/codex-llm.d.ts +40 -0
package/dist/codex-llm.js +144 -0
package/dist/config.d.ts +227 -1
package/dist/config.js +775 -8
package/dist/conversation-brain.d.ts +92 -0
package/dist/conversation-brain.js +360 -0
package/dist/fast-brain.d.ts +122 -0
package/dist/fast-brain.js +1404 -0
package/dist/index.js +1997 -322
package/dist/prompts.d.ts +19 -0
package/dist/prompts.js +610 -0
package/dist/session-access.d.ts +399 -0
package/dist/session-access.js +775 -0
package/dist/smithery-proxy.d.ts +57 -0
package/dist/smithery-proxy.js +195 -0
package/dist/status-manager.d.ts +90 -0
package/dist/status-manager.js +187 -0
package/dist/voice-io.d.ts +70 -0
package/dist/voice-io.js +152 -0
package/package.json +17 -6

package/dist/voice-io.js ADDED Viewed

@@ -0,0 +1,152 @@
+/**
+ * Voice I/O Module
+ * Handles STT (Speech-to-Text), TTS (Text-to-Speech), and Realtime model creation
+ *
+ * Supports two modes:
+ * - Direct mode: STT (Deepgram) → Claude Agent SDK → TTS (Deepgram)
+ * - Realtime mode: OpenAI/Gemini native speech-to-speech models
+ */
+import * as deepgram from '@livekit/agents-plugin-deepgram';
+import * as google from '@livekit/agents-plugin-google';
+import * as openai from '@livekit/agents-plugin-openai';
+import * as silero from '@livekit/agents-plugin-silero';
+/**
+ * Create STT (Speech-to-Text) instance based on config
+ * Note: Gemini STT is not available in Node.js, using Deepgram as default
+ */
+export function createSTT(config) {
+    switch (config.provider) {
+        case 'deepgram':
+            return new deepgram.STT({
+                model: (config.model || 'nova-3'),
+                language: config.language || 'en',
+            });
+        case 'groq-whisper':
+            return openai.STT.withGroq({
+                model: config.model || 'whisper-large-v3-turbo',
+            });
+        case 'openai-whisper':
+            return new openai.STT({
+                model: config.model || 'whisper-1',
+            });
+        default:
+            throw new Error(`Unknown STT provider: ${config.provider}`);
+    }
+}
+/**
+ * Create TTS (Text-to-Speech) instance based on config
+ * Using Gemini TTS as default (cheaper, good quality)
+ */
+export function createTTS(config) {
+    let tts;
+    switch (config.provider) {
+        case 'gemini':
+            // Gemini TTS via google plugin
+            tts = new google.beta.TTS({
+                model: config.model || 'gemini-2.5-flash-preview-tts',
+                voice: config.voice || 'apollo',
+            });
+            break;
+        case 'openai':
+            tts = new openai.TTS({
+                voice: config.voice || 'alloy',
+                model: config.model || 'tts-1',
+            });
+            break;
+        case 'deepgram':
+            tts = new deepgram.TTS({
+                model: (config.model || 'aura-asteria-en'),
+            });
+            break;
+        default:
+            throw new Error(`Unknown TTS provider: ${config.provider}`);
+    }
+    // Increase max listeners to prevent memory leak warnings
+    // TTS instances can have many concurrent listeners during active conversations
+    if (tts && typeof tts.setMaxListeners === 'function') {
+        tts.setMaxListeners(50);
+    }
+    return tts;
+}
+/**
+ * Create VAD (Voice Activity Detection) for turn detection
+ *
+ * Tuned to prevent:
+ * - "Audio file is too short" errors from STT (OpenAI requires >= 0.1s)
+ * - Split sentences when user pauses briefly mid-speech
+ * - False triggers from ambient noise
+ */
+export async function createVAD() {
+    return silero.VAD.load({
+        // Minimum 0.5s speech before triggering - prevents noise/short sounds
+        // Higher value = more complete utterances before processing
+        minSpeechDuration: 0.5,
+        // Wait 1.2s of silence before considering speech "done"
+        // Allows natural pauses mid-sentence without triggering STT
+        // (increased from 0.8s to reduce sentence splitting)
+        minSilenceDuration: 1.2,
+        // Add 0.2s padding to start of speech chunks for cleaner audio
+        prefixPaddingDuration: 0.2,
+        // Higher threshold = less sensitive to quiet sounds/noise
+        // Default is 0.5, using 0.65 to reduce false positives
+        activationThreshold: 0.65,
+    });
+}
+/**
+ * Default voice I/O configuration
+ * Uses Deepgram STT (fast, accurate) + Deepgram TTS (fast, good)
+ */
+export const DEFAULT_VOICE_IO_CONFIG = {
+    stt: {
+        provider: 'deepgram',
+        model: 'nova-3',
+        language: 'en',
+    },
+    tts: {
+        provider: 'deepgram',
+        voice: 'aura-asteria-en',
+    },
+};
+/**
+ * Create Realtime Model for native speech-to-speech
+ * Supports OpenAI Realtime API and Gemini Live API
+ *
+ * Note: Instructions are passed to voice.Agent, not to the RealtimeModel
+ */
+export function createRealtimeModel(config) {
+    if (config.provider === 'gemini') {
+        console.log('📱 Using Gemini Live API (realtime)');
+        // Note: 12-2025 model has a known bug causing code 1008 crashes during user interruptions
+        // with tool calls. No newer model available yet — auto-recovery in index.ts handles this.
+        return new google.beta.realtime.RealtimeModel({
+            model: config.geminiModel || 'gemini-2.5-flash-native-audio-preview-12-2025',
+            voice: config.geminiVoice || 'Puck',
+            // Gemini supports instructions at model level
+            instructions: config.instructions,
+            // Enable transcription so we get text of what the agent says
+            inputAudioTranscription: {},
+            outputAudioTranscription: {},
+        });
+    }
+    else {
+        console.log('📱 Using OpenAI Realtime API');
+        // OpenAI RealtimeModel - instructions go to voice.Agent instead
+        return new openai.realtime.RealtimeModel({
+            model: config.openaiModel || 'gpt-4o-realtime-preview',
+            voice: config.openaiVoice || 'alloy',
+        });
+    }
+}
+/**
+ * Create realtime model from config
+ */
+export function createRealtimeModelFromConfig(realtimeConfig, instructions) {
+    return createRealtimeModel({
+        provider: realtimeConfig.provider || 'openai',
+        openaiVoice: realtimeConfig.openaiVoice,
+        openaiModel: realtimeConfig.openaiModel,
+        geminiVoice: realtimeConfig.geminiVoice,
+        geminiModel: realtimeConfig.geminiModel,
+        instructions,
+    });
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.1.6",
+  "version": "0.5.3",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {
@@ -28,20 +28,31 @@
   "license": "MIT",
   "dependencies": {
     "@anthropic-ai/claude-agent-sdk": "^0.1.74",
-    "@livekit/agents": "^1.0.0",
-    "@livekit/agents-plugin-google": "^1.0.0",
-    "@livekit/agents-plugin-openai": "^1.0.0",
-    "@livekit/rtc-node": "^0.13.22",
+    "@anthropic-ai/sdk": "^0.52.0",
+    "@google/genai": "^1.0.0",
+    "@livekit/agents": "^1.0.45",
+    "@livekit/agents-plugin-deepgram": "^1.0.45",
+    "@livekit/agents-plugin-elevenlabs": "^1.0.45",
+    "@livekit/agents-plugin-google": "^1.0.45",
+    "@livekit/agents-plugin-openai": "^1.0.45",
+    "@livekit/agents-plugin-silero": "^1.0.45",
+    "@livekit/rtc-node": "^0.13.24",
+    "@modelcontextprotocol/sdk": "^1.26.0",
     "@openai/codex-sdk": "^0.77.0",
+    "@smithery/api": "^0.48.0",
     "dotenv": "^16.4.0",
     "livekit-server-sdk": "^2.15.0",
+    "node-pty": "^1.1.0",
     "tsx": "^4.0.0",
+    "ws": "^8.19.0",
     "yaml": "^2.3.0",
     "zod": "^3.23.0"
   },
   "devDependencies": {
     "@types/node": "^20.0.0",
-    "typescript": "^5.0.0"
+    "@vitest/coverage-v8": "^4.0.18",
+    "typescript": "^5.0.0",
+    "vitest": "^4.0.18"
   },
   "engines": {
     "node": ">=18.0.0"