osborn 0.1.6 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ /**
2
+ * Voice I/O Module
3
+ * Handles STT (Speech-to-Text), TTS (Text-to-Speech), and Realtime model creation
4
+ *
5
+ * Supports two modes:
6
+ * - Direct mode: STT (Deepgram) → Claude Agent SDK → TTS (Deepgram)
7
+ * - Realtime mode: OpenAI/Gemini native speech-to-speech models
8
+ */
9
+ import * as deepgram from '@livekit/agents-plugin-deepgram';
10
+ import * as google from '@livekit/agents-plugin-google';
11
+ import * as openai from '@livekit/agents-plugin-openai';
12
+ import * as silero from '@livekit/agents-plugin-silero';
13
+ /**
14
+ * Create STT (Speech-to-Text) instance based on config
15
+ * Note: Gemini STT is not available in Node.js, using Deepgram as default
16
+ */
17
+ export function createSTT(config) {
18
+ switch (config.provider) {
19
+ case 'deepgram':
20
+ return new deepgram.STT({
21
+ model: (config.model || 'nova-3'),
22
+ language: config.language || 'en',
23
+ });
24
+ case 'groq-whisper':
25
+ return openai.STT.withGroq({
26
+ model: config.model || 'whisper-large-v3-turbo',
27
+ });
28
+ case 'openai-whisper':
29
+ return new openai.STT({
30
+ model: config.model || 'whisper-1',
31
+ });
32
+ default:
33
+ throw new Error(`Unknown STT provider: ${config.provider}`);
34
+ }
35
+ }
36
+ /**
37
+ * Create TTS (Text-to-Speech) instance based on config
38
+ * Using Gemini TTS as default (cheaper, good quality)
39
+ */
40
+ export function createTTS(config) {
41
+ let tts;
42
+ switch (config.provider) {
43
+ case 'gemini':
44
+ // Gemini TTS via google plugin
45
+ tts = new google.beta.TTS({
46
+ model: config.model || 'gemini-2.5-flash-preview-tts',
47
+ voice: config.voice || 'apollo',
48
+ });
49
+ break;
50
+ case 'openai':
51
+ tts = new openai.TTS({
52
+ voice: config.voice || 'alloy',
53
+ model: config.model || 'tts-1',
54
+ });
55
+ break;
56
+ case 'deepgram':
57
+ tts = new deepgram.TTS({
58
+ model: (config.model || 'aura-asteria-en'),
59
+ });
60
+ break;
61
+ default:
62
+ throw new Error(`Unknown TTS provider: ${config.provider}`);
63
+ }
64
+ // Increase max listeners to prevent memory leak warnings
65
+ // TTS instances can have many concurrent listeners during active conversations
66
+ if (tts && typeof tts.setMaxListeners === 'function') {
67
+ tts.setMaxListeners(50);
68
+ }
69
+ return tts;
70
+ }
71
+ /**
72
+ * Create VAD (Voice Activity Detection) for turn detection
73
+ *
74
+ * Tuned to prevent:
75
+ * - "Audio file is too short" errors from STT (OpenAI requires >= 0.1s)
76
+ * - Split sentences when user pauses briefly mid-speech
77
+ * - False triggers from ambient noise
78
+ */
79
+ export async function createVAD() {
80
+ return silero.VAD.load({
81
+ // Minimum 0.5s speech before triggering - prevents noise/short sounds
82
+ // Higher value = more complete utterances before processing
83
+ minSpeechDuration: 0.5,
84
+ // Wait 1.2s of silence before considering speech "done"
85
+ // Allows natural pauses mid-sentence without triggering STT
86
+ // (increased from 0.8s to reduce sentence splitting)
87
+ minSilenceDuration: 1.2,
88
+ // Add 0.2s padding to start of speech chunks for cleaner audio
89
+ prefixPaddingDuration: 0.2,
90
+ // Higher threshold = less sensitive to quiet sounds/noise
91
+ // Default is 0.5, using 0.65 to reduce false positives
92
+ activationThreshold: 0.65,
93
+ });
94
+ }
95
+ /**
96
+ * Default voice I/O configuration
97
+ * Uses Deepgram STT (fast, accurate) + Deepgram TTS (fast, good)
98
+ */
99
+ export const DEFAULT_VOICE_IO_CONFIG = {
100
+ stt: {
101
+ provider: 'deepgram',
102
+ model: 'nova-3',
103
+ language: 'en',
104
+ },
105
+ tts: {
106
+ provider: 'deepgram',
107
+ voice: 'aura-asteria-en',
108
+ },
109
+ };
110
+ /**
111
+ * Create Realtime Model for native speech-to-speech
112
+ * Supports OpenAI Realtime API and Gemini Live API
113
+ *
114
+ * Note: Instructions are passed to voice.Agent, not to the RealtimeModel
115
+ */
116
+ export function createRealtimeModel(config) {
117
+ if (config.provider === 'gemini') {
118
+ console.log('📱 Using Gemini Live API (realtime)');
119
+ // Note: 12-2025 model has a known bug causing code 1008 crashes during user interruptions
120
+ // with tool calls. No newer model available yet — auto-recovery in index.ts handles this.
121
+ return new google.beta.realtime.RealtimeModel({
122
+ model: config.geminiModel || 'gemini-2.5-flash-native-audio-preview-12-2025',
123
+ voice: config.geminiVoice || 'Puck',
124
+ // Gemini supports instructions at model level
125
+ instructions: config.instructions,
126
+ // Enable transcription so we get text of what the agent says
127
+ inputAudioTranscription: {},
128
+ outputAudioTranscription: {},
129
+ });
130
+ }
131
+ else {
132
+ console.log('📱 Using OpenAI Realtime API');
133
+ // OpenAI RealtimeModel - instructions go to voice.Agent instead
134
+ return new openai.realtime.RealtimeModel({
135
+ model: config.openaiModel || 'gpt-4o-realtime-preview',
136
+ voice: config.openaiVoice || 'alloy',
137
+ });
138
+ }
139
+ }
140
+ /**
141
+ * Create realtime model from config
142
+ */
143
+ export function createRealtimeModelFromConfig(realtimeConfig, instructions) {
144
+ return createRealtimeModel({
145
+ provider: realtimeConfig.provider || 'openai',
146
+ openaiVoice: realtimeConfig.openaiVoice,
147
+ openaiModel: realtimeConfig.openaiModel,
148
+ geminiVoice: realtimeConfig.geminiVoice,
149
+ geminiModel: realtimeConfig.geminiModel,
150
+ instructions,
151
+ });
152
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.1.6",
3
+ "version": "0.5.3",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
@@ -28,20 +28,31 @@
28
28
  "license": "MIT",
29
29
  "dependencies": {
30
30
  "@anthropic-ai/claude-agent-sdk": "^0.1.74",
31
- "@livekit/agents": "^1.0.0",
32
- "@livekit/agents-plugin-google": "^1.0.0",
33
- "@livekit/agents-plugin-openai": "^1.0.0",
34
- "@livekit/rtc-node": "^0.13.22",
31
+ "@anthropic-ai/sdk": "^0.52.0",
32
+ "@google/genai": "^1.0.0",
33
+ "@livekit/agents": "^1.0.45",
34
+ "@livekit/agents-plugin-deepgram": "^1.0.45",
35
+ "@livekit/agents-plugin-elevenlabs": "^1.0.45",
36
+ "@livekit/agents-plugin-google": "^1.0.45",
37
+ "@livekit/agents-plugin-openai": "^1.0.45",
38
+ "@livekit/agents-plugin-silero": "^1.0.45",
39
+ "@livekit/rtc-node": "^0.13.24",
40
+ "@modelcontextprotocol/sdk": "^1.26.0",
35
41
  "@openai/codex-sdk": "^0.77.0",
42
+ "@smithery/api": "^0.48.0",
36
43
  "dotenv": "^16.4.0",
37
44
  "livekit-server-sdk": "^2.15.0",
45
+ "node-pty": "^1.1.0",
38
46
  "tsx": "^4.0.0",
47
+ "ws": "^8.19.0",
39
48
  "yaml": "^2.3.0",
40
49
  "zod": "^3.23.0"
41
50
  },
42
51
  "devDependencies": {
43
52
  "@types/node": "^20.0.0",
44
- "typescript": "^5.0.0"
53
+ "@vitest/coverage-v8": "^4.0.18",
54
+ "typescript": "^5.0.0",
55
+ "vitest": "^4.0.18"
45
56
  },
46
57
  "engines": {
47
58
  "node": ">=18.0.0"