osborn 0.5.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/.claude/settings.local.json +9 -0
  2. package/.claude/skills/markdown-to-pdf/SKILL.md +29 -0
  3. package/.claude/skills/pdf-to-markdown/SKILL.md +28 -0
  4. package/.claude/skills/playwright-browser/SKILL.md +90 -0
  5. package/.claude/skills/shadcn/SKILL.md +232 -0
  6. package/.claude/skills/shadcn/image.png +0 -0
  7. package/.claude/skills/youtube-transcript/SKILL.md +24 -0
  8. package/.dockerignore +13 -0
  9. package/Dockerfile +103 -0
  10. package/deploy.sh +70 -0
  11. package/dist/claude-auth.d.ts +60 -0
  12. package/dist/claude-auth.js +334 -0
  13. package/dist/claude-llm.d.ts +51 -2
  14. package/dist/claude-llm.js +619 -86
  15. package/dist/config.d.ts +5 -1
  16. package/dist/config.js +4 -1
  17. package/dist/fast-brain.d.ts +70 -16
  18. package/dist/fast-brain.js +662 -99
  19. package/dist/index-3-2-26-legacy.d.ts +1 -0
  20. package/dist/index-3-2-26-legacy.js +2233 -0
  21. package/dist/index.js +979 -429
  22. package/dist/jsonl-search.d.ts +66 -0
  23. package/dist/jsonl-search.js +274 -0
  24. package/dist/leagcyprompts2.d.ts +0 -0
  25. package/dist/leagcyprompts2.js +573 -0
  26. package/dist/pipeline-direct-llm.d.ts +77 -0
  27. package/dist/pipeline-direct-llm.js +221 -0
  28. package/dist/pipeline-fastbrain.d.ts +45 -0
  29. package/dist/pipeline-fastbrain.js +373 -0
  30. package/dist/prompts-2-25-26.d.ts +0 -0
  31. package/dist/prompts-2-25-26.js +518 -0
  32. package/dist/prompts-3-2-26.d.ts +78 -0
  33. package/dist/prompts-3-2-26.js +1319 -0
  34. package/dist/prompts.d.ts +83 -12
  35. package/dist/prompts.js +2064 -587
  36. package/dist/recall-client.d.ts +33 -0
  37. package/dist/recall-client.js +101 -0
  38. package/dist/session-access.d.ts +24 -0
  39. package/dist/session-access.js +74 -0
  40. package/dist/summary-index.d.ts +87 -0
  41. package/dist/summary-index.js +570 -0
  42. package/dist/turn-detector-shim.d.ts +24 -0
  43. package/dist/turn-detector-shim.js +83 -0
  44. package/dist/voice-io.d.ts +15 -5
  45. package/dist/voice-io.js +52 -20
  46. package/fly.toml +30 -0
  47. package/package.json +18 -13
@@ -12,12 +12,16 @@ import * as openai from '@livekit/agents-plugin-openai';
12
12
  import * as silero from '@livekit/agents-plugin-silero';
13
13
  import type { RealtimeConfig } from './config.js';
14
14
  export interface STTConfig {
15
- provider: 'deepgram' | 'groq-whisper' | 'openai-whisper';
15
+ provider: 'deepgram' | 'deepgram-flux' | 'groq-whisper' | 'openai-whisper';
16
16
  model?: string;
17
17
  language?: string;
18
+ /** Deepgram Flux: end-of-turn confidence threshold (0.0-1.0, default 0.7) */
19
+ eotThreshold?: number;
20
+ /** Deepgram Flux: max ms to wait before forcing turn end (default 3000) */
21
+ eotTimeoutMs?: number;
18
22
  }
19
23
  export interface TTSConfig {
20
- provider: 'gemini' | 'openai' | 'elevenlabs' | 'deepgram';
24
+ provider: 'gemini' | 'openai' | 'elevenlabs' | 'deepgram' | 'groq-orpheus';
21
25
  voice?: string;
22
26
  model?: string;
23
27
  }
@@ -29,7 +33,7 @@ export interface VoiceIOConfig {
29
33
  * Create STT (Speech-to-Text) instance based on config
30
34
  * Note: Gemini STT is not available in Node.js, using Deepgram as default
31
35
  */
32
- export declare function createSTT(config: STTConfig): deepgram.STT | openai.STT;
36
+ export declare function createSTT(config: STTConfig): deepgram.STT | deepgram.STTv2 | openai.STT;
33
37
  /**
34
38
  * Create TTS (Text-to-Speech) instance based on config
35
39
  * Using Gemini TTS as default (cheaper, good quality)
@@ -45,15 +49,21 @@ export declare function createTTS(config: TTSConfig): any;
45
49
  */
46
50
  export declare function createVAD(): Promise<silero.VAD>;
47
51
  /**
48
- * Default voice I/O configuration
52
+ * Default voice I/O configuration (used by realtime mode fallback)
49
53
  * Uses Deepgram STT (fast, accurate) + Deepgram TTS (fast, good)
50
54
  */
51
55
  export declare const DEFAULT_VOICE_IO_CONFIG: VoiceIOConfig;
56
+ /**
57
+ * Direct mode voice config — centralized here for easy provider swapping.
58
+ * To switch providers: comment out the active line, uncomment the alternative.
59
+ */
60
+ export declare const DIRECT_MODE_STT: STTConfig;
61
+ export declare const DIRECT_MODE_TTS: TTSConfig;
52
62
  export interface RealtimeModelConfig {
53
63
  provider: 'openai' | 'gemini';
54
64
  openaiVoice?: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
55
65
  openaiModel?: string;
56
- geminiVoice?: 'Puck' | 'Charon' | 'Kore' | 'Fenrir' | 'Aoede';
66
+ geminiVoice?: 'Charon' | 'Puck' | 'Kore' | 'Fenrir' | 'Aoede';
57
67
  geminiModel?: string;
58
68
  instructions?: string;
59
69
  }
package/dist/voice-io.js CHANGED
@@ -20,6 +20,19 @@ export function createSTT(config) {
20
20
  return new deepgram.STT({
21
21
  model: (config.model || 'nova-3'),
22
22
  language: config.language || 'en',
23
+ endpointing: 550, // Wait 550ms of silence before committing final transcript (default 25ms causes mid-sentence fragments)
24
+ });
25
+ case 'deepgram-flux':
26
+ // Deepgram Flux (V2 API) — semantic turn detection via ML model
27
+ // Uses TurnInfo events: StartOfTurn, Update, EagerEndOfTurn, TurnResumed, EndOfTurn
28
+ // TurnResumed prevents premature commits when user pauses mid-sentence
29
+ // All processing is server-side (Deepgram), zero local CPU
30
+ console.log(`🎙️ Using Deepgram Flux STT (semantic turn detection, eotThreshold=${config.eotThreshold ?? 0.85}, eotTimeoutMs=${config.eotTimeoutMs ?? 3000})`);
31
+ return new deepgram.STTv2({
32
+ model: config.model || 'flux-general-en',
33
+ language: config.language || 'en',
34
+ eotThreshold: config.eotThreshold ?? 0.85, // 0.0-1.0 — lower = more aggressive turn detection (more false positives), higher = more lenient (more false negatives)
35
+ eotTimeoutMs: config.eotTimeoutMs ?? 3000, // Max ms to wait before forcing end of turn — helps when user pauses for a long time mid-sentence (default 3000ms)
23
36
  });
24
37
  case 'groq-whisper':
25
38
  return openai.STT.withGroq({
@@ -55,7 +68,17 @@ export function createTTS(config) {
55
68
  break;
56
69
  case 'deepgram':
57
70
  tts = new deepgram.TTS({
58
- model: (config.model || 'aura-asteria-en'),
71
+ model: (config.model || 'aura-2-asteria-en'),
72
+ });
73
+ break;
74
+ case 'groq-orpheus':
75
+ // Groq Orpheus TTS via OpenAI-compatible API ($22/M chars)
76
+ // Voices: autumn, diana, hannah, austin, daniel, troy
77
+ tts = new openai.TTS({
78
+ model: config.model || 'canopylabs/orpheus-v1-english',
79
+ voice: config.voice || 'autumn',
80
+ apiKey: process.env.GROQ_API_KEY,
81
+ baseURL: 'https://api.groq.com/openai/v1',
59
82
  });
60
83
  break;
61
84
  default:
@@ -64,7 +87,7 @@ export function createTTS(config) {
64
87
  // Increase max listeners to prevent memory leak warnings
65
88
  // TTS instances can have many concurrent listeners during active conversations
66
89
  if (tts && typeof tts.setMaxListeners === 'function') {
67
- tts.setMaxListeners(50);
90
+ tts.setMaxListeners(100);
68
91
  }
69
92
  return tts;
70
93
  }
@@ -77,23 +100,17 @@ export function createTTS(config) {
77
100
  * - False triggers from ambient noise
78
101
  */
79
102
  export async function createVAD() {
103
+ // VAD now only handles interruption detection — turn detection moved to Deepgram STT (server-side).
104
+ // Lighter settings = less local CPU from ONNX inference.
80
105
  return silero.VAD.load({
81
- // Minimum 0.5s speech before triggering - prevents noise/short sounds
82
- // Higher value = more complete utterances before processing
83
- minSpeechDuration: 0.5,
84
- // Wait 1.2s of silence before considering speech "done"
85
- // Allows natural pauses mid-sentence without triggering STT
86
- // (increased from 0.8s to reduce sentence splitting)
87
- minSilenceDuration: 1.2,
88
- // Add 0.2s padding to start of speech chunks for cleaner audio
89
- prefixPaddingDuration: 0.2,
90
- // Higher threshold = less sensitive to quiet sounds/noise
91
- // Default is 0.5, using 0.65 to reduce false positives
92
- activationThreshold: 0.65,
106
+ minSpeechDuration: 0.4, // 400ms quick interruption detection
107
+ minSilenceDuration: 1.2, // 1200ms responsive
108
+ prefixPaddingDuration: 0.1,
109
+ activationThreshold: 0.85, // default balanced for interruptions only
93
110
  });
94
111
  }
95
112
  /**
96
- * Default voice I/O configuration
113
+ * Default voice I/O configuration (used by realtime mode fallback)
97
114
  * Uses Deepgram STT (fast, accurate) + Deepgram TTS (fast, good)
98
115
  */
99
116
  export const DEFAULT_VOICE_IO_CONFIG = {
@@ -104,9 +121,25 @@ export const DEFAULT_VOICE_IO_CONFIG = {
104
121
  },
105
122
  tts: {
106
123
  provider: 'deepgram',
107
- voice: 'aura-asteria-en',
124
+ voice: 'aura-2-asteria-en',
108
125
  },
109
126
  };
127
+ /**
128
+ * Direct mode voice config — centralized here for easy provider swapping.
129
+ * To switch providers: comment out the active line, uncomment the alternative.
130
+ */
131
+ export const DIRECT_MODE_STT = {
132
+ // provider: 'groq-whisper', model: 'whisper-large-v3-turbo', // Batch — needs VAD
133
+ // provider: 'openai-whisper', model: 'whisper-1', // Batch — needs VAD
134
+ // provider: 'deepgram', model: 'nova-3', language: 'en', // Streaming, silence-based endpointing
135
+ provider: 'deepgram-flux', model: 'flux-general-en', language: 'en', // Streaming, ML-based turn detection (requires Deepgram V2 access)
136
+ };
137
+ export const DIRECT_MODE_TTS = {
138
+ // provider: 'deepgram', model: 'aura-2-asteria-en', // WebSocket-based: handles TTS abort cleanly (no unrecoverable crash on interruption)
139
+ // provider: 'gemini', model: 'gemini-2.5-flash-preview-tts', voice: 'apollo',
140
+ provider: 'openai', model: 'tts-1', voice: 'fable', // HTTP streaming: throws APIUserAbortError on interrupt → unrecoverable session crash
141
+ // provider: 'groq-orpheus', model: 'canopylabs/orpheus-v1-english', voice: 'autumn', // $22/M chars — voices: autumn, diana, hannah, austin, daniel, troy
142
+ };
110
143
  /**
111
144
  * Create Realtime Model for native speech-to-speech
112
145
  * Supports OpenAI Realtime API and Gemini Live API
@@ -116,11 +149,10 @@ export const DEFAULT_VOICE_IO_CONFIG = {
116
149
  export function createRealtimeModel(config) {
117
150
  if (config.provider === 'gemini') {
118
151
  console.log('📱 Using Gemini Live API (realtime)');
119
- // Note: 12-2025 model has a known bug causing code 1008 crashes during user interruptions
120
- // with tool calls. No newer model available yet — auto-recovery in index.ts handles this.
152
+ // Using 'latest' alias — 12-2025 had a known 1008 crash bug during interruptions + tool calls
121
153
  return new google.beta.realtime.RealtimeModel({
122
- model: config.geminiModel || 'gemini-2.5-flash-native-audio-preview-12-2025',
123
- voice: config.geminiVoice || 'Puck',
154
+ model: config.geminiModel || 'gemini-2.5-flash-native-audio-latest',
155
+ voice: config.geminiVoice || 'Charon',
124
156
  // Gemini supports instructions at model level
125
157
  instructions: config.instructions,
126
158
  // Enable transcription so we get text of what the agent says
package/fly.toml ADDED
@@ -0,0 +1,30 @@
1
+ # Osborn Agent — Fly.io Configuration
2
+ # Change app name per user: osborn-agent-<username>
3
+ # Deploy: fly deploy
4
+
5
+ app = "osborn-agent"
6
+ primary_region = "ewr"
7
+
8
+ [build]
9
+
10
+ [env]
11
+ NODE_ENV = "production"
12
+ PORT = "8741"
13
+ HOST = "0.0.0.0"
14
+ OSBORN_CWD = "/workspace"
15
+
16
+ [http_service]
17
+ internal_port = 8741
18
+ force_https = true
19
+ auto_stop_machines = "off"
20
+ auto_start_machines = true
21
+ min_machines_running = 1
22
+
23
+ [mounts]
24
+ source = "workspace"
25
+ destination = "/workspace"
26
+
27
+ [[vm]]
28
+ memory = "1gb"
29
+ cpu_kind = "shared"
30
+ cpus = 1
package/package.json CHANGED
@@ -1,10 +1,10 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.5.3",
3
+ "version": "0.8.0",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
7
- "osborn": "./bin/cli.js"
7
+ "osborn": "bin/cli.js"
8
8
  },
9
9
  "scripts": {
10
10
  "dev": "tsx src/index.ts",
@@ -22,31 +22,36 @@
22
22
  ],
23
23
  "repository": {
24
24
  "type": "git",
25
- "url": "https://github.com/Erriccc/osborn.git"
25
+ "url": "git+https://github.com/Erriccc/osborn.git"
26
26
  },
27
27
  "author": "Osborn Ojure",
28
28
  "license": "MIT",
29
29
  "dependencies": {
30
- "@anthropic-ai/claude-agent-sdk": "^0.1.74",
31
- "@anthropic-ai/sdk": "^0.52.0",
30
+ "@anthropic-ai/claude-agent-sdk": "^0.2.91",
31
+ "@anthropic-ai/sdk": "^0.80.0",
32
32
  "@google/genai": "^1.0.0",
33
- "@livekit/agents": "^1.0.45",
34
- "@livekit/agents-plugin-deepgram": "^1.0.45",
35
- "@livekit/agents-plugin-elevenlabs": "^1.0.45",
36
- "@livekit/agents-plugin-google": "^1.0.45",
37
- "@livekit/agents-plugin-openai": "^1.0.45",
38
- "@livekit/agents-plugin-silero": "^1.0.45",
33
+ "@livekit/agents": "^1.2.1",
34
+ "@livekit/agents-plugin-deepgram": "^1.2.1",
35
+ "@livekit/agents-plugin-elevenlabs": "^1.2.1",
36
+ "@livekit/agents-plugin-google": "^1.2.1",
37
+ "@livekit/agents-plugin-livekit": "^1.2.1",
38
+ "@livekit/agents-plugin-openai": "^1.2.1",
39
+ "@livekit/agents-plugin-silero": "^1.2.1",
39
40
  "@livekit/rtc-node": "^0.13.24",
40
- "@modelcontextprotocol/sdk": "^1.26.0",
41
+ "@modelcontextprotocol/sdk": "^1.29.0",
41
42
  "@openai/codex-sdk": "^0.77.0",
42
43
  "@smithery/api": "^0.48.0",
44
+ "@types/diff": "^8.0.0",
45
+ "@vscode/ripgrep": "^1.17.1",
46
+ "diff": "^8.0.4",
43
47
  "dotenv": "^16.4.0",
44
48
  "livekit-server-sdk": "^2.15.0",
49
+ "minisearch": "^7.2.0",
45
50
  "node-pty": "^1.1.0",
46
51
  "tsx": "^4.0.0",
47
52
  "ws": "^8.19.0",
48
53
  "yaml": "^2.3.0",
49
- "zod": "^3.23.0"
54
+ "zod": "^4.3.6"
50
55
  },
51
56
  "devDependencies": {
52
57
  "@types/node": "^20.0.0",