osborn 0.9.32 → 0.9.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10,6 +10,7 @@ initializeLogger({ pretty: true, level: 'info' });
10
10
  import { setMaxListeners } from 'node:events';
11
11
  setMaxListeners(50);
12
12
  import { createServer } from 'http';
13
+ import { WebSocket, WebSocketServer } from 'ws';
13
14
  import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
14
15
  import { dirname, join } from 'node:path';
15
16
  import { fileURLToPath } from 'node:url';
@@ -146,6 +147,73 @@ process.on('uncaughtException', (error) => {
146
147
  // ============================================================
147
148
  // Module-level room code so the HTTP server can expose it via GET /room-code
148
149
  let currentRoomCode = null;
150
+ // Meeting output WebSocket — module-level so both startApiServer and main() can access it
151
+ let meetingOutputWs = null;
152
+ function sendToMeetingOutput(msg) {
153
+ if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
154
+ try {
155
+ meetingOutputWs.send(JSON.stringify(msg));
156
+ }
157
+ catch { }
158
+ }
159
+ }
160
+ // Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
161
+ // Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
162
+ async function synthesizeForMeeting(text, ttsConfig) {
163
+ if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
164
+ return;
165
+ const ttsInstance = createTTS(ttsConfig);
166
+ try {
167
+ const chunks = [];
168
+ let sampleRate = 24000;
169
+ let numChannels = 1;
170
+ const stream = ttsInstance.synthesize(text);
171
+ for await (const event of stream) {
172
+ if (event === Symbol.for('END_OF_STREAM'))
173
+ break;
174
+ const e = event;
175
+ if (e?.frame?.data) {
176
+ chunks.push(e.frame.data);
177
+ sampleRate = e.frame.sampleRate ?? sampleRate;
178
+ numChannels = e.frame.numChannels ?? numChannels;
179
+ }
180
+ }
181
+ if (chunks.length === 0)
182
+ return;
183
+ const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
184
+ const pcm = new Int16Array(totalSamples);
185
+ let offset = 0;
186
+ for (const c of chunks) {
187
+ pcm.set(c, offset);
188
+ offset += c.length;
189
+ }
190
+ // WAV header (44 bytes) + PCM data
191
+ const dataBytes = pcm.length * 2;
192
+ const wav = Buffer.alloc(44 + dataBytes);
193
+ wav.write('RIFF', 0);
194
+ wav.writeUInt32LE(36 + dataBytes, 4);
195
+ wav.write('WAVE', 8);
196
+ wav.write('fmt ', 12);
197
+ wav.writeUInt32LE(16, 16);
198
+ wav.writeUInt16LE(1, 20);
199
+ wav.writeUInt16LE(numChannels, 22);
200
+ wav.writeUInt32LE(sampleRate, 24);
201
+ wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
202
+ wav.writeUInt16LE(numChannels * 2, 32);
203
+ wav.writeUInt16LE(16, 34);
204
+ wav.write('data', 36);
205
+ wav.writeUInt32LE(dataBytes, 40);
206
+ for (let i = 0; i < pcm.length; i++)
207
+ wav.writeInt16LE(pcm[i], 44 + i * 2);
208
+ if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
209
+ meetingOutputWs.send(wav);
210
+ console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
211
+ }
212
+ }
213
+ finally {
214
+ await ttsInstance.close().catch(() => { });
215
+ }
216
+ }
149
217
  function startApiServer(workingDir, port) {
150
218
  const server = createServer(async (req, res) => {
151
219
  // CORS headers for cloud frontend
@@ -891,6 +959,32 @@ function startApiServer(workingDir, port) {
891
959
  };
892
960
  cleanStaleUploadDirs();
893
961
  setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
962
+ // ============================================================
963
+ // Meeting Output WebSocket — /meeting-audio
964
+ // ============================================================
965
+ // Recall's headless browser opens meeting-output.html which connects here.
966
+ // We push: JSON { type: 'speak', text } for display, binary PCM for audio (future).
967
+ const meetingOutputWss = new WebSocketServer({ noServer: true });
968
+ meetingOutputWss.on('connection', (ws) => {
969
+ console.log('📺 Meeting output browser connected');
970
+ meetingOutputWs = ws;
971
+ ws.on('close', () => {
972
+ console.log('📺 Meeting output browser disconnected');
973
+ if (meetingOutputWs === ws)
974
+ meetingOutputWs = null;
975
+ });
976
+ });
977
+ server.on('upgrade', (req, socket, head) => {
978
+ const url = new URL(req.url || '/', `http://localhost:${port}`);
979
+ if (url.pathname === '/meeting-audio') {
980
+ meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
981
+ meetingOutputWss.emit('connection', ws, req);
982
+ });
983
+ }
984
+ else {
985
+ socket.destroy();
986
+ }
987
+ });
894
988
  server.on('error', (err) => {
895
989
  if (err.code === 'EADDRINUSE') {
896
990
  console.warn(`⚠️ API port ${port} in use, trying ${port + 1}...`);
@@ -1112,6 +1206,7 @@ async function main() {
1112
1206
  // session-only path (no user prefix).
1113
1207
  let currentUserId = '';
1114
1208
  let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
1209
+ // meetingOutputWs is module-level (see top of file) — shared between startApiServer and main()
1115
1210
  // Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
1116
1211
  // Updated by resume_session, session_selected, continue_session, switch_session handlers
1117
1212
  let currentResumeSessionId;
@@ -1728,6 +1823,17 @@ async function main() {
1728
1823
  }
1729
1824
  const sayId = Date.now(); // simple ID to correlate start/end logs
1730
1825
  console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
1826
+ // Forward spoken text + audio to meeting output page when bot is in a meeting.
1827
+ // Text appears immediately; audio uses the same configured TTS (directConfig.tts)
1828
+ // so voice/provider stays consistent — no separate hardcoded provider.
1829
+ // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
1830
+ // Recall captures the browser page's audio output and injects it into the meeting.
1831
+ if (activeMeetingBotId) {
1832
+ sendToMeetingOutput({ type: 'speak', text: data.text });
1833
+ if (meetingOutputWs) {
1834
+ synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
1835
+ }
1836
+ }
1731
1837
  try {
1732
1838
  const handle = currentSession.say(data.text);
1733
1839
  if (handle && typeof handle.addDoneCallback === 'function') {
@@ -2975,10 +3081,27 @@ async function main() {
2975
3081
  clearInterval(readyInterval);
2976
3082
  console.log('✅ agent_ready retries complete');
2977
3083
  }, 20000);
2978
- // Stop agent_ready retries on user speech
3084
+ // Stop agent_ready retries on user speech, and interrupt agent TTS at VAD onset.
3085
+ // Previously the interrupt only fired when STT committed a full transcript (chat()
3086
+ // call), which let the agent talk over the user for the full utterance. Firing it
3087
+ // here cuts TTS the moment VAD detects speech.
3088
+ // Realtime providers (OpenAI/Gemini) handle interruption server-side via their own
3089
+ // VAD — calling interrupt() manually for Gemini specifically crashes its state
3090
+ // machine (code 1008, hangs in 'speaking'), so skip those.
2979
3091
  session.on('input_speech_started', () => {
2980
3092
  readySent = true;
2981
3093
  clearInterval(readyInterval);
3094
+ if (agentState !== 'speaking')
3095
+ return;
3096
+ if (sessionVoiceMode === 'realtime')
3097
+ return;
3098
+ try {
3099
+ console.log('🎤 VAD onset → interrupting agent TTS');
3100
+ currentSession?.interrupt();
3101
+ }
3102
+ catch (err) {
3103
+ console.warn('⚠️ VAD-onset interrupt failed:', err instanceof Error ? err.message : err);
3104
+ }
2982
3105
  });
2983
3106
  // Greet user via TTS (delayed if resume prompt will be shown)
2984
3107
  // For realtime mode: use generateReply() since there's no standalone TTS
@@ -1,18 +1,113 @@
1
1
  <!DOCTYPE html>
2
2
  <html>
3
- <head><title>Osborn Meeting Output</title></head>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Osborn</title>
6
+ <style>
7
+ * { margin: 0; padding: 0; box-sizing: border-box; }
8
+ body {
9
+ background: #0a0a0f;
10
+ color: #ffffff;
11
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
12
+ width: 100vw;
13
+ height: 100vh;
14
+ display: flex;
15
+ flex-direction: column;
16
+ align-items: center;
17
+ justify-content: center;
18
+ overflow: hidden;
19
+ }
20
+ #header {
21
+ position: absolute;
22
+ top: 24px;
23
+ left: 50%;
24
+ transform: translateX(-50%);
25
+ display: flex;
26
+ align-items: center;
27
+ gap: 10px;
28
+ }
29
+ #dot {
30
+ width: 8px;
31
+ height: 8px;
32
+ border-radius: 50%;
33
+ background: #333;
34
+ transition: background 0.3s;
35
+ }
36
+ #dot.speaking { background: #4ade80; box-shadow: 0 0 8px #4ade80; }
37
+ #dot.connected { background: #3b82f6; }
38
+ #name {
39
+ font-size: 13px;
40
+ font-weight: 600;
41
+ letter-spacing: 0.1em;
42
+ text-transform: uppercase;
43
+ color: #555;
44
+ }
45
+ #speech {
46
+ max-width: 80%;
47
+ text-align: center;
48
+ font-size: 28px;
49
+ font-weight: 400;
50
+ line-height: 1.4;
51
+ color: #f0f0f0;
52
+ opacity: 0;
53
+ transition: opacity 0.4s ease;
54
+ min-height: 2em;
55
+ }
56
+ #speech.visible { opacity: 1; }
57
+ #idle {
58
+ font-size: 14px;
59
+ color: #2a2a2a;
60
+ margin-top: 16px;
61
+ transition: opacity 0.4s;
62
+ }
63
+ #idle.hidden { opacity: 0; }
64
+ </style>
65
+ </head>
4
66
  <body>
5
- <script>
6
- const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
67
+ <div id="header">
68
+ <div id="dot"></div>
69
+ <div id="name">Osborn</div>
70
+ </div>
71
+ <div id="speech"></div>
72
+ <div id="idle">Listening…</div>
73
+ <script>
74
+ const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
75
+ const speechEl = document.getElementById('speech')
76
+ const dotEl = document.getElementById('dot')
77
+ const idleEl = document.getElementById('idle')
78
+
79
+ // Persistent AudioContext — created ONCE at startup per Recall's own demo pattern
80
+ let audioCtx = null
81
+ let clearTimer = null
82
+
83
+ async function initAudio() {
84
+ audioCtx = new (window.AudioContext || window.webkitAudioContext)()
85
+ // Recall's headless Chrome may start AudioContext suspended — resume immediately.
86
+ // Their own voice-agent-demo does this at connect time without waiting for user gesture.
87
+ if (audioCtx.state === 'suspended') {
88
+ await audioCtx.resume()
89
+ }
90
+ }
7
91
 
8
- function connect() {
9
- const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
10
- const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
92
+ function showSpeech(text) {
93
+ if (clearTimer) { clearTimeout(clearTimer); clearTimer = null }
94
+ speechEl.textContent = text
95
+ speechEl.classList.add('visible')
96
+ dotEl.className = 'speaking'
97
+ idleEl.classList.add('hidden')
98
+ clearTimer = setTimeout(clearSpeech, 6000)
99
+ }
100
+
101
+ function clearSpeech() {
102
+ speechEl.classList.remove('visible')
103
+ dotEl.className = 'connected'
104
+ idleEl.classList.remove('hidden')
105
+ clearTimer = null
106
+ }
11
107
 
12
- ws.onmessage = async (event) => {
108
+ async function playAudio(arrayBuffer) {
109
+ if (!audioCtx) return
13
110
  try {
14
- const audioCtx = new (window.AudioContext || window.webkitAudioContext)()
15
- const arrayBuffer = await event.data.arrayBuffer()
16
111
  const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer)
17
112
  const source = audioCtx.createBufferSource()
18
113
  source.buffer = audioBuffer
@@ -23,10 +118,40 @@
23
118
  }
24
119
  }
25
120
 
26
- ws.onclose = () => setTimeout(connect, 1000)
27
- }
121
+ function connect() {
122
+ const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
123
+ const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
124
+ ws.binaryType = 'arraybuffer'
125
+
126
+ ws.onopen = () => {
127
+ console.log('Connected to meeting-audio WebSocket')
128
+ dotEl.className = 'connected'
129
+ }
130
+
131
+ ws.onmessage = async (event) => {
132
+ if (event.data instanceof ArrayBuffer) {
133
+ await playAudio(event.data)
134
+ } else {
135
+ try {
136
+ const msg = JSON.parse(event.data)
137
+ if (msg.type === 'speak' && msg.text) {
138
+ showSpeech(msg.text)
139
+ } else if (msg.type === 'clear') {
140
+ clearSpeech()
141
+ }
142
+ } catch (e) {
143
+ console.error('Parse error:', e)
144
+ }
145
+ }
146
+ }
147
+
148
+ ws.onclose = () => {
149
+ dotEl.className = ''
150
+ setTimeout(connect, 1500)
151
+ }
152
+ }
28
153
 
29
- connect()
30
- </script>
154
+ initAudio().then(() => connect())
155
+ </script>
31
156
  </body>
32
157
  </html>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.32",
3
+ "version": "0.9.34",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {