npm - osborn - Versions diffs - 0.9.32 → 0.9.34 - Mend

osborn 0.9.32 → 0.9.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js +124 -1
package/dist/meeting-output.html +138 -13
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -10,6 +10,7 @@ initializeLogger({ pretty: true, level: 'info' });
 import { setMaxListeners } from 'node:events';
 setMaxListeners(50);
 import { createServer } from 'http';
+import { WebSocket, WebSocketServer } from 'ws';
 import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
 import { dirname, join } from 'node:path';
 import { fileURLToPath } from 'node:url';
@@ -146,6 +147,73 @@ process.on('uncaughtException', (error) => {
 // ============================================================
 // Module-level room code so the HTTP server can expose it via GET /room-code
 let currentRoomCode = null;
+// Meeting output WebSocket — module-level so both startApiServer and main() can access it
+let meetingOutputWs = null;
+function sendToMeetingOutput(msg) {
+    if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
+        try {
+            meetingOutputWs.send(JSON.stringify(msg));
+        }
+        catch { }
+    }
+}
+// Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
+// Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
+async function synthesizeForMeeting(text, ttsConfig) {
+    if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
+        return;
+    const ttsInstance = createTTS(ttsConfig);
+    try {
+        const chunks = [];
+        let sampleRate = 24000;
+        let numChannels = 1;
+        const stream = ttsInstance.synthesize(text);
+        for await (const event of stream) {
+            if (event === Symbol.for('END_OF_STREAM'))
+                break;
+            const e = event;
+            if (e?.frame?.data) {
+                chunks.push(e.frame.data);
+                sampleRate = e.frame.sampleRate ?? sampleRate;
+                numChannels = e.frame.numChannels ?? numChannels;
+            }
+        }
+        if (chunks.length === 0)
+            return;
+        const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
+        const pcm = new Int16Array(totalSamples);
+        let offset = 0;
+        for (const c of chunks) {
+            pcm.set(c, offset);
+            offset += c.length;
+        }
+        // WAV header (44 bytes) + PCM data
+        const dataBytes = pcm.length * 2;
+        const wav = Buffer.alloc(44 + dataBytes);
+        wav.write('RIFF', 0);
+        wav.writeUInt32LE(36 + dataBytes, 4);
+        wav.write('WAVE', 8);
+        wav.write('fmt ', 12);
+        wav.writeUInt32LE(16, 16);
+        wav.writeUInt16LE(1, 20);
+        wav.writeUInt16LE(numChannels, 22);
+        wav.writeUInt32LE(sampleRate, 24);
+        wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
+        wav.writeUInt16LE(numChannels * 2, 32);
+        wav.writeUInt16LE(16, 34);
+        wav.write('data', 36);
+        wav.writeUInt32LE(dataBytes, 40);
+        for (let i = 0; i < pcm.length; i++)
+            wav.writeInt16LE(pcm[i], 44 + i * 2);
+        if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
+            meetingOutputWs.send(wav);
+            console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
+        }
+    }
+    finally {
+        await ttsInstance.close().catch(() => { });
+    }
+}
 function startApiServer(workingDir, port) {
     const server = createServer(async (req, res) => {
         // CORS headers for cloud frontend
@@ -891,6 +959,32 @@ function startApiServer(workingDir, port) {
     };
     cleanStaleUploadDirs();
     setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
+    // ============================================================
+    // Meeting Output WebSocket — /meeting-audio
+    // ============================================================
+    // Recall's headless browser opens meeting-output.html which connects here.
+    // We push: JSON { type: 'speak', text } for display, binary PCM for audio (future).
+    const meetingOutputWss = new WebSocketServer({ noServer: true });
+    meetingOutputWss.on('connection', (ws) => {
+        console.log('📺 Meeting output browser connected');
+        meetingOutputWs = ws;
+        ws.on('close', () => {
+            console.log('📺 Meeting output browser disconnected');
+            if (meetingOutputWs === ws)
+                meetingOutputWs = null;
+        });
+    });
+    server.on('upgrade', (req, socket, head) => {
+        const url = new URL(req.url || '/', `http://localhost:${port}`);
+        if (url.pathname === '/meeting-audio') {
+            meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
+                meetingOutputWss.emit('connection', ws, req);
+            });
+        }
+        else {
+            socket.destroy();
+        }
+    });
     server.on('error', (err) => {
         if (err.code === 'EADDRINUSE') {
             console.warn(`⚠️ API port ${port} in use, trying ${port + 1}...`);
@@ -1112,6 +1206,7 @@ async function main() {
     // session-only path (no user prefix).
     let currentUserId = '';
     let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
+    // meetingOutputWs is module-level (see top of file) — shared between startApiServer and main()
     // Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
     // Updated by resume_session, session_selected, continue_session, switch_session handlers
     let currentResumeSessionId;
@@ -1728,6 +1823,17 @@ async function main() {
             }
             const sayId = Date.now(); // simple ID to correlate start/end logs
             console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
+            // Forward spoken text + audio to meeting output page when bot is in a meeting.
+            // Text appears immediately; audio uses the same configured TTS (directConfig.tts)
+            // so voice/provider stays consistent — no separate hardcoded provider.
+            // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
+            // Recall captures the browser page's audio output and injects it into the meeting.
+            if (activeMeetingBotId) {
+                sendToMeetingOutput({ type: 'speak', text: data.text });
+                if (meetingOutputWs) {
+                    synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
+                }
+            }
             try {
                 const handle = currentSession.say(data.text);
                 if (handle && typeof handle.addDoneCallback === 'function') {
@@ -2975,10 +3081,27 @@ async function main() {
                 clearInterval(readyInterval);
                 console.log('✅ agent_ready retries complete');
             }, 20000);
-            // Stop agent_ready retries on user speech
+            // Stop agent_ready retries on user speech, and interrupt agent TTS at VAD onset.
+            // Previously the interrupt only fired when STT committed a full transcript (chat()
+            // call), which let the agent talk over the user for the full utterance. Firing it
+            // here cuts TTS the moment VAD detects speech.
+            // Realtime providers (OpenAI/Gemini) handle interruption server-side via their own
+            // VAD — calling interrupt() manually for Gemini specifically crashes its state
+            // machine (code 1008, hangs in 'speaking'), so skip those.
             session.on('input_speech_started', () => {
                 readySent = true;
                 clearInterval(readyInterval);
+                if (agentState !== 'speaking')
+                    return;
+                if (sessionVoiceMode === 'realtime')
+                    return;
+                try {
+                    console.log('🎤 VAD onset → interrupting agent TTS');
+                    currentSession?.interrupt();
+                }
+                catch (err) {
+                    console.warn('⚠️ VAD-onset interrupt failed:', err instanceof Error ? err.message : err);
+                }
             });
             // Greet user via TTS (delayed if resume prompt will be shown)
             // For realtime mode: use generateReply() since there's no standalone TTS

package/dist/meeting-output.html CHANGED Viewed

@@ -1,18 +1,113 @@
 <!DOCTYPE html>
 <html>
-<head><title>Osborn Meeting Output</title></head>
+<head>
+  <meta charset="utf-8">
+  <title>Osborn</title>
+  <style>
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+    body {
+      background: #0a0a0f;
+      color: #ffffff;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      width: 100vw;
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+      overflow: hidden;
+    }
+    #header {
+      position: absolute;
+      top: 24px;
+      left: 50%;
+      transform: translateX(-50%);
+      display: flex;
+      align-items: center;
+      gap: 10px;
+    }
+    #dot {
+      width: 8px;
+      height: 8px;
+      border-radius: 50%;
+      background: #333;
+      transition: background 0.3s;
+    }
+    #dot.speaking { background: #4ade80; box-shadow: 0 0 8px #4ade80; }
+    #dot.connected { background: #3b82f6; }
+    #name {
+      font-size: 13px;
+      font-weight: 600;
+      letter-spacing: 0.1em;
+      text-transform: uppercase;
+      color: #555;
+    }
+    #speech {
+      max-width: 80%;
+      text-align: center;
+      font-size: 28px;
+      font-weight: 400;
+      line-height: 1.4;
+      color: #f0f0f0;
+      opacity: 0;
+      transition: opacity 0.4s ease;
+      min-height: 2em;
+    }
+    #speech.visible { opacity: 1; }
+    #idle {
+      font-size: 14px;
+      color: #2a2a2a;
+      margin-top: 16px;
+      transition: opacity 0.4s;
+    }
+    #idle.hidden { opacity: 0; }
+  </style>
+</head>
 <body>
-<script>
-  const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
+  <div id="header">
+    <div id="dot"></div>
+    <div id="name">Osborn</div>
+  </div>
+  <div id="speech"></div>
+  <div id="idle">Listening…</div>
+  <script>
+    const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
+    const speechEl = document.getElementById('speech')
+    const dotEl = document.getElementById('dot')
+    const idleEl = document.getElementById('idle')
+    // Persistent AudioContext — created ONCE at startup per Recall's own demo pattern
+    let audioCtx = null
+    let clearTimer = null
+    async function initAudio() {
+      audioCtx = new (window.AudioContext || window.webkitAudioContext)()
+      // Recall's headless Chrome may start AudioContext suspended — resume immediately.
+      // Their own voice-agent-demo does this at connect time without waiting for user gesture.
+      if (audioCtx.state === 'suspended') {
+        await audioCtx.resume()
+      }
+    }
-  function connect() {
-    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
-    const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
+    function showSpeech(text) {
+      if (clearTimer) { clearTimeout(clearTimer); clearTimer = null }
+      speechEl.textContent = text
+      speechEl.classList.add('visible')
+      dotEl.className = 'speaking'
+      idleEl.classList.add('hidden')
+      clearTimer = setTimeout(clearSpeech, 6000)
+    }
+    function clearSpeech() {
+      speechEl.classList.remove('visible')
+      dotEl.className = 'connected'
+      idleEl.classList.remove('hidden')
+      clearTimer = null
+    }
-    ws.onmessage = async (event) => {
+    async function playAudio(arrayBuffer) {
+      if (!audioCtx) return
       try {
-        const audioCtx = new (window.AudioContext || window.webkitAudioContext)()
-        const arrayBuffer = await event.data.arrayBuffer()
         const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer)
         const source = audioCtx.createBufferSource()
         source.buffer = audioBuffer
@@ -23,10 +118,40 @@
       }
     }
-    ws.onclose = () => setTimeout(connect, 1000)
-  }
+    function connect() {
+      const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
+      const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
+      ws.binaryType = 'arraybuffer'
+      ws.onopen = () => {
+        console.log('Connected to meeting-audio WebSocket')
+        dotEl.className = 'connected'
+      }
+      ws.onmessage = async (event) => {
+        if (event.data instanceof ArrayBuffer) {
+          await playAudio(event.data)
+        } else {
+          try {
+            const msg = JSON.parse(event.data)
+            if (msg.type === 'speak' && msg.text) {
+              showSpeech(msg.text)
+            } else if (msg.type === 'clear') {
+              clearSpeech()
+            }
+          } catch (e) {
+            console.error('Parse error:', e)
+          }
+        }
+      }
+      ws.onclose = () => {
+        dotEl.className = ''
+        setTimeout(connect, 1500)
+      }
+    }
-  connect()
-</script>
+    initAudio().then(() => connect())
+  </script>
 </body>
 </html>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.9.32",
+  "version": "0.9.34",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {