npm - osborn - Versions diffs - 0.9.31 → 0.9.33 - Mend

osborn 0.9.31 → 0.9.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/.claude/skills/user-context/SKILL.md +8 -3
package/dist/index.js +106 -0
package/dist/meeting-output.html +138 -13
package/dist/recall-client.d.ts +26 -11
package/dist/recall-client.js +32 -19
package/package.json +1 -1

package/.claude/skills/user-context/SKILL.md CHANGED Viewed

@@ -17,12 +17,17 @@ Do not batch — capture as they happen.
 </what-to-do>
 <trigger-phrases>
-- "grill me"
+This skill is a META operation — about building a model of the USER THEMSELVES,
+not about any subject matter in the current conversation. The trigger phrases below
+are intentionally specific so they cannot be confused with domain requests in any session.
+- "update user context"
 - "learn my language"
-- "update my context"
+- "start context interview"
+- "grill me on my language"
 - "learn how I talk"
 - "standardise my language"
-- "update user context"
+- "update my context"
 </trigger-phrases>
 <supporting-info>

package/dist/index.js CHANGED Viewed

@@ -10,6 +10,7 @@ initializeLogger({ pretty: true, level: 'info' });
 import { setMaxListeners } from 'node:events';
 setMaxListeners(50);
 import { createServer } from 'http';
+import { WebSocket, WebSocketServer } from 'ws';
 import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
 import { dirname, join } from 'node:path';
 import { fileURLToPath } from 'node:url';
@@ -146,6 +147,73 @@ process.on('uncaughtException', (error) => {
 // ============================================================
 // Module-level room code so the HTTP server can expose it via GET /room-code
 let currentRoomCode = null;
+// Meeting output WebSocket — module-level so both startApiServer and main() can access it
+let meetingOutputWs = null;
+function sendToMeetingOutput(msg) {
+    if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
+        try {
+            meetingOutputWs.send(JSON.stringify(msg));
+        }
+        catch { }
+    }
+}
+// Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
+// Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
+async function synthesizeForMeeting(text, ttsConfig) {
+    if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
+        return;
+    const ttsInstance = createTTS(ttsConfig);
+    try {
+        const chunks = [];
+        let sampleRate = 24000;
+        let numChannels = 1;
+        const stream = ttsInstance.synthesize(text);
+        for await (const event of stream) {
+            if (event === Symbol.for('END_OF_STREAM'))
+                break;
+            const e = event;
+            if (e?.frame?.data) {
+                chunks.push(e.frame.data);
+                sampleRate = e.frame.sampleRate ?? sampleRate;
+                numChannels = e.frame.numChannels ?? numChannels;
+            }
+        }
+        if (chunks.length === 0)
+            return;
+        const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
+        const pcm = new Int16Array(totalSamples);
+        let offset = 0;
+        for (const c of chunks) {
+            pcm.set(c, offset);
+            offset += c.length;
+        }
+        // WAV header (44 bytes) + PCM data
+        const dataBytes = pcm.length * 2;
+        const wav = Buffer.alloc(44 + dataBytes);
+        wav.write('RIFF', 0);
+        wav.writeUInt32LE(36 + dataBytes, 4);
+        wav.write('WAVE', 8);
+        wav.write('fmt ', 12);
+        wav.writeUInt32LE(16, 16);
+        wav.writeUInt16LE(1, 20);
+        wav.writeUInt16LE(numChannels, 22);
+        wav.writeUInt32LE(sampleRate, 24);
+        wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
+        wav.writeUInt16LE(numChannels * 2, 32);
+        wav.writeUInt16LE(16, 34);
+        wav.write('data', 36);
+        wav.writeUInt32LE(dataBytes, 40);
+        for (let i = 0; i < pcm.length; i++)
+            wav.writeInt16LE(pcm[i], 44 + i * 2);
+        if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
+            meetingOutputWs.send(wav);
+            console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
+        }
+    }
+    finally {
+        await ttsInstance.close().catch(() => { });
+    }
+}
 function startApiServer(workingDir, port) {
     const server = createServer(async (req, res) => {
         // CORS headers for cloud frontend
@@ -891,6 +959,32 @@ function startApiServer(workingDir, port) {
     };
     cleanStaleUploadDirs();
     setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
+    // ============================================================
+    // Meeting Output WebSocket — /meeting-audio
+    // ============================================================
+    // Recall's headless browser opens meeting-output.html which connects here.
+    // We push: JSON { type: 'speak', text } for display, binary PCM for audio (future).
+    const meetingOutputWss = new WebSocketServer({ noServer: true });
+    meetingOutputWss.on('connection', (ws) => {
+        console.log('📺 Meeting output browser connected');
+        meetingOutputWs = ws;
+        ws.on('close', () => {
+            console.log('📺 Meeting output browser disconnected');
+            if (meetingOutputWs === ws)
+                meetingOutputWs = null;
+        });
+    });
+    server.on('upgrade', (req, socket, head) => {
+        const url = new URL(req.url || '/', `http://localhost:${port}`);
+        if (url.pathname === '/meeting-audio') {
+            meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
+                meetingOutputWss.emit('connection', ws, req);
+            });
+        }
+        else {
+            socket.destroy();
+        }
+    });
     server.on('error', (err) => {
         if (err.code === 'EADDRINUSE') {
             console.warn(`⚠️ API port ${port} in use, trying ${port + 1}...`);
@@ -1112,6 +1206,7 @@ async function main() {
     // session-only path (no user prefix).
     let currentUserId = '';
     let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
+    // meetingOutputWs is module-level (see top of file) — shared between startApiServer and main()
     // Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
     // Updated by resume_session, session_selected, continue_session, switch_session handlers
     let currentResumeSessionId;
@@ -1728,6 +1823,17 @@ async function main() {
             }
             const sayId = Date.now(); // simple ID to correlate start/end logs
             console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
+            // Forward spoken text + audio to meeting output page when bot is in a meeting.
+            // Text appears immediately; audio uses the same configured TTS (directConfig.tts)
+            // so voice/provider stays consistent — no separate hardcoded provider.
+            // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
+            // Recall captures the browser page's audio output and injects it into the meeting.
+            if (activeMeetingBotId) {
+                sendToMeetingOutput({ type: 'speak', text: data.text });
+                if (meetingOutputWs) {
+                    synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
+                }
+            }
             try {
                 const handle = currentSession.say(data.text);
                 if (handle && typeof handle.addDoneCallback === 'function') {

package/dist/meeting-output.html CHANGED Viewed

@@ -1,18 +1,113 @@
 <!DOCTYPE html>
 <html>
-<head><title>Osborn Meeting Output</title></head>
+<head>
+  <meta charset="utf-8">
+  <title>Osborn</title>
+  <style>
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+    body {
+      background: #0a0a0f;
+      color: #ffffff;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      width: 100vw;
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+      overflow: hidden;
+    }
+    #header {
+      position: absolute;
+      top: 24px;
+      left: 50%;
+      transform: translateX(-50%);
+      display: flex;
+      align-items: center;
+      gap: 10px;
+    }
+    #dot {
+      width: 8px;
+      height: 8px;
+      border-radius: 50%;
+      background: #333;
+      transition: background 0.3s;
+    }
+    #dot.speaking { background: #4ade80; box-shadow: 0 0 8px #4ade80; }
+    #dot.connected { background: #3b82f6; }
+    #name {
+      font-size: 13px;
+      font-weight: 600;
+      letter-spacing: 0.1em;
+      text-transform: uppercase;
+      color: #555;
+    }
+    #speech {
+      max-width: 80%;
+      text-align: center;
+      font-size: 28px;
+      font-weight: 400;
+      line-height: 1.4;
+      color: #f0f0f0;
+      opacity: 0;
+      transition: opacity 0.4s ease;
+      min-height: 2em;
+    }
+    #speech.visible { opacity: 1; }
+    #idle {
+      font-size: 14px;
+      color: #2a2a2a;
+      margin-top: 16px;
+      transition: opacity 0.4s;
+    }
+    #idle.hidden { opacity: 0; }
+  </style>
+</head>
 <body>
-<script>
-  const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
+  <div id="header">
+    <div id="dot"></div>
+    <div id="name">Osborn</div>
+  </div>
+  <div id="speech"></div>
+  <div id="idle">Listening…</div>
+  <script>
+    const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
+    const speechEl = document.getElementById('speech')
+    const dotEl = document.getElementById('dot')
+    const idleEl = document.getElementById('idle')
+    // Persistent AudioContext — created ONCE at startup per Recall's own demo pattern
+    let audioCtx = null
+    let clearTimer = null
+    async function initAudio() {
+      audioCtx = new (window.AudioContext || window.webkitAudioContext)()
+      // Recall's headless Chrome may start AudioContext suspended — resume immediately.
+      // Their own voice-agent-demo does this at connect time without waiting for user gesture.
+      if (audioCtx.state === 'suspended') {
+        await audioCtx.resume()
+      }
+    }
-  function connect() {
-    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
-    const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
+    function showSpeech(text) {
+      if (clearTimer) { clearTimeout(clearTimer); clearTimer = null }
+      speechEl.textContent = text
+      speechEl.classList.add('visible')
+      dotEl.className = 'speaking'
+      idleEl.classList.add('hidden')
+      clearTimer = setTimeout(clearSpeech, 6000)
+    }
+    function clearSpeech() {
+      speechEl.classList.remove('visible')
+      dotEl.className = 'connected'
+      idleEl.classList.remove('hidden')
+      clearTimer = null
+    }
-    ws.onmessage = async (event) => {
+    async function playAudio(arrayBuffer) {
+      if (!audioCtx) return
       try {
-        const audioCtx = new (window.AudioContext || window.webkitAudioContext)()
-        const arrayBuffer = await event.data.arrayBuffer()
         const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer)
         const source = audioCtx.createBufferSource()
         source.buffer = audioBuffer
@@ -23,10 +118,40 @@
       }
     }
-    ws.onclose = () => setTimeout(connect, 1000)
-  }
+    function connect() {
+      const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
+      const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
+      ws.binaryType = 'arraybuffer'
+      ws.onopen = () => {
+        console.log('Connected to meeting-audio WebSocket')
+        dotEl.className = 'connected'
+      }
+      ws.onmessage = async (event) => {
+        if (event.data instanceof ArrayBuffer) {
+          await playAudio(event.data)
+        } else {
+          try {
+            const msg = JSON.parse(event.data)
+            if (msg.type === 'speak' && msg.text) {
+              showSpeech(msg.text)
+            } else if (msg.type === 'clear') {
+              clearSpeech()
+            }
+          } catch (e) {
+            console.error('Parse error:', e)
+          }
+        }
+      }
+      ws.onclose = () => {
+        dotEl.className = ''
+        setTimeout(connect, 1500)
+      }
+    }
-  connect()
-</script>
+    initAudio().then(() => connect())
+  </script>
 </body>
 </html>

package/dist/recall-client.d.ts CHANGED Viewed

@@ -4,18 +4,33 @@ export interface RecallBot {
     meeting_url: string;
     status: string;
 }
-export interface TranscriptWord {
-    text: string;
-    start_time: number;
-    end_time: number;
-}
 export interface TranscriptPayload {
-    bot_id: string;
-    transcript: {
-        speaker: string;
-        words: TranscriptWord[];
-        is_final: boolean;
-        language?: string;
+    event: string;
+    data: {
+        data: {
+            words: Array<{
+                text: string;
+                start_timestamp?: {
+                    relative?: number;
+                };
+                end_timestamp?: {
+                    relative?: number;
+                };
+            }>;
+            language_code?: string;
+            participant?: {
+                id: number;
+                name: string;
+                is_host?: boolean;
+                platform?: string;
+            };
+        };
+        bot?: {
+            id: string;
+        };
+        recording?: {
+            id: string;
+        };
     };
 }
 export declare class RecallClient extends EventEmitter {

package/dist/recall-client.js CHANGED Viewed

@@ -9,6 +9,17 @@ export class RecallClient extends EventEmitter {
         this.#apiKey = apiKey;
     }
     async joinMeeting(meetingUrl, webhookBaseUrl, botName = 'Osborn') {
+        // Authoritative structure per https://docs.recall.ai/reference/bot_create
+        // and https://docs.recall.ai/docs/real-time-transcription:
+        //
+        //   recording_config.transcript.provider  — transcription provider config
+        //   recording_config.realtime_endpoints   — webhook/websocket delivery
+        //
+        // IMPORTANT:
+        //   - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
+        //   - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
+        //   - `transcription_options` does NOT exist — use `transcript.provider`
+        //   - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
         const res = await fetch(`${RECALL_BASE_URL}/bot`, {
             method: 'POST',
             headers: {
@@ -19,24 +30,26 @@ export class RecallClient extends EventEmitter {
                 meeting_url: meetingUrl,
                 bot_name: botName,
                 recording_config: {
-                    // Field names must match Recall API exactly (no underscore in realtime_endpoints).
-                    // real_time_endpoints was silently ignored — API uses realtime_endpoints.
+                    transcript: {
+                        provider: {
+                            // recallai_streaming is built-in — no external API key needed,
+                            // low-latency, works across all meeting platforms.
+                            recallai_streaming: {
+                                mode: 'prioritize_low_latency',
+                                language_code: 'en',
+                            },
+                        },
+                    },
                     realtime_endpoints: [{
                             type: 'webhook',
-                            config: {
-                                url: `${webhookBaseUrl}/webhook/recall`,
-                                events: ['transcript.data'],
-                            },
+                            url: `${webhookBaseUrl}/webhook/recall`,
+                            events: ['transcript.data'],
                         }],
-                    transcription_options: {
-                        provider: 'assembly_ai',
-                        mode: 'prioritize_low_latency',
-                    },
                 },
                 output_media: {
                     camera: {
-                        // Recall API expects `kind` (not `type`); the wrong key arrives as null and
-                        // gets rejected as "Invalid choice null. Expected 'webpage' or 'default'."
+                        // `kind` (not `type`) — confirmed from prior debugging.
+                        // Output webpage plays TTS audio so meeting participants can hear the agent.
                         kind: 'webpage',
                         config: {
                             url: `${webhookBaseUrl}/meeting-output`,
@@ -69,16 +82,16 @@ export class RecallClient extends EventEmitter {
         return bot.status_changes?.at(-1)?.code ?? 'unknown';
     }
     handleWebhook(payload) {
-        if (!payload.transcript?.is_final)
+        // Only process final transcripts (transcript.data), skip partials
+        if (payload.event !== 'transcript.data')
             return;
-        const text = payload.transcript.words.map(w => w.text).join(' ').trim();
+        const words = payload.data?.data?.words ?? [];
+        const text = words.map(w => w.text).join(' ').trim();
         if (!text)
             return;
-        this.emit('transcript', {
-            botId: payload.bot_id,
-            speaker: payload.transcript.speaker,
-            text,
-        });
+        const speaker = payload.data?.data?.participant?.name ?? 'Unknown';
+        const botId = payload.data?.bot?.id ?? 'unknown';
+        this.emit('transcript', { botId, speaker, text });
     }
     registerBot(botId, sessionId) {
         this.#activeBots.set(botId, sessionId);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.9.31",
+  "version": "0.9.33",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {