npm - osborn - Versions diffs - 0.9.42 → 0.9.44 - Mend

osborn 0.9.42 → 0.9.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/.claude/skills/meetings/SKILL.md +145 -0
package/dist/index.js +111 -527
package/dist/meeting-transcript-poller.d.ts +60 -0
package/dist/meeting-transcript-poller.js +112 -0
package/dist/recall-client.d.ts +50 -2
package/dist/recall-client.js +54 -64
package/package.json +2 -2

package/dist/meeting-transcript-poller.d.ts ADDED Viewed

@@ -0,0 +1,60 @@
+/**
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
+ * messages.
+ *
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
+ * never speaks in the meeting — it's a silent note-taker.
+ *
+ * Lifecycle:
+ *   const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
+ *   poller.start()
+ *   ...
+ *   poller.stop()  // on leave_meeting / disconnect / session switch
+ *
+ * Dedup strategy:
+ *   Each turn carries a `start_timestamp.relative` on its first word (seconds
+ *   since recording start). We track the highest cursor we've forwarded and
+ *   only send turns with a strictly greater first-word timestamp. This means
+ *   re-fetches don't double-deliver, and partial transcripts that get refined
+ *   later don't re-trigger LLM processing of already-handled turns.
+ *
+ * Error handling:
+ *   Transient fetch errors are logged + skipped (poll continues on next tick).
+ *   No backoff — Recall's transcript endpoint is stable enough that a 30s
+ *   cadence makes "slow start" non-issues self-recover within one cycle.
+ */
+import type { RecallClient, TranscriptTurn } from './recall-client.js';
+export interface MeetingTranscriptPollerOptions {
+    botId: string;
+    recall: RecallClient;
+    /** Called when new transcript turns arrive (de-duped). Get a fresh batch each tick. */
+    onTurns: (chunk: {
+        botId: string;
+        turns: TranscriptTurn[];
+        formatted: string;
+    }) => void | Promise<void>;
+    /** Default 30s — matches the user's stated cadence. */
+    intervalMs?: number;
+    /** Optional debug logger. */
+    onError?: (err: Error) => void;
+}
+export declare class MeetingTranscriptPoller {
+    #private;
+    constructor(opts: MeetingTranscriptPollerOptions);
+    start(): void;
+    stop(): void;
+}
+/**
+ * Format an array of turns into a single string for LLM consumption.
+ *
+ * Each turn becomes:
+ *   <Speaker>: <text>
+ *
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
+ * string if nothing meaningful is in the batch.
+ */
+export declare function formatTurns(turns: TranscriptTurn[]): string;

package/dist/meeting-transcript-poller.js ADDED Viewed

@@ -0,0 +1,112 @@
+/**
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
+ * messages.
+ *
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
+ * never speaks in the meeting — it's a silent note-taker.
+ *
+ * Lifecycle:
+ *   const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
+ *   poller.start()
+ *   ...
+ *   poller.stop()  // on leave_meeting / disconnect / session switch
+ *
+ * Dedup strategy:
+ *   Each turn carries a `start_timestamp.relative` on its first word (seconds
+ *   since recording start). We track the highest cursor we've forwarded and
+ *   only send turns with a strictly greater first-word timestamp. This means
+ *   re-fetches don't double-deliver, and partial transcripts that get refined
+ *   later don't re-trigger LLM processing of already-handled turns.
+ *
+ * Error handling:
+ *   Transient fetch errors are logged + skipped (poll continues on next tick).
+ *   No backoff — Recall's transcript endpoint is stable enough that a 30s
+ *   cadence makes "slow start" non-issues self-recover within one cycle.
+ */
+export class MeetingTranscriptPoller {
+    #opts;
+    #timer = null;
+    #cursor = -Infinity; // highest first-word.start_timestamp.relative we've forwarded
+    #inFlight = false; // prevent overlapping polls if one cycle runs long
+    #stopped = false;
+    constructor(opts) {
+        this.#opts = opts;
+    }
+    start() {
+        if (this.#timer)
+            return;
+        const interval = this.#opts.intervalMs ?? 30_000;
+        console.log(`📓 MeetingTranscriptPoller starting for bot=${this.#opts.botId.substring(0, 8)} (every ${Math.round(interval / 1000)}s)`);
+        // Fire once immediately so the LLM sees the meeting started, then on interval.
+        void this.#tick();
+        this.#timer = setInterval(() => void this.#tick(), interval);
+    }
+    stop() {
+        if (this.#stopped)
+            return;
+        this.#stopped = true;
+        if (this.#timer) {
+            clearInterval(this.#timer);
+            this.#timer = null;
+        }
+        console.log(`📓 MeetingTranscriptPoller stopped for bot=${this.#opts.botId.substring(0, 8)}`);
+    }
+    async #tick() {
+        if (this.#inFlight || this.#stopped)
+            return;
+        this.#inFlight = true;
+        try {
+            const all = await this.#opts.recall.getTranscript(this.#opts.botId);
+            const fresh = all.filter(t => {
+                const firstWordTs = t.words?.[0]?.start_timestamp?.relative;
+                return typeof firstWordTs === 'number' && firstWordTs > this.#cursor;
+            });
+            if (fresh.length === 0)
+                return;
+            // Advance cursor to highest seen first-word ts (across all returned turns,
+            // not just the fresh ones — guards against Recall returning a paged subset).
+            for (const t of all) {
+                const ts = t.words?.[0]?.start_timestamp?.relative;
+                if (typeof ts === 'number' && ts > this.#cursor)
+                    this.#cursor = ts;
+            }
+            const formatted = formatTurns(fresh);
+            if (!formatted)
+                return; // pure-whitespace fresh batch — skip
+            console.log(`📓 MeetingTranscriptPoller: ${fresh.length} new turn(s), cursor=${this.#cursor.toFixed(1)}s, chars=${formatted.length}`);
+            await this.#opts.onTurns({ botId: this.#opts.botId, turns: fresh, formatted });
+        }
+        catch (err) {
+            const e = err instanceof Error ? err : new Error(String(err));
+            this.#opts.onError?.(e);
+            console.warn(`⚠️ MeetingTranscriptPoller tick failed: ${e.message}`);
+        }
+        finally {
+            this.#inFlight = false;
+        }
+    }
+}
+/**
+ * Format an array of turns into a single string for LLM consumption.
+ *
+ * Each turn becomes:
+ *   <Speaker>: <text>
+ *
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
+ * string if nothing meaningful is in the batch.
+ */
+export function formatTurns(turns) {
+    const lines = [];
+    for (const t of turns) {
+        const speaker = t.speaker || t.participant?.name || 'Unknown';
+        const text = (t.words ?? []).map(w => w.text).join(' ').replace(/\s+/g, ' ').trim();
+        if (!text)
+            continue;
+        lines.push(`${speaker}: ${text}`);
+    }
+    return lines.join('\n');
+}

package/dist/recall-client.d.ts CHANGED Viewed

@@ -4,6 +4,36 @@ export interface RecallBot {
     meeting_url: string;
     status: string;
 }
+/**
+ * One transcript turn = one speaker's continuous utterance.
+ * Shape returned by GET /api/v1/bot/{bot_id}/transcript.
+ *
+ * Per Recall docs each turn contains:
+ *   - speaker: participant name (or 'Unknown')
+ *   - words: array of { text, start_timestamp.relative, end_timestamp.relative }
+ *   - The `start_timestamp.relative` (seconds since recording start) on the
+ *     FIRST word is the turn's start; we use this as the dedup cursor.
+ */
+export interface TranscriptTurn {
+    speaker?: string;
+    participant?: {
+        id?: number;
+        name?: string;
+        is_host?: boolean;
+    };
+    words: Array<{
+        text: string;
+        start_timestamp?: {
+            relative?: number;
+            absolute?: string;
+        };
+        end_timestamp?: {
+            relative?: number;
+            absolute?: string;
+        };
+    }>;
+    language?: string;
+}
 export interface TranscriptPayload {
     event: string;
     data: {
@@ -49,10 +79,28 @@ export declare class RecallClient extends EventEmitter {
      *                            room as the osborn agent (no separate WebSocket+WAV pipe).
      * @param opts.botName     Display name of the bot in the meeting
      */
-    joinMeeting(meetingUrl: string, webhookBaseUrl: string, opts?: {
-        outputPageUrl?: string;
+    joinMeeting(meetingUrl: string, _webhookBaseUrl: string, opts?: {
         botName?: string;
     }): Promise<string>;
+    /**
+     * Fetch the bot's current transcript. Returns an array of "transcript turns"
+     * (each turn = one speaker's utterance) sorted by start time.
+     *
+     * Verified 2026-05-22 against the real us-west-2 API: there is NO simple
+     * `GET /bot/{id}/transcript` convenience endpoint. The actual chain is:
+     *
+     *   1. GET /api/v1/bot/{bot_id}
+     *   2. recordings[0].media_shortcuts.transcript.data.download_url   (S3 signed URL)
+     *   3. GET that URL  →  JSON array of TranscriptTurn objects
+     *
+     * The S3 URL is pre-signed and expires (~6h). Re-fetch step 1 each poll;
+     * don't cache the URL.
+     *
+     * If `recordings[0]` doesn't exist yet (bot still joining or pre-recording),
+     * returns []. Caller (MeetingTranscriptPoller) treats that as "no new turns
+     * yet" and waits for the next tick.
+     */
+    getTranscript(botId: string): Promise<TranscriptTurn[]>;
     leaveMeeting(botId: string): Promise<void>;
     getBotStatus(botId: string): Promise<string>;
     handleWebhook(payload: TranscriptPayload): void;

package/dist/recall-client.js CHANGED Viewed

@@ -21,37 +21,20 @@ export class RecallClient extends EventEmitter {
      *                            room as the osborn agent (no separate WebSocket+WAV pipe).
      * @param opts.botName     Display name of the bot in the meeting
      */
-    async joinMeeting(meetingUrl, webhookBaseUrl, opts) {
+    async joinMeeting(meetingUrl, _webhookBaseUrl, opts) {
         const botName = opts?.botName ?? 'Osborn';
-        const outputPageUrl = opts?.outputPageUrl ?? `${webhookBaseUrl}/meeting-output`;
-        // Authoritative structure per https://docs.recall.ai/reference/bot_create
-        // and https://docs.recall.ai/docs/real-time-transcription:
+        // ARCHITECTURE (post-2026-05-22 polling redesign):
+        //   The bot joins by name only — visible in the meeting participant list as
+        //   "Osborn" but with no audio output and no avatar. We do NOT configure any
+        //   `output_media`, `audio_separate_raw`, or `realtime_endpoints` — instead
+        //   the agent polls Recall's REST transcript API every ~30s
+        //   (see MeetingTranscriptPoller) and feeds new turns into the LLM as
+        //   `[MEETING — <botId>]:` tagged messages. The meetings skill teaches the
+        //   LLM not to respond out loud to those messages, only to take notes.
         //
-        //   recording_config.transcript.provider  — transcription provider config
-        //   recording_config.realtime_endpoints   — webhook/websocket delivery
-        //
-        // IMPORTANT:
-        //   - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
-        //   - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
-        //   - `transcription_options` does NOT exist — use `transcript.provider`
-        //   - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
-        //
-        // ARCHITECTURE (post-2026-05-22 redesign):
-        //   Input (meeting → osborn): Recall's documented WebSocket audio protocol.
-        //     `audio_separate_raw` config + websocket realtime endpoint streams
-        //     per-participant PCM (S16LE 16kHz mono, base64 in JSON) to the agent's
-        //     /meeting-audio-in WS handler. Bot's own audio is excluded by default
-        //     → zero possibility of feedback loop, no echo cancellation needed.
-        //   Output (osborn → meeting): webpage output_media (LiveKit-on-page). Bot
-        //     page subscribes to osborn's LiveKit audio track and plays it via
-        //     track.attach(); Recall captures the page's audio output and injects
-        //     into the meeting.
-        //   Webhook transcripts (transcript.data): retained as a SECONDARY signal —
-        //     the agent index.ts handler for this event currently logs but does NOT
-        //     forward to the LLM (intentionally disabled). The Deepgram WS path
-        //     above is the LLM input.
-        const httpBase = webhookBaseUrl.replace(/\/$/, '');
-        const wsBase = httpBase.replace(/^https?:\/\//, m => m === 'https://' ? 'wss://' : 'ws://');
+        //   We DO keep `recording_config.transcript.provider.recallai_streaming` so
+        //   Recall actually transcribes the meeting — the REST endpoint we poll
+        //   requires this to be configured, otherwise transcripts are empty.
         const res = await fetch(`${RECALL_BASE_URL}/bot`, {
             method: 'POST',
             headers: {
@@ -64,46 +47,12 @@ export class RecallClient extends EventEmitter {
                 recording_config: {
                     transcript: {
                         provider: {
-                            // recallai_streaming is built-in — no external API key needed,
-                            // low-latency, works across all meeting platforms.
-                            // Kept for the secondary webhook signal (display / future use);
-                            // LLM input now comes from the Deepgram WS pipe below.
                             recallai_streaming: {
                                 mode: 'prioritize_low_latency',
                                 language_code: 'en',
                             },
                         },
                     },
-                    // Per-participant raw PCM audio stream. Bot's own audio is excluded
-                    // (we don't set include_bot_in_recording.audio:true).
-                    audio_separate_raw: {},
-                    realtime_endpoints: [
-                        {
-                            // Transcript webhook (secondary signal; LLM forwarding disabled).
-                            type: 'webhook',
-                            url: `${httpBase}/webhook/recall`,
-                            events: ['transcript.data'],
-                        },
-                        {
-                            // Per-participant PCM audio → agent's Deepgram STT pipe.
-                            type: 'websocket',
-                            url: `${wsBase}/meeting-audio-in`,
-                            events: ['audio_separate_raw.data'],
-                        },
-                    ],
-                },
-                output_media: {
-                    camera: {
-                        // `kind` (not `type`) — confirmed from prior debugging.
-                        // The page Recall renders connects to LiveKit and plays osborn's
-                        // TTS audio via track.attach(); Recall captures the page audio.
-                        // The page does NOT call getUserMedia anymore — input now comes
-                        // from the audio_separate_raw WebSocket above.
-                        kind: 'webpage',
-                        config: {
-                            url: outputPageUrl,
-                        },
-                    },
                 },
             }),
         });
@@ -112,9 +61,50 @@ export class RecallClient extends EventEmitter {
             throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
         }
         const bot = (await res.json());
-        console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (output page: ${outputPageUrl})`);
+        console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (polling-only, no audio pipeline)`);
         return bot.id;
     }
+    /**
+     * Fetch the bot's current transcript. Returns an array of "transcript turns"
+     * (each turn = one speaker's utterance) sorted by start time.
+     *
+     * Verified 2026-05-22 against the real us-west-2 API: there is NO simple
+     * `GET /bot/{id}/transcript` convenience endpoint. The actual chain is:
+     *
+     *   1. GET /api/v1/bot/{bot_id}
+     *   2. recordings[0].media_shortcuts.transcript.data.download_url   (S3 signed URL)
+     *   3. GET that URL  →  JSON array of TranscriptTurn objects
+     *
+     * The S3 URL is pre-signed and expires (~6h). Re-fetch step 1 each poll;
+     * don't cache the URL.
+     *
+     * If `recordings[0]` doesn't exist yet (bot still joining or pre-recording),
+     * returns []. Caller (MeetingTranscriptPoller) treats that as "no new turns
+     * yet" and waits for the next tick.
+     */
+    async getTranscript(botId) {
+        const botRes = await fetch(`${RECALL_BASE_URL}/bot/${botId}`, {
+            headers: { 'Authorization': `Token ${this.#apiKey}` },
+        });
+        if (!botRes.ok) {
+            const err = await botRes.text().catch(() => '');
+            throw new Error(`Recall.ai bot fetch failed: ${botRes.status} ${err.substring(0, 200)}`);
+        }
+        const bot = await botRes.json();
+        const downloadUrl = bot.recordings?.[0]?.media_shortcuts?.transcript?.data?.download_url;
+        if (!downloadUrl) {
+            // Recording / transcript not ready yet — pre-call, just-joined, or
+            // recording_done event hasn't fired. Empty result is expected here.
+            return [];
+        }
+        const txRes = await fetch(downloadUrl);
+        if (!txRes.ok) {
+            const err = await txRes.text().catch(() => '');
+            throw new Error(`Recall.ai transcript download failed: ${txRes.status} ${err.substring(0, 200)}`);
+        }
+        const turns = await txRes.json();
+        return Array.isArray(turns) ? turns : [];
+    }
     async leaveMeeting(botId) {
         await fetch(`${RECALL_BASE_URL}/bot/${botId}/leave_call`, {
             method: 'POST',

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "osborn",
-  "version": "0.9.42",
+  "version": "0.9.44",
   "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
   "type": "module",
   "bin": {
@@ -11,7 +11,7 @@
     "dev:logged": "tsx scripts/dev-logged.ts",
     "review": "tsx scripts/review.ts",
     "start": "tsx src/index.ts",
-    "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts && cp src/meeting-output.html dist/",
+    "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts",
     "room": "tsx src/index.ts --room",
     "prepublishOnly": "npm run build"
   },