osborn 0.9.42 → 0.9.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ /**
2
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
3
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
4
+ * messages.
5
+ *
6
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
7
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
8
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
9
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
10
+ * never speaks in the meeting — it's a silent note-taker.
11
+ *
12
+ * Lifecycle:
13
+ * const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
14
+ * poller.start()
15
+ * ...
16
+ * poller.stop() // on leave_meeting / disconnect / session switch
17
+ *
18
+ * Dedup strategy:
19
+ * Each turn carries a `start_timestamp.relative` on its first word (seconds
20
+ * since recording start). We track the highest cursor we've forwarded and
21
+ * only send turns with a strictly greater first-word timestamp. This means
22
+ * re-fetches don't double-deliver, and partial transcripts that get refined
23
+ * later don't re-trigger LLM processing of already-handled turns.
24
+ *
25
+ * Error handling:
26
+ * Transient fetch errors are logged + skipped (poll continues on next tick).
27
+ * No backoff — Recall's transcript endpoint is stable enough that a 30s
28
+ * cadence makes "slow start" non-issues self-recover within one cycle.
29
+ */
30
+ import type { RecallClient, TranscriptTurn } from './recall-client.js';
31
+ export interface MeetingTranscriptPollerOptions {
32
+ botId: string;
33
+ recall: RecallClient;
34
+ /** Called when new transcript turns arrive (de-duped). Get a fresh batch each tick. */
35
+ onTurns: (chunk: {
36
+ botId: string;
37
+ turns: TranscriptTurn[];
38
+ formatted: string;
39
+ }) => void | Promise<void>;
40
+ /** Default 30s — matches the user's stated cadence. */
41
+ intervalMs?: number;
42
+ /** Optional debug logger. */
43
+ onError?: (err: Error) => void;
44
+ }
45
+ export declare class MeetingTranscriptPoller {
46
+ #private;
47
+ constructor(opts: MeetingTranscriptPollerOptions);
48
+ start(): void;
49
+ stop(): void;
50
+ }
51
+ /**
52
+ * Format an array of turns into a single string for LLM consumption.
53
+ *
54
+ * Each turn becomes:
55
+ * <Speaker>: <text>
56
+ *
57
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
58
+ * string if nothing meaningful is in the batch.
59
+ */
60
+ export declare function formatTurns(turns: TranscriptTurn[]): string;
@@ -0,0 +1,112 @@
1
+ /**
2
+ * MeetingTranscriptPoller — pulls Recall.ai meeting transcripts on a fixed
3
+ * interval and forwards new turns to the LLM as tagged `[MEETING — <botId>]:`
4
+ * messages.
5
+ *
6
+ * This replaces the older LiveKit/WebSocket audio pipeline that streamed raw
7
+ * PCM from Recall into a LiveKit room. The polling architecture is simpler
8
+ * (no parallel STT, no audio pipeline, no participant juggling), survives
9
+ * agent restarts (Recall keeps the transcript on its side), and the LLM
10
+ * never speaks in the meeting — it's a silent note-taker.
11
+ *
12
+ * Lifecycle:
13
+ * const poller = new MeetingTranscriptPoller({ botId, recall, onTurns, intervalMs })
14
+ * poller.start()
15
+ * ...
16
+ * poller.stop() // on leave_meeting / disconnect / session switch
17
+ *
18
+ * Dedup strategy:
19
+ * Each turn carries a `start_timestamp.relative` on its first word (seconds
20
+ * since recording start). We track the highest cursor we've forwarded and
21
+ * only send turns with a strictly greater first-word timestamp. This means
22
+ * re-fetches don't double-deliver, and partial transcripts that get refined
23
+ * later don't re-trigger LLM processing of already-handled turns.
24
+ *
25
+ * Error handling:
26
+ * Transient fetch errors are logged + skipped (poll continues on next tick).
27
+ * No backoff — Recall's transcript endpoint is stable enough that a 30s
28
+ * cadence makes "slow start" non-issues self-recover within one cycle.
29
+ */
30
+ export class MeetingTranscriptPoller {
31
+ #opts;
32
+ #timer = null;
33
+ #cursor = -Infinity; // highest first-word.start_timestamp.relative we've forwarded
34
+ #inFlight = false; // prevent overlapping polls if one cycle runs long
35
+ #stopped = false;
36
+ constructor(opts) {
37
+ this.#opts = opts;
38
+ }
39
+ start() {
40
+ if (this.#timer)
41
+ return;
42
+ const interval = this.#opts.intervalMs ?? 30_000;
43
+ console.log(`📓 MeetingTranscriptPoller starting for bot=${this.#opts.botId.substring(0, 8)} (every ${Math.round(interval / 1000)}s)`);
44
+ // Fire once immediately so the LLM sees the meeting started, then on interval.
45
+ void this.#tick();
46
+ this.#timer = setInterval(() => void this.#tick(), interval);
47
+ }
48
+ stop() {
49
+ if (this.#stopped)
50
+ return;
51
+ this.#stopped = true;
52
+ if (this.#timer) {
53
+ clearInterval(this.#timer);
54
+ this.#timer = null;
55
+ }
56
+ console.log(`📓 MeetingTranscriptPoller stopped for bot=${this.#opts.botId.substring(0, 8)}`);
57
+ }
58
+ async #tick() {
59
+ if (this.#inFlight || this.#stopped)
60
+ return;
61
+ this.#inFlight = true;
62
+ try {
63
+ const all = await this.#opts.recall.getTranscript(this.#opts.botId);
64
+ const fresh = all.filter(t => {
65
+ const firstWordTs = t.words?.[0]?.start_timestamp?.relative;
66
+ return typeof firstWordTs === 'number' && firstWordTs > this.#cursor;
67
+ });
68
+ if (fresh.length === 0)
69
+ return;
70
+ // Advance cursor to highest seen first-word ts (across all returned turns,
71
+ // not just the fresh ones — guards against Recall returning a paged subset).
72
+ for (const t of all) {
73
+ const ts = t.words?.[0]?.start_timestamp?.relative;
74
+ if (typeof ts === 'number' && ts > this.#cursor)
75
+ this.#cursor = ts;
76
+ }
77
+ const formatted = formatTurns(fresh);
78
+ if (!formatted)
79
+ return; // pure-whitespace fresh batch — skip
80
+ console.log(`📓 MeetingTranscriptPoller: ${fresh.length} new turn(s), cursor=${this.#cursor.toFixed(1)}s, chars=${formatted.length}`);
81
+ await this.#opts.onTurns({ botId: this.#opts.botId, turns: fresh, formatted });
82
+ }
83
+ catch (err) {
84
+ const e = err instanceof Error ? err : new Error(String(err));
85
+ this.#opts.onError?.(e);
86
+ console.warn(`⚠️ MeetingTranscriptPoller tick failed: ${e.message}`);
87
+ }
88
+ finally {
89
+ this.#inFlight = false;
90
+ }
91
+ }
92
+ }
93
+ /**
94
+ * Format an array of turns into a single string for LLM consumption.
95
+ *
96
+ * Each turn becomes:
97
+ * <Speaker>: <text>
98
+ *
99
+ * Whitespace-only words and zero-content turns are dropped. Returns empty
100
+ * string if nothing meaningful is in the batch.
101
+ */
102
+ export function formatTurns(turns) {
103
+ const lines = [];
104
+ for (const t of turns) {
105
+ const speaker = t.speaker || t.participant?.name || 'Unknown';
106
+ const text = (t.words ?? []).map(w => w.text).join(' ').replace(/\s+/g, ' ').trim();
107
+ if (!text)
108
+ continue;
109
+ lines.push(`${speaker}: ${text}`);
110
+ }
111
+ return lines.join('\n');
112
+ }
@@ -4,6 +4,36 @@ export interface RecallBot {
4
4
  meeting_url: string;
5
5
  status: string;
6
6
  }
7
+ /**
8
+ * One transcript turn = one speaker's continuous utterance.
9
+ * Shape returned by GET /api/v1/bot/{bot_id}/transcript.
10
+ *
11
+ * Per Recall docs each turn contains:
12
+ * - speaker: participant name (or 'Unknown')
13
+ * - words: array of { text, start_timestamp.relative, end_timestamp.relative }
14
+ * - The `start_timestamp.relative` (seconds since recording start) on the
15
+ * FIRST word is the turn's start; we use this as the dedup cursor.
16
+ */
17
+ export interface TranscriptTurn {
18
+ speaker?: string;
19
+ participant?: {
20
+ id?: number;
21
+ name?: string;
22
+ is_host?: boolean;
23
+ };
24
+ words: Array<{
25
+ text: string;
26
+ start_timestamp?: {
27
+ relative?: number;
28
+ absolute?: string;
29
+ };
30
+ end_timestamp?: {
31
+ relative?: number;
32
+ absolute?: string;
33
+ };
34
+ }>;
35
+ language?: string;
36
+ }
7
37
  export interface TranscriptPayload {
8
38
  event: string;
9
39
  data: {
@@ -49,10 +79,28 @@ export declare class RecallClient extends EventEmitter {
49
79
  * room as the osborn agent (no separate WebSocket+WAV pipe).
50
80
  * @param opts.botName Display name of the bot in the meeting
51
81
  */
52
- joinMeeting(meetingUrl: string, webhookBaseUrl: string, opts?: {
53
- outputPageUrl?: string;
82
+ joinMeeting(meetingUrl: string, _webhookBaseUrl: string, opts?: {
54
83
  botName?: string;
55
84
  }): Promise<string>;
85
+ /**
86
+ * Fetch the bot's current transcript. Returns an array of "transcript turns"
87
+ * (each turn = one speaker's utterance) sorted by start time.
88
+ *
89
+ * Verified 2026-05-22 against the real us-west-2 API: there is NO simple
90
+ * `GET /bot/{id}/transcript` convenience endpoint. The actual chain is:
91
+ *
92
+ * 1. GET /api/v1/bot/{bot_id}
93
+ * 2. recordings[0].media_shortcuts.transcript.data.download_url (S3 signed URL)
94
+ * 3. GET that URL → JSON array of TranscriptTurn objects
95
+ *
96
+ * The S3 URL is pre-signed and expires (~6h). Re-fetch step 1 each poll;
97
+ * don't cache the URL.
98
+ *
99
+ * If `recordings[0]` doesn't exist yet (bot still joining or pre-recording),
100
+ * returns []. Caller (MeetingTranscriptPoller) treats that as "no new turns
101
+ * yet" and waits for the next tick.
102
+ */
103
+ getTranscript(botId: string): Promise<TranscriptTurn[]>;
56
104
  leaveMeeting(botId: string): Promise<void>;
57
105
  getBotStatus(botId: string): Promise<string>;
58
106
  handleWebhook(payload: TranscriptPayload): void;
@@ -21,37 +21,20 @@ export class RecallClient extends EventEmitter {
21
21
  * room as the osborn agent (no separate WebSocket+WAV pipe).
22
22
  * @param opts.botName Display name of the bot in the meeting
23
23
  */
24
- async joinMeeting(meetingUrl, webhookBaseUrl, opts) {
24
+ async joinMeeting(meetingUrl, _webhookBaseUrl, opts) {
25
25
  const botName = opts?.botName ?? 'Osborn';
26
- const outputPageUrl = opts?.outputPageUrl ?? `${webhookBaseUrl}/meeting-output`;
27
- // Authoritative structure per https://docs.recall.ai/reference/bot_create
28
- // and https://docs.recall.ai/docs/real-time-transcription:
26
+ // ARCHITECTURE (post-2026-05-22 polling redesign):
27
+ // The bot joins by name only — visible in the meeting participant list as
28
+ // "Osborn" but with no audio output and no avatar. We do NOT configure any
29
+ // `output_media`, `audio_separate_raw`, or `realtime_endpoints` — instead
30
+ // the agent polls Recall's REST transcript API every ~30s
31
+ // (see MeetingTranscriptPoller) and feeds new turns into the LLM as
32
+ // `[MEETING — <botId>]:` tagged messages. The meetings skill teaches the
33
+ // LLM not to respond out loud to those messages, only to take notes.
29
34
  //
30
- // recording_config.transcript.provider transcription provider config
31
- // recording_config.realtime_endpoints webhook/websocket delivery
32
- //
33
- // IMPORTANT:
34
- // - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
35
- // - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
36
- // - `transcription_options` does NOT exist — use `transcript.provider`
37
- // - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
38
- //
39
- // ARCHITECTURE (post-2026-05-22 redesign):
40
- // Input (meeting → osborn): Recall's documented WebSocket audio protocol.
41
- // `audio_separate_raw` config + websocket realtime endpoint streams
42
- // per-participant PCM (S16LE 16kHz mono, base64 in JSON) to the agent's
43
- // /meeting-audio-in WS handler. Bot's own audio is excluded by default
44
- // → zero possibility of feedback loop, no echo cancellation needed.
45
- // Output (osborn → meeting): webpage output_media (LiveKit-on-page). Bot
46
- // page subscribes to osborn's LiveKit audio track and plays it via
47
- // track.attach(); Recall captures the page's audio output and injects
48
- // into the meeting.
49
- // Webhook transcripts (transcript.data): retained as a SECONDARY signal —
50
- // the agent index.ts handler for this event currently logs but does NOT
51
- // forward to the LLM (intentionally disabled). The Deepgram WS path
52
- // above is the LLM input.
53
- const httpBase = webhookBaseUrl.replace(/\/$/, '');
54
- const wsBase = httpBase.replace(/^https?:\/\//, m => m === 'https://' ? 'wss://' : 'ws://');
35
+ // We DO keep `recording_config.transcript.provider.recallai_streaming` so
36
+ // Recall actually transcribes the meeting the REST endpoint we poll
37
+ // requires this to be configured, otherwise transcripts are empty.
55
38
  const res = await fetch(`${RECALL_BASE_URL}/bot`, {
56
39
  method: 'POST',
57
40
  headers: {
@@ -64,46 +47,12 @@ export class RecallClient extends EventEmitter {
64
47
  recording_config: {
65
48
  transcript: {
66
49
  provider: {
67
- // recallai_streaming is built-in — no external API key needed,
68
- // low-latency, works across all meeting platforms.
69
- // Kept for the secondary webhook signal (display / future use);
70
- // LLM input now comes from the Deepgram WS pipe below.
71
50
  recallai_streaming: {
72
51
  mode: 'prioritize_low_latency',
73
52
  language_code: 'en',
74
53
  },
75
54
  },
76
55
  },
77
- // Per-participant raw PCM audio stream. Bot's own audio is excluded
78
- // (we don't set include_bot_in_recording.audio:true).
79
- audio_separate_raw: {},
80
- realtime_endpoints: [
81
- {
82
- // Transcript webhook (secondary signal; LLM forwarding disabled).
83
- type: 'webhook',
84
- url: `${httpBase}/webhook/recall`,
85
- events: ['transcript.data'],
86
- },
87
- {
88
- // Per-participant PCM audio → agent's Deepgram STT pipe.
89
- type: 'websocket',
90
- url: `${wsBase}/meeting-audio-in`,
91
- events: ['audio_separate_raw.data'],
92
- },
93
- ],
94
- },
95
- output_media: {
96
- camera: {
97
- // `kind` (not `type`) — confirmed from prior debugging.
98
- // The page Recall renders connects to LiveKit and plays osborn's
99
- // TTS audio via track.attach(); Recall captures the page audio.
100
- // The page does NOT call getUserMedia anymore — input now comes
101
- // from the audio_separate_raw WebSocket above.
102
- kind: 'webpage',
103
- config: {
104
- url: outputPageUrl,
105
- },
106
- },
107
56
  },
108
57
  }),
109
58
  });
@@ -112,9 +61,50 @@ export class RecallClient extends EventEmitter {
112
61
  throw new Error(`Recall.ai join failed: ${res.status} ${err}`);
113
62
  }
114
63
  const bot = (await res.json());
115
- console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (output page: ${outputPageUrl})`);
64
+ console.log(`🤖 Recall.ai bot joined meeting: ${bot.id} (polling-only, no audio pipeline)`);
116
65
  return bot.id;
117
66
  }
67
+ /**
68
+ * Fetch the bot's current transcript. Returns an array of "transcript turns"
69
+ * (each turn = one speaker's utterance) sorted by start time.
70
+ *
71
+ * Verified 2026-05-22 against the real us-west-2 API: there is NO simple
72
+ * `GET /bot/{id}/transcript` convenience endpoint. The actual chain is:
73
+ *
74
+ * 1. GET /api/v1/bot/{bot_id}
75
+ * 2. recordings[0].media_shortcuts.transcript.data.download_url (S3 signed URL)
76
+ * 3. GET that URL → JSON array of TranscriptTurn objects
77
+ *
78
+ * The S3 URL is pre-signed and expires (~6h). Re-fetch step 1 each poll;
79
+ * don't cache the URL.
80
+ *
81
+ * If `recordings[0]` doesn't exist yet (bot still joining or pre-recording),
82
+ * returns []. Caller (MeetingTranscriptPoller) treats that as "no new turns
83
+ * yet" and waits for the next tick.
84
+ */
85
+ async getTranscript(botId) {
86
+ const botRes = await fetch(`${RECALL_BASE_URL}/bot/${botId}`, {
87
+ headers: { 'Authorization': `Token ${this.#apiKey}` },
88
+ });
89
+ if (!botRes.ok) {
90
+ const err = await botRes.text().catch(() => '');
91
+ throw new Error(`Recall.ai bot fetch failed: ${botRes.status} ${err.substring(0, 200)}`);
92
+ }
93
+ const bot = await botRes.json();
94
+ const downloadUrl = bot.recordings?.[0]?.media_shortcuts?.transcript?.data?.download_url;
95
+ if (!downloadUrl) {
96
+ // Recording / transcript not ready yet — pre-call, just-joined, or
97
+ // recording_done event hasn't fired. Empty result is expected here.
98
+ return [];
99
+ }
100
+ const txRes = await fetch(downloadUrl);
101
+ if (!txRes.ok) {
102
+ const err = await txRes.text().catch(() => '');
103
+ throw new Error(`Recall.ai transcript download failed: ${txRes.status} ${err.substring(0, 200)}`);
104
+ }
105
+ const turns = await txRes.json();
106
+ return Array.isArray(turns) ? turns : [];
107
+ }
118
108
  async leaveMeeting(botId) {
119
109
  await fetch(`${RECALL_BASE_URL}/bot/${botId}/leave_call`, {
120
110
  method: 'POST',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.42",
3
+ "version": "0.9.44",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {
@@ -11,7 +11,7 @@
11
11
  "dev:logged": "tsx scripts/dev-logged.ts",
12
12
  "review": "tsx scripts/review.ts",
13
13
  "start": "tsx src/index.ts",
14
- "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts && cp src/meeting-output.html dist/",
14
+ "build": "tsc && rm -rf dist/prompts && cp -r src/prompts dist/prompts",
15
15
  "room": "tsx src/index.ts --room",
16
16
  "prepublishOnly": "npm run build"
17
17
  },