osborn 0.9.31 → 0.9.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,12 +17,17 @@ Do not batch — capture as they happen.
17
17
  </what-to-do>
18
18
 
19
19
  <trigger-phrases>
20
- - "grill me"
20
+ This skill is a META operation — about building a model of the USER THEMSELVES,
21
+ not about any subject matter in the current conversation. The trigger phrases below
22
+ are intentionally specific so they cannot be confused with domain requests in any session.
23
+
24
+ - "update user context"
21
25
  - "learn my language"
22
- - "update my context"
26
+ - "start context interview"
27
+ - "grill me on my language"
23
28
  - "learn how I talk"
24
29
  - "standardise my language"
25
- - "update user context"
30
+ - "update my context"
26
31
  </trigger-phrases>
27
32
 
28
33
  <supporting-info>
package/dist/index.js CHANGED
@@ -10,6 +10,7 @@ initializeLogger({ pretty: true, level: 'info' });
10
10
  import { setMaxListeners } from 'node:events';
11
11
  setMaxListeners(50);
12
12
  import { createServer } from 'http';
13
+ import { WebSocket, WebSocketServer } from 'ws';
13
14
  import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
14
15
  import { dirname, join } from 'node:path';
15
16
  import { fileURLToPath } from 'node:url';
@@ -146,6 +147,73 @@ process.on('uncaughtException', (error) => {
146
147
  // ============================================================
147
148
  // Module-level room code so the HTTP server can expose it via GET /room-code
148
149
  let currentRoomCode = null;
150
+ // Meeting output WebSocket — module-level so both startApiServer and main() can access it
151
+ let meetingOutputWs = null;
152
+ function sendToMeetingOutput(msg) {
153
+ if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
154
+ try {
155
+ meetingOutputWs.send(JSON.stringify(msg));
156
+ }
157
+ catch { }
158
+ }
159
+ }
160
+ // Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
161
+ // Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
162
+ async function synthesizeForMeeting(text, ttsConfig) {
163
+ if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
164
+ return;
165
+ const ttsInstance = createTTS(ttsConfig);
166
+ try {
167
+ const chunks = [];
168
+ let sampleRate = 24000;
169
+ let numChannels = 1;
170
+ const stream = ttsInstance.synthesize(text);
171
+ for await (const event of stream) {
172
+ if (event === Symbol.for('END_OF_STREAM'))
173
+ break;
174
+ const e = event;
175
+ if (e?.frame?.data) {
176
+ chunks.push(e.frame.data);
177
+ sampleRate = e.frame.sampleRate ?? sampleRate;
178
+ numChannels = e.frame.numChannels ?? numChannels;
179
+ }
180
+ }
181
+ if (chunks.length === 0)
182
+ return;
183
+ const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
184
+ const pcm = new Int16Array(totalSamples);
185
+ let offset = 0;
186
+ for (const c of chunks) {
187
+ pcm.set(c, offset);
188
+ offset += c.length;
189
+ }
190
+ // WAV header (44 bytes) + PCM data
191
+ const dataBytes = pcm.length * 2;
192
+ const wav = Buffer.alloc(44 + dataBytes);
193
+ wav.write('RIFF', 0);
194
+ wav.writeUInt32LE(36 + dataBytes, 4);
195
+ wav.write('WAVE', 8);
196
+ wav.write('fmt ', 12);
197
+ wav.writeUInt32LE(16, 16);
198
+ wav.writeUInt16LE(1, 20);
199
+ wav.writeUInt16LE(numChannels, 22);
200
+ wav.writeUInt32LE(sampleRate, 24);
201
+ wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
202
+ wav.writeUInt16LE(numChannels * 2, 32);
203
+ wav.writeUInt16LE(16, 34);
204
+ wav.write('data', 36);
205
+ wav.writeUInt32LE(dataBytes, 40);
206
+ for (let i = 0; i < pcm.length; i++)
207
+ wav.writeInt16LE(pcm[i], 44 + i * 2);
208
+ if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
209
+ meetingOutputWs.send(wav);
210
+ console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
211
+ }
212
+ }
213
+ finally {
214
+ await ttsInstance.close().catch(() => { });
215
+ }
216
+ }
149
217
  function startApiServer(workingDir, port) {
150
218
  const server = createServer(async (req, res) => {
151
219
  // CORS headers for cloud frontend
@@ -891,6 +959,32 @@ function startApiServer(workingDir, port) {
891
959
  };
892
960
  cleanStaleUploadDirs();
893
961
  setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
962
+ // ============================================================
963
+ // Meeting Output WebSocket — /meeting-audio
964
+ // ============================================================
965
+ // Recall's headless browser opens meeting-output.html which connects here.
966
+ // We push: JSON { type: 'speak', text } for display, binary PCM for audio (future).
967
+ const meetingOutputWss = new WebSocketServer({ noServer: true });
968
+ meetingOutputWss.on('connection', (ws) => {
969
+ console.log('📺 Meeting output browser connected');
970
+ meetingOutputWs = ws;
971
+ ws.on('close', () => {
972
+ console.log('📺 Meeting output browser disconnected');
973
+ if (meetingOutputWs === ws)
974
+ meetingOutputWs = null;
975
+ });
976
+ });
977
+ server.on('upgrade', (req, socket, head) => {
978
+ const url = new URL(req.url || '/', `http://localhost:${port}`);
979
+ if (url.pathname === '/meeting-audio') {
980
+ meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
981
+ meetingOutputWss.emit('connection', ws, req);
982
+ });
983
+ }
984
+ else {
985
+ socket.destroy();
986
+ }
987
+ });
894
988
  server.on('error', (err) => {
895
989
  if (err.code === 'EADDRINUSE') {
896
990
  console.warn(`⚠️ API port ${port} in use, trying ${port + 1}...`);
@@ -1112,6 +1206,7 @@ async function main() {
1112
1206
  // session-only path (no user prefix).
1113
1207
  let currentUserId = '';
1114
1208
  let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
1209
+ // meetingOutputWs is module-level (see top of file) — shared between startApiServer and main()
1115
1210
  // Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
1116
1211
  // Updated by resume_session, session_selected, continue_session, switch_session handlers
1117
1212
  let currentResumeSessionId;
@@ -1728,6 +1823,17 @@ async function main() {
1728
1823
  }
1729
1824
  const sayId = Date.now(); // simple ID to correlate start/end logs
1730
1825
  console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
1826
+ // Forward spoken text + audio to meeting output page when bot is in a meeting.
1827
+ // Text appears immediately; audio uses the same configured TTS (directConfig.tts)
1828
+ // so voice/provider stays consistent — no separate hardcoded provider.
1829
+ // PCM frames are WAV-encoded and pushed as binary WebSocket frames.
1830
+ // Recall captures the browser page's audio output and injects it into the meeting.
1831
+ if (activeMeetingBotId) {
1832
+ sendToMeetingOutput({ type: 'speak', text: data.text });
1833
+ if (meetingOutputWs) {
1834
+ synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
1835
+ }
1836
+ }
1731
1837
  try {
1732
1838
  const handle = currentSession.say(data.text);
1733
1839
  if (handle && typeof handle.addDoneCallback === 'function') {
@@ -1,18 +1,113 @@
1
1
  <!DOCTYPE html>
2
2
  <html>
3
- <head><title>Osborn Meeting Output</title></head>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Osborn</title>
6
+ <style>
7
+ * { margin: 0; padding: 0; box-sizing: border-box; }
8
+ body {
9
+ background: #0a0a0f;
10
+ color: #ffffff;
11
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
12
+ width: 100vw;
13
+ height: 100vh;
14
+ display: flex;
15
+ flex-direction: column;
16
+ align-items: center;
17
+ justify-content: center;
18
+ overflow: hidden;
19
+ }
20
+ #header {
21
+ position: absolute;
22
+ top: 24px;
23
+ left: 50%;
24
+ transform: translateX(-50%);
25
+ display: flex;
26
+ align-items: center;
27
+ gap: 10px;
28
+ }
29
+ #dot {
30
+ width: 8px;
31
+ height: 8px;
32
+ border-radius: 50%;
33
+ background: #333;
34
+ transition: background 0.3s;
35
+ }
36
+ #dot.speaking { background: #4ade80; box-shadow: 0 0 8px #4ade80; }
37
+ #dot.connected { background: #3b82f6; }
38
+ #name {
39
+ font-size: 13px;
40
+ font-weight: 600;
41
+ letter-spacing: 0.1em;
42
+ text-transform: uppercase;
43
+ color: #555;
44
+ }
45
+ #speech {
46
+ max-width: 80%;
47
+ text-align: center;
48
+ font-size: 28px;
49
+ font-weight: 400;
50
+ line-height: 1.4;
51
+ color: #f0f0f0;
52
+ opacity: 0;
53
+ transition: opacity 0.4s ease;
54
+ min-height: 2em;
55
+ }
56
+ #speech.visible { opacity: 1; }
57
+ #idle {
58
+ font-size: 14px;
59
+ color: #2a2a2a;
60
+ margin-top: 16px;
61
+ transition: opacity 0.4s;
62
+ }
63
+ #idle.hidden { opacity: 0; }
64
+ </style>
65
+ </head>
4
66
  <body>
5
- <script>
6
- const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
67
+ <div id="header">
68
+ <div id="dot"></div>
69
+ <div id="name">Osborn</div>
70
+ </div>
71
+ <div id="speech"></div>
72
+ <div id="idle">Listening…</div>
73
+ <script>
74
+ const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
75
+ const speechEl = document.getElementById('speech')
76
+ const dotEl = document.getElementById('dot')
77
+ const idleEl = document.getElementById('idle')
78
+
79
+ // Persistent AudioContext — created ONCE at startup per Recall's own demo pattern
80
+ let audioCtx = null
81
+ let clearTimer = null
82
+
83
+ async function initAudio() {
84
+ audioCtx = new (window.AudioContext || window.webkitAudioContext)()
85
+ // Recall's headless Chrome may start AudioContext suspended — resume immediately.
86
+ // Their own voice-agent-demo does this at connect time without waiting for user gesture.
87
+ if (audioCtx.state === 'suspended') {
88
+ await audioCtx.resume()
89
+ }
90
+ }
7
91
 
8
- function connect() {
9
- const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
10
- const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
92
+ function showSpeech(text) {
93
+ if (clearTimer) { clearTimeout(clearTimer); clearTimer = null }
94
+ speechEl.textContent = text
95
+ speechEl.classList.add('visible')
96
+ dotEl.className = 'speaking'
97
+ idleEl.classList.add('hidden')
98
+ clearTimer = setTimeout(clearSpeech, 6000)
99
+ }
100
+
101
+ function clearSpeech() {
102
+ speechEl.classList.remove('visible')
103
+ dotEl.className = 'connected'
104
+ idleEl.classList.remove('hidden')
105
+ clearTimer = null
106
+ }
11
107
 
12
- ws.onmessage = async (event) => {
108
+ async function playAudio(arrayBuffer) {
109
+ if (!audioCtx) return
13
110
  try {
14
- const audioCtx = new (window.AudioContext || window.webkitAudioContext)()
15
- const arrayBuffer = await event.data.arrayBuffer()
16
111
  const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer)
17
112
  const source = audioCtx.createBufferSource()
18
113
  source.buffer = audioBuffer
@@ -23,10 +118,40 @@
23
118
  }
24
119
  }
25
120
 
26
- ws.onclose = () => setTimeout(connect, 1000)
27
- }
121
+ function connect() {
122
+ const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
123
+ const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
124
+ ws.binaryType = 'arraybuffer'
125
+
126
+ ws.onopen = () => {
127
+ console.log('Connected to meeting-audio WebSocket')
128
+ dotEl.className = 'connected'
129
+ }
130
+
131
+ ws.onmessage = async (event) => {
132
+ if (event.data instanceof ArrayBuffer) {
133
+ await playAudio(event.data)
134
+ } else {
135
+ try {
136
+ const msg = JSON.parse(event.data)
137
+ if (msg.type === 'speak' && msg.text) {
138
+ showSpeech(msg.text)
139
+ } else if (msg.type === 'clear') {
140
+ clearSpeech()
141
+ }
142
+ } catch (e) {
143
+ console.error('Parse error:', e)
144
+ }
145
+ }
146
+ }
147
+
148
+ ws.onclose = () => {
149
+ dotEl.className = ''
150
+ setTimeout(connect, 1500)
151
+ }
152
+ }
28
153
 
29
- connect()
30
- </script>
154
+ initAudio().then(() => connect())
155
+ </script>
31
156
  </body>
32
157
  </html>
@@ -4,18 +4,33 @@ export interface RecallBot {
4
4
  meeting_url: string;
5
5
  status: string;
6
6
  }
7
- export interface TranscriptWord {
8
- text: string;
9
- start_time: number;
10
- end_time: number;
11
- }
12
7
  export interface TranscriptPayload {
13
- bot_id: string;
14
- transcript: {
15
- speaker: string;
16
- words: TranscriptWord[];
17
- is_final: boolean;
18
- language?: string;
8
+ event: string;
9
+ data: {
10
+ data: {
11
+ words: Array<{
12
+ text: string;
13
+ start_timestamp?: {
14
+ relative?: number;
15
+ };
16
+ end_timestamp?: {
17
+ relative?: number;
18
+ };
19
+ }>;
20
+ language_code?: string;
21
+ participant?: {
22
+ id: number;
23
+ name: string;
24
+ is_host?: boolean;
25
+ platform?: string;
26
+ };
27
+ };
28
+ bot?: {
29
+ id: string;
30
+ };
31
+ recording?: {
32
+ id: string;
33
+ };
19
34
  };
20
35
  }
21
36
  export declare class RecallClient extends EventEmitter {
@@ -9,6 +9,17 @@ export class RecallClient extends EventEmitter {
9
9
  this.#apiKey = apiKey;
10
10
  }
11
11
  async joinMeeting(meetingUrl, webhookBaseUrl, botName = 'Osborn') {
12
+ // Authoritative structure per https://docs.recall.ai/reference/bot_create
13
+ // and https://docs.recall.ai/docs/real-time-transcription:
14
+ //
15
+ // recording_config.transcript.provider — transcription provider config
16
+ // recording_config.realtime_endpoints — webhook/websocket delivery
17
+ //
18
+ // IMPORTANT:
19
+ // - Field is `realtime_endpoints` (NOT `real_time_endpoints`)
20
+ // - `url` and `events` are flat on the endpoint object (NOT nested under `config`)
21
+ // - `transcription_options` does NOT exist — use `transcript.provider`
22
+ // - Both transcript.provider AND realtime_endpoints must be set, or no events delivered
12
23
  const res = await fetch(`${RECALL_BASE_URL}/bot`, {
13
24
  method: 'POST',
14
25
  headers: {
@@ -19,24 +30,26 @@ export class RecallClient extends EventEmitter {
19
30
  meeting_url: meetingUrl,
20
31
  bot_name: botName,
21
32
  recording_config: {
22
- // Field names must match Recall API exactly (no underscore in realtime_endpoints).
23
- // real_time_endpoints was silently ignored — API uses realtime_endpoints.
33
+ transcript: {
34
+ provider: {
35
+ // recallai_streaming is built-in — no external API key needed,
36
+ // low-latency, works across all meeting platforms.
37
+ recallai_streaming: {
38
+ mode: 'prioritize_low_latency',
39
+ language_code: 'en',
40
+ },
41
+ },
42
+ },
24
43
  realtime_endpoints: [{
25
44
  type: 'webhook',
26
- config: {
27
- url: `${webhookBaseUrl}/webhook/recall`,
28
- events: ['transcript.data'],
29
- },
45
+ url: `${webhookBaseUrl}/webhook/recall`,
46
+ events: ['transcript.data'],
30
47
  }],
31
- transcription_options: {
32
- provider: 'assembly_ai',
33
- mode: 'prioritize_low_latency',
34
- },
35
48
  },
36
49
  output_media: {
37
50
  camera: {
38
- // Recall API expects `kind` (not `type`); the wrong key arrives as null and
39
- // gets rejected as "Invalid choice null. Expected 'webpage' or 'default'."
51
+ // `kind` (not `type`) confirmed from prior debugging.
52
+ // Output webpage plays TTS audio so meeting participants can hear the agent.
40
53
  kind: 'webpage',
41
54
  config: {
42
55
  url: `${webhookBaseUrl}/meeting-output`,
@@ -69,16 +82,16 @@ export class RecallClient extends EventEmitter {
69
82
  return bot.status_changes?.at(-1)?.code ?? 'unknown';
70
83
  }
71
84
  handleWebhook(payload) {
72
- if (!payload.transcript?.is_final)
85
+ // Only process final transcripts (transcript.data), skip partials
86
+ if (payload.event !== 'transcript.data')
73
87
  return;
74
- const text = payload.transcript.words.map(w => w.text).join(' ').trim();
88
+ const words = payload.data?.data?.words ?? [];
89
+ const text = words.map(w => w.text).join(' ').trim();
75
90
  if (!text)
76
91
  return;
77
- this.emit('transcript', {
78
- botId: payload.bot_id,
79
- speaker: payload.transcript.speaker,
80
- text,
81
- });
92
+ const speaker = payload.data?.data?.participant?.name ?? 'Unknown';
93
+ const botId = payload.data?.bot?.id ?? 'unknown';
94
+ this.emit('transcript', { botId, speaker, text });
82
95
  }
83
96
  registerBot(botId, sessionId) {
84
97
  this.#activeBots.set(botId, sessionId);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "osborn",
3
- "version": "0.9.31",
3
+ "version": "0.9.33",
4
4
  "description": "Voice AI coding assistant - local agent that connects to Osborn frontend",
5
5
  "type": "module",
6
6
  "bin": {