osborn 0.9.32 → 0.9.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +124 -1
- package/dist/meeting-output.html +138 -13
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -10,6 +10,7 @@ initializeLogger({ pretty: true, level: 'info' });
|
|
|
10
10
|
import { setMaxListeners } from 'node:events';
|
|
11
11
|
setMaxListeners(50);
|
|
12
12
|
import { createServer } from 'http';
|
|
13
|
+
import { WebSocket, WebSocketServer } from 'ws';
|
|
13
14
|
import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, mkdtempSync, cpSync, rmSync, statSync, createWriteStream } from 'node:fs';
|
|
14
15
|
import { dirname, join } from 'node:path';
|
|
15
16
|
import { fileURLToPath } from 'node:url';
|
|
@@ -146,6 +147,73 @@ process.on('uncaughtException', (error) => {
|
|
|
146
147
|
// ============================================================
|
|
147
148
|
// Module-level room code so the HTTP server can expose it via GET /room-code
|
|
148
149
|
let currentRoomCode = null;
|
|
150
|
+
// Meeting output WebSocket — module-level so both startApiServer and main() can access it
|
|
151
|
+
let meetingOutputWs = null;
|
|
152
|
+
function sendToMeetingOutput(msg) {
|
|
153
|
+
if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
|
|
154
|
+
try {
|
|
155
|
+
meetingOutputWs.send(JSON.stringify(msg));
|
|
156
|
+
}
|
|
157
|
+
catch { }
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
// Synthesize text using the configured TTS provider, WAV-encode, and push to meeting browser.
|
|
161
|
+
// Uses the same ttsConfig as the live voice session — no separate hardcoded provider.
|
|
162
|
+
async function synthesizeForMeeting(text, ttsConfig) {
|
|
163
|
+
if (!meetingOutputWs || meetingOutputWs.readyState !== WebSocket.OPEN)
|
|
164
|
+
return;
|
|
165
|
+
const ttsInstance = createTTS(ttsConfig);
|
|
166
|
+
try {
|
|
167
|
+
const chunks = [];
|
|
168
|
+
let sampleRate = 24000;
|
|
169
|
+
let numChannels = 1;
|
|
170
|
+
const stream = ttsInstance.synthesize(text);
|
|
171
|
+
for await (const event of stream) {
|
|
172
|
+
if (event === Symbol.for('END_OF_STREAM'))
|
|
173
|
+
break;
|
|
174
|
+
const e = event;
|
|
175
|
+
if (e?.frame?.data) {
|
|
176
|
+
chunks.push(e.frame.data);
|
|
177
|
+
sampleRate = e.frame.sampleRate ?? sampleRate;
|
|
178
|
+
numChannels = e.frame.numChannels ?? numChannels;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
if (chunks.length === 0)
|
|
182
|
+
return;
|
|
183
|
+
const totalSamples = chunks.reduce((s, c) => s + c.length, 0);
|
|
184
|
+
const pcm = new Int16Array(totalSamples);
|
|
185
|
+
let offset = 0;
|
|
186
|
+
for (const c of chunks) {
|
|
187
|
+
pcm.set(c, offset);
|
|
188
|
+
offset += c.length;
|
|
189
|
+
}
|
|
190
|
+
// WAV header (44 bytes) + PCM data
|
|
191
|
+
const dataBytes = pcm.length * 2;
|
|
192
|
+
const wav = Buffer.alloc(44 + dataBytes);
|
|
193
|
+
wav.write('RIFF', 0);
|
|
194
|
+
wav.writeUInt32LE(36 + dataBytes, 4);
|
|
195
|
+
wav.write('WAVE', 8);
|
|
196
|
+
wav.write('fmt ', 12);
|
|
197
|
+
wav.writeUInt32LE(16, 16);
|
|
198
|
+
wav.writeUInt16LE(1, 20);
|
|
199
|
+
wav.writeUInt16LE(numChannels, 22);
|
|
200
|
+
wav.writeUInt32LE(sampleRate, 24);
|
|
201
|
+
wav.writeUInt32LE(sampleRate * numChannels * 2, 28);
|
|
202
|
+
wav.writeUInt16LE(numChannels * 2, 32);
|
|
203
|
+
wav.writeUInt16LE(16, 34);
|
|
204
|
+
wav.write('data', 36);
|
|
205
|
+
wav.writeUInt32LE(dataBytes, 40);
|
|
206
|
+
for (let i = 0; i < pcm.length; i++)
|
|
207
|
+
wav.writeInt16LE(pcm[i], 44 + i * 2);
|
|
208
|
+
if (meetingOutputWs && meetingOutputWs.readyState === WebSocket.OPEN) {
|
|
209
|
+
meetingOutputWs.send(wav);
|
|
210
|
+
console.log(`📺 Meeting audio sent (${wav.byteLength} bytes, ${sampleRate}Hz)`);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
finally {
|
|
214
|
+
await ttsInstance.close().catch(() => { });
|
|
215
|
+
}
|
|
216
|
+
}
|
|
149
217
|
function startApiServer(workingDir, port) {
|
|
150
218
|
const server = createServer(async (req, res) => {
|
|
151
219
|
// CORS headers for cloud frontend
|
|
@@ -891,6 +959,32 @@ function startApiServer(workingDir, port) {
|
|
|
891
959
|
};
|
|
892
960
|
cleanStaleUploadDirs();
|
|
893
961
|
setInterval(cleanStaleUploadDirs, 10 * 60 * 1000);
|
|
962
|
+
// ============================================================
|
|
963
|
+
// Meeting Output WebSocket — /meeting-audio
|
|
964
|
+
// ============================================================
|
|
965
|
+
// Recall's headless browser opens meeting-output.html which connects here.
|
|
966
|
+
// We push: JSON { type: 'speak', text } for display, binary PCM for audio (future).
|
|
967
|
+
const meetingOutputWss = new WebSocketServer({ noServer: true });
|
|
968
|
+
meetingOutputWss.on('connection', (ws) => {
|
|
969
|
+
console.log('📺 Meeting output browser connected');
|
|
970
|
+
meetingOutputWs = ws;
|
|
971
|
+
ws.on('close', () => {
|
|
972
|
+
console.log('📺 Meeting output browser disconnected');
|
|
973
|
+
if (meetingOutputWs === ws)
|
|
974
|
+
meetingOutputWs = null;
|
|
975
|
+
});
|
|
976
|
+
});
|
|
977
|
+
server.on('upgrade', (req, socket, head) => {
|
|
978
|
+
const url = new URL(req.url || '/', `http://localhost:${port}`);
|
|
979
|
+
if (url.pathname === '/meeting-audio') {
|
|
980
|
+
meetingOutputWss.handleUpgrade(req, socket, head, (ws) => {
|
|
981
|
+
meetingOutputWss.emit('connection', ws, req);
|
|
982
|
+
});
|
|
983
|
+
}
|
|
984
|
+
else {
|
|
985
|
+
socket.destroy();
|
|
986
|
+
}
|
|
987
|
+
});
|
|
894
988
|
server.on('error', (err) => {
|
|
895
989
|
if (err.code === 'EADDRINUSE') {
|
|
896
990
|
console.warn(`⚠️ API port ${port} in use, trying ${port + 1}...`);
|
|
@@ -1112,6 +1206,7 @@ async function main() {
|
|
|
1112
1206
|
// session-only path (no user prefix).
|
|
1113
1207
|
let currentUserId = '';
|
|
1114
1208
|
let activeMeetingBotId = null; // Recall.ai bot ID if in a meeting
|
|
1209
|
+
// meetingOutputWs is module-level (see top of file) — shared between startApiServer and main()
|
|
1115
1210
|
// Track the active resume session ID across scopes (ParticipantConnected + DataReceived)
|
|
1116
1211
|
// Updated by resume_session, session_selected, continue_session, switch_session handlers
|
|
1117
1212
|
let currentResumeSessionId;
|
|
@@ -1728,6 +1823,17 @@ async function main() {
|
|
|
1728
1823
|
}
|
|
1729
1824
|
const sayId = Date.now(); // simple ID to correlate start/end logs
|
|
1730
1825
|
console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
|
|
1826
|
+
// Forward spoken text + audio to meeting output page when bot is in a meeting.
|
|
1827
|
+
// Text appears immediately; audio uses the same configured TTS (directConfig.tts)
|
|
1828
|
+
// so voice/provider stays consistent — no separate hardcoded provider.
|
|
1829
|
+
// PCM frames are WAV-encoded and pushed as binary WebSocket frames.
|
|
1830
|
+
// Recall captures the browser page's audio output and injects it into the meeting.
|
|
1831
|
+
if (activeMeetingBotId) {
|
|
1832
|
+
sendToMeetingOutput({ type: 'speak', text: data.text });
|
|
1833
|
+
if (meetingOutputWs) {
|
|
1834
|
+
synthesizeForMeeting(data.text, directConfig.tts).catch((err) => console.warn('⚠️ Meeting TTS error:', err));
|
|
1835
|
+
}
|
|
1836
|
+
}
|
|
1731
1837
|
try {
|
|
1732
1838
|
const handle = currentSession.say(data.text);
|
|
1733
1839
|
if (handle && typeof handle.addDoneCallback === 'function') {
|
|
@@ -2975,10 +3081,27 @@ async function main() {
|
|
|
2975
3081
|
clearInterval(readyInterval);
|
|
2976
3082
|
console.log('✅ agent_ready retries complete');
|
|
2977
3083
|
}, 20000);
|
|
2978
|
-
// Stop agent_ready retries on user speech
|
|
3084
|
+
// Stop agent_ready retries on user speech, and interrupt agent TTS at VAD onset.
|
|
3085
|
+
// Previously the interrupt only fired when STT committed a full transcript (chat()
|
|
3086
|
+
// call), which let the agent talk over the user for the full utterance. Firing it
|
|
3087
|
+
// here cuts TTS the moment VAD detects speech.
|
|
3088
|
+
// Realtime providers (OpenAI/Gemini) handle interruption server-side via their own
|
|
3089
|
+
// VAD — calling interrupt() manually for Gemini specifically crashes its state
|
|
3090
|
+
// machine (code 1008, hangs in 'speaking'), so skip those.
|
|
2979
3091
|
session.on('input_speech_started', () => {
|
|
2980
3092
|
readySent = true;
|
|
2981
3093
|
clearInterval(readyInterval);
|
|
3094
|
+
if (agentState !== 'speaking')
|
|
3095
|
+
return;
|
|
3096
|
+
if (sessionVoiceMode === 'realtime')
|
|
3097
|
+
return;
|
|
3098
|
+
try {
|
|
3099
|
+
console.log('🎤 VAD onset → interrupting agent TTS');
|
|
3100
|
+
currentSession?.interrupt();
|
|
3101
|
+
}
|
|
3102
|
+
catch (err) {
|
|
3103
|
+
console.warn('⚠️ VAD-onset interrupt failed:', err instanceof Error ? err.message : err);
|
|
3104
|
+
}
|
|
2982
3105
|
});
|
|
2983
3106
|
// Greet user via TTS (delayed if resume prompt will be shown)
|
|
2984
3107
|
// For realtime mode: use generateReply() since there's no standalone TTS
|
package/dist/meeting-output.html
CHANGED
|
@@ -1,18 +1,113 @@
|
|
|
1
1
|
<!DOCTYPE html>
|
|
2
2
|
<html>
|
|
3
|
-
<head
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8">
|
|
5
|
+
<title>Osborn</title>
|
|
6
|
+
<style>
|
|
7
|
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
8
|
+
body {
|
|
9
|
+
background: #0a0a0f;
|
|
10
|
+
color: #ffffff;
|
|
11
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
12
|
+
width: 100vw;
|
|
13
|
+
height: 100vh;
|
|
14
|
+
display: flex;
|
|
15
|
+
flex-direction: column;
|
|
16
|
+
align-items: center;
|
|
17
|
+
justify-content: center;
|
|
18
|
+
overflow: hidden;
|
|
19
|
+
}
|
|
20
|
+
#header {
|
|
21
|
+
position: absolute;
|
|
22
|
+
top: 24px;
|
|
23
|
+
left: 50%;
|
|
24
|
+
transform: translateX(-50%);
|
|
25
|
+
display: flex;
|
|
26
|
+
align-items: center;
|
|
27
|
+
gap: 10px;
|
|
28
|
+
}
|
|
29
|
+
#dot {
|
|
30
|
+
width: 8px;
|
|
31
|
+
height: 8px;
|
|
32
|
+
border-radius: 50%;
|
|
33
|
+
background: #333;
|
|
34
|
+
transition: background 0.3s;
|
|
35
|
+
}
|
|
36
|
+
#dot.speaking { background: #4ade80; box-shadow: 0 0 8px #4ade80; }
|
|
37
|
+
#dot.connected { background: #3b82f6; }
|
|
38
|
+
#name {
|
|
39
|
+
font-size: 13px;
|
|
40
|
+
font-weight: 600;
|
|
41
|
+
letter-spacing: 0.1em;
|
|
42
|
+
text-transform: uppercase;
|
|
43
|
+
color: #555;
|
|
44
|
+
}
|
|
45
|
+
#speech {
|
|
46
|
+
max-width: 80%;
|
|
47
|
+
text-align: center;
|
|
48
|
+
font-size: 28px;
|
|
49
|
+
font-weight: 400;
|
|
50
|
+
line-height: 1.4;
|
|
51
|
+
color: #f0f0f0;
|
|
52
|
+
opacity: 0;
|
|
53
|
+
transition: opacity 0.4s ease;
|
|
54
|
+
min-height: 2em;
|
|
55
|
+
}
|
|
56
|
+
#speech.visible { opacity: 1; }
|
|
57
|
+
#idle {
|
|
58
|
+
font-size: 14px;
|
|
59
|
+
color: #2a2a2a;
|
|
60
|
+
margin-top: 16px;
|
|
61
|
+
transition: opacity 0.4s;
|
|
62
|
+
}
|
|
63
|
+
#idle.hidden { opacity: 0; }
|
|
64
|
+
</style>
|
|
65
|
+
</head>
|
|
4
66
|
<body>
|
|
5
|
-
<
|
|
6
|
-
|
|
67
|
+
<div id="header">
|
|
68
|
+
<div id="dot"></div>
|
|
69
|
+
<div id="name">Osborn</div>
|
|
70
|
+
</div>
|
|
71
|
+
<div id="speech"></div>
|
|
72
|
+
<div id="idle">Listening…</div>
|
|
73
|
+
<script>
|
|
74
|
+
const botId = new URLSearchParams(window.location.search).get('bot_id') || 'unknown'
|
|
75
|
+
const speechEl = document.getElementById('speech')
|
|
76
|
+
const dotEl = document.getElementById('dot')
|
|
77
|
+
const idleEl = document.getElementById('idle')
|
|
78
|
+
|
|
79
|
+
// Persistent AudioContext — created ONCE at startup per Recall's own demo pattern
|
|
80
|
+
let audioCtx = null
|
|
81
|
+
let clearTimer = null
|
|
82
|
+
|
|
83
|
+
async function initAudio() {
|
|
84
|
+
audioCtx = new (window.AudioContext || window.webkitAudioContext)()
|
|
85
|
+
// Recall's headless Chrome may start AudioContext suspended — resume immediately.
|
|
86
|
+
// Their own voice-agent-demo does this at connect time without waiting for user gesture.
|
|
87
|
+
if (audioCtx.state === 'suspended') {
|
|
88
|
+
await audioCtx.resume()
|
|
89
|
+
}
|
|
90
|
+
}
|
|
7
91
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
92
|
+
function showSpeech(text) {
|
|
93
|
+
if (clearTimer) { clearTimeout(clearTimer); clearTimer = null }
|
|
94
|
+
speechEl.textContent = text
|
|
95
|
+
speechEl.classList.add('visible')
|
|
96
|
+
dotEl.className = 'speaking'
|
|
97
|
+
idleEl.classList.add('hidden')
|
|
98
|
+
clearTimer = setTimeout(clearSpeech, 6000)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function clearSpeech() {
|
|
102
|
+
speechEl.classList.remove('visible')
|
|
103
|
+
dotEl.className = 'connected'
|
|
104
|
+
idleEl.classList.remove('hidden')
|
|
105
|
+
clearTimer = null
|
|
106
|
+
}
|
|
11
107
|
|
|
12
|
-
|
|
108
|
+
async function playAudio(arrayBuffer) {
|
|
109
|
+
if (!audioCtx) return
|
|
13
110
|
try {
|
|
14
|
-
const audioCtx = new (window.AudioContext || window.webkitAudioContext)()
|
|
15
|
-
const arrayBuffer = await event.data.arrayBuffer()
|
|
16
111
|
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer)
|
|
17
112
|
const source = audioCtx.createBufferSource()
|
|
18
113
|
source.buffer = audioBuffer
|
|
@@ -23,10 +118,40 @@
|
|
|
23
118
|
}
|
|
24
119
|
}
|
|
25
120
|
|
|
26
|
-
|
|
27
|
-
|
|
121
|
+
function connect() {
|
|
122
|
+
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
|
|
123
|
+
const ws = new WebSocket(`${protocol}//${window.location.host}/meeting-audio?bot_id=${botId}`)
|
|
124
|
+
ws.binaryType = 'arraybuffer'
|
|
125
|
+
|
|
126
|
+
ws.onopen = () => {
|
|
127
|
+
console.log('Connected to meeting-audio WebSocket')
|
|
128
|
+
dotEl.className = 'connected'
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
ws.onmessage = async (event) => {
|
|
132
|
+
if (event.data instanceof ArrayBuffer) {
|
|
133
|
+
await playAudio(event.data)
|
|
134
|
+
} else {
|
|
135
|
+
try {
|
|
136
|
+
const msg = JSON.parse(event.data)
|
|
137
|
+
if (msg.type === 'speak' && msg.text) {
|
|
138
|
+
showSpeech(msg.text)
|
|
139
|
+
} else if (msg.type === 'clear') {
|
|
140
|
+
clearSpeech()
|
|
141
|
+
}
|
|
142
|
+
} catch (e) {
|
|
143
|
+
console.error('Parse error:', e)
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
ws.onclose = () => {
|
|
149
|
+
dotEl.className = ''
|
|
150
|
+
setTimeout(connect, 1500)
|
|
151
|
+
}
|
|
152
|
+
}
|
|
28
153
|
|
|
29
|
-
|
|
30
|
-
</script>
|
|
154
|
+
initAudio().then(() => connect())
|
|
155
|
+
</script>
|
|
31
156
|
</body>
|
|
32
157
|
</html>
|