@conversionpros/aiva 1.0.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/bin/aiva.js +26 -14
  2. package/lib/bluebubbles.js +145 -0
  3. package/lib/config-gen.js +253 -0
  4. package/lib/constants.js +72 -0
  5. package/lib/launch-agent.js +112 -0
  6. package/lib/prerequisites.js +236 -0
  7. package/lib/process.js +59 -145
  8. package/lib/setup.js +224 -194
  9. package/lib/validate.js +194 -0
  10. package/package.json +9 -34
  11. package/auto-deploy.js +0 -190
  12. package/cli-sync.js +0 -126
  13. package/d2a-prompt-template.txt +0 -106
  14. package/diagnostics-api.js +0 -304
  15. package/docs/ara-dedup-fix-scope.md +0 -112
  16. package/docs/ara-fix-round2-scope.md +0 -61
  17. package/docs/ara-greeting-fix-scope.md +0 -70
  18. package/docs/calendar-date-fix-scope.md +0 -28
  19. package/docs/getting-started.md +0 -115
  20. package/docs/network-architecture-rollout-scope.md +0 -43
  21. package/docs/scope-google-oauth-integration.md +0 -351
  22. package/docs/settings-page-scope.md +0 -50
  23. package/docs/xai-imagine-scope.md +0 -116
  24. package/docs/xai-voice-integration-scope.md +0 -115
  25. package/docs/xai-voice-tools-scope.md +0 -165
  26. package/email-router.js +0 -512
  27. package/follow-up-handler.js +0 -606
  28. package/gateway-monitor.js +0 -158
  29. package/google-email.js +0 -379
  30. package/google-oauth.js +0 -310
  31. package/grok-imagine.js +0 -97
  32. package/health-reporter.js +0 -287
  33. package/invisible-prefix-base.txt +0 -206
  34. package/invisible-prefix-owner.txt +0 -26
  35. package/invisible-prefix-slim.txt +0 -10
  36. package/invisible-prefix.txt +0 -43
  37. package/knowledge-base.js +0 -472
  38. package/lib/cli.js +0 -19
  39. package/lib/server.js +0 -42
  40. package/meta-capi.js +0 -206
  41. package/meta-leads.js +0 -411
  42. package/notion-oauth.js +0 -323
  43. package/public/agent-config.html +0 -241
  44. package/public/aiva-avatar-anime.png +0 -0
  45. package/public/css/docs.css.bak +0 -688
  46. package/public/css/onboarding.css +0 -543
  47. package/public/diagrams/claude-subscription-pool.html +0 -329
  48. package/public/diagrams/claude-subscription-pool.png +0 -0
  49. package/public/docs-icon.png +0 -0
  50. package/public/escalation.html +0 -237
  51. package/public/group-config.html +0 -300
  52. package/public/icon-192.png +0 -0
  53. package/public/icon-512.png +0 -0
  54. package/public/icons/agents.svg +0 -1
  55. package/public/icons/attach.svg +0 -1
  56. package/public/icons/characters.svg +0 -1
  57. package/public/icons/chat.svg +0 -1
  58. package/public/icons/docs.svg +0 -1
  59. package/public/icons/heartbeat.svg +0 -1
  60. package/public/icons/messages.svg +0 -1
  61. package/public/icons/mic.svg +0 -1
  62. package/public/icons/notes.svg +0 -1
  63. package/public/icons/settings.svg +0 -1
  64. package/public/icons/tasks.svg +0 -1
  65. package/public/images/onboarding/p0-communication-layer.png +0 -0
  66. package/public/images/onboarding/p0-infinite-surface.png +0 -0
  67. package/public/images/onboarding/p0-learning-model.png +0 -0
  68. package/public/images/onboarding/p0-meet-aiva.png +0 -0
  69. package/public/images/onboarding/p4-contact-intelligence.png +0 -0
  70. package/public/images/onboarding/p4-context-compounds.png +0 -0
  71. package/public/images/onboarding/p4-message-router.png +0 -0
  72. package/public/images/onboarding/p4-per-contact-rules.png +0 -0
  73. package/public/images/onboarding/p4-send-messages.png +0 -0
  74. package/public/images/onboarding/p6-be-precise.png +0 -0
  75. package/public/images/onboarding/p6-review-escalations.png +0 -0
  76. package/public/images/onboarding/p6-voice-input.png +0 -0
  77. package/public/images/onboarding/p7-completion.png +0 -0
  78. package/public/index.html +0 -11594
  79. package/public/js/onboarding.js +0 -699
  80. package/public/manifest.json +0 -24
  81. package/public/messages-v2.html +0 -2824
  82. package/public/permission-approve.html.bak +0 -107
  83. package/public/permissions.html +0 -150
  84. package/public/styles/design-system.css +0 -68
  85. package/router-db.js +0 -604
  86. package/router-utils.js +0 -28
  87. package/router-v2/adapters/imessage.js +0 -191
  88. package/router-v2/adapters/quo.js +0 -82
  89. package/router-v2/adapters/whatsapp.js +0 -192
  90. package/router-v2/contact-manager.js +0 -234
  91. package/router-v2/conversation-engine.js +0 -498
  92. package/router-v2/data/knowledge-base.json +0 -176
  93. package/router-v2/data/router-v2.db +0 -0
  94. package/router-v2/data/router-v2.db-shm +0 -0
  95. package/router-v2/data/router-v2.db-wal +0 -0
  96. package/router-v2/data/router.db +0 -0
  97. package/router-v2/db.js +0 -457
  98. package/router-v2/escalation-bridge.js +0 -540
  99. package/router-v2/follow-up-engine.js +0 -347
  100. package/router-v2/index.js +0 -441
  101. package/router-v2/ingestion.js +0 -213
  102. package/router-v2/knowledge-base.js +0 -231
  103. package/router-v2/lead-qualifier.js +0 -152
  104. package/router-v2/learning-loop.js +0 -202
  105. package/router-v2/outbound-sender.js +0 -160
  106. package/router-v2/package.json +0 -13
  107. package/router-v2/permission-gate.js +0 -86
  108. package/router-v2/playbook.js +0 -177
  109. package/router-v2/prompts/base.js +0 -52
  110. package/router-v2/prompts/first-contact.js +0 -38
  111. package/router-v2/prompts/lead-qualification.js +0 -37
  112. package/router-v2/prompts/scheduling.js +0 -72
  113. package/router-v2/prompts/style-overrides.js +0 -22
  114. package/router-v2/scheduler.js +0 -301
  115. package/router-v2/scripts/migrate-v1-to-v2.js +0 -215
  116. package/router-v2/scripts/seed-faq.js +0 -67
  117. package/router-v2/seed-knowledge-base.js +0 -39
  118. package/router-v2/utils/ai.js +0 -129
  119. package/router-v2/utils/phone.js +0 -52
  120. package/router-v2/utils/response-validator.js +0 -98
  121. package/router-v2/utils/sanitize.js +0 -222
  122. package/router.js +0 -5005
  123. package/routes/google-calendar.js +0 -186
  124. package/scripts/deploy.sh +0 -62
  125. package/scripts/macos-calendar.sh +0 -232
  126. package/scripts/onboard-device.sh +0 -466
  127. package/server.js +0 -5131
  128. package/start.sh +0 -24
  129. package/templates/AGENTS.md +0 -548
  130. package/templates/IDENTITY.md +0 -15
  131. package/templates/docs-agents.html +0 -132
  132. package/templates/docs-app.html +0 -130
  133. package/templates/docs-home.html +0 -83
  134. package/templates/docs-imessage.html +0 -121
  135. package/templates/docs-tasks.html +0 -123
  136. package/templates/docs-tips.html +0 -175
  137. package/templates/getting-started.html +0 -809
  138. package/templates/invisible-prefix-base.txt +0 -171
  139. package/templates/invisible-prefix-owner.txt +0 -282
  140. package/templates/invisible-prefix.txt +0 -338
  141. package/templates/manifest.json +0 -61
  142. package/templates/memory-org/clients.md +0 -7
  143. package/templates/memory-org/credentials.md +0 -9
  144. package/templates/memory-org/devices.md +0 -7
  145. package/templates/updates.html +0 -464
  146. package/tts-proxy.js +0 -96
  147. package/voice-call-local.js +0 -731
  148. package/voice-call.js +0 -732
  149. package/wa-listener.js +0 -354
@@ -1,731 +0,0 @@
1
- /**
2
- * Local Voice Call Module for AIVA — Routes through Main Agent + Local TTS
3
- *
4
- * Architecture:
5
- * 1. Client sends PCM16 audio chunks
6
- * 2. Server accumulates and transcribes via OpenAI Whisper
7
- * 3. Transcription is sent to OpenClaw main agent (claude-sonnet-4-5)
8
- * 4. Agent's text response is converted to speech via local TTS server
9
- * 5. Audio is streamed back to client
10
- */
11
-
12
- const WebSocket = require('ws');
13
- const fs = require('fs');
14
- const path = require('path');
15
- const { exec: execCb, execSync } = require('child_process');
16
- const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
17
-
18
- // Configuration
19
- const OPENCLAW_API = 'http://127.0.0.1:18789/v1/chat/completions';
20
- const OPENCLAW_AUTH = loadOpenClawAuth();
21
- const LOCAL_TTS_API = 'http://127.0.0.1:3851/tts';
22
- const CONTEXT_API = 'http://localhost:3847/api/context/voice';
23
- // STT: Local Whisper CLI (free, no API key needed)
24
- // LLM: OpenClaw proxy (free, existing subscription)
25
- // TTS: Kokoro TTS (free, local)
26
-
27
- // Active call sessions
28
- const activeCalls = new Map();
29
-
30
- function loadOpenClawAuth() {
31
- try {
32
- const configPath = path.join(process.env.HOME, '.openclaw', 'openclaw.json');
33
- const config = JSON.parse(fs.readFileSync(configPath, 'utf-8'));
34
- return config.gateway?.auth?.password || '';
35
- } catch (e) {
36
- console.error('[voice-call-local] Failed to load OpenClaw auth:', e.message);
37
- return '';
38
- }
39
- }
40
-
41
- async function fetchContext() {
42
- try {
43
- const res = await fetch(CONTEXT_API);
44
- if (!res.ok) throw new Error(`Context API ${res.status}`);
45
- return await res.json();
46
- } catch (e) {
47
- console.error('[voice-call-local] Failed to fetch context:', e.message);
48
- return null;
49
- }
50
- }
51
-
52
- function buildSystemPrompt(ctx) {
53
- const now = new Date();
54
- const timeStr = now.toLocaleString('en-US', {
55
- timeZone: 'America/Los_Angeles',
56
- weekday: 'long',
57
- year: 'numeric',
58
- month: 'long',
59
- day: 'numeric',
60
- hour: 'numeric',
61
- minute: '2-digit',
62
- hour12: true
63
- });
64
-
65
- let prompt = `You are AIVA (AI-VA), Brandon Burgan's AI assistant. This is a voice conversation through the AIVA app.
66
-
67
- CURRENT DATE AND TIME: ${timeStr} (Pacific Time)
68
-
69
- VOICE CONVERSATION RULES (CRITICAL):
70
- - Keep responses SHORT and CONVERSATIONAL (1-3 sentences max)
71
- - Use contractions and casual language
72
- - NO markdown, bullet points, or formatting — everything is spoken aloud
73
- - Start with a brief greeting then WAIT for Brandon to tell you what he needs
74
- - Don't volunteer information (calendar, tasks) unless he asks
75
- - Be helpful and concise
76
-
77
- TASK RULES:
78
- - Unless explicitly asked, don't mention finished/done tasks
79
- - When listing tasks, default to non-done tasks only
80
-
81
- PERSONALITY:
82
- You are warm, efficient, and proactive. You understand Brandon's needs before he asks.`;
83
-
84
- // Append dynamic context if available
85
- if (ctx) {
86
- if (ctx.activeTasks?.length) {
87
- const tasks = ctx.activeTasks.slice(0, 10).map(t => `- ${t.title} (${t.status})`).join('\n');
88
- prompt += `\n\nACTIVE TASKS (reference only — do NOT mention unless asked):\n${tasks}`;
89
- }
90
- if (ctx.recentChat?.length) {
91
- const recent = ctx.recentChat.slice(-10).map(m => `${m.from}: ${m.text?.slice(0, 100)}`).join('\n');
92
- prompt += `\n\nRECENT CHAT (reference only):\n${recent}`;
93
- }
94
- const calArray = Array.isArray(ctx.calendar) ? ctx.calendar : [];
95
- if (calArray.length) {
96
- const nowTime = new Date();
97
- const futureEvents = calArray.filter(e => {
98
- const eventTime = e.time || e.start || '';
99
- if (!eventTime) return true;
100
- try {
101
- const eventDate = new Date(eventTime);
102
- return eventDate > nowTime;
103
- } catch { return true; }
104
- });
105
- if (futureEvents.length) {
106
- const events = futureEvents.slice(0, 5).map(e => `- ${e.title || e.summary} (${e.time || e.start || ''})`).join('\n');
107
- prompt += `\n\nUPCOMING CALENDAR (reference only — do NOT mention unless asked):\n${events}`;
108
- }
109
- }
110
- }
111
-
112
- return prompt;
113
- }
114
-
115
- async function transcribeAudio(audioBuffer) {
116
- /**
117
- * Transcribe audio buffer using OpenAI Whisper API.
118
- * audioBuffer: Buffer containing audio data (WAV/PCM)
119
- */
120
- const FormData = (await import('form-data')).default;
121
-
122
- // Wrap raw PCM16 in a proper WAV header (24kHz, 16-bit, mono)
123
- const sampleRate = 24000;
124
- const numChannels = 1;
125
- const bitsPerSample = 16;
126
- const dataSize = audioBuffer.length;
127
- const wavHeader = Buffer.alloc(44);
128
- wavHeader.write('RIFF', 0);
129
- wavHeader.writeUInt32LE(36 + dataSize, 4);
130
- wavHeader.write('WAVE', 8);
131
- wavHeader.write('fmt ', 12);
132
- wavHeader.writeUInt32LE(16, 16);
133
- wavHeader.writeUInt16LE(1, 20); // PCM
134
- wavHeader.writeUInt16LE(numChannels, 22);
135
- wavHeader.writeUInt32LE(sampleRate, 24);
136
- wavHeader.writeUInt32LE(sampleRate * numChannels * bitsPerSample / 8, 28);
137
- wavHeader.writeUInt16LE(numChannels * bitsPerSample / 8, 32);
138
- wavHeader.writeUInt16LE(bitsPerSample, 34);
139
- wavHeader.write('data', 36);
140
- wavHeader.writeUInt32LE(dataSize, 40);
141
-
142
- const tempPath = path.join('/tmp', `voice-${Date.now()}.wav`);
143
- fs.writeFileSync(tempPath, Buffer.concat([wavHeader, audioBuffer]));
144
-
145
- try {
146
- // Convert to proper format for Whisper (if needed)
147
- const convertedPath = tempPath.replace('.wav', '-converted.wav');
148
- try {
149
- execSync(`ffmpeg -y -i "${tempPath}" -ar 16000 -ac 1 -acodec pcm_s16le "${convertedPath}" 2>/dev/null`);
150
- fs.unlinkSync(tempPath);
151
- } catch (e) {
152
- // If conversion fails, use original
153
- fs.renameSync(tempPath, convertedPath);
154
- }
155
-
156
- // Use LOCAL Whisper CLI (free, no API key needed)
157
- const result = execSync(
158
- `whisper "${convertedPath}" --model tiny --language en --output_format txt --output_dir /tmp 2>/dev/null`,
159
- { timeout: 30000 }
160
- ).toString().trim();
161
-
162
- // Read the output text file
163
- const txtPath = convertedPath.replace('.wav', '.txt');
164
- let transcript = '';
165
- if (fs.existsSync(txtPath)) {
166
- transcript = fs.readFileSync(txtPath, 'utf-8').trim();
167
- fs.unlinkSync(txtPath);
168
- } else {
169
- // Sometimes whisper outputs to stdout
170
- transcript = result;
171
- }
172
-
173
- // Clean up audio files
174
- if (fs.existsSync(convertedPath)) fs.unlinkSync(convertedPath);
175
- // Clean up any other whisper output files
176
- for (const ext of ['.vtt', '.srt', '.tsv', '.json']) {
177
- const f = convertedPath.replace('.wav', ext);
178
- if (fs.existsSync(f)) fs.unlinkSync(f);
179
- }
180
-
181
- return transcript;
182
- } catch (e) {
183
- console.error('[voice-call-local] Transcription error:', e.message);
184
- if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath);
185
- throw e;
186
- }
187
- }
188
-
189
- function stripToolMarkup(text) {
190
- if (text.includes('Exec:') || text.includes('```') || text.includes('curl ') || text.includes('🛠️')) {
191
- console.warn('[voice-call-local] Stripping tool markup from response');
192
- text = text.replace(/⚠️.*$/gm, '').replace(/🛠️.*$/gm, '').replace(/```[\s\S]*?```/g, '').replace(/`[^`]+`/g, '').trim();
193
- if (!text || text.length < 5) {
194
- text = "Hey! Sorry, I had a hiccup there. What were you saying?";
195
- }
196
- }
197
- return text;
198
- }
199
-
200
- /**
201
- * Stream LLM response with sentence-level TTS chunking.
202
- * Calls onSentence(sentenceText, audioBase64) for each complete sentence.
203
- * Returns the full assembled response text.
204
- */
205
- async function queryMainAgentStreaming(messages, systemPrompt, { onSentence, onDone } = {}) {
206
- const basePayload = {
207
- model: 'claude-sonnet-4-5',
208
- messages: [
209
- { role: 'system', content: systemPrompt + '\n\nCRITICAL: You are in a VOICE conversation. Respond with plain spoken text ONLY. Do NOT use any tools, commands, code blocks, markdown, or special formatting. Just talk naturally.' },
210
- ...messages
211
- ],
212
- max_tokens: 500,
213
- temperature: 0.7,
214
- user: 'voice-call-brandon'
215
- };
216
-
217
- const delays = [0, 3000, 8000];
218
- for (let attempt = 0; attempt < delays.length; attempt++) {
219
- if (delays[attempt] > 0) await new Promise(r => setTimeout(r, delays[attempt]));
220
- const useStream = attempt === 0; // fallback to non-streaming on retries
221
- try {
222
- const payload = { ...basePayload, ...(useStream ? { stream: true } : {}) };
223
- const response = await fetch(OPENCLAW_API, {
224
- method: 'POST',
225
- headers: {
226
- 'Content-Type': 'application/json',
227
- 'Authorization': `Bearer ${OPENCLAW_AUTH}`,
228
- 'x-openclaw-agent-id': 'voice',
229
- 'x-openclaw-session-key': 'agent:voice:openai-user:voice-call-brandon'
230
- },
231
- body: JSON.stringify(payload)
232
- });
233
- if (!response.ok) {
234
- const err = await response.text();
235
- console.error(`[voice-call-local] Proxy attempt ${attempt+1} failed: ${response.status} ${err.substring(0, 100)}`);
236
- continue;
237
- }
238
-
239
- if (!useStream) {
240
- // Non-streaming fallback
241
- const result = await response.json();
242
- let text = stripToolMarkup(result.choices?.[0]?.message?.content || '');
243
- if (!text || text.includes('No response from OpenClaw')) continue;
244
- console.log('[voice-call-local] Got response via OpenClaw proxy (non-streaming fallback)');
245
- if (onSentence) {
246
- try {
247
- const audio = await generateSpeech(text);
248
- onSentence(text, audio.toString('base64'));
249
- } catch (e) { console.error('[voice-call-local] TTS error in fallback:', e.message); }
250
- }
251
- if (onDone) onDone();
252
- return text;
253
- }
254
-
255
- // Streaming path: parse SSE
256
- console.log('[voice-call-local] Streaming response from proxy...');
257
- let fullText = '';
258
- let sentenceBuffer = '';
259
- const ttsPromises = []; // track in-flight TTS to await at end
260
-
261
- const processCompleteSentences = async (flush = false) => {
262
- // Find sentence boundaries
263
- const sentenceEndRegex = /([.!?])\s+/g;
264
- let match;
265
- let lastEnd = 0;
266
- const sentences = [];
267
-
268
- while ((match = sentenceEndRegex.exec(sentenceBuffer)) !== null) {
269
- sentences.push(sentenceBuffer.substring(lastEnd, match.index + 1));
270
- lastEnd = match.index + match[0].length;
271
- }
272
-
273
- if (sentences.length > 0) {
274
- sentenceBuffer = sentenceBuffer.substring(lastEnd);
275
- for (const sentence of sentences) {
276
- const cleaned = stripToolMarkup(sentence.trim());
277
- if (cleaned.length < 2) continue;
278
- if (onSentence) {
279
- const p = (async () => {
280
- try {
281
- const audio = await generateSpeech(cleaned);
282
- onSentence(cleaned, audio.toString('base64'));
283
- } catch (e) { console.error('[voice-call-local] TTS chunk error:', e.message); }
284
- })();
285
- ttsPromises.push(p);
286
- }
287
- }
288
- }
289
-
290
- if (flush && sentenceBuffer.trim().length > 1) {
291
- const cleaned = stripToolMarkup(sentenceBuffer.trim());
292
- sentenceBuffer = '';
293
- if (cleaned.length >= 2 && onSentence) {
294
- const p = (async () => {
295
- try {
296
- const audio = await generateSpeech(cleaned);
297
- onSentence(cleaned, audio.toString('base64'));
298
- } catch (e) { console.error('[voice-call-local] TTS flush error:', e.message); }
299
- })();
300
- ttsPromises.push(p);
301
- }
302
- }
303
- };
304
-
305
- // Read SSE stream
306
- const reader = response.body;
307
- let remainder = '';
308
-
309
- await new Promise((resolve, reject) => {
310
- reader.on('data', (chunk) => {
311
- const text = remainder + chunk.toString();
312
- const lines = text.split('\n');
313
- remainder = lines.pop() || '';
314
-
315
- for (const line of lines) {
316
- if (!line.startsWith('data: ')) continue;
317
- const data = line.slice(6).trim();
318
- if (data === '[DONE]') continue;
319
- try {
320
- const parsed = JSON.parse(data);
321
- const delta = parsed.choices?.[0]?.delta?.content || '';
322
- if (delta) {
323
- fullText += delta;
324
- sentenceBuffer += delta;
325
- processCompleteSentences(false);
326
- }
327
- } catch (e) { /* ignore parse errors on SSE lines */ }
328
- }
329
- });
330
- reader.on('end', resolve);
331
- reader.on('error', reject);
332
- });
333
-
334
- // Flush remaining text
335
- await processCompleteSentences(true);
336
- // Wait for all TTS chunks to finish
337
- await Promise.all(ttsPromises);
338
-
339
- fullText = stripToolMarkup(fullText);
340
- if (!fullText || fullText.includes('No response from OpenClaw')) continue;
341
-
342
- console.log('[voice-call-local] Streaming response complete:', fullText.substring(0, 100));
343
- if (onDone) onDone();
344
- return fullText;
345
-
346
- } catch (e) {
347
- console.error(`[voice-call-local] Proxy attempt ${attempt+1} error:`, e.message);
348
- }
349
- }
350
-
351
- throw new Error('All proxy attempts failed');
352
- }
353
-
354
- // Legacy non-streaming wrapper for compatibility
355
- async function queryMainAgent(messages, systemPrompt) {
356
- return queryMainAgentStreaming(messages, systemPrompt);
357
- }
358
-
359
- // Add 20ms fade-in to prevent audio pop/click at start of TTS playback
360
- function addFadeIn(audioBuffer, durationMs = 20) {
361
- const headerSize = 44; // WAV header
362
- const sampleRate = 24000; // Kokoro default
363
- const fadeSamples = Math.floor(sampleRate * durationMs / 1000);
364
- const buf = Buffer.from(audioBuffer);
365
-
366
- for (let i = 0; i < fadeSamples && (headerSize + i * 2 + 1) < buf.length; i++) {
367
- const offset = headerSize + i * 2;
368
- const sample = buf.readInt16LE(offset);
369
- const gain = i / fadeSamples; // 0.0 to 1.0
370
- buf.writeInt16LE(Math.round(sample * gain), offset);
371
- }
372
- return buf;
373
- }
374
-
375
- async function generateSpeech(text) {
376
- /**
377
- * Generate speech from text using local TTS server.
378
- * Returns audio buffer with fade-in applied.
379
- */
380
- try {
381
- const response = await fetch(LOCAL_TTS_API, {
382
- method: 'POST',
383
- headers: { 'Content-Type': 'application/json' },
384
- body: JSON.stringify({ text, rate: 200 })
385
- });
386
-
387
- if (!response.ok) {
388
- const error = await response.text();
389
- throw new Error(`TTS error: ${error}`);
390
- }
391
-
392
- const arrayBuffer = await response.arrayBuffer();
393
- return addFadeIn(Buffer.from(arrayBuffer));
394
- } catch (e) {
395
- console.error('[voice-call-local] TTS error:', e.message);
396
- throw e;
397
- }
398
- }
399
-
400
- function setupVoiceCall(io) {
401
- io.on('connection', (socket) => {
402
-
403
- socket.on('voice-call-start', async (data) => {
404
- console.log('[voice-call-local] Call started by:', data?.userId || 'unknown');
405
-
406
- // Fetch context for system prompt
407
- const ctx = await fetchContext().catch(() => null);
408
- const systemPrompt = buildSystemPrompt(ctx);
409
-
410
- const callSession = {
411
- id: Date.now().toString(),
412
- userId: data?.userId || 'brandon',
413
- startedAt: new Date().toISOString(),
414
- transcript: [], // Full conversation history
415
- messages: [], // OpenAI-format messages for context
416
- systemPrompt,
417
- audioBuffer: null, // Accumulator for incoming audio chunks
418
- isRecording: false,
419
- processingResponse: false
420
- };
421
-
422
- activeCalls.set(socket.id, callSession);
423
-
424
- // Auto-start recording (frontend sends continuous audio stream)
425
- callSession.isRecording = true;
426
- callSession.audioBuffer = Buffer.alloc(0);
427
-
428
- // Set up silence detection — process audio after 2s of silence
429
- callSession.silenceTimer = null;
430
- callSession.lastAudioTime = Date.now();
431
-
432
- // Send ready signal
433
- socket.emit('voice-call-ready', { callId: callSession.id });
434
- socket.emit('voice-call-status', { status: 'listening' });
435
- console.log('[voice-call-local] Call ready:', callSession.id);
436
- });
437
-
438
- // User starts speaking (push-to-talk or VAD trigger)
439
- socket.on('voice-call-start-recording', () => {
440
- const session = activeCalls.get(socket.id);
441
- if (!session) return;
442
-
443
- session.isRecording = true;
444
- session.audioBuffer = Buffer.alloc(0);
445
- socket.emit('voice-call-status', { status: 'recording' });
446
- console.log('[voice-call-local] Recording started');
447
- });
448
-
449
- // Receive audio chunks (continuous stream with silence detection)
450
- let chunkCount = 0;
451
- socket.on('voice-call-audio-chunk', (data) => {
452
- const session = activeCalls.get(socket.id);
453
- if (!session || !session.isRecording || session.processingResponse) return;
454
-
455
- if (data?.audio) {
456
- const chunk = Buffer.from(data.audio, 'base64');
457
-
458
- // Check if this chunk has meaningful audio (not silence)
459
- const samples = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
460
- let rms = 0;
461
- for (let i = 0; i < samples.length; i++) rms += samples[i] * samples[i];
462
- rms = Math.sqrt(rms / samples.length);
463
-
464
- // Barge-in: user speaks while agent is playing TTS
465
- if (session.isPlaying && rms > 1000) {
466
- console.log('[voice-call-local] Barge-in detected! Stopping playback.');
467
- session.isPlaying = false;
468
- session.audioBuffer = Buffer.alloc(0);
469
- session.hasVoice = true;
470
- session.lastAudioTime = Date.now();
471
- if (session.silenceTimer) { clearTimeout(session.silenceTimer); session.silenceTimer = null; }
472
- socket.emit('voice-call-barge-in', {});
473
- }
474
-
475
- // Don't accumulate audio while TTS is playing (unless barge-in just triggered above)
476
- if (session.isPlaying) return;
477
-
478
- session.audioBuffer = Buffer.concat([session.audioBuffer, chunk]);
479
- chunkCount++;
480
- if (chunkCount % 50 === 1) console.log(`[voice-call-local] Chunk #${chunkCount}, buffer: ${session.audioBuffer.length} bytes`);
481
-
482
- if (chunkCount % 50 === 1) console.log(`[voice-call-local] RMS: ${Math.round(rms)}, hasVoice: ${!!session.hasVoice}`);
483
- if (rms > 1000) { // Voice detected
484
- session.lastAudioTime = Date.now();
485
- if (!session.hasVoice) {
486
- session.hasVoice = true;
487
- console.log('[voice-call-local] Voice detected! RMS:', Math.round(rms));
488
- socket.emit('voice-call-status', { status: 'recording' });
489
- }
490
- }
491
-
492
- // If voice is active again, cancel the silence timer (they're still talking)
493
- if (rms > 1000) {
494
- if (session.silenceTimer) { clearTimeout(session.silenceTimer); session.silenceTimer = null; }
495
- }
496
-
497
- // After voice was detected, if we're now in silence, start the timer (only once)
498
- if (session.hasVoice && rms <= 1000 && !session.silenceTimer) {
499
- session.silenceTimer = setTimeout(async () => {
500
- session.silenceTimer = null; // Clear reference so new timer can start
501
- if (!session.isRecording || session.processingResponse) return;
502
- if (session.audioBuffer.length < 16000) return; // Too short
503
-
504
- session.processingResponse = true;
505
- session.hasVoice = false;
506
- console.log('[voice-call-local] Silence detected, processing...');
507
- socket.emit('voice-call-status', { status: 'processing' });
508
-
509
- try {
510
- console.log('[voice-call-local] Transcribing audio...', session.audioBuffer.length, 'bytes');
511
- const transcript = await transcribeAudio(session.audioBuffer);
512
-
513
- if (!transcript) {
514
- socket.emit('voice-call-status', { status: 'listening' });
515
- session.audioBuffer = Buffer.alloc(0);
516
- session.processingResponse = false;
517
- return;
518
- }
519
-
520
- // Filter Whisper hallucinations
521
- const hallucinations = ['thank you', 'thanks for watching', 'thank you for watching', 'bye', 'you', 'the end', "i'm sorry", 'goodbye', 'well congratulations', 'thanks', 'okay'];
522
- const normalized = transcript.toLowerCase().replace(/[.,!?]/g, '').trim();
523
- if ((hallucinations.includes(normalized) || normalized.length < 3) && session.audioBuffer.length < 144000) {
524
- console.log('[voice-call-local] Filtered likely hallucination:', transcript);
525
- session.audioBuffer = Buffer.alloc(0);
526
- session.processingResponse = false;
527
- socket.emit('voice-call-status', { status: 'listening' });
528
- return;
529
- }
530
-
531
- console.log('[voice-call-local] User said:', transcript);
532
- socket.emit('voice-call-user-transcript', { text: transcript });
533
-
534
- // Get AI response
535
- session.messages.push({ role: 'user', content: transcript });
536
- session.transcript.push({ role: 'user', text: transcript, ts: new Date().toISOString() });
537
-
538
- session.isPlaying = true;
539
- const aiResponse = await queryMainAgentStreaming(session.messages, session.systemPrompt, {
540
- onSentence: (text, audioBase64) => {
541
- socket.emit('voice-call-transcript-delta', { text });
542
- socket.emit('voice-call-audio-delta', { audio: audioBase64 });
543
- },
544
- onDone: () => { session.isPlaying = false; socket.emit('voice-call-response-done', {}); }
545
- });
546
- console.log('[voice-call-local] Agent response:', aiResponse?.substring(0, 100));
547
-
548
- if (aiResponse) {
549
- session.messages.push({ role: 'assistant', content: aiResponse });
550
- session.transcript.push({ role: 'assistant', text: aiResponse, ts: new Date().toISOString() });
551
- }
552
- } catch (e) {
553
- console.error('[voice-call-local] Processing error:', e.message);
554
- socket.emit('voice-call-error', { error: e.message });
555
- }
556
-
557
- session.audioBuffer = Buffer.alloc(0);
558
- session.processingResponse = false;
559
- socket.emit('voice-call-status', { status: 'listening' });
560
- }, 1500); // Allow natural pauses between sentences
561
- }
562
- }
563
- });
564
-
565
- // User stops speaking
566
- socket.on('voice-call-stop-recording', async () => {
567
- const session = activeCalls.get(socket.id);
568
- if (!session || !session.isRecording) return;
569
-
570
- session.isRecording = false;
571
- socket.emit('voice-call-status', { status: 'processing' });
572
-
573
- try {
574
- // Transcribe audio
575
- console.log('[voice-call-local] Transcribing audio...');
576
- const transcript = await transcribeAudio(session.audioBuffer);
577
-
578
- if (!transcript) {
579
- socket.emit('voice-call-error', { error: 'Could not understand audio' });
580
- socket.emit('voice-call-status', { status: 'listening' });
581
- return;
582
- }
583
-
584
- // Filter Whisper hallucinations
585
- const hallucinations2 = ['thank you', 'thanks for watching', 'thank you for watching', 'bye', 'you', 'the end', "i'm sorry", 'goodbye', 'well congratulations', 'thanks', 'okay'];
586
- const normalized2 = transcript.toLowerCase().replace(/[.,!?]/g, '').trim();
587
- if ((hallucinations2.includes(normalized2) || normalized2.length < 3) && session.audioBuffer.length < 144000) {
588
- console.log('[voice-call-local] Filtered likely hallucination:', transcript);
589
- session.audioBuffer = Buffer.alloc(0);
590
- session.processingResponse = false;
591
- socket.emit('voice-call-status', { status: 'listening' });
592
- return;
593
- }
594
-
595
- console.log('[voice-call-local] User said:', transcript);
596
- session.transcript.push({ role: 'user', text: transcript });
597
- session.messages.push({ role: 'user', content: transcript });
598
- socket.emit('voice-call-user-transcript', { text: transcript });
599
-
600
- // Query main agent with streaming
601
- console.log('[voice-call-local] Querying main agent (streaming)...');
602
- session.processingResponse = true;
603
- socket.emit('voice-call-status', { status: 'thinking' });
604
-
605
- let firstChunkSent = false;
606
- session.isPlaying = true;
607
- const responseText = await queryMainAgentStreaming(session.messages, session.systemPrompt, {
608
- onSentence: (text, audioBase64) => {
609
- if (!firstChunkSent) {
610
- socket.emit('voice-call-status', { status: 'speaking' });
611
- firstChunkSent = true;
612
- }
613
- socket.emit('voice-call-transcript-delta', { text });
614
- socket.emit('voice-call-audio-delta', { audio: audioBase64 });
615
- // Also emit legacy event for compatibility
616
- socket.emit('voice-call-audio-response', { audio: audioBase64, text });
617
- },
618
- onDone: () => { session.isPlaying = false; socket.emit('voice-call-response-done', { text: '' }); }
619
- });
620
-
621
- if (!responseText) {
622
- throw new Error('No response from agent');
623
- }
624
-
625
- console.log('[voice-call-local] Agent responded:', responseText.substring(0, 100));
626
- session.transcript.push({ role: 'assistant', text: responseText });
627
- session.messages.push({ role: 'assistant', content: responseText });
628
-
629
- console.log('[voice-call-local] Response sent (streamed)');
630
- socket.emit('voice-call-status', { status: 'listening' });
631
- session.processingResponse = false;
632
-
633
- } catch (e) {
634
- console.error('[voice-call-local] Processing error:', e.message);
635
- socket.emit('voice-call-error', { error: e.message });
636
- socket.emit('voice-call-status', { status: 'listening' });
637
- session.processingResponse = false;
638
- }
639
- });
640
-
641
- socket.on('voice-call-end', () => {
642
- const session = activeCalls.get(socket.id);
643
- if (!session) return;
644
- cleanupCall(socket.id, session);
645
- });
646
-
647
- socket.on('disconnect', () => {
648
- const session = activeCalls.get(socket.id);
649
- if (session) {
650
- console.log('[voice-call-local] Socket disconnected, cleaning up call');
651
- cleanupCall(socket.id, session);
652
- }
653
- });
654
- });
655
-
656
- function cleanupCall(socketId, session) {
657
- console.log(`[voice-call-local] Call ended (${session.transcript.length} turns)`);
658
-
659
- // Save transcript if there was conversation
660
- if (session.transcript.length > 0) {
661
- try {
662
- const logDir = path.join(process.env.HOME || '', '.openclaw', 'workspace', 'memory', 'call-logs');
663
- fs.mkdirSync(logDir, { recursive: true });
664
- const ts = new Date().toISOString().replace(/[:.]/g, '-');
665
- const pendingFile = path.join(logDir, `pending_local_${ts}.json`);
666
-
667
- // Calculate duration
668
- const startTime = new Date(session.startedAt).getTime();
669
- const endTime = Date.now();
670
- const durationSeconds = Math.round((endTime - startTime) / 1000);
671
- const durationMin = Math.floor(durationSeconds / 60);
672
- const durationSec = durationSeconds % 60;
673
-
674
- const logData = {
675
- type: 'local-voice-call',
676
- timestamp: new Date().toISOString(),
677
- duration: durationSeconds,
678
- transcript: session.transcript,
679
- };
680
-
681
- fs.writeFileSync(pendingFile, JSON.stringify(logData, null, 2));
682
- console.log(`[voice-call-local] Transcript saved: ${pendingFile}`);
683
-
684
- // Fire wake hook to main agent with full transcript
685
- const readableTranscript = session.transcript.map(t =>
686
- `${t.role === 'user' ? 'Brandon' : 'AIVA'}: ${t.text}`
687
- ).join('\n');
688
-
689
- const wakeText = `[VOICE-CALL-COMPLETE] Voice call with Brandon just ended. Duration: ${durationMin}m ${durationSec}s.\n\nFull transcript:\n${readableTranscript}\n\nProcess any action items from this conversation immediately. If Brandon asked for something to be done, delegate it. If there are insights about Brandon's preferences or decisions, log them to memory.`;
690
-
691
- // Send transcript directly to main agent via sessions_send
692
- const invokeData = JSON.stringify({
693
- tool: 'sessions_send',
694
- args: {
695
- sessionKey: 'agent:main:main',
696
- message: wakeText
697
- }
698
- });
699
-
700
- const invokeReq = require('http').request({
701
- hostname: '127.0.0.1',
702
- port: 18789,
703
- path: '/tools/invoke',
704
- method: 'POST',
705
- headers: {
706
- 'Content-Type': 'application/json',
707
- 'Authorization': `Bearer ${OPENCLAW_AUTH}`
708
- }
709
- }, (res) => {
710
- let body = '';
711
- res.on('data', d => body += d);
712
- res.on('end', () => {
713
- console.log(`[voice-call-local] sessions_send result: ${res.statusCode}`);
714
- });
715
- });
716
- invokeReq.on('error', e => console.error('[voice-call-local] sessions_send failed:', e.message));
717
- invokeReq.write(invokeData);
718
- invokeReq.end();
719
-
720
- } catch (e) {
721
- console.error('[voice-call-local] Failed to save transcript:', e.message);
722
- }
723
- }
724
-
725
- activeCalls.delete(socketId);
726
- }
727
-
728
- console.log('[voice-call-local] Voice call handler initialized (Local TTS + Main Agent)');
729
- }
730
-
731
- module.exports = { setupVoiceCall };