@conversionpros/aiva 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/README.md +148 -0
  2. package/auto-deploy.js +190 -0
  3. package/bin/aiva.js +81 -0
  4. package/cli-sync.js +126 -0
  5. package/d2a-prompt-template.txt +106 -0
  6. package/diagnostics-api.js +304 -0
  7. package/docs/ara-dedup-fix-scope.md +112 -0
  8. package/docs/ara-fix-round2-scope.md +61 -0
  9. package/docs/ara-greeting-fix-scope.md +70 -0
  10. package/docs/calendar-date-fix-scope.md +28 -0
  11. package/docs/getting-started.md +115 -0
  12. package/docs/network-architecture-rollout-scope.md +43 -0
  13. package/docs/scope-google-oauth-integration.md +351 -0
  14. package/docs/settings-page-scope.md +50 -0
  15. package/docs/xai-imagine-scope.md +116 -0
  16. package/docs/xai-voice-integration-scope.md +115 -0
  17. package/docs/xai-voice-tools-scope.md +165 -0
  18. package/email-router.js +512 -0
  19. package/follow-up-handler.js +606 -0
  20. package/gateway-monitor.js +158 -0
  21. package/google-email.js +379 -0
  22. package/google-oauth.js +310 -0
  23. package/grok-imagine.js +97 -0
  24. package/health-reporter.js +287 -0
  25. package/invisible-prefix-base.txt +206 -0
  26. package/invisible-prefix-owner.txt +26 -0
  27. package/invisible-prefix-slim.txt +10 -0
  28. package/invisible-prefix.txt +43 -0
  29. package/knowledge-base.js +472 -0
  30. package/lib/cli.js +19 -0
  31. package/lib/config.js +124 -0
  32. package/lib/health.js +57 -0
  33. package/lib/process.js +207 -0
  34. package/lib/server.js +42 -0
  35. package/lib/setup.js +472 -0
  36. package/meta-capi.js +206 -0
  37. package/meta-leads.js +411 -0
  38. package/notion-oauth.js +323 -0
  39. package/package.json +61 -0
  40. package/public/agent-config.html +241 -0
  41. package/public/aiva-avatar-anime.png +0 -0
  42. package/public/css/docs.css.bak +688 -0
  43. package/public/css/onboarding.css +543 -0
  44. package/public/diagrams/claude-subscription-pool.html +329 -0
  45. package/public/diagrams/claude-subscription-pool.png +0 -0
  46. package/public/docs-icon.png +0 -0
  47. package/public/escalation.html +237 -0
  48. package/public/group-config.html +300 -0
  49. package/public/icon-192.png +0 -0
  50. package/public/icon-512.png +0 -0
  51. package/public/icons/agents.svg +1 -0
  52. package/public/icons/attach.svg +1 -0
  53. package/public/icons/characters.svg +1 -0
  54. package/public/icons/chat.svg +1 -0
  55. package/public/icons/docs.svg +1 -0
  56. package/public/icons/heartbeat.svg +1 -0
  57. package/public/icons/messages.svg +1 -0
  58. package/public/icons/mic.svg +1 -0
  59. package/public/icons/notes.svg +1 -0
  60. package/public/icons/settings.svg +1 -0
  61. package/public/icons/tasks.svg +1 -0
  62. package/public/images/onboarding/p0-communication-layer.png +0 -0
  63. package/public/images/onboarding/p0-infinite-surface.png +0 -0
  64. package/public/images/onboarding/p0-learning-model.png +0 -0
  65. package/public/images/onboarding/p0-meet-aiva.png +0 -0
  66. package/public/images/onboarding/p4-contact-intelligence.png +0 -0
  67. package/public/images/onboarding/p4-context-compounds.png +0 -0
  68. package/public/images/onboarding/p4-message-router.png +0 -0
  69. package/public/images/onboarding/p4-per-contact-rules.png +0 -0
  70. package/public/images/onboarding/p4-send-messages.png +0 -0
  71. package/public/images/onboarding/p6-be-precise.png +0 -0
  72. package/public/images/onboarding/p6-review-escalations.png +0 -0
  73. package/public/images/onboarding/p6-voice-input.png +0 -0
  74. package/public/images/onboarding/p7-completion.png +0 -0
  75. package/public/index.html +11594 -0
  76. package/public/js/onboarding.js +699 -0
  77. package/public/manifest.json +24 -0
  78. package/public/messages-v2.html +2824 -0
  79. package/public/permission-approve.html.bak +107 -0
  80. package/public/permissions.html +150 -0
  81. package/public/styles/design-system.css +68 -0
  82. package/router-db.js +604 -0
  83. package/router-utils.js +28 -0
  84. package/router-v2/adapters/imessage.js +191 -0
  85. package/router-v2/adapters/quo.js +82 -0
  86. package/router-v2/adapters/whatsapp.js +192 -0
  87. package/router-v2/contact-manager.js +234 -0
  88. package/router-v2/conversation-engine.js +498 -0
  89. package/router-v2/data/knowledge-base.json +176 -0
  90. package/router-v2/data/router-v2.db +0 -0
  91. package/router-v2/data/router-v2.db-shm +0 -0
  92. package/router-v2/data/router-v2.db-wal +0 -0
  93. package/router-v2/data/router.db +0 -0
  94. package/router-v2/db.js +457 -0
  95. package/router-v2/escalation-bridge.js +540 -0
  96. package/router-v2/follow-up-engine.js +347 -0
  97. package/router-v2/index.js +441 -0
  98. package/router-v2/ingestion.js +213 -0
  99. package/router-v2/knowledge-base.js +231 -0
  100. package/router-v2/lead-qualifier.js +152 -0
  101. package/router-v2/learning-loop.js +202 -0
  102. package/router-v2/outbound-sender.js +160 -0
  103. package/router-v2/package.json +13 -0
  104. package/router-v2/permission-gate.js +86 -0
  105. package/router-v2/playbook.js +177 -0
  106. package/router-v2/prompts/base.js +52 -0
  107. package/router-v2/prompts/first-contact.js +38 -0
  108. package/router-v2/prompts/lead-qualification.js +37 -0
  109. package/router-v2/prompts/scheduling.js +72 -0
  110. package/router-v2/prompts/style-overrides.js +22 -0
  111. package/router-v2/scheduler.js +301 -0
  112. package/router-v2/scripts/migrate-v1-to-v2.js +215 -0
  113. package/router-v2/scripts/seed-faq.js +67 -0
  114. package/router-v2/seed-knowledge-base.js +39 -0
  115. package/router-v2/utils/ai.js +129 -0
  116. package/router-v2/utils/phone.js +52 -0
  117. package/router-v2/utils/response-validator.js +98 -0
  118. package/router-v2/utils/sanitize.js +222 -0
  119. package/router.js +5005 -0
  120. package/routes/google-calendar.js +186 -0
  121. package/scripts/deploy.sh +62 -0
  122. package/scripts/macos-calendar.sh +232 -0
  123. package/scripts/onboard-device.sh +466 -0
  124. package/server.js +5131 -0
  125. package/start.sh +24 -0
  126. package/templates/AGENTS.md +548 -0
  127. package/templates/IDENTITY.md +15 -0
  128. package/templates/docs-agents.html +132 -0
  129. package/templates/docs-app.html +130 -0
  130. package/templates/docs-home.html +83 -0
  131. package/templates/docs-imessage.html +121 -0
  132. package/templates/docs-tasks.html +123 -0
  133. package/templates/docs-tips.html +175 -0
  134. package/templates/getting-started.html +809 -0
  135. package/templates/invisible-prefix-base.txt +171 -0
  136. package/templates/invisible-prefix-owner.txt +282 -0
  137. package/templates/invisible-prefix.txt +338 -0
  138. package/templates/manifest.json +61 -0
  139. package/templates/memory-org/clients.md +7 -0
  140. package/templates/memory-org/credentials.md +9 -0
  141. package/templates/memory-org/devices.md +7 -0
  142. package/templates/updates.html +464 -0
  143. package/templates/workspace/AGENTS.md.tmpl +161 -0
  144. package/templates/workspace/HEARTBEAT.md.tmpl +17 -0
  145. package/templates/workspace/IDENTITY.md.tmpl +15 -0
  146. package/templates/workspace/MEMORY.md.tmpl +16 -0
  147. package/templates/workspace/SOUL.md.tmpl +51 -0
  148. package/templates/workspace/USER.md.tmpl +25 -0
  149. package/tts-proxy.js +96 -0
  150. package/voice-call-local.js +731 -0
  151. package/voice-call.js +732 -0
  152. package/wa-listener.js +354 -0
@@ -0,0 +1,731 @@
1
+ /**
2
+ * Local Voice Call Module for AIVA — Routes through Main Agent + Local TTS
3
+ *
4
+ * Architecture:
5
+ * 1. Client sends PCM16 audio chunks
6
+ * 2. Server accumulates and transcribes via OpenAI Whisper
7
+ * 3. Transcription is sent to OpenClaw main agent (claude-sonnet-4-5)
8
+ * 4. Agent's text response is converted to speech via local TTS server
9
+ * 5. Audio is streamed back to client
10
+ */
11
+
12
+ const WebSocket = require('ws');
13
+ const fs = require('fs');
14
+ const path = require('path');
15
+ const { exec: execCb, execSync } = require('child_process');
16
+ const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
17
+
18
+ // Configuration
19
+ const OPENCLAW_API = 'http://127.0.0.1:18789/v1/chat/completions';
20
+ const OPENCLAW_AUTH = loadOpenClawAuth();
21
+ const LOCAL_TTS_API = 'http://127.0.0.1:3851/tts';
22
+ const CONTEXT_API = 'http://localhost:3847/api/context/voice';
23
+ // STT: Local Whisper CLI (free, no API key needed)
24
+ // LLM: OpenClaw proxy (free, existing subscription)
25
+ // TTS: Kokoro TTS (free, local)
26
+
27
+ // Active call sessions
28
+ const activeCalls = new Map();
29
+
30
+ function loadOpenClawAuth() {
31
+ try {
32
+ const configPath = path.join(process.env.HOME, '.openclaw', 'openclaw.json');
33
+ const config = JSON.parse(fs.readFileSync(configPath, 'utf-8'));
34
+ return config.gateway?.auth?.password || '';
35
+ } catch (e) {
36
+ console.error('[voice-call-local] Failed to load OpenClaw auth:', e.message);
37
+ return '';
38
+ }
39
+ }
40
+
41
+ async function fetchContext() {
42
+ try {
43
+ const res = await fetch(CONTEXT_API);
44
+ if (!res.ok) throw new Error(`Context API ${res.status}`);
45
+ return await res.json();
46
+ } catch (e) {
47
+ console.error('[voice-call-local] Failed to fetch context:', e.message);
48
+ return null;
49
+ }
50
+ }
51
+
52
+ function buildSystemPrompt(ctx) {
53
+ const now = new Date();
54
+ const timeStr = now.toLocaleString('en-US', {
55
+ timeZone: 'America/Los_Angeles',
56
+ weekday: 'long',
57
+ year: 'numeric',
58
+ month: 'long',
59
+ day: 'numeric',
60
+ hour: 'numeric',
61
+ minute: '2-digit',
62
+ hour12: true
63
+ });
64
+
65
+ let prompt = `You are AIVA (AI-VA), Brandon Burgan's AI assistant. This is a voice conversation through the AIVA app.
66
+
67
+ CURRENT DATE AND TIME: ${timeStr} (Pacific Time)
68
+
69
+ VOICE CONVERSATION RULES (CRITICAL):
70
+ - Keep responses SHORT and CONVERSATIONAL (1-3 sentences max)
71
+ - Use contractions and casual language
72
+ - NO markdown, bullet points, or formatting — everything is spoken aloud
73
+ - Start with a brief greeting then WAIT for Brandon to tell you what he needs
74
+ - Don't volunteer information (calendar, tasks) unless he asks
75
+ - Be helpful and concise
76
+
77
+ TASK RULES:
78
+ - Unless explicitly asked, don't mention finished/done tasks
79
+ - When listing tasks, default to non-done tasks only
80
+
81
+ PERSONALITY:
82
+ You are warm, efficient, and proactive. You understand Brandon's needs before he asks.`;
83
+
84
+ // Append dynamic context if available
85
+ if (ctx) {
86
+ if (ctx.activeTasks?.length) {
87
+ const tasks = ctx.activeTasks.slice(0, 10).map(t => `- ${t.title} (${t.status})`).join('\n');
88
+ prompt += `\n\nACTIVE TASKS (reference only — do NOT mention unless asked):\n${tasks}`;
89
+ }
90
+ if (ctx.recentChat?.length) {
91
+ const recent = ctx.recentChat.slice(-10).map(m => `${m.from}: ${m.text?.slice(0, 100)}`).join('\n');
92
+ prompt += `\n\nRECENT CHAT (reference only):\n${recent}`;
93
+ }
94
+ const calArray = Array.isArray(ctx.calendar) ? ctx.calendar : [];
95
+ if (calArray.length) {
96
+ const nowTime = new Date();
97
+ const futureEvents = calArray.filter(e => {
98
+ const eventTime = e.time || e.start || '';
99
+ if (!eventTime) return true;
100
+ try {
101
+ const eventDate = new Date(eventTime);
102
+ return eventDate > nowTime;
103
+ } catch { return true; }
104
+ });
105
+ if (futureEvents.length) {
106
+ const events = futureEvents.slice(0, 5).map(e => `- ${e.title || e.summary} (${e.time || e.start || ''})`).join('\n');
107
+ prompt += `\n\nUPCOMING CALENDAR (reference only — do NOT mention unless asked):\n${events}`;
108
+ }
109
+ }
110
+ }
111
+
112
+ return prompt;
113
+ }
114
+
115
+ async function transcribeAudio(audioBuffer) {
116
+ /**
117
+ * Transcribe audio buffer using OpenAI Whisper API.
118
+ * audioBuffer: Buffer containing audio data (WAV/PCM)
119
+ */
120
+ const FormData = (await import('form-data')).default;
121
+
122
+ // Wrap raw PCM16 in a proper WAV header (24kHz, 16-bit, mono)
123
+ const sampleRate = 24000;
124
+ const numChannels = 1;
125
+ const bitsPerSample = 16;
126
+ const dataSize = audioBuffer.length;
127
+ const wavHeader = Buffer.alloc(44);
128
+ wavHeader.write('RIFF', 0);
129
+ wavHeader.writeUInt32LE(36 + dataSize, 4);
130
+ wavHeader.write('WAVE', 8);
131
+ wavHeader.write('fmt ', 12);
132
+ wavHeader.writeUInt32LE(16, 16);
133
+ wavHeader.writeUInt16LE(1, 20); // PCM
134
+ wavHeader.writeUInt16LE(numChannels, 22);
135
+ wavHeader.writeUInt32LE(sampleRate, 24);
136
+ wavHeader.writeUInt32LE(sampleRate * numChannels * bitsPerSample / 8, 28);
137
+ wavHeader.writeUInt16LE(numChannels * bitsPerSample / 8, 32);
138
+ wavHeader.writeUInt16LE(bitsPerSample, 34);
139
+ wavHeader.write('data', 36);
140
+ wavHeader.writeUInt32LE(dataSize, 40);
141
+
142
+ const tempPath = path.join('/tmp', `voice-${Date.now()}.wav`);
143
+ fs.writeFileSync(tempPath, Buffer.concat([wavHeader, audioBuffer]));
144
+
145
+ try {
146
+ // Convert to proper format for Whisper (if needed)
147
+ const convertedPath = tempPath.replace('.wav', '-converted.wav');
148
+ try {
149
+ execSync(`ffmpeg -y -i "${tempPath}" -ar 16000 -ac 1 -acodec pcm_s16le "${convertedPath}" 2>/dev/null`);
150
+ fs.unlinkSync(tempPath);
151
+ } catch (e) {
152
+ // If conversion fails, use original
153
+ fs.renameSync(tempPath, convertedPath);
154
+ }
155
+
156
+ // Use LOCAL Whisper CLI (free, no API key needed)
157
+ const result = execSync(
158
+ `whisper "${convertedPath}" --model tiny --language en --output_format txt --output_dir /tmp 2>/dev/null`,
159
+ { timeout: 30000 }
160
+ ).toString().trim();
161
+
162
+ // Read the output text file
163
+ const txtPath = convertedPath.replace('.wav', '.txt');
164
+ let transcript = '';
165
+ if (fs.existsSync(txtPath)) {
166
+ transcript = fs.readFileSync(txtPath, 'utf-8').trim();
167
+ fs.unlinkSync(txtPath);
168
+ } else {
169
+ // Sometimes whisper outputs to stdout
170
+ transcript = result;
171
+ }
172
+
173
+ // Clean up audio files
174
+ if (fs.existsSync(convertedPath)) fs.unlinkSync(convertedPath);
175
+ // Clean up any other whisper output files
176
+ for (const ext of ['.vtt', '.srt', '.tsv', '.json']) {
177
+ const f = convertedPath.replace('.wav', ext);
178
+ if (fs.existsSync(f)) fs.unlinkSync(f);
179
+ }
180
+
181
+ return transcript;
182
+ } catch (e) {
183
+ console.error('[voice-call-local] Transcription error:', e.message);
184
+ if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath);
185
+ throw e;
186
+ }
187
+ }
188
+
189
+ function stripToolMarkup(text) {
190
+ if (text.includes('Exec:') || text.includes('```') || text.includes('curl ') || text.includes('🛠️')) {
191
+ console.warn('[voice-call-local] Stripping tool markup from response');
192
+ text = text.replace(/⚠️.*$/gm, '').replace(/🛠️.*$/gm, '').replace(/```[\s\S]*?```/g, '').replace(/`[^`]+`/g, '').trim();
193
+ if (!text || text.length < 5) {
194
+ text = "Hey! Sorry, I had a hiccup there. What were you saying?";
195
+ }
196
+ }
197
+ return text;
198
+ }
199
+
200
+ /**
201
+ * Stream LLM response with sentence-level TTS chunking.
202
+ * Calls onSentence(sentenceText, audioBase64) for each complete sentence.
203
+ * Returns the full assembled response text.
204
+ */
205
+ async function queryMainAgentStreaming(messages, systemPrompt, { onSentence, onDone } = {}) {
206
+ const basePayload = {
207
+ model: 'claude-sonnet-4-5',
208
+ messages: [
209
+ { role: 'system', content: systemPrompt + '\n\nCRITICAL: You are in a VOICE conversation. Respond with plain spoken text ONLY. Do NOT use any tools, commands, code blocks, markdown, or special formatting. Just talk naturally.' },
210
+ ...messages
211
+ ],
212
+ max_tokens: 500,
213
+ temperature: 0.7,
214
+ user: 'voice-call-brandon'
215
+ };
216
+
217
+ const delays = [0, 3000, 8000];
218
+ for (let attempt = 0; attempt < delays.length; attempt++) {
219
+ if (delays[attempt] > 0) await new Promise(r => setTimeout(r, delays[attempt]));
220
+ const useStream = attempt === 0; // fallback to non-streaming on retries
221
+ try {
222
+ const payload = { ...basePayload, ...(useStream ? { stream: true } : {}) };
223
+ const response = await fetch(OPENCLAW_API, {
224
+ method: 'POST',
225
+ headers: {
226
+ 'Content-Type': 'application/json',
227
+ 'Authorization': `Bearer ${OPENCLAW_AUTH}`,
228
+ 'x-openclaw-agent-id': 'voice',
229
+ 'x-openclaw-session-key': 'agent:voice:openai-user:voice-call-brandon'
230
+ },
231
+ body: JSON.stringify(payload)
232
+ });
233
+ if (!response.ok) {
234
+ const err = await response.text();
235
+ console.error(`[voice-call-local] Proxy attempt ${attempt+1} failed: ${response.status} ${err.substring(0, 100)}`);
236
+ continue;
237
+ }
238
+
239
+ if (!useStream) {
240
+ // Non-streaming fallback
241
+ const result = await response.json();
242
+ let text = stripToolMarkup(result.choices?.[0]?.message?.content || '');
243
+ if (!text || text.includes('No response from OpenClaw')) continue;
244
+ console.log('[voice-call-local] Got response via OpenClaw proxy (non-streaming fallback)');
245
+ if (onSentence) {
246
+ try {
247
+ const audio = await generateSpeech(text);
248
+ onSentence(text, audio.toString('base64'));
249
+ } catch (e) { console.error('[voice-call-local] TTS error in fallback:', e.message); }
250
+ }
251
+ if (onDone) onDone();
252
+ return text;
253
+ }
254
+
255
+ // Streaming path: parse SSE
256
+ console.log('[voice-call-local] Streaming response from proxy...');
257
+ let fullText = '';
258
+ let sentenceBuffer = '';
259
+ const ttsPromises = []; // track in-flight TTS to await at end
260
+
261
+ const processCompleteSentences = async (flush = false) => {
262
+ // Find sentence boundaries
263
+ const sentenceEndRegex = /([.!?])\s+/g;
264
+ let match;
265
+ let lastEnd = 0;
266
+ const sentences = [];
267
+
268
+ while ((match = sentenceEndRegex.exec(sentenceBuffer)) !== null) {
269
+ sentences.push(sentenceBuffer.substring(lastEnd, match.index + 1));
270
+ lastEnd = match.index + match[0].length;
271
+ }
272
+
273
+ if (sentences.length > 0) {
274
+ sentenceBuffer = sentenceBuffer.substring(lastEnd);
275
+ for (const sentence of sentences) {
276
+ const cleaned = stripToolMarkup(sentence.trim());
277
+ if (cleaned.length < 2) continue;
278
+ if (onSentence) {
279
+ const p = (async () => {
280
+ try {
281
+ const audio = await generateSpeech(cleaned);
282
+ onSentence(cleaned, audio.toString('base64'));
283
+ } catch (e) { console.error('[voice-call-local] TTS chunk error:', e.message); }
284
+ })();
285
+ ttsPromises.push(p);
286
+ }
287
+ }
288
+ }
289
+
290
+ if (flush && sentenceBuffer.trim().length > 1) {
291
+ const cleaned = stripToolMarkup(sentenceBuffer.trim());
292
+ sentenceBuffer = '';
293
+ if (cleaned.length >= 2 && onSentence) {
294
+ const p = (async () => {
295
+ try {
296
+ const audio = await generateSpeech(cleaned);
297
+ onSentence(cleaned, audio.toString('base64'));
298
+ } catch (e) { console.error('[voice-call-local] TTS flush error:', e.message); }
299
+ })();
300
+ ttsPromises.push(p);
301
+ }
302
+ }
303
+ };
304
+
305
+ // Read SSE stream
306
+ const reader = response.body;
307
+ let remainder = '';
308
+
309
+ await new Promise((resolve, reject) => {
310
+ reader.on('data', (chunk) => {
311
+ const text = remainder + chunk.toString();
312
+ const lines = text.split('\n');
313
+ remainder = lines.pop() || '';
314
+
315
+ for (const line of lines) {
316
+ if (!line.startsWith('data: ')) continue;
317
+ const data = line.slice(6).trim();
318
+ if (data === '[DONE]') continue;
319
+ try {
320
+ const parsed = JSON.parse(data);
321
+ const delta = parsed.choices?.[0]?.delta?.content || '';
322
+ if (delta) {
323
+ fullText += delta;
324
+ sentenceBuffer += delta;
325
+ processCompleteSentences(false);
326
+ }
327
+ } catch (e) { /* ignore parse errors on SSE lines */ }
328
+ }
329
+ });
330
+ reader.on('end', resolve);
331
+ reader.on('error', reject);
332
+ });
333
+
334
+ // Flush remaining text
335
+ await processCompleteSentences(true);
336
+ // Wait for all TTS chunks to finish
337
+ await Promise.all(ttsPromises);
338
+
339
+ fullText = stripToolMarkup(fullText);
340
+ if (!fullText || fullText.includes('No response from OpenClaw')) continue;
341
+
342
+ console.log('[voice-call-local] Streaming response complete:', fullText.substring(0, 100));
343
+ if (onDone) onDone();
344
+ return fullText;
345
+
346
+ } catch (e) {
347
+ console.error(`[voice-call-local] Proxy attempt ${attempt+1} error:`, e.message);
348
+ }
349
+ }
350
+
351
+ throw new Error('All proxy attempts failed');
352
+ }
353
+
354
+ // Legacy non-streaming wrapper for compatibility
355
+ async function queryMainAgent(messages, systemPrompt) {
356
+ return queryMainAgentStreaming(messages, systemPrompt);
357
+ }
358
+
359
+ // Add 20ms fade-in to prevent audio pop/click at start of TTS playback
360
+ function addFadeIn(audioBuffer, durationMs = 20) {
361
+ const headerSize = 44; // WAV header
362
+ const sampleRate = 24000; // Kokoro default
363
+ const fadeSamples = Math.floor(sampleRate * durationMs / 1000);
364
+ const buf = Buffer.from(audioBuffer);
365
+
366
+ for (let i = 0; i < fadeSamples && (headerSize + i * 2 + 1) < buf.length; i++) {
367
+ const offset = headerSize + i * 2;
368
+ const sample = buf.readInt16LE(offset);
369
+ const gain = i / fadeSamples; // 0.0 to 1.0
370
+ buf.writeInt16LE(Math.round(sample * gain), offset);
371
+ }
372
+ return buf;
373
+ }
374
+
375
+ async function generateSpeech(text) {
376
+ /**
377
+ * Generate speech from text using local TTS server.
378
+ * Returns audio buffer with fade-in applied.
379
+ */
380
+ try {
381
+ const response = await fetch(LOCAL_TTS_API, {
382
+ method: 'POST',
383
+ headers: { 'Content-Type': 'application/json' },
384
+ body: JSON.stringify({ text, rate: 200 })
385
+ });
386
+
387
+ if (!response.ok) {
388
+ const error = await response.text();
389
+ throw new Error(`TTS error: ${error}`);
390
+ }
391
+
392
+ const arrayBuffer = await response.arrayBuffer();
393
+ return addFadeIn(Buffer.from(arrayBuffer));
394
+ } catch (e) {
395
+ console.error('[voice-call-local] TTS error:', e.message);
396
+ throw e;
397
+ }
398
+ }
399
+
400
+ function setupVoiceCall(io) {
401
+ io.on('connection', (socket) => {
402
+
403
+ socket.on('voice-call-start', async (data) => {
404
+ console.log('[voice-call-local] Call started by:', data?.userId || 'unknown');
405
+
406
+ // Fetch context for system prompt
407
+ const ctx = await fetchContext().catch(() => null);
408
+ const systemPrompt = buildSystemPrompt(ctx);
409
+
410
+ const callSession = {
411
+ id: Date.now().toString(),
412
+ userId: data?.userId || 'brandon',
413
+ startedAt: new Date().toISOString(),
414
+ transcript: [], // Full conversation history
415
+ messages: [], // OpenAI-format messages for context
416
+ systemPrompt,
417
+ audioBuffer: null, // Accumulator for incoming audio chunks
418
+ isRecording: false,
419
+ processingResponse: false
420
+ };
421
+
422
+ activeCalls.set(socket.id, callSession);
423
+
424
+ // Auto-start recording (frontend sends continuous audio stream)
425
+ callSession.isRecording = true;
426
+ callSession.audioBuffer = Buffer.alloc(0);
427
+
428
+ // Set up silence detection — process audio after 2s of silence
429
+ callSession.silenceTimer = null;
430
+ callSession.lastAudioTime = Date.now();
431
+
432
+ // Send ready signal
433
+ socket.emit('voice-call-ready', { callId: callSession.id });
434
+ socket.emit('voice-call-status', { status: 'listening' });
435
+ console.log('[voice-call-local] Call ready:', callSession.id);
436
+ });
437
+
438
+ // User starts speaking (push-to-talk or VAD trigger)
439
+ socket.on('voice-call-start-recording', () => {
440
+ const session = activeCalls.get(socket.id);
441
+ if (!session) return;
442
+
443
+ session.isRecording = true;
444
+ session.audioBuffer = Buffer.alloc(0);
445
+ socket.emit('voice-call-status', { status: 'recording' });
446
+ console.log('[voice-call-local] Recording started');
447
+ });
448
+
449
+ // Receive audio chunks (continuous stream with silence detection)
450
+ let chunkCount = 0;
451
+ socket.on('voice-call-audio-chunk', (data) => {
452
+ const session = activeCalls.get(socket.id);
453
+ if (!session || !session.isRecording || session.processingResponse) return;
454
+
455
+ if (data?.audio) {
456
+ const chunk = Buffer.from(data.audio, 'base64');
457
+
458
+ // Check if this chunk has meaningful audio (not silence)
459
+ const samples = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
460
+ let rms = 0;
461
+ for (let i = 0; i < samples.length; i++) rms += samples[i] * samples[i];
462
+ rms = Math.sqrt(rms / samples.length);
463
+
464
+ // Barge-in: user speaks while agent is playing TTS
465
+ if (session.isPlaying && rms > 1000) {
466
+ console.log('[voice-call-local] Barge-in detected! Stopping playback.');
467
+ session.isPlaying = false;
468
+ session.audioBuffer = Buffer.alloc(0);
469
+ session.hasVoice = true;
470
+ session.lastAudioTime = Date.now();
471
+ if (session.silenceTimer) { clearTimeout(session.silenceTimer); session.silenceTimer = null; }
472
+ socket.emit('voice-call-barge-in', {});
473
+ }
474
+
475
+ // Don't accumulate audio while TTS is playing (unless barge-in just triggered above)
476
+ if (session.isPlaying) return;
477
+
478
+ session.audioBuffer = Buffer.concat([session.audioBuffer, chunk]);
479
+ chunkCount++;
480
+ if (chunkCount % 50 === 1) console.log(`[voice-call-local] Chunk #${chunkCount}, buffer: ${session.audioBuffer.length} bytes`);
481
+
482
+ if (chunkCount % 50 === 1) console.log(`[voice-call-local] RMS: ${Math.round(rms)}, hasVoice: ${!!session.hasVoice}`);
483
+ if (rms > 1000) { // Voice detected
484
+ session.lastAudioTime = Date.now();
485
+ if (!session.hasVoice) {
486
+ session.hasVoice = true;
487
+ console.log('[voice-call-local] Voice detected! RMS:', Math.round(rms));
488
+ socket.emit('voice-call-status', { status: 'recording' });
489
+ }
490
+ }
491
+
492
+ // If voice is active again, cancel the silence timer (they're still talking)
493
+ if (rms > 1000) {
494
+ if (session.silenceTimer) { clearTimeout(session.silenceTimer); session.silenceTimer = null; }
495
+ }
496
+
497
+ // After voice was detected, if we're now in silence, start the timer (only once)
498
+ if (session.hasVoice && rms <= 1000 && !session.silenceTimer) {
499
+ session.silenceTimer = setTimeout(async () => {
500
+ session.silenceTimer = null; // Clear reference so new timer can start
501
+ if (!session.isRecording || session.processingResponse) return;
502
+ if (session.audioBuffer.length < 16000) return; // Too short
503
+
504
+ session.processingResponse = true;
505
+ session.hasVoice = false;
506
+ console.log('[voice-call-local] Silence detected, processing...');
507
+ socket.emit('voice-call-status', { status: 'processing' });
508
+
509
+ try {
510
+ console.log('[voice-call-local] Transcribing audio...', session.audioBuffer.length, 'bytes');
511
+ const transcript = await transcribeAudio(session.audioBuffer);
512
+
513
+ if (!transcript) {
514
+ socket.emit('voice-call-status', { status: 'listening' });
515
+ session.audioBuffer = Buffer.alloc(0);
516
+ session.processingResponse = false;
517
+ return;
518
+ }
519
+
520
+ // Filter Whisper hallucinations
521
+ const hallucinations = ['thank you', 'thanks for watching', 'thank you for watching', 'bye', 'you', 'the end', "i'm sorry", 'goodbye', 'well congratulations', 'thanks', 'okay'];
522
+ const normalized = transcript.toLowerCase().replace(/[.,!?]/g, '').trim();
523
+ if ((hallucinations.includes(normalized) || normalized.length < 3) && session.audioBuffer.length < 144000) {
524
+ console.log('[voice-call-local] Filtered likely hallucination:', transcript);
525
+ session.audioBuffer = Buffer.alloc(0);
526
+ session.processingResponse = false;
527
+ socket.emit('voice-call-status', { status: 'listening' });
528
+ return;
529
+ }
530
+
531
+ console.log('[voice-call-local] User said:', transcript);
532
+ socket.emit('voice-call-user-transcript', { text: transcript });
533
+
534
+ // Get AI response
535
+ session.messages.push({ role: 'user', content: transcript });
536
+ session.transcript.push({ role: 'user', text: transcript, ts: new Date().toISOString() });
537
+
538
+ session.isPlaying = true;
539
+ const aiResponse = await queryMainAgentStreaming(session.messages, session.systemPrompt, {
540
+ onSentence: (text, audioBase64) => {
541
+ socket.emit('voice-call-transcript-delta', { text });
542
+ socket.emit('voice-call-audio-delta', { audio: audioBase64 });
543
+ },
544
+ onDone: () => { session.isPlaying = false; socket.emit('voice-call-response-done', {}); }
545
+ });
546
+ console.log('[voice-call-local] Agent response:', aiResponse?.substring(0, 100));
547
+
548
+ if (aiResponse) {
549
+ session.messages.push({ role: 'assistant', content: aiResponse });
550
+ session.transcript.push({ role: 'assistant', text: aiResponse, ts: new Date().toISOString() });
551
+ }
552
+ } catch (e) {
553
+ console.error('[voice-call-local] Processing error:', e.message);
554
+ socket.emit('voice-call-error', { error: e.message });
555
+ }
556
+
557
+ session.audioBuffer = Buffer.alloc(0);
558
+ session.processingResponse = false;
559
+ socket.emit('voice-call-status', { status: 'listening' });
560
+ }, 1500); // Allow natural pauses between sentences
561
+ }
562
+ }
563
+ });
564
+
565
+ // User stops speaking
566
+ socket.on('voice-call-stop-recording', async () => {
567
+ const session = activeCalls.get(socket.id);
568
+ if (!session || !session.isRecording) return;
569
+
570
+ session.isRecording = false;
571
+ socket.emit('voice-call-status', { status: 'processing' });
572
+
573
+ try {
574
+ // Transcribe audio
575
+ console.log('[voice-call-local] Transcribing audio...');
576
+ const transcript = await transcribeAudio(session.audioBuffer);
577
+
578
+ if (!transcript) {
579
+ socket.emit('voice-call-error', { error: 'Could not understand audio' });
580
+ socket.emit('voice-call-status', { status: 'listening' });
581
+ return;
582
+ }
583
+
584
+ // Filter Whisper hallucinations
585
+ const hallucinations2 = ['thank you', 'thanks for watching', 'thank you for watching', 'bye', 'you', 'the end', "i'm sorry", 'goodbye', 'well congratulations', 'thanks', 'okay'];
586
+ const normalized2 = transcript.toLowerCase().replace(/[.,!?]/g, '').trim();
587
+ if ((hallucinations2.includes(normalized2) || normalized2.length < 3) && session.audioBuffer.length < 144000) {
588
+ console.log('[voice-call-local] Filtered likely hallucination:', transcript);
589
+ session.audioBuffer = Buffer.alloc(0);
590
+ session.processingResponse = false;
591
+ socket.emit('voice-call-status', { status: 'listening' });
592
+ return;
593
+ }
594
+
595
+ console.log('[voice-call-local] User said:', transcript);
596
+ session.transcript.push({ role: 'user', text: transcript });
597
+ session.messages.push({ role: 'user', content: transcript });
598
+ socket.emit('voice-call-user-transcript', { text: transcript });
599
+
600
+ // Query main agent with streaming
601
+ console.log('[voice-call-local] Querying main agent (streaming)...');
602
+ session.processingResponse = true;
603
+ socket.emit('voice-call-status', { status: 'thinking' });
604
+
605
+ let firstChunkSent = false;
606
+ session.isPlaying = true;
607
+ const responseText = await queryMainAgentStreaming(session.messages, session.systemPrompt, {
608
+ onSentence: (text, audioBase64) => {
609
+ if (!firstChunkSent) {
610
+ socket.emit('voice-call-status', { status: 'speaking' });
611
+ firstChunkSent = true;
612
+ }
613
+ socket.emit('voice-call-transcript-delta', { text });
614
+ socket.emit('voice-call-audio-delta', { audio: audioBase64 });
615
+ // Also emit legacy event for compatibility
616
+ socket.emit('voice-call-audio-response', { audio: audioBase64, text });
617
+ },
618
+ onDone: () => { session.isPlaying = false; socket.emit('voice-call-response-done', { text: '' }); }
619
+ });
620
+
621
+ if (!responseText) {
622
+ throw new Error('No response from agent');
623
+ }
624
+
625
+ console.log('[voice-call-local] Agent responded:', responseText.substring(0, 100));
626
+ session.transcript.push({ role: 'assistant', text: responseText });
627
+ session.messages.push({ role: 'assistant', content: responseText });
628
+
629
+ console.log('[voice-call-local] Response sent (streamed)');
630
+ socket.emit('voice-call-status', { status: 'listening' });
631
+ session.processingResponse = false;
632
+
633
+ } catch (e) {
634
+ console.error('[voice-call-local] Processing error:', e.message);
635
+ socket.emit('voice-call-error', { error: e.message });
636
+ socket.emit('voice-call-status', { status: 'listening' });
637
+ session.processingResponse = false;
638
+ }
639
+ });
640
+
641
+ socket.on('voice-call-end', () => {
642
+ const session = activeCalls.get(socket.id);
643
+ if (!session) return;
644
+ cleanupCall(socket.id, session);
645
+ });
646
+
647
+ socket.on('disconnect', () => {
648
+ const session = activeCalls.get(socket.id);
649
+ if (session) {
650
+ console.log('[voice-call-local] Socket disconnected, cleaning up call');
651
+ cleanupCall(socket.id, session);
652
+ }
653
+ });
654
+ });
655
+
656
+ function cleanupCall(socketId, session) {
657
+ console.log(`[voice-call-local] Call ended (${session.transcript.length} turns)`);
658
+
659
+ // Save transcript if there was conversation
660
+ if (session.transcript.length > 0) {
661
+ try {
662
+ const logDir = path.join(process.env.HOME || '', '.openclaw', 'workspace', 'memory', 'call-logs');
663
+ fs.mkdirSync(logDir, { recursive: true });
664
+ const ts = new Date().toISOString().replace(/[:.]/g, '-');
665
+ const pendingFile = path.join(logDir, `pending_local_${ts}.json`);
666
+
667
+ // Calculate duration
668
+ const startTime = new Date(session.startedAt).getTime();
669
+ const endTime = Date.now();
670
+ const durationSeconds = Math.round((endTime - startTime) / 1000);
671
+ const durationMin = Math.floor(durationSeconds / 60);
672
+ const durationSec = durationSeconds % 60;
673
+
674
+ const logData = {
675
+ type: 'local-voice-call',
676
+ timestamp: new Date().toISOString(),
677
+ duration: durationSeconds,
678
+ transcript: session.transcript,
679
+ };
680
+
681
+ fs.writeFileSync(pendingFile, JSON.stringify(logData, null, 2));
682
+ console.log(`[voice-call-local] Transcript saved: ${pendingFile}`);
683
+
684
+ // Fire wake hook to main agent with full transcript
685
+ const readableTranscript = session.transcript.map(t =>
686
+ `${t.role === 'user' ? 'Brandon' : 'AIVA'}: ${t.text}`
687
+ ).join('\n');
688
+
689
+ const wakeText = `[VOICE-CALL-COMPLETE] Voice call with Brandon just ended. Duration: ${durationMin}m ${durationSec}s.\n\nFull transcript:\n${readableTranscript}\n\nProcess any action items from this conversation immediately. If Brandon asked for something to be done, delegate it. If there are insights about Brandon's preferences or decisions, log them to memory.`;
690
+
691
+ // Send transcript directly to main agent via sessions_send
692
+ const invokeData = JSON.stringify({
693
+ tool: 'sessions_send',
694
+ args: {
695
+ sessionKey: 'agent:main:main',
696
+ message: wakeText
697
+ }
698
+ });
699
+
700
+ const invokeReq = require('http').request({
701
+ hostname: '127.0.0.1',
702
+ port: 18789,
703
+ path: '/tools/invoke',
704
+ method: 'POST',
705
+ headers: {
706
+ 'Content-Type': 'application/json',
707
+ 'Authorization': `Bearer ${OPENCLAW_AUTH}`
708
+ }
709
+ }, (res) => {
710
+ let body = '';
711
+ res.on('data', d => body += d);
712
+ res.on('end', () => {
713
+ console.log(`[voice-call-local] sessions_send result: ${res.statusCode}`);
714
+ });
715
+ });
716
+ invokeReq.on('error', e => console.error('[voice-call-local] sessions_send failed:', e.message));
717
+ invokeReq.write(invokeData);
718
+ invokeReq.end();
719
+
720
+ } catch (e) {
721
+ console.error('[voice-call-local] Failed to save transcript:', e.message);
722
+ }
723
+ }
724
+
725
+ activeCalls.delete(socketId);
726
+ }
727
+
728
+ console.log('[voice-call-local] Voice call handler initialized (Local TTS + Main Agent)');
729
+ }
730
+
731
+ module.exports = { setupVoiceCall };