@kernel.chat/kbot 3.51.0 → 3.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/README.md +43 -9
  2. package/dist/agent-protocol.test.d.ts +2 -0
  3. package/dist/agent-protocol.test.d.ts.map +1 -0
  4. package/dist/agent-protocol.test.js +730 -0
  5. package/dist/agent-protocol.test.js.map +1 -0
  6. package/dist/agent.d.ts.map +1 -1
  7. package/dist/agent.js +34 -10
  8. package/dist/agent.js.map +1 -1
  9. package/dist/agents/replit.js +1 -1
  10. package/dist/auth.js +3 -3
  11. package/dist/auth.js.map +1 -1
  12. package/dist/behaviour.d.ts +30 -0
  13. package/dist/behaviour.d.ts.map +1 -0
  14. package/dist/behaviour.js +191 -0
  15. package/dist/behaviour.js.map +1 -0
  16. package/dist/bench.d.ts +64 -0
  17. package/dist/bench.d.ts.map +1 -0
  18. package/dist/bench.js +973 -0
  19. package/dist/bench.js.map +1 -0
  20. package/dist/bootstrap.js +1 -1
  21. package/dist/bootstrap.js.map +1 -1
  22. package/dist/cli.js +144 -29
  23. package/dist/cli.js.map +1 -1
  24. package/dist/cloud-agent.d.ts +77 -0
  25. package/dist/cloud-agent.d.ts.map +1 -0
  26. package/dist/cloud-agent.js +743 -0
  27. package/dist/cloud-agent.js.map +1 -0
  28. package/dist/context.test.d.ts +2 -0
  29. package/dist/context.test.d.ts.map +1 -0
  30. package/dist/context.test.js +561 -0
  31. package/dist/context.test.js.map +1 -0
  32. package/dist/evolution.d.ts.map +1 -1
  33. package/dist/evolution.js +4 -1
  34. package/dist/evolution.js.map +1 -1
  35. package/dist/github-release.d.ts +61 -0
  36. package/dist/github-release.d.ts.map +1 -0
  37. package/dist/github-release.js +451 -0
  38. package/dist/github-release.js.map +1 -0
  39. package/dist/graph-memory.test.d.ts +2 -0
  40. package/dist/graph-memory.test.d.ts.map +1 -0
  41. package/dist/graph-memory.test.js +946 -0
  42. package/dist/graph-memory.test.js.map +1 -0
  43. package/dist/init-science.d.ts +43 -0
  44. package/dist/init-science.d.ts.map +1 -0
  45. package/dist/init-science.js +477 -0
  46. package/dist/init-science.js.map +1 -0
  47. package/dist/integrations/ableton-m4l.d.ts +124 -0
  48. package/dist/integrations/ableton-m4l.d.ts.map +1 -0
  49. package/dist/integrations/ableton-m4l.js +338 -0
  50. package/dist/integrations/ableton-m4l.js.map +1 -0
  51. package/dist/integrations/ableton-osc.d.ts.map +1 -1
  52. package/dist/integrations/ableton-osc.js +6 -2
  53. package/dist/integrations/ableton-osc.js.map +1 -1
  54. package/dist/lab.d.ts +45 -0
  55. package/dist/lab.d.ts.map +1 -0
  56. package/dist/lab.js +1020 -0
  57. package/dist/lab.js.map +1 -0
  58. package/dist/lsp-deep.d.ts +101 -0
  59. package/dist/lsp-deep.d.ts.map +1 -0
  60. package/dist/lsp-deep.js +689 -0
  61. package/dist/lsp-deep.js.map +1 -0
  62. package/dist/memory.test.d.ts +2 -0
  63. package/dist/memory.test.d.ts.map +1 -0
  64. package/dist/memory.test.js +369 -0
  65. package/dist/memory.test.js.map +1 -0
  66. package/dist/multi-session.d.ts +164 -0
  67. package/dist/multi-session.d.ts.map +1 -0
  68. package/dist/multi-session.js +885 -0
  69. package/dist/multi-session.js.map +1 -0
  70. package/dist/music-learning.d.ts +181 -0
  71. package/dist/music-learning.d.ts.map +1 -0
  72. package/dist/music-learning.js +340 -0
  73. package/dist/music-learning.js.map +1 -0
  74. package/dist/self-eval.d.ts.map +1 -1
  75. package/dist/self-eval.js +5 -2
  76. package/dist/self-eval.js.map +1 -1
  77. package/dist/skill-system.d.ts +68 -0
  78. package/dist/skill-system.d.ts.map +1 -0
  79. package/dist/skill-system.js +386 -0
  80. package/dist/skill-system.js.map +1 -0
  81. package/dist/streaming.d.ts.map +1 -1
  82. package/dist/streaming.js +0 -1
  83. package/dist/streaming.js.map +1 -1
  84. package/dist/teach.d.ts +136 -0
  85. package/dist/teach.d.ts.map +1 -0
  86. package/dist/teach.js +915 -0
  87. package/dist/teach.js.map +1 -0
  88. package/dist/telemetry.d.ts +1 -1
  89. package/dist/telemetry.d.ts.map +1 -1
  90. package/dist/telemetry.js.map +1 -1
  91. package/dist/tools/ableton.d.ts.map +1 -1
  92. package/dist/tools/ableton.js +24 -8
  93. package/dist/tools/ableton.js.map +1 -1
  94. package/dist/tools/arrangement-engine.d.ts +2 -0
  95. package/dist/tools/arrangement-engine.d.ts.map +1 -0
  96. package/dist/tools/arrangement-engine.js +644 -0
  97. package/dist/tools/arrangement-engine.js.map +1 -0
  98. package/dist/tools/browser-agent.js +2 -2
  99. package/dist/tools/browser-agent.js.map +1 -1
  100. package/dist/tools/forge.d.ts.map +1 -1
  101. package/dist/tools/forge.js +15 -26
  102. package/dist/tools/forge.js.map +1 -1
  103. package/dist/tools/git.d.ts.map +1 -1
  104. package/dist/tools/git.js +10 -7
  105. package/dist/tools/git.js.map +1 -1
  106. package/dist/tools/index.d.ts.map +1 -1
  107. package/dist/tools/index.js +5 -0
  108. package/dist/tools/index.js.map +1 -1
  109. package/dist/tools/producer-engine.d.ts +71 -0
  110. package/dist/tools/producer-engine.d.ts.map +1 -0
  111. package/dist/tools/producer-engine.js +1859 -0
  112. package/dist/tools/producer-engine.js.map +1 -0
  113. package/dist/tools/sound-designer.d.ts +2 -0
  114. package/dist/tools/sound-designer.d.ts.map +1 -0
  115. package/dist/tools/sound-designer.js +896 -0
  116. package/dist/tools/sound-designer.js.map +1 -0
  117. package/dist/voice-realtime.d.ts +54 -0
  118. package/dist/voice-realtime.d.ts.map +1 -0
  119. package/dist/voice-realtime.js +805 -0
  120. package/dist/voice-realtime.js.map +1 -0
  121. package/package.json +11 -4
@@ -0,0 +1,805 @@
1
+ // kbot Voice Realtime — Bidirectional real-time voice conversation
2
+ //
3
+ // Real-time voice mode with natural turn-taking, VAD, streaming TTS,
4
+ // interrupt handling, and waveform visualization.
5
+ //
6
+ // Usage:
7
+ // import { startRealtimeVoice } from './voice-realtime.js'
8
+ // await startRealtimeVoice({ stt: 'whisper-local', tts: 'system', continuous: true })
9
+ import { execSync, spawn } from 'node:child_process';
10
+ import { createInterface } from 'node:readline';
11
+ import { tmpdir, homedir } from 'node:os';
12
+ import { join } from 'node:path';
13
+ import { existsSync, unlinkSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'node:fs';
14
+ import chalk from 'chalk';
15
+ // ---------------------------------------------------------------------------
16
+ // Constants
17
+ // ---------------------------------------------------------------------------
18
+ const KBOT_DIR = join(homedir(), '.kbot');
19
+ const VOICE_DIR = join(KBOT_DIR, 'voice');
20
+ const ACCENT = chalk.hex('#A78BFA');
21
+ const DIM = chalk.dim;
22
+ const OLLAMA_DEFAULT = process.env.OLLAMA_HOST || 'http://localhost:11434';
23
+ const WAVE = [' ', '\u2581', '\u2582', '\u2583', '\u2584', '\u2585', '\u2586', '\u2587', '\u2588'];
24
+ // ---------------------------------------------------------------------------
25
+ // Helpers
26
+ // ---------------------------------------------------------------------------
27
+ const has = (cmd) => {
28
+ try {
29
+ execSync(`which ${cmd}`, { stdio: 'ignore' });
30
+ return true;
31
+ }
32
+ catch {
33
+ return false;
34
+ }
35
+ };
36
+ const hasSox = () => has('sox') && has('rec');
37
+ const hasRec = () => has('rec') || has('arecord');
38
+ function loadKey(provider) {
39
+ const p = join(KBOT_DIR, 'config.json');
40
+ if (!existsSync(p))
41
+ return null;
42
+ try {
43
+ const c = JSON.parse(readFileSync(p, 'utf-8'));
44
+ return (c.providers?.[provider]?.apiKey || (c.provider === provider && c.apiKey) || null);
45
+ }
46
+ catch {
47
+ return null;
48
+ }
49
+ }
50
+ function detectSTT(preferred) {
51
+ if (preferred === 'whisper-local' || preferred === 'whisper-api' || preferred === 'system')
52
+ return preferred;
53
+ if (has('whisper') || has('whisper.cpp') || has('whisper-cpp'))
54
+ return 'whisper-local';
55
+ if (loadKey('openai'))
56
+ return 'whisper-api';
57
+ return 'system';
58
+ }
59
+ function detectTTS(preferred) {
60
+ if (preferred === 'system' || preferred === 'elevenlabs' || preferred === 'openai-tts')
61
+ return preferred;
62
+ if (process.platform === 'darwin' || has('espeak') || has('piper'))
63
+ return 'system';
64
+ if (loadKey('openai'))
65
+ return 'openai-tts';
66
+ if (loadKey('elevenlabs'))
67
+ return 'elevenlabs';
68
+ return 'system';
69
+ }
70
+ function stripMd(text) {
71
+ return text
72
+ .replace(/```(\w+)?\n[\s\S]*?```/g, (_, l) => l ? `Here's a ${l} snippet.` : 'Here\'s a code snippet.')
73
+ .replace(/`([^`]+)`/g, '$1')
74
+ .replace(/^#{1,6}\s+/gm, '')
75
+ .replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1')
76
+ .replace(/_{1,3}([^_]+)_{1,3}/g, '$1')
77
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
78
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
79
+ .replace(/^>\s+/gm, '')
80
+ .replace(/^[\s]*[-*+]\s+/gm, '')
81
+ .replace(/^[\s]*\d+\.\s+/gm, '')
82
+ .replace(/\n{3,}/g, '\n\n')
83
+ .trim();
84
+ }
85
+ const shellSafe = (t) => t.replace(/[;&|`$(){}[\]!#\\]/g, '');
86
+ function splitSentences(text) {
87
+ return (text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g) || [text]).map(s => s.trim()).filter(Boolean);
88
+ }
89
+ // ---------------------------------------------------------------------------
90
+ // Waveform visualization
91
+ // ---------------------------------------------------------------------------
92
+ function showWaveform(volume) {
93
+ const bars = [];
94
+ for (let i = 0; i < 30; i++) {
95
+ const v = Math.max(0, Math.min(1, volume + Math.sin(i * 0.5 + Date.now() * 0.001) * 0.3 * volume));
96
+ bars.push(WAVE[Math.round(v * (WAVE.length - 1))]);
97
+ }
98
+ process.stdout.write(`\r ${chalk.cyan('\uD83C\uDF99')} ${chalk.cyan(bars.join(''))} `);
99
+ }
100
+ function showStatus(status, extra) {
101
+ const labels = {
102
+ listening: chalk.cyan('\uD83C\uDF99 Listening...'),
103
+ processing: chalk.yellow('\uD83D\uDD04 Processing...'),
104
+ speaking: chalk.green('\uD83D\uDD0A Speaking...'),
105
+ idle: chalk.dim('\u23F8 Idle'),
106
+ };
107
+ process.stdout.write(`\r${' '.repeat(80)}\r ${labels[status]}${extra ? ` ${DIM(extra)}` : ''}`);
108
+ }
109
+ // ---------------------------------------------------------------------------
110
+ // Voice Activity Detection (VAD) via sox
111
+ // ---------------------------------------------------------------------------
112
+ class VAD {
113
+ cfg;
114
+ proc = null;
115
+ speaking = false;
116
+ silenceAt = 0;
117
+ speechAt = 0;
118
+ onStart;
119
+ onEnd;
120
+ onVol;
121
+ constructor(cfg) { this.cfg = cfg; }
122
+ start(cb) {
123
+ this.onStart = cb.onStart;
124
+ this.onEnd = cb.onEnd;
125
+ this.onVol = cb.onVol;
126
+ if (!hasSox())
127
+ return;
128
+ this.proc = spawn('rec', [
129
+ '-q', '-r', String(this.cfg.sampleRate), '-c', '1',
130
+ '-t', 'raw', '-b', '16', '-e', 'signed-integer', '-',
131
+ ], { stdio: ['ignore', 'pipe', 'ignore'] });
132
+ this.proc.stdout?.on('data', (chunk) => {
133
+ let sum = 0;
134
+ const n = chunk.length / 2;
135
+ for (let i = 0; i < chunk.length - 1; i += 2) {
136
+ const s = chunk.readInt16LE(i);
137
+ sum += s * s;
138
+ }
139
+ const vol = Math.min(1, Math.sqrt(sum / (n || 1)) / 32768);
140
+ this.onVol?.(vol);
141
+ this.process(vol);
142
+ });
143
+ }
144
+ process(vol) {
145
+ const now = Date.now();
146
+ if (!this.speaking) {
147
+ if (vol > this.cfg.threshold) {
148
+ this.speaking = true;
149
+ this.speechAt = now;
150
+ this.silenceAt = 0;
151
+ this.onStart?.();
152
+ }
153
+ }
154
+ else {
155
+ if (vol < this.cfg.threshold) {
156
+ if (!this.silenceAt)
157
+ this.silenceAt = now;
158
+ else if (now - this.silenceAt >= this.cfg.silenceDuration * 1000) {
159
+ if ((now - this.speechAt) / 1000 >= this.cfg.minSpeechDuration)
160
+ this.onEnd?.();
161
+ this.speaking = false;
162
+ this.silenceAt = 0;
163
+ }
164
+ }
165
+ else {
166
+ this.silenceAt = 0;
167
+ }
168
+ }
169
+ }
170
+ stop() {
171
+ this.proc?.kill('SIGTERM');
172
+ this.proc = null;
173
+ this.speaking = false;
174
+ }
175
+ }
176
+ // ---------------------------------------------------------------------------
177
+ // Audio recording
178
+ // ---------------------------------------------------------------------------
179
+ async function record(path, maxSec, silenceSec) {
180
+ return new Promise(resolve => {
181
+ const proc = has('rec')
182
+ ? spawn('rec', [
183
+ path, 'rate', '16k', 'channels', '1',
184
+ 'silence', '1', '0.1', '1.5%', '1', String(silenceSec), '1.5%',
185
+ 'trim', '0', String(maxSec),
186
+ ], { stdio: ['ignore', 'ignore', 'ignore'] })
187
+ : spawn('arecord', [
188
+ '-f', 'S16_LE', '-r', '16000', '-c', '1', '-d', String(maxSec), path,
189
+ ], { stdio: ['ignore', 'ignore', 'ignore'] });
190
+ const t = setTimeout(() => proc.kill('SIGTERM'), (maxSec + 3) * 1000);
191
+ proc.on('close', () => { clearTimeout(t); resolve(existsSync(path)); });
192
+ proc.on('error', () => { clearTimeout(t); resolve(false); });
193
+ });
194
+ }
195
+ // ---------------------------------------------------------------------------
196
+ // STT engines (fallback chain)
197
+ // ---------------------------------------------------------------------------
198
+ async function sttLocal(path, lang) {
199
+ const cmd = ['whisper', 'whisper.cpp', 'whisper-cpp'].find(has);
200
+ if (!cmd)
201
+ throw new Error('No whisper binary');
202
+ return execSync(`${cmd} "${path}" --model base --language ${lang} --output_format txt 2>/dev/null`, { encoding: 'utf-8', timeout: 60_000 }).trim();
203
+ }
204
+ async function sttAPI(path, key, lang) {
205
+ return execSync(`curl -s https://api.openai.com/v1/audio/transcriptions ` +
206
+ `-H "Authorization: Bearer ${key}" ` +
207
+ `-F "file=@${path}" -F "model=whisper-1" -F "language=${lang}" -F "response_format=text"`, { encoding: 'utf-8', timeout: 30_000 }).trim();
208
+ }
209
+ async function transcribe(path, engine, lang, key) {
210
+ const chain = [];
211
+ if (engine === 'whisper-local')
212
+ chain.push(() => sttLocal(path, lang));
213
+ if (key)
214
+ chain.push(() => sttAPI(path, key, lang));
215
+ if (engine !== 'whisper-local' && ['whisper', 'whisper.cpp', 'whisper-cpp'].some(has))
216
+ chain.push(() => sttLocal(path, lang));
217
+ for (const fn of chain) {
218
+ try {
219
+ const t = await fn();
220
+ if (t)
221
+ return t;
222
+ }
223
+ catch { /* next */ }
224
+ }
225
+ return '';
226
+ }
227
+ // ---------------------------------------------------------------------------
228
+ // TTS engines (fallback chain)
229
+ // ---------------------------------------------------------------------------
230
+ async function ttsSystem(text, voice, rate) {
231
+ const t = shellSafe(text);
232
+ if (!t)
233
+ return null;
234
+ if (process.platform === 'darwin')
235
+ return spawn('say', ['-v', voice, '-r', String(rate), t], { stdio: 'ignore' });
236
+ if (has('piper')) {
237
+ const p = spawn('piper', ['--output-raw'], { stdio: ['pipe', 'pipe', 'ignore'] });
238
+ const a = spawn('aplay', ['-r', '22050', '-f', 'S16_LE', '-'], { stdio: ['pipe', 'ignore', 'ignore'] });
239
+ p.stdout?.pipe(a.stdin);
240
+ p.stdin?.write(t);
241
+ p.stdin?.end();
242
+ return a;
243
+ }
244
+ if (has('espeak'))
245
+ return spawn('espeak', [t], { stdio: 'ignore' });
246
+ return null;
247
+ }
248
+ async function ttsOpenAI(text, key) {
249
+ const t = shellSafe(stripMd(text));
250
+ if (!t)
251
+ return null;
252
+ const tmp = join(tmpdir(), `kbot-tts-${Date.now()}.mp3`);
253
+ try {
254
+ execSync(`curl -s https://api.openai.com/v1/audio/speech -H "Authorization: Bearer ${key}" ` +
255
+ `-H "Content-Type: application/json" -d '${JSON.stringify({ model: 'tts-1', input: t, voice: 'nova' })}' --output "${tmp}"`, { timeout: 30_000 });
256
+ if (!existsSync(tmp))
257
+ return null;
258
+ const player = process.platform === 'darwin' ? 'afplay' : has('mpv') ? 'mpv' : has('aplay') ? 'aplay' : null;
259
+ if (!player) {
260
+ try {
261
+ unlinkSync(tmp);
262
+ }
263
+ catch { }
264
+ return null;
265
+ }
266
+ const p = spawn(player, player === 'mpv' ? ['--no-terminal', tmp] : [tmp], { stdio: 'ignore' });
267
+ p.on('close', () => { try {
268
+ unlinkSync(tmp);
269
+ }
270
+ catch { } });
271
+ return p;
272
+ }
273
+ catch {
274
+ try {
275
+ unlinkSync(tmp);
276
+ }
277
+ catch { }
278
+ return null;
279
+ }
280
+ }
281
+ async function ttsElevenLabs(text, key, voiceId) {
282
+ const t = stripMd(text);
283
+ if (!t)
284
+ return null;
285
+ const tmp = join(tmpdir(), `kbot-el-${Date.now()}.mp3`);
286
+ try {
287
+ execSync(`curl -s "https://api.elevenlabs.io/v1/text-to-speech/${voiceId}" ` +
288
+ `-H "xi-api-key: ${key}" -H "Content-Type: application/json" ` +
289
+ `-d '${JSON.stringify({ text: t, model_id: 'eleven_monolingual_v1' })}' --output "${tmp}"`, { timeout: 30_000 });
290
+ if (!existsSync(tmp))
291
+ return null;
292
+ const player = process.platform === 'darwin' ? 'afplay' : has('mpv') ? 'mpv' : null;
293
+ if (!player) {
294
+ try {
295
+ unlinkSync(tmp);
296
+ }
297
+ catch { }
298
+ return null;
299
+ }
300
+ const p = spawn(player, player === 'mpv' ? ['--no-terminal', tmp] : [tmp], { stdio: 'ignore' });
301
+ p.on('close', () => { try {
302
+ unlinkSync(tmp);
303
+ }
304
+ catch { } });
305
+ return p;
306
+ }
307
+ catch {
308
+ try {
309
+ unlinkSync(tmp);
310
+ }
311
+ catch { }
312
+ return null;
313
+ }
314
+ }
315
+ async function speak(text, st) {
316
+ switch (st.session.ttsEngine) {
317
+ case 'system': return ttsSystem(text, st.voice, st.rate);
318
+ case 'openai-tts': return st.openaiKey ? ttsOpenAI(text, st.openaiKey) : ttsSystem(text, st.voice, st.rate);
319
+ case 'elevenlabs': return st.elevenKey ? ttsElevenLabs(text, st.elevenKey, st.elevenVoiceId) : ttsSystem(text, st.voice, st.rate);
320
+ default: return null;
321
+ }
322
+ }
323
+ // ---------------------------------------------------------------------------
324
+ // Streaming TTS — sentence-level chunking
325
+ // ---------------------------------------------------------------------------
326
+ async function speakChunked(text, st, isInterrupted) {
327
+ const sentences = splitSentences(stripMd(text));
328
+ for (let i = 0; i < sentences.length; i++) {
329
+ if (isInterrupted())
330
+ break;
331
+ st.session.status = 'speaking';
332
+ showStatus('speaking', `(${i + 1}/${sentences.length})`);
333
+ const proc = await speak(sentences[i], st);
334
+ if (!proc)
335
+ continue;
336
+ st.ttsProc = proc;
337
+ await new Promise(resolve => {
338
+ const iv = setInterval(() => { if (isInterrupted()) {
339
+ clearInterval(iv);
340
+ proc.kill('SIGTERM');
341
+ } }, 100);
342
+ proc.on('close', () => { clearInterval(iv); if (st.ttsProc === proc)
343
+ st.ttsProc = null; resolve(); });
344
+ proc.on('error', () => { clearInterval(iv); if (st.ttsProc === proc)
345
+ st.ttsProc = null; resolve(); });
346
+ });
347
+ }
348
+ }
349
+ function interruptTTS(st) {
350
+ if (st.ttsProc) {
351
+ st.ttsProc.kill('SIGTERM');
352
+ st.ttsProc = null;
353
+ }
354
+ st.interrupted = true;
355
+ st.session.status = 'idle';
356
+ }
357
+ // ---------------------------------------------------------------------------
358
+ // Voice commands
359
+ // ---------------------------------------------------------------------------
360
+ function parseCommand(text) {
361
+ const l = text.toLowerCase().trim();
362
+ if (/^(stop|cancel|shut up|be quiet|enough)$/.test(l))
363
+ return { action: 'stop' };
364
+ if (/^(pause|hold on|wait)$/.test(l))
365
+ return { action: 'pause' };
366
+ if (/^(save this|save conversation|save)$/.test(l))
367
+ return { action: 'save' };
368
+ if (/^(exit|quit|goodbye|bye|end)$/.test(l))
369
+ return { action: 'exit' };
370
+ const m = l.match(/^switch\s+to\s+(\w+)$/);
371
+ if (m)
372
+ return { action: 'switch', arg: m[1] };
373
+ return null;
374
+ }
375
+ // ---------------------------------------------------------------------------
376
+ // Audio recording to ~/.kbot/voice/
377
+ // ---------------------------------------------------------------------------
378
+ function ensureVoiceDir() { if (!existsSync(VOICE_DIR))
379
+ mkdirSync(VOICE_DIR, { recursive: true }); }
380
+ function saveRecording(src, sessionId, idx, role) {
381
+ ensureVoiceDir();
382
+ const dest = join(VOICE_DIR, `${sessionId}-${String(idx).padStart(3, '0')}-${role}.wav`);
383
+ try {
384
+ writeFileSync(dest, readFileSync(src));
385
+ return dest;
386
+ }
387
+ catch {
388
+ return undefined;
389
+ }
390
+ }
391
+ function audioDuration(path) {
392
+ try {
393
+ if (has('soxi'))
394
+ return parseFloat(execSync(`soxi -D "${path}" 2>/dev/null`, { encoding: 'utf-8' })) || 0;
395
+ return statSync(path).size / 32000;
396
+ }
397
+ catch {
398
+ return 0;
399
+ }
400
+ }
401
+ function saveSession(s) {
402
+ ensureVoiceDir();
403
+ try {
404
+ writeFileSync(join(VOICE_DIR, `${s.id}.json`), JSON.stringify({
405
+ id: s.id, timestamp: new Date().toISOString(), turns: s.history.length,
406
+ sttEngine: s.sttEngine, ttsEngine: s.ttsEngine, language: s.language, history: s.history,
407
+ }, null, 2));
408
+ }
409
+ catch { /* non-critical */ }
410
+ }
411
+ // ---------------------------------------------------------------------------
412
+ // Ollama LLM
413
+ // ---------------------------------------------------------------------------
414
+ async function checkOllama(host) {
415
+ try {
416
+ return (await fetch(`${host}/api/tags`, { signal: AbortSignal.timeout(3000) })).ok;
417
+ }
418
+ catch {
419
+ return false;
420
+ }
421
+ }
422
+ async function chat(msg, st, agent) {
423
+ const sys = [
424
+ 'You are kbot, a helpful AI assistant in a real-time voice conversation.',
425
+ 'Keep responses concise and conversational. Avoid code blocks, markdown, long lists, URLs.',
426
+ 'Use natural speech. Ask if the user wants detail on complex topics.',
427
+ agent !== 'auto' ? `You are the "${agent}" specialist.` : '',
428
+ ].filter(Boolean).join(' ');
429
+ const body = JSON.stringify({
430
+ model: st.ollamaModel,
431
+ messages: [{ role: 'system', content: sys }, ...st.messages.slice(-10), { role: 'user', content: msg }],
432
+ stream: false, options: { temperature: 0.7, num_predict: 400 },
433
+ });
434
+ const res = await fetch(`${st.ollamaHost}/api/chat`, {
435
+ method: 'POST', headers: { 'Content-Type': 'application/json' },
436
+ body, signal: AbortSignal.timeout(60_000),
437
+ });
438
+ if (!res.ok)
439
+ throw new Error(`Ollama ${res.status}`);
440
+ const data = await res.json();
441
+ return data.message?.content?.trim() || '(no response)';
442
+ }
443
+ // ---------------------------------------------------------------------------
444
+ // Text input fallback
445
+ // ---------------------------------------------------------------------------
446
+ function textInput(prompt) {
447
+ return new Promise(resolve => {
448
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
449
+ rl.question(prompt, a => { rl.close(); resolve(a.trim()); });
450
+ });
451
+ }
452
+ // ---------------------------------------------------------------------------
453
+ // Single voice turn (non-VAD mode)
454
+ // ---------------------------------------------------------------------------
455
+ async function voiceTurn(st, agent) {
456
+ const { session } = st;
457
+ const idx = session.history.length;
458
+ let userText = '';
459
+ const tmp = join(tmpdir(), `kbot-rt-${Date.now()}.wav`);
460
+ // 1. Capture input
461
+ if (hasRec() && session.sttEngine !== 'system') {
462
+ session.status = 'listening';
463
+ showStatus('listening');
464
+ console.log();
465
+ const ok = await record(tmp, 30, st.vad.silenceDuration);
466
+ if (!ok || !existsSync(tmp)) {
467
+ console.log(chalk.yellow(' No audio captured.'));
468
+ userText = await textInput(chalk.cyan(' You: '));
469
+ }
470
+ else {
471
+ saveRecording(tmp, session.id, idx, 'user');
472
+ session.status = 'processing';
473
+ showStatus('processing', '(transcribing)');
474
+ userText = await transcribe(tmp, session.sttEngine, session.language, st.openaiKey);
475
+ process.stdout.write(`\r${' '.repeat(80)}\r`);
476
+ if (userText) {
477
+ console.log(` ${DIM('You:')} ${chalk.white(userText)}`);
478
+ session.history.push({ role: 'user', text: userText, duration: audioDuration(tmp), timestamp: new Date().toISOString() });
479
+ }
480
+ else {
481
+ console.log(chalk.yellow(' Could not transcribe.'));
482
+ userText = await textInput(chalk.cyan(' You: '));
483
+ }
484
+ }
485
+ try {
486
+ if (existsSync(tmp))
487
+ unlinkSync(tmp);
488
+ }
489
+ catch { }
490
+ }
491
+ else {
492
+ userText = await textInput(chalk.cyan('\n You: '));
493
+ }
494
+ if (!userText || !st.running)
495
+ return { cont: true };
496
+ if (session.history.length === idx)
497
+ session.history.push({ role: 'user', text: userText, duration: 0, timestamp: new Date().toISOString() });
498
+ // 2. Voice commands
499
+ const cmd = parseCommand(userText);
500
+ if (cmd) {
501
+ if (cmd.action === 'exit') {
502
+ console.log(ACCENT(' kbot: ') + 'Goodbye!');
503
+ await speakChunked('Goodbye!', st, () => false);
504
+ return { cont: false };
505
+ }
506
+ if (cmd.action === 'stop') {
507
+ interruptTTS(st);
508
+ console.log(DIM(' (stopped)'));
509
+ return { cont: true };
510
+ }
511
+ if (cmd.action === 'pause') {
512
+ interruptTTS(st);
513
+ await new Promise(resolve => {
514
+ console.log(chalk.yellow('\n \u23F8 Paused') + DIM(' — press Enter to resume'));
515
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
516
+ rl.question(DIM(' > '), a => { rl.close(); if (a.trim() === 'exit')
517
+ st.running = false; resolve(); });
518
+ });
519
+ return { cont: st.running };
520
+ }
521
+ if (cmd.action === 'switch' && cmd.arg) {
522
+ console.log(ACCENT(' kbot: ') + `Switching to ${cmd.arg}.`);
523
+ await speakChunked(`Switching to ${cmd.arg}.`, st, () => false);
524
+ return { cont: true, newAgent: cmd.arg };
525
+ }
526
+ if (cmd.action === 'save') {
527
+ saveSession(session);
528
+ console.log(chalk.green(` Saved: ${VOICE_DIR}/${session.id}.json`));
529
+ await speakChunked('Conversation saved.', st, () => false);
530
+ return { cont: true };
531
+ }
532
+ }
533
+ // 3. LLM response
534
+ session.status = 'processing';
535
+ showStatus('processing', '(thinking)');
536
+ st.messages.push({ role: 'user', content: userText });
537
+ try {
538
+ const resp = await chat(userText, st, agent);
539
+ process.stdout.write(`\r${' '.repeat(80)}\r`);
540
+ console.log(ACCENT(' kbot: ') + resp);
541
+ st.messages.push({ role: 'assistant', content: resp });
542
+ session.history.push({ role: 'assistant', text: resp, duration: 0, timestamp: new Date().toISOString() });
543
+ st.interrupted = false;
544
+ await speakChunked(resp, st, () => st.interrupted);
545
+ session.status = 'idle';
546
+ }
547
+ catch (e) {
548
+ process.stdout.write(`\r${' '.repeat(80)}\r`);
549
+ console.log(chalk.red(` Error: ${e instanceof Error ? e.message : e}`));
550
+ await speakChunked('Sorry, I encountered an error.', st, () => false);
551
+ }
552
+ return { cont: true };
553
+ }
554
+ // ---------------------------------------------------------------------------
555
+ // VAD-driven continuous loop
556
+ // ---------------------------------------------------------------------------
557
+ async function vadLoop(st, agent) {
558
+ const vad = new VAD(st.vad);
559
+ let recProc = null, recPath = '', recording = false;
560
+ const startRec = () => {
561
+ if (recording)
562
+ return;
563
+ recording = true;
564
+ if (st.session.status === 'speaking') {
565
+ interruptTTS(st);
566
+ console.log(DIM('\n (interrupted)'));
567
+ }
568
+ st.session.status = 'listening';
569
+ recPath = join(tmpdir(), `kbot-vad-${Date.now()}.wav`);
570
+ if (has('rec'))
571
+ recProc = spawn('rec', [recPath, 'rate', '16k', 'channels', '1', 'trim', '0', '30'], { stdio: ['ignore', 'ignore', 'ignore'] });
572
+ };
573
+ const stopRec = async () => {
574
+ if (!recording || !recProc)
575
+ return;
576
+ recording = false;
577
+ recProc.kill('SIGTERM');
578
+ recProc = null;
579
+ await new Promise(r => setTimeout(r, 200));
580
+ if (!existsSync(recPath))
581
+ return;
582
+ st.session.status = 'processing';
583
+ showStatus('processing', '(transcribing)');
584
+ const text = await transcribe(recPath, st.session.sttEngine, st.session.language, st.openaiKey);
585
+ try {
586
+ unlinkSync(recPath);
587
+ }
588
+ catch { }
589
+ process.stdout.write(`\r${' '.repeat(80)}\r`);
590
+ if (!text)
591
+ return;
592
+ console.log(` ${DIM('You:')} ${chalk.white(text)}`);
593
+ const cmd = parseCommand(text);
594
+ if (cmd) {
595
+ if (cmd.action === 'exit') {
596
+ st.running = false;
597
+ return;
598
+ }
599
+ if (cmd.action === 'pause') {
600
+ vad.stop();
601
+ await new Promise(r => {
602
+ console.log(chalk.yellow('\n \u23F8 Paused'));
603
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
604
+ rl.question(DIM(' > '), () => { rl.close(); r(); });
605
+ });
606
+ if (st.running)
607
+ vad.start({ onStart: startRec, onEnd: () => { stopRec(); }, onVol: v => { if (st.session.status !== 'speaking')
608
+ showWaveform(v); } });
609
+ return;
610
+ }
611
+ if (cmd.action === 'save') {
612
+ saveSession(st.session);
613
+ console.log(chalk.green(' Saved.'));
614
+ return;
615
+ }
616
+ return;
617
+ }
618
+ st.session.history.push({ role: 'user', text, duration: 0, timestamp: new Date().toISOString() });
619
+ st.session.status = 'processing';
620
+ showStatus('processing', '(thinking)');
621
+ st.messages.push({ role: 'user', content: text });
622
+ try {
623
+ const resp = await chat(text, st, agent);
624
+ process.stdout.write(`\r${' '.repeat(80)}\r`);
625
+ console.log(ACCENT(' kbot: ') + resp);
626
+ st.messages.push({ role: 'assistant', content: resp });
627
+ st.session.history.push({ role: 'assistant', text: resp, duration: 0, timestamp: new Date().toISOString() });
628
+ st.interrupted = false;
629
+ await speakChunked(resp, st, () => st.interrupted);
630
+ }
631
+ catch (e) {
632
+ process.stdout.write(`\r${' '.repeat(80)}\r`);
633
+ console.log(chalk.red(` Error: ${e instanceof Error ? e.message : e}`));
634
+ }
635
+ st.session.status = 'idle';
636
+ };
637
+ vad.start({
638
+ onStart: startRec, onEnd: () => { stopRec(); },
639
+ onVol: v => { if (st.session.status === 'listening' || st.session.status === 'idle')
640
+ showWaveform(v); },
641
+ });
642
+ console.log(DIM(' VAD active — start speaking...\n'));
643
+ await new Promise(resolve => {
644
+ const iv = setInterval(() => { if (!st.running) {
645
+ clearInterval(iv);
646
+ vad.stop();
647
+ resolve();
648
+ } }, 200);
649
+ });
650
+ }
651
+ // ---------------------------------------------------------------------------
652
+ // Banner
653
+ // ---------------------------------------------------------------------------
654
+ function banner(st) {
655
+ const s = st.session;
656
+ console.log();
657
+ console.log(ACCENT.bold(' kbot Realtime Voice'));
658
+ console.log(ACCENT(' ' + '='.repeat(44)));
659
+ console.log();
660
+ console.log(` ${DIM('Session:')} ${DIM(s.id)}`);
661
+ console.log(` ${DIM('STT:')} ${chalk.green(s.sttEngine)}`);
662
+ console.log(` ${DIM('TTS:')} ${chalk.green(s.ttsEngine)}${s.ttsEngine === 'system' ? DIM(` (${st.voice}, ${st.rate} wpm)`) : ''}`);
663
+ console.log(` ${DIM('Language:')} ${s.language}`);
664
+ console.log(` ${DIM('VAD:')} ${s.vadEnabled ? chalk.green('on') + DIM(` (thresh ${st.vad.threshold}, silence ${st.vad.silenceDuration}s)`) : chalk.yellow('off')}`);
665
+ console.log(` ${DIM('Continuous:')} ${s.continuous ? chalk.green('yes') : chalk.yellow('no')}`);
666
+ console.log(` ${DIM('LLM:')} ${chalk.cyan(`${st.ollamaModel} @ ${st.ollamaHost}`)}`);
667
+ console.log(` ${DIM('Recording:')} ${chalk.cyan(VOICE_DIR)}`);
668
+ console.log();
669
+ console.log(DIM(' Voice commands: "stop", "pause", "switch to [agent]", "save this"'));
670
+ console.log(DIM(' Say "exit" / "goodbye" to end. Ctrl+C anytime.'));
671
+ console.log();
672
+ }
673
+ // ---------------------------------------------------------------------------
674
+ // Entry point: startRealtimeVoice
675
+ // ---------------------------------------------------------------------------
676
+ /**
677
+ * Start a real-time bidirectional voice conversation.
678
+ *
679
+ * - Voice Activity Detection (VAD) via sox for natural turn-taking
680
+ * - STT fallback: whisper-local -> whisper-api -> system
681
+ * - TTS fallback: system -> openai-tts -> elevenlabs
682
+ * - Streaming TTS: sentence-level chunking, speaks before full response
683
+ * - Interrupt: speaking stops if user starts talking
684
+ * - Voice commands: stop, pause, switch to [agent], save this
685
+ * - Audio saved to ~/.kbot/voice/ for playback/review
686
+ * - Waveform + status visualization in terminal
687
+ */
688
+ export async function startRealtimeVoice(opts) {
689
+ const id = `voice-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
690
+ const vadEnabled = opts?.vad !== false && hasSox();
691
+ const session = {
692
+ id, status: 'idle',
693
+ sttEngine: detectSTT(opts?.stt), ttsEngine: detectTTS(opts?.tts),
694
+ language: opts?.language ?? 'en', continuous: opts?.continuous ?? true,
695
+ vadEnabled, history: [],
696
+ };
697
+ const st = {
698
+ session,
699
+ vad: { threshold: opts?.vadThreshold ?? 0.02, silenceDuration: opts?.vadSilence ?? 1.5, minSpeechDuration: 0.3, sampleRate: 16000 },
700
+ running: false, interrupted: false, ttsProc: null,
701
+ ollamaHost: opts?.ollamaHost ?? OLLAMA_DEFAULT,
702
+ ollamaModel: opts?.ollamaModel ?? 'gemma3:12b',
703
+ messages: [], openaiKey: loadKey('openai'), elevenKey: loadKey('elevenlabs'),
704
+ elevenVoiceId: 'EXAVITQu4vr4xnSDxMaL', voice: opts?.voice ?? 'Samantha', rate: opts?.rate ?? 190,
705
+ };
706
+ // Pre-flight
707
+ if (!(await checkOllama(st.ollamaHost))) {
708
+ console.error(chalk.red(`\n Ollama is not running at ${st.ollamaHost}`));
709
+ console.error(chalk.yellow(` Start: ollama serve && ollama pull ${st.ollamaModel}`));
710
+ return;
711
+ }
712
+ if (!hasRec())
713
+ console.warn(chalk.yellow(' \u26A0 No recorder — install sox: brew install sox\n'));
714
+ if (session.sttEngine === 'system')
715
+ console.warn(chalk.yellow(' \u26A0 Install whisper.cpp for STT\n'));
716
+ if (vadEnabled && !hasSox()) {
717
+ console.warn(chalk.yellow(' \u26A0 VAD needs sox\n'));
718
+ session.vadEnabled = false;
719
+ }
720
+ ensureVoiceDir();
721
+ banner(st);
722
+ st.running = true;
723
+ const cleanup = () => {
724
+ if (st.running) {
725
+ st.running = false;
726
+ interruptTTS(st);
727
+ console.log(DIM('\n\n Session ended.'));
728
+ saveSession(session);
729
+ }
730
+ };
731
+ process.on('SIGINT', cleanup);
732
+ process.on('SIGTERM', cleanup);
733
+ // Greeting
734
+ const greeting = 'Hey! Real-time voice mode is active. Go ahead and speak.';
735
+ console.log(ACCENT(' kbot: ') + greeting);
736
+ await speakChunked(greeting, st, () => false);
737
+ // Main loop
738
+ if (session.vadEnabled) {
739
+ await vadLoop(st, opts?.agent ?? 'auto');
740
+ }
741
+ else {
742
+ let agent = opts?.agent ?? 'auto';
743
+ while (st.running) {
744
+ const result = await voiceTurn(st, agent);
745
+ if (!result.cont) {
746
+ st.running = false;
747
+ break;
748
+ }
749
+ if (result.newAgent)
750
+ agent = result.newAgent;
751
+ if (!session.continuous)
752
+ break;
753
+ }
754
+ }
755
+ saveSession(session);
756
+ process.removeListener('SIGINT', cleanup);
757
+ process.removeListener('SIGTERM', cleanup);
758
+ }
759
+ // ---------------------------------------------------------------------------
760
+ // Utility exports
761
+ // ---------------------------------------------------------------------------
762
+ /** Describe real-time voice capabilities of the current system */
763
+ export function describeRealtimeCapabilities() {
764
+ return [
765
+ 'Real-time Voice Capabilities:',
766
+ ` STT: ${detectSTT()}`,
767
+ ` TTS: ${detectTTS()}`,
768
+ ` VAD: ${hasSox() ? 'available' : 'unavailable (install sox)'}`,
769
+ ` Recorder: ${hasRec() ? 'available' : 'unavailable'}`,
770
+ ` Platform: ${process.platform}`,
771
+ ` Voice dir: ${VOICE_DIR}`,
772
+ ].join('\n');
773
+ }
774
+ /** List saved voice sessions from ~/.kbot/voice/ */
775
+ export function listVoiceSessions() {
776
+ ensureVoiceDir();
777
+ try {
778
+ return execSync(`ls "${VOICE_DIR}"/*.json 2>/dev/null || true`, { encoding: 'utf-8' })
779
+ .split('\n').filter(Boolean).map(f => {
780
+ try {
781
+ const d = JSON.parse(readFileSync(f, 'utf-8'));
782
+ return { id: d.id, timestamp: d.timestamp, turns: d.turns };
783
+ }
784
+ catch {
785
+ return null;
786
+ }
787
+ }).filter((x) => x !== null);
788
+ }
789
+ catch {
790
+ return [];
791
+ }
792
+ }
793
+ /** Load a saved voice session by ID */
794
+ export function getVoiceSession(sessionId) {
795
+ const p = join(VOICE_DIR, `${sessionId}.json`);
796
+ if (!existsSync(p))
797
+ return null;
798
+ try {
799
+ return JSON.parse(readFileSync(p, 'utf-8'));
800
+ }
801
+ catch {
802
+ return null;
803
+ }
804
+ }
805
+ //# sourceMappingURL=voice-realtime.js.map