@kernel.chat/kbot 3.51.0 → 3.52.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -9
- package/dist/agent-protocol.test.d.ts +2 -0
- package/dist/agent-protocol.test.d.ts.map +1 -0
- package/dist/agent-protocol.test.js +730 -0
- package/dist/agent-protocol.test.js.map +1 -0
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +34 -10
- package/dist/agent.js.map +1 -1
- package/dist/auth.js +3 -3
- package/dist/auth.js.map +1 -1
- package/dist/bench.d.ts +64 -0
- package/dist/bench.d.ts.map +1 -0
- package/dist/bench.js +973 -0
- package/dist/bench.js.map +1 -0
- package/dist/cli.js +144 -29
- package/dist/cli.js.map +1 -1
- package/dist/cloud-agent.d.ts +77 -0
- package/dist/cloud-agent.d.ts.map +1 -0
- package/dist/cloud-agent.js +743 -0
- package/dist/cloud-agent.js.map +1 -0
- package/dist/context.test.d.ts +2 -0
- package/dist/context.test.d.ts.map +1 -0
- package/dist/context.test.js +561 -0
- package/dist/context.test.js.map +1 -0
- package/dist/evolution.d.ts.map +1 -1
- package/dist/evolution.js +4 -1
- package/dist/evolution.js.map +1 -1
- package/dist/github-release.d.ts +61 -0
- package/dist/github-release.d.ts.map +1 -0
- package/dist/github-release.js +451 -0
- package/dist/github-release.js.map +1 -0
- package/dist/graph-memory.test.d.ts +2 -0
- package/dist/graph-memory.test.d.ts.map +1 -0
- package/dist/graph-memory.test.js +946 -0
- package/dist/graph-memory.test.js.map +1 -0
- package/dist/init-science.d.ts +43 -0
- package/dist/init-science.d.ts.map +1 -0
- package/dist/init-science.js +477 -0
- package/dist/init-science.js.map +1 -0
- package/dist/lab.d.ts +45 -0
- package/dist/lab.d.ts.map +1 -0
- package/dist/lab.js +1020 -0
- package/dist/lab.js.map +1 -0
- package/dist/lsp-deep.d.ts +101 -0
- package/dist/lsp-deep.d.ts.map +1 -0
- package/dist/lsp-deep.js +689 -0
- package/dist/lsp-deep.js.map +1 -0
- package/dist/memory.test.d.ts +2 -0
- package/dist/memory.test.d.ts.map +1 -0
- package/dist/memory.test.js +369 -0
- package/dist/memory.test.js.map +1 -0
- package/dist/multi-session.d.ts +164 -0
- package/dist/multi-session.d.ts.map +1 -0
- package/dist/multi-session.js +885 -0
- package/dist/multi-session.js.map +1 -0
- package/dist/self-eval.d.ts.map +1 -1
- package/dist/self-eval.js +5 -2
- package/dist/self-eval.js.map +1 -1
- package/dist/streaming.d.ts.map +1 -1
- package/dist/streaming.js +0 -1
- package/dist/streaming.js.map +1 -1
- package/dist/teach.d.ts +136 -0
- package/dist/teach.d.ts.map +1 -0
- package/dist/teach.js +915 -0
- package/dist/teach.js.map +1 -0
- package/dist/telemetry.d.ts +1 -1
- package/dist/telemetry.d.ts.map +1 -1
- package/dist/telemetry.js.map +1 -1
- package/dist/tools/browser-agent.js +2 -2
- package/dist/tools/browser-agent.js.map +1 -1
- package/dist/tools/forge.d.ts.map +1 -1
- package/dist/tools/forge.js +15 -26
- package/dist/tools/forge.js.map +1 -1
- package/dist/tools/git.d.ts.map +1 -1
- package/dist/tools/git.js +10 -7
- package/dist/tools/git.js.map +1 -1
- package/dist/voice-realtime.d.ts +54 -0
- package/dist/voice-realtime.d.ts.map +1 -0
- package/dist/voice-realtime.js +805 -0
- package/dist/voice-realtime.js.map +1 -0
- package/package.json +10 -3
|
@@ -0,0 +1,805 @@
|
|
|
1
|
+
// kbot Voice Realtime — Bidirectional real-time voice conversation
|
|
2
|
+
//
|
|
3
|
+
// Real-time voice mode with natural turn-taking, VAD, streaming TTS,
|
|
4
|
+
// interrupt handling, and waveform visualization.
|
|
5
|
+
//
|
|
6
|
+
// Usage:
|
|
7
|
+
// import { startRealtimeVoice } from './voice-realtime.js'
|
|
8
|
+
// await startRealtimeVoice({ stt: 'whisper-local', tts: 'system', continuous: true })
|
|
9
|
+
import { execSync, spawn } from 'node:child_process';
|
|
10
|
+
import { createInterface } from 'node:readline';
|
|
11
|
+
import { tmpdir, homedir } from 'node:os';
|
|
12
|
+
import { join } from 'node:path';
|
|
13
|
+
import { existsSync, unlinkSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'node:fs';
|
|
14
|
+
import chalk from 'chalk';
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Constants
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
const KBOT_DIR = join(homedir(), '.kbot');
|
|
19
|
+
const VOICE_DIR = join(KBOT_DIR, 'voice');
|
|
20
|
+
const ACCENT = chalk.hex('#A78BFA');
|
|
21
|
+
const DIM = chalk.dim;
|
|
22
|
+
const OLLAMA_DEFAULT = process.env.OLLAMA_HOST || 'http://localhost:11434';
|
|
23
|
+
const WAVE = [' ', '\u2581', '\u2582', '\u2583', '\u2584', '\u2585', '\u2586', '\u2587', '\u2588'];
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Helpers
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
const has = (cmd) => {
|
|
28
|
+
try {
|
|
29
|
+
execSync(`which ${cmd}`, { stdio: 'ignore' });
|
|
30
|
+
return true;
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
const hasSox = () => has('sox') && has('rec');
|
|
37
|
+
const hasRec = () => has('rec') || has('arecord');
|
|
38
|
+
function loadKey(provider) {
|
|
39
|
+
const p = join(KBOT_DIR, 'config.json');
|
|
40
|
+
if (!existsSync(p))
|
|
41
|
+
return null;
|
|
42
|
+
try {
|
|
43
|
+
const c = JSON.parse(readFileSync(p, 'utf-8'));
|
|
44
|
+
return (c.providers?.[provider]?.apiKey || (c.provider === provider && c.apiKey) || null);
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
function detectSTT(preferred) {
|
|
51
|
+
if (preferred === 'whisper-local' || preferred === 'whisper-api' || preferred === 'system')
|
|
52
|
+
return preferred;
|
|
53
|
+
if (has('whisper') || has('whisper.cpp') || has('whisper-cpp'))
|
|
54
|
+
return 'whisper-local';
|
|
55
|
+
if (loadKey('openai'))
|
|
56
|
+
return 'whisper-api';
|
|
57
|
+
return 'system';
|
|
58
|
+
}
|
|
59
|
+
function detectTTS(preferred) {
|
|
60
|
+
if (preferred === 'system' || preferred === 'elevenlabs' || preferred === 'openai-tts')
|
|
61
|
+
return preferred;
|
|
62
|
+
if (process.platform === 'darwin' || has('espeak') || has('piper'))
|
|
63
|
+
return 'system';
|
|
64
|
+
if (loadKey('openai'))
|
|
65
|
+
return 'openai-tts';
|
|
66
|
+
if (loadKey('elevenlabs'))
|
|
67
|
+
return 'elevenlabs';
|
|
68
|
+
return 'system';
|
|
69
|
+
}
|
|
70
|
+
function stripMd(text) {
|
|
71
|
+
return text
|
|
72
|
+
.replace(/```(\w+)?\n[\s\S]*?```/g, (_, l) => l ? `Here's a ${l} snippet.` : 'Here\'s a code snippet.')
|
|
73
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
74
|
+
.replace(/^#{1,6}\s+/gm, '')
|
|
75
|
+
.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1')
|
|
76
|
+
.replace(/_{1,3}([^_]+)_{1,3}/g, '$1')
|
|
77
|
+
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
|
78
|
+
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
|
|
79
|
+
.replace(/^>\s+/gm, '')
|
|
80
|
+
.replace(/^[\s]*[-*+]\s+/gm, '')
|
|
81
|
+
.replace(/^[\s]*\d+\.\s+/gm, '')
|
|
82
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
83
|
+
.trim();
|
|
84
|
+
}
|
|
85
|
+
const shellSafe = (t) => t.replace(/[;&|`$(){}[\]!#\\]/g, '');
|
|
86
|
+
function splitSentences(text) {
|
|
87
|
+
return (text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g) || [text]).map(s => s.trim()).filter(Boolean);
|
|
88
|
+
}
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
// Waveform visualization
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
function showWaveform(volume) {
|
|
93
|
+
const bars = [];
|
|
94
|
+
for (let i = 0; i < 30; i++) {
|
|
95
|
+
const v = Math.max(0, Math.min(1, volume + Math.sin(i * 0.5 + Date.now() * 0.001) * 0.3 * volume));
|
|
96
|
+
bars.push(WAVE[Math.round(v * (WAVE.length - 1))]);
|
|
97
|
+
}
|
|
98
|
+
process.stdout.write(`\r ${chalk.cyan('\uD83C\uDF99')} ${chalk.cyan(bars.join(''))} `);
|
|
99
|
+
}
|
|
100
|
+
function showStatus(status, extra) {
|
|
101
|
+
const labels = {
|
|
102
|
+
listening: chalk.cyan('\uD83C\uDF99 Listening...'),
|
|
103
|
+
processing: chalk.yellow('\uD83D\uDD04 Processing...'),
|
|
104
|
+
speaking: chalk.green('\uD83D\uDD0A Speaking...'),
|
|
105
|
+
idle: chalk.dim('\u23F8 Idle'),
|
|
106
|
+
};
|
|
107
|
+
process.stdout.write(`\r${' '.repeat(80)}\r ${labels[status]}${extra ? ` ${DIM(extra)}` : ''}`);
|
|
108
|
+
}
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
// Voice Activity Detection (VAD) via sox
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
class VAD {
|
|
113
|
+
cfg;
|
|
114
|
+
proc = null;
|
|
115
|
+
speaking = false;
|
|
116
|
+
silenceAt = 0;
|
|
117
|
+
speechAt = 0;
|
|
118
|
+
onStart;
|
|
119
|
+
onEnd;
|
|
120
|
+
onVol;
|
|
121
|
+
constructor(cfg) { this.cfg = cfg; }
|
|
122
|
+
start(cb) {
|
|
123
|
+
this.onStart = cb.onStart;
|
|
124
|
+
this.onEnd = cb.onEnd;
|
|
125
|
+
this.onVol = cb.onVol;
|
|
126
|
+
if (!hasSox())
|
|
127
|
+
return;
|
|
128
|
+
this.proc = spawn('rec', [
|
|
129
|
+
'-q', '-r', String(this.cfg.sampleRate), '-c', '1',
|
|
130
|
+
'-t', 'raw', '-b', '16', '-e', 'signed-integer', '-',
|
|
131
|
+
], { stdio: ['ignore', 'pipe', 'ignore'] });
|
|
132
|
+
this.proc.stdout?.on('data', (chunk) => {
|
|
133
|
+
let sum = 0;
|
|
134
|
+
const n = chunk.length / 2;
|
|
135
|
+
for (let i = 0; i < chunk.length - 1; i += 2) {
|
|
136
|
+
const s = chunk.readInt16LE(i);
|
|
137
|
+
sum += s * s;
|
|
138
|
+
}
|
|
139
|
+
const vol = Math.min(1, Math.sqrt(sum / (n || 1)) / 32768);
|
|
140
|
+
this.onVol?.(vol);
|
|
141
|
+
this.process(vol);
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
process(vol) {
|
|
145
|
+
const now = Date.now();
|
|
146
|
+
if (!this.speaking) {
|
|
147
|
+
if (vol > this.cfg.threshold) {
|
|
148
|
+
this.speaking = true;
|
|
149
|
+
this.speechAt = now;
|
|
150
|
+
this.silenceAt = 0;
|
|
151
|
+
this.onStart?.();
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
else {
|
|
155
|
+
if (vol < this.cfg.threshold) {
|
|
156
|
+
if (!this.silenceAt)
|
|
157
|
+
this.silenceAt = now;
|
|
158
|
+
else if (now - this.silenceAt >= this.cfg.silenceDuration * 1000) {
|
|
159
|
+
if ((now - this.speechAt) / 1000 >= this.cfg.minSpeechDuration)
|
|
160
|
+
this.onEnd?.();
|
|
161
|
+
this.speaking = false;
|
|
162
|
+
this.silenceAt = 0;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
else {
|
|
166
|
+
this.silenceAt = 0;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
stop() {
|
|
171
|
+
this.proc?.kill('SIGTERM');
|
|
172
|
+
this.proc = null;
|
|
173
|
+
this.speaking = false;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
// Audio recording
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
async function record(path, maxSec, silenceSec) {
|
|
180
|
+
return new Promise(resolve => {
|
|
181
|
+
const proc = has('rec')
|
|
182
|
+
? spawn('rec', [
|
|
183
|
+
path, 'rate', '16k', 'channels', '1',
|
|
184
|
+
'silence', '1', '0.1', '1.5%', '1', String(silenceSec), '1.5%',
|
|
185
|
+
'trim', '0', String(maxSec),
|
|
186
|
+
], { stdio: ['ignore', 'ignore', 'ignore'] })
|
|
187
|
+
: spawn('arecord', [
|
|
188
|
+
'-f', 'S16_LE', '-r', '16000', '-c', '1', '-d', String(maxSec), path,
|
|
189
|
+
], { stdio: ['ignore', 'ignore', 'ignore'] });
|
|
190
|
+
const t = setTimeout(() => proc.kill('SIGTERM'), (maxSec + 3) * 1000);
|
|
191
|
+
proc.on('close', () => { clearTimeout(t); resolve(existsSync(path)); });
|
|
192
|
+
proc.on('error', () => { clearTimeout(t); resolve(false); });
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
196
|
+
// STT engines (fallback chain)
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
async function sttLocal(path, lang) {
|
|
199
|
+
const cmd = ['whisper', 'whisper.cpp', 'whisper-cpp'].find(has);
|
|
200
|
+
if (!cmd)
|
|
201
|
+
throw new Error('No whisper binary');
|
|
202
|
+
return execSync(`${cmd} "${path}" --model base --language ${lang} --output_format txt 2>/dev/null`, { encoding: 'utf-8', timeout: 60_000 }).trim();
|
|
203
|
+
}
|
|
204
|
+
async function sttAPI(path, key, lang) {
|
|
205
|
+
return execSync(`curl -s https://api.openai.com/v1/audio/transcriptions ` +
|
|
206
|
+
`-H "Authorization: Bearer ${key}" ` +
|
|
207
|
+
`-F "file=@${path}" -F "model=whisper-1" -F "language=${lang}" -F "response_format=text"`, { encoding: 'utf-8', timeout: 30_000 }).trim();
|
|
208
|
+
}
|
|
209
|
+
async function transcribe(path, engine, lang, key) {
|
|
210
|
+
const chain = [];
|
|
211
|
+
if (engine === 'whisper-local')
|
|
212
|
+
chain.push(() => sttLocal(path, lang));
|
|
213
|
+
if (key)
|
|
214
|
+
chain.push(() => sttAPI(path, key, lang));
|
|
215
|
+
if (engine !== 'whisper-local' && ['whisper', 'whisper.cpp', 'whisper-cpp'].some(has))
|
|
216
|
+
chain.push(() => sttLocal(path, lang));
|
|
217
|
+
for (const fn of chain) {
|
|
218
|
+
try {
|
|
219
|
+
const t = await fn();
|
|
220
|
+
if (t)
|
|
221
|
+
return t;
|
|
222
|
+
}
|
|
223
|
+
catch { /* next */ }
|
|
224
|
+
}
|
|
225
|
+
return '';
|
|
226
|
+
}
|
|
227
|
+
// ---------------------------------------------------------------------------
|
|
228
|
+
// TTS engines (fallback chain)
|
|
229
|
+
// ---------------------------------------------------------------------------
|
|
230
|
+
async function ttsSystem(text, voice, rate) {
|
|
231
|
+
const t = shellSafe(text);
|
|
232
|
+
if (!t)
|
|
233
|
+
return null;
|
|
234
|
+
if (process.platform === 'darwin')
|
|
235
|
+
return spawn('say', ['-v', voice, '-r', String(rate), t], { stdio: 'ignore' });
|
|
236
|
+
if (has('piper')) {
|
|
237
|
+
const p = spawn('piper', ['--output-raw'], { stdio: ['pipe', 'pipe', 'ignore'] });
|
|
238
|
+
const a = spawn('aplay', ['-r', '22050', '-f', 'S16_LE', '-'], { stdio: ['pipe', 'ignore', 'ignore'] });
|
|
239
|
+
p.stdout?.pipe(a.stdin);
|
|
240
|
+
p.stdin?.write(t);
|
|
241
|
+
p.stdin?.end();
|
|
242
|
+
return a;
|
|
243
|
+
}
|
|
244
|
+
if (has('espeak'))
|
|
245
|
+
return spawn('espeak', [t], { stdio: 'ignore' });
|
|
246
|
+
return null;
|
|
247
|
+
}
|
|
248
|
+
async function ttsOpenAI(text, key) {
|
|
249
|
+
const t = shellSafe(stripMd(text));
|
|
250
|
+
if (!t)
|
|
251
|
+
return null;
|
|
252
|
+
const tmp = join(tmpdir(), `kbot-tts-${Date.now()}.mp3`);
|
|
253
|
+
try {
|
|
254
|
+
execSync(`curl -s https://api.openai.com/v1/audio/speech -H "Authorization: Bearer ${key}" ` +
|
|
255
|
+
`-H "Content-Type: application/json" -d '${JSON.stringify({ model: 'tts-1', input: t, voice: 'nova' })}' --output "${tmp}"`, { timeout: 30_000 });
|
|
256
|
+
if (!existsSync(tmp))
|
|
257
|
+
return null;
|
|
258
|
+
const player = process.platform === 'darwin' ? 'afplay' : has('mpv') ? 'mpv' : has('aplay') ? 'aplay' : null;
|
|
259
|
+
if (!player) {
|
|
260
|
+
try {
|
|
261
|
+
unlinkSync(tmp);
|
|
262
|
+
}
|
|
263
|
+
catch { }
|
|
264
|
+
return null;
|
|
265
|
+
}
|
|
266
|
+
const p = spawn(player, player === 'mpv' ? ['--no-terminal', tmp] : [tmp], { stdio: 'ignore' });
|
|
267
|
+
p.on('close', () => { try {
|
|
268
|
+
unlinkSync(tmp);
|
|
269
|
+
}
|
|
270
|
+
catch { } });
|
|
271
|
+
return p;
|
|
272
|
+
}
|
|
273
|
+
catch {
|
|
274
|
+
try {
|
|
275
|
+
unlinkSync(tmp);
|
|
276
|
+
}
|
|
277
|
+
catch { }
|
|
278
|
+
return null;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
async function ttsElevenLabs(text, key, voiceId) {
|
|
282
|
+
const t = stripMd(text);
|
|
283
|
+
if (!t)
|
|
284
|
+
return null;
|
|
285
|
+
const tmp = join(tmpdir(), `kbot-el-${Date.now()}.mp3`);
|
|
286
|
+
try {
|
|
287
|
+
execSync(`curl -s "https://api.elevenlabs.io/v1/text-to-speech/${voiceId}" ` +
|
|
288
|
+
`-H "xi-api-key: ${key}" -H "Content-Type: application/json" ` +
|
|
289
|
+
`-d '${JSON.stringify({ text: t, model_id: 'eleven_monolingual_v1' })}' --output "${tmp}"`, { timeout: 30_000 });
|
|
290
|
+
if (!existsSync(tmp))
|
|
291
|
+
return null;
|
|
292
|
+
const player = process.platform === 'darwin' ? 'afplay' : has('mpv') ? 'mpv' : null;
|
|
293
|
+
if (!player) {
|
|
294
|
+
try {
|
|
295
|
+
unlinkSync(tmp);
|
|
296
|
+
}
|
|
297
|
+
catch { }
|
|
298
|
+
return null;
|
|
299
|
+
}
|
|
300
|
+
const p = spawn(player, player === 'mpv' ? ['--no-terminal', tmp] : [tmp], { stdio: 'ignore' });
|
|
301
|
+
p.on('close', () => { try {
|
|
302
|
+
unlinkSync(tmp);
|
|
303
|
+
}
|
|
304
|
+
catch { } });
|
|
305
|
+
return p;
|
|
306
|
+
}
|
|
307
|
+
catch {
|
|
308
|
+
try {
|
|
309
|
+
unlinkSync(tmp);
|
|
310
|
+
}
|
|
311
|
+
catch { }
|
|
312
|
+
return null;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
async function speak(text, st) {
|
|
316
|
+
switch (st.session.ttsEngine) {
|
|
317
|
+
case 'system': return ttsSystem(text, st.voice, st.rate);
|
|
318
|
+
case 'openai-tts': return st.openaiKey ? ttsOpenAI(text, st.openaiKey) : ttsSystem(text, st.voice, st.rate);
|
|
319
|
+
case 'elevenlabs': return st.elevenKey ? ttsElevenLabs(text, st.elevenKey, st.elevenVoiceId) : ttsSystem(text, st.voice, st.rate);
|
|
320
|
+
default: return null;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
// ---------------------------------------------------------------------------
|
|
324
|
+
// Streaming TTS — sentence-level chunking
|
|
325
|
+
// ---------------------------------------------------------------------------
|
|
326
|
+
async function speakChunked(text, st, isInterrupted) {
|
|
327
|
+
const sentences = splitSentences(stripMd(text));
|
|
328
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
329
|
+
if (isInterrupted())
|
|
330
|
+
break;
|
|
331
|
+
st.session.status = 'speaking';
|
|
332
|
+
showStatus('speaking', `(${i + 1}/${sentences.length})`);
|
|
333
|
+
const proc = await speak(sentences[i], st);
|
|
334
|
+
if (!proc)
|
|
335
|
+
continue;
|
|
336
|
+
st.ttsProc = proc;
|
|
337
|
+
await new Promise(resolve => {
|
|
338
|
+
const iv = setInterval(() => { if (isInterrupted()) {
|
|
339
|
+
clearInterval(iv);
|
|
340
|
+
proc.kill('SIGTERM');
|
|
341
|
+
} }, 100);
|
|
342
|
+
proc.on('close', () => { clearInterval(iv); if (st.ttsProc === proc)
|
|
343
|
+
st.ttsProc = null; resolve(); });
|
|
344
|
+
proc.on('error', () => { clearInterval(iv); if (st.ttsProc === proc)
|
|
345
|
+
st.ttsProc = null; resolve(); });
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
function interruptTTS(st) {
|
|
350
|
+
if (st.ttsProc) {
|
|
351
|
+
st.ttsProc.kill('SIGTERM');
|
|
352
|
+
st.ttsProc = null;
|
|
353
|
+
}
|
|
354
|
+
st.interrupted = true;
|
|
355
|
+
st.session.status = 'idle';
|
|
356
|
+
}
|
|
357
|
+
// ---------------------------------------------------------------------------
|
|
358
|
+
// Voice commands
|
|
359
|
+
// ---------------------------------------------------------------------------
|
|
360
|
+
function parseCommand(text) {
|
|
361
|
+
const l = text.toLowerCase().trim();
|
|
362
|
+
if (/^(stop|cancel|shut up|be quiet|enough)$/.test(l))
|
|
363
|
+
return { action: 'stop' };
|
|
364
|
+
if (/^(pause|hold on|wait)$/.test(l))
|
|
365
|
+
return { action: 'pause' };
|
|
366
|
+
if (/^(save this|save conversation|save)$/.test(l))
|
|
367
|
+
return { action: 'save' };
|
|
368
|
+
if (/^(exit|quit|goodbye|bye|end)$/.test(l))
|
|
369
|
+
return { action: 'exit' };
|
|
370
|
+
const m = l.match(/^switch\s+to\s+(\w+)$/);
|
|
371
|
+
if (m)
|
|
372
|
+
return { action: 'switch', arg: m[1] };
|
|
373
|
+
return null;
|
|
374
|
+
}
|
|
375
|
+
// ---------------------------------------------------------------------------
|
|
376
|
+
// Audio recording to ~/.kbot/voice/
|
|
377
|
+
// ---------------------------------------------------------------------------
|
|
378
|
+
function ensureVoiceDir() { if (!existsSync(VOICE_DIR))
|
|
379
|
+
mkdirSync(VOICE_DIR, { recursive: true }); }
|
|
380
|
+
function saveRecording(src, sessionId, idx, role) {
|
|
381
|
+
ensureVoiceDir();
|
|
382
|
+
const dest = join(VOICE_DIR, `${sessionId}-${String(idx).padStart(3, '0')}-${role}.wav`);
|
|
383
|
+
try {
|
|
384
|
+
writeFileSync(dest, readFileSync(src));
|
|
385
|
+
return dest;
|
|
386
|
+
}
|
|
387
|
+
catch {
|
|
388
|
+
return undefined;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
function audioDuration(path) {
|
|
392
|
+
try {
|
|
393
|
+
if (has('soxi'))
|
|
394
|
+
return parseFloat(execSync(`soxi -D "${path}" 2>/dev/null`, { encoding: 'utf-8' })) || 0;
|
|
395
|
+
return statSync(path).size / 32000;
|
|
396
|
+
}
|
|
397
|
+
catch {
|
|
398
|
+
return 0;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
function saveSession(s) {
|
|
402
|
+
ensureVoiceDir();
|
|
403
|
+
try {
|
|
404
|
+
writeFileSync(join(VOICE_DIR, `${s.id}.json`), JSON.stringify({
|
|
405
|
+
id: s.id, timestamp: new Date().toISOString(), turns: s.history.length,
|
|
406
|
+
sttEngine: s.sttEngine, ttsEngine: s.ttsEngine, language: s.language, history: s.history,
|
|
407
|
+
}, null, 2));
|
|
408
|
+
}
|
|
409
|
+
catch { /* non-critical */ }
|
|
410
|
+
}
|
|
411
|
+
// ---------------------------------------------------------------------------
|
|
412
|
+
// Ollama LLM
|
|
413
|
+
// ---------------------------------------------------------------------------
|
|
414
|
+
async function checkOllama(host) {
|
|
415
|
+
try {
|
|
416
|
+
return (await fetch(`${host}/api/tags`, { signal: AbortSignal.timeout(3000) })).ok;
|
|
417
|
+
}
|
|
418
|
+
catch {
|
|
419
|
+
return false;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
async function chat(msg, st, agent) {
|
|
423
|
+
const sys = [
|
|
424
|
+
'You are kbot, a helpful AI assistant in a real-time voice conversation.',
|
|
425
|
+
'Keep responses concise and conversational. Avoid code blocks, markdown, long lists, URLs.',
|
|
426
|
+
'Use natural speech. Ask if the user wants detail on complex topics.',
|
|
427
|
+
agent !== 'auto' ? `You are the "${agent}" specialist.` : '',
|
|
428
|
+
].filter(Boolean).join(' ');
|
|
429
|
+
const body = JSON.stringify({
|
|
430
|
+
model: st.ollamaModel,
|
|
431
|
+
messages: [{ role: 'system', content: sys }, ...st.messages.slice(-10), { role: 'user', content: msg }],
|
|
432
|
+
stream: false, options: { temperature: 0.7, num_predict: 400 },
|
|
433
|
+
});
|
|
434
|
+
const res = await fetch(`${st.ollamaHost}/api/chat`, {
|
|
435
|
+
method: 'POST', headers: { 'Content-Type': 'application/json' },
|
|
436
|
+
body, signal: AbortSignal.timeout(60_000),
|
|
437
|
+
});
|
|
438
|
+
if (!res.ok)
|
|
439
|
+
throw new Error(`Ollama ${res.status}`);
|
|
440
|
+
const data = await res.json();
|
|
441
|
+
return data.message?.content?.trim() || '(no response)';
|
|
442
|
+
}
|
|
443
|
+
// ---------------------------------------------------------------------------
|
|
444
|
+
// Text input fallback
|
|
445
|
+
// ---------------------------------------------------------------------------
|
|
446
|
+
function textInput(prompt) {
|
|
447
|
+
return new Promise(resolve => {
|
|
448
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
449
|
+
rl.question(prompt, a => { rl.close(); resolve(a.trim()); });
|
|
450
|
+
});
|
|
451
|
+
}
|
|
452
|
+
// ---------------------------------------------------------------------------
|
|
453
|
+
// Single voice turn (non-VAD mode)
|
|
454
|
+
// ---------------------------------------------------------------------------
|
|
455
|
+
async function voiceTurn(st, agent) {
|
|
456
|
+
const { session } = st;
|
|
457
|
+
const idx = session.history.length;
|
|
458
|
+
let userText = '';
|
|
459
|
+
const tmp = join(tmpdir(), `kbot-rt-${Date.now()}.wav`);
|
|
460
|
+
// 1. Capture input
|
|
461
|
+
if (hasRec() && session.sttEngine !== 'system') {
|
|
462
|
+
session.status = 'listening';
|
|
463
|
+
showStatus('listening');
|
|
464
|
+
console.log();
|
|
465
|
+
const ok = await record(tmp, 30, st.vad.silenceDuration);
|
|
466
|
+
if (!ok || !existsSync(tmp)) {
|
|
467
|
+
console.log(chalk.yellow(' No audio captured.'));
|
|
468
|
+
userText = await textInput(chalk.cyan(' You: '));
|
|
469
|
+
}
|
|
470
|
+
else {
|
|
471
|
+
saveRecording(tmp, session.id, idx, 'user');
|
|
472
|
+
session.status = 'processing';
|
|
473
|
+
showStatus('processing', '(transcribing)');
|
|
474
|
+
userText = await transcribe(tmp, session.sttEngine, session.language, st.openaiKey);
|
|
475
|
+
process.stdout.write(`\r${' '.repeat(80)}\r`);
|
|
476
|
+
if (userText) {
|
|
477
|
+
console.log(` ${DIM('You:')} ${chalk.white(userText)}`);
|
|
478
|
+
session.history.push({ role: 'user', text: userText, duration: audioDuration(tmp), timestamp: new Date().toISOString() });
|
|
479
|
+
}
|
|
480
|
+
else {
|
|
481
|
+
console.log(chalk.yellow(' Could not transcribe.'));
|
|
482
|
+
userText = await textInput(chalk.cyan(' You: '));
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
try {
|
|
486
|
+
if (existsSync(tmp))
|
|
487
|
+
unlinkSync(tmp);
|
|
488
|
+
}
|
|
489
|
+
catch { }
|
|
490
|
+
}
|
|
491
|
+
else {
|
|
492
|
+
userText = await textInput(chalk.cyan('\n You: '));
|
|
493
|
+
}
|
|
494
|
+
if (!userText || !st.running)
|
|
495
|
+
return { cont: true };
|
|
496
|
+
if (session.history.length === idx)
|
|
497
|
+
session.history.push({ role: 'user', text: userText, duration: 0, timestamp: new Date().toISOString() });
|
|
498
|
+
// 2. Voice commands
|
|
499
|
+
const cmd = parseCommand(userText);
|
|
500
|
+
if (cmd) {
|
|
501
|
+
if (cmd.action === 'exit') {
|
|
502
|
+
console.log(ACCENT(' kbot: ') + 'Goodbye!');
|
|
503
|
+
await speakChunked('Goodbye!', st, () => false);
|
|
504
|
+
return { cont: false };
|
|
505
|
+
}
|
|
506
|
+
if (cmd.action === 'stop') {
|
|
507
|
+
interruptTTS(st);
|
|
508
|
+
console.log(DIM(' (stopped)'));
|
|
509
|
+
return { cont: true };
|
|
510
|
+
}
|
|
511
|
+
if (cmd.action === 'pause') {
|
|
512
|
+
interruptTTS(st);
|
|
513
|
+
await new Promise(resolve => {
|
|
514
|
+
console.log(chalk.yellow('\n \u23F8 Paused') + DIM(' — press Enter to resume'));
|
|
515
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
516
|
+
rl.question(DIM(' > '), a => { rl.close(); if (a.trim() === 'exit')
|
|
517
|
+
st.running = false; resolve(); });
|
|
518
|
+
});
|
|
519
|
+
return { cont: st.running };
|
|
520
|
+
}
|
|
521
|
+
if (cmd.action === 'switch' && cmd.arg) {
|
|
522
|
+
console.log(ACCENT(' kbot: ') + `Switching to ${cmd.arg}.`);
|
|
523
|
+
await speakChunked(`Switching to ${cmd.arg}.`, st, () => false);
|
|
524
|
+
return { cont: true, newAgent: cmd.arg };
|
|
525
|
+
}
|
|
526
|
+
if (cmd.action === 'save') {
|
|
527
|
+
saveSession(session);
|
|
528
|
+
console.log(chalk.green(` Saved: ${VOICE_DIR}/${session.id}.json`));
|
|
529
|
+
await speakChunked('Conversation saved.', st, () => false);
|
|
530
|
+
return { cont: true };
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
// 3. LLM response
|
|
534
|
+
session.status = 'processing';
|
|
535
|
+
showStatus('processing', '(thinking)');
|
|
536
|
+
st.messages.push({ role: 'user', content: userText });
|
|
537
|
+
try {
|
|
538
|
+
const resp = await chat(userText, st, agent);
|
|
539
|
+
process.stdout.write(`\r${' '.repeat(80)}\r`);
|
|
540
|
+
console.log(ACCENT(' kbot: ') + resp);
|
|
541
|
+
st.messages.push({ role: 'assistant', content: resp });
|
|
542
|
+
session.history.push({ role: 'assistant', text: resp, duration: 0, timestamp: new Date().toISOString() });
|
|
543
|
+
st.interrupted = false;
|
|
544
|
+
await speakChunked(resp, st, () => st.interrupted);
|
|
545
|
+
session.status = 'idle';
|
|
546
|
+
}
|
|
547
|
+
catch (e) {
|
|
548
|
+
process.stdout.write(`\r${' '.repeat(80)}\r`);
|
|
549
|
+
console.log(chalk.red(` Error: ${e instanceof Error ? e.message : e}`));
|
|
550
|
+
await speakChunked('Sorry, I encountered an error.', st, () => false);
|
|
551
|
+
}
|
|
552
|
+
return { cont: true };
|
|
553
|
+
}
|
|
554
|
+
// ---------------------------------------------------------------------------
|
|
555
|
+
// VAD-driven continuous loop
|
|
556
|
+
// ---------------------------------------------------------------------------
|
|
557
|
+
async function vadLoop(st, agent) {
|
|
558
|
+
const vad = new VAD(st.vad);
|
|
559
|
+
let recProc = null, recPath = '', recording = false;
|
|
560
|
+
const startRec = () => {
|
|
561
|
+
if (recording)
|
|
562
|
+
return;
|
|
563
|
+
recording = true;
|
|
564
|
+
if (st.session.status === 'speaking') {
|
|
565
|
+
interruptTTS(st);
|
|
566
|
+
console.log(DIM('\n (interrupted)'));
|
|
567
|
+
}
|
|
568
|
+
st.session.status = 'listening';
|
|
569
|
+
recPath = join(tmpdir(), `kbot-vad-${Date.now()}.wav`);
|
|
570
|
+
if (has('rec'))
|
|
571
|
+
recProc = spawn('rec', [recPath, 'rate', '16k', 'channels', '1', 'trim', '0', '30'], { stdio: ['ignore', 'ignore', 'ignore'] });
|
|
572
|
+
};
|
|
573
|
+
const stopRec = async () => {
|
|
574
|
+
if (!recording || !recProc)
|
|
575
|
+
return;
|
|
576
|
+
recording = false;
|
|
577
|
+
recProc.kill('SIGTERM');
|
|
578
|
+
recProc = null;
|
|
579
|
+
await new Promise(r => setTimeout(r, 200));
|
|
580
|
+
if (!existsSync(recPath))
|
|
581
|
+
return;
|
|
582
|
+
st.session.status = 'processing';
|
|
583
|
+
showStatus('processing', '(transcribing)');
|
|
584
|
+
const text = await transcribe(recPath, st.session.sttEngine, st.session.language, st.openaiKey);
|
|
585
|
+
try {
|
|
586
|
+
unlinkSync(recPath);
|
|
587
|
+
}
|
|
588
|
+
catch { }
|
|
589
|
+
process.stdout.write(`\r${' '.repeat(80)}\r`);
|
|
590
|
+
if (!text)
|
|
591
|
+
return;
|
|
592
|
+
console.log(` ${DIM('You:')} ${chalk.white(text)}`);
|
|
593
|
+
const cmd = parseCommand(text);
|
|
594
|
+
if (cmd) {
|
|
595
|
+
if (cmd.action === 'exit') {
|
|
596
|
+
st.running = false;
|
|
597
|
+
return;
|
|
598
|
+
}
|
|
599
|
+
if (cmd.action === 'pause') {
|
|
600
|
+
vad.stop();
|
|
601
|
+
await new Promise(r => {
|
|
602
|
+
console.log(chalk.yellow('\n \u23F8 Paused'));
|
|
603
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
604
|
+
rl.question(DIM(' > '), () => { rl.close(); r(); });
|
|
605
|
+
});
|
|
606
|
+
if (st.running)
|
|
607
|
+
vad.start({ onStart: startRec, onEnd: () => { stopRec(); }, onVol: v => { if (st.session.status !== 'speaking')
|
|
608
|
+
showWaveform(v); } });
|
|
609
|
+
return;
|
|
610
|
+
}
|
|
611
|
+
if (cmd.action === 'save') {
|
|
612
|
+
saveSession(st.session);
|
|
613
|
+
console.log(chalk.green(' Saved.'));
|
|
614
|
+
return;
|
|
615
|
+
}
|
|
616
|
+
return;
|
|
617
|
+
}
|
|
618
|
+
st.session.history.push({ role: 'user', text, duration: 0, timestamp: new Date().toISOString() });
|
|
619
|
+
st.session.status = 'processing';
|
|
620
|
+
showStatus('processing', '(thinking)');
|
|
621
|
+
st.messages.push({ role: 'user', content: text });
|
|
622
|
+
try {
|
|
623
|
+
const resp = await chat(text, st, agent);
|
|
624
|
+
process.stdout.write(`\r${' '.repeat(80)}\r`);
|
|
625
|
+
console.log(ACCENT(' kbot: ') + resp);
|
|
626
|
+
st.messages.push({ role: 'assistant', content: resp });
|
|
627
|
+
st.session.history.push({ role: 'assistant', text: resp, duration: 0, timestamp: new Date().toISOString() });
|
|
628
|
+
st.interrupted = false;
|
|
629
|
+
await speakChunked(resp, st, () => st.interrupted);
|
|
630
|
+
}
|
|
631
|
+
catch (e) {
|
|
632
|
+
process.stdout.write(`\r${' '.repeat(80)}\r`);
|
|
633
|
+
console.log(chalk.red(` Error: ${e instanceof Error ? e.message : e}`));
|
|
634
|
+
}
|
|
635
|
+
st.session.status = 'idle';
|
|
636
|
+
};
|
|
637
|
+
vad.start({
|
|
638
|
+
onStart: startRec, onEnd: () => { stopRec(); },
|
|
639
|
+
onVol: v => { if (st.session.status === 'listening' || st.session.status === 'idle')
|
|
640
|
+
showWaveform(v); },
|
|
641
|
+
});
|
|
642
|
+
console.log(DIM(' VAD active — start speaking...\n'));
|
|
643
|
+
await new Promise(resolve => {
|
|
644
|
+
const iv = setInterval(() => { if (!st.running) {
|
|
645
|
+
clearInterval(iv);
|
|
646
|
+
vad.stop();
|
|
647
|
+
resolve();
|
|
648
|
+
} }, 200);
|
|
649
|
+
});
|
|
650
|
+
}
|
|
651
|
+
// ---------------------------------------------------------------------------
|
|
652
|
+
// Banner
|
|
653
|
+
// ---------------------------------------------------------------------------
|
|
654
|
+
function banner(st) {
|
|
655
|
+
const s = st.session;
|
|
656
|
+
console.log();
|
|
657
|
+
console.log(ACCENT.bold(' kbot Realtime Voice'));
|
|
658
|
+
console.log(ACCENT(' ' + '='.repeat(44)));
|
|
659
|
+
console.log();
|
|
660
|
+
console.log(` ${DIM('Session:')} ${DIM(s.id)}`);
|
|
661
|
+
console.log(` ${DIM('STT:')} ${chalk.green(s.sttEngine)}`);
|
|
662
|
+
console.log(` ${DIM('TTS:')} ${chalk.green(s.ttsEngine)}${s.ttsEngine === 'system' ? DIM(` (${st.voice}, ${st.rate} wpm)`) : ''}`);
|
|
663
|
+
console.log(` ${DIM('Language:')} ${s.language}`);
|
|
664
|
+
console.log(` ${DIM('VAD:')} ${s.vadEnabled ? chalk.green('on') + DIM(` (thresh ${st.vad.threshold}, silence ${st.vad.silenceDuration}s)`) : chalk.yellow('off')}`);
|
|
665
|
+
console.log(` ${DIM('Continuous:')} ${s.continuous ? chalk.green('yes') : chalk.yellow('no')}`);
|
|
666
|
+
console.log(` ${DIM('LLM:')} ${chalk.cyan(`${st.ollamaModel} @ ${st.ollamaHost}`)}`);
|
|
667
|
+
console.log(` ${DIM('Recording:')} ${chalk.cyan(VOICE_DIR)}`);
|
|
668
|
+
console.log();
|
|
669
|
+
console.log(DIM(' Voice commands: "stop", "pause", "switch to [agent]", "save this"'));
|
|
670
|
+
console.log(DIM(' Say "exit" / "goodbye" to end. Ctrl+C anytime.'));
|
|
671
|
+
console.log();
|
|
672
|
+
}
|
|
673
|
+
// ---------------------------------------------------------------------------
|
|
674
|
+
// Entry point: startRealtimeVoice
|
|
675
|
+
// ---------------------------------------------------------------------------
|
|
676
|
+
/**
|
|
677
|
+
* Start a real-time bidirectional voice conversation.
|
|
678
|
+
*
|
|
679
|
+
* - Voice Activity Detection (VAD) via sox for natural turn-taking
|
|
680
|
+
* - STT fallback: whisper-local -> whisper-api -> system
|
|
681
|
+
* - TTS fallback: system -> openai-tts -> elevenlabs
|
|
682
|
+
* - Streaming TTS: sentence-level chunking, speaks before full response
|
|
683
|
+
* - Interrupt: speaking stops if user starts talking
|
|
684
|
+
* - Voice commands: stop, pause, switch to [agent], save this
|
|
685
|
+
* - Audio saved to ~/.kbot/voice/ for playback/review
|
|
686
|
+
* - Waveform + status visualization in terminal
|
|
687
|
+
*/
|
|
688
|
+
export async function startRealtimeVoice(opts) {
|
|
689
|
+
const id = `voice-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
|
|
690
|
+
const vadEnabled = opts?.vad !== false && hasSox();
|
|
691
|
+
const session = {
|
|
692
|
+
id, status: 'idle',
|
|
693
|
+
sttEngine: detectSTT(opts?.stt), ttsEngine: detectTTS(opts?.tts),
|
|
694
|
+
language: opts?.language ?? 'en', continuous: opts?.continuous ?? true,
|
|
695
|
+
vadEnabled, history: [],
|
|
696
|
+
};
|
|
697
|
+
const st = {
|
|
698
|
+
session,
|
|
699
|
+
vad: { threshold: opts?.vadThreshold ?? 0.02, silenceDuration: opts?.vadSilence ?? 1.5, minSpeechDuration: 0.3, sampleRate: 16000 },
|
|
700
|
+
running: false, interrupted: false, ttsProc: null,
|
|
701
|
+
ollamaHost: opts?.ollamaHost ?? OLLAMA_DEFAULT,
|
|
702
|
+
ollamaModel: opts?.ollamaModel ?? 'gemma3:12b',
|
|
703
|
+
messages: [], openaiKey: loadKey('openai'), elevenKey: loadKey('elevenlabs'),
|
|
704
|
+
elevenVoiceId: 'EXAVITQu4vr4xnSDxMaL', voice: opts?.voice ?? 'Samantha', rate: opts?.rate ?? 190,
|
|
705
|
+
};
|
|
706
|
+
// Pre-flight
|
|
707
|
+
if (!(await checkOllama(st.ollamaHost))) {
|
|
708
|
+
console.error(chalk.red(`\n Ollama is not running at ${st.ollamaHost}`));
|
|
709
|
+
console.error(chalk.yellow(` Start: ollama serve && ollama pull ${st.ollamaModel}`));
|
|
710
|
+
return;
|
|
711
|
+
}
|
|
712
|
+
if (!hasRec())
|
|
713
|
+
console.warn(chalk.yellow(' \u26A0 No recorder — install sox: brew install sox\n'));
|
|
714
|
+
if (session.sttEngine === 'system')
|
|
715
|
+
console.warn(chalk.yellow(' \u26A0 Install whisper.cpp for STT\n'));
|
|
716
|
+
if (vadEnabled && !hasSox()) {
|
|
717
|
+
console.warn(chalk.yellow(' \u26A0 VAD needs sox\n'));
|
|
718
|
+
session.vadEnabled = false;
|
|
719
|
+
}
|
|
720
|
+
ensureVoiceDir();
|
|
721
|
+
banner(st);
|
|
722
|
+
st.running = true;
|
|
723
|
+
const cleanup = () => {
|
|
724
|
+
if (st.running) {
|
|
725
|
+
st.running = false;
|
|
726
|
+
interruptTTS(st);
|
|
727
|
+
console.log(DIM('\n\n Session ended.'));
|
|
728
|
+
saveSession(session);
|
|
729
|
+
}
|
|
730
|
+
};
|
|
731
|
+
process.on('SIGINT', cleanup);
|
|
732
|
+
process.on('SIGTERM', cleanup);
|
|
733
|
+
// Greeting
|
|
734
|
+
const greeting = 'Hey! Real-time voice mode is active. Go ahead and speak.';
|
|
735
|
+
console.log(ACCENT(' kbot: ') + greeting);
|
|
736
|
+
await speakChunked(greeting, st, () => false);
|
|
737
|
+
// Main loop
|
|
738
|
+
if (session.vadEnabled) {
|
|
739
|
+
await vadLoop(st, opts?.agent ?? 'auto');
|
|
740
|
+
}
|
|
741
|
+
else {
|
|
742
|
+
let agent = opts?.agent ?? 'auto';
|
|
743
|
+
while (st.running) {
|
|
744
|
+
const result = await voiceTurn(st, agent);
|
|
745
|
+
if (!result.cont) {
|
|
746
|
+
st.running = false;
|
|
747
|
+
break;
|
|
748
|
+
}
|
|
749
|
+
if (result.newAgent)
|
|
750
|
+
agent = result.newAgent;
|
|
751
|
+
if (!session.continuous)
|
|
752
|
+
break;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
saveSession(session);
|
|
756
|
+
process.removeListener('SIGINT', cleanup);
|
|
757
|
+
process.removeListener('SIGTERM', cleanup);
|
|
758
|
+
}
|
|
759
|
+
// ---------------------------------------------------------------------------
|
|
760
|
+
// Utility exports
|
|
761
|
+
// ---------------------------------------------------------------------------
|
|
762
|
+
/** Describe real-time voice capabilities of the current system */
|
|
763
|
+
export function describeRealtimeCapabilities() {
|
|
764
|
+
return [
|
|
765
|
+
'Real-time Voice Capabilities:',
|
|
766
|
+
` STT: ${detectSTT()}`,
|
|
767
|
+
` TTS: ${detectTTS()}`,
|
|
768
|
+
` VAD: ${hasSox() ? 'available' : 'unavailable (install sox)'}`,
|
|
769
|
+
` Recorder: ${hasRec() ? 'available' : 'unavailable'}`,
|
|
770
|
+
` Platform: ${process.platform}`,
|
|
771
|
+
` Voice dir: ${VOICE_DIR}`,
|
|
772
|
+
].join('\n');
|
|
773
|
+
}
|
|
774
|
+
/** List saved voice sessions from ~/.kbot/voice/ */
|
|
775
|
+
export function listVoiceSessions() {
|
|
776
|
+
ensureVoiceDir();
|
|
777
|
+
try {
|
|
778
|
+
return execSync(`ls "${VOICE_DIR}"/*.json 2>/dev/null || true`, { encoding: 'utf-8' })
|
|
779
|
+
.split('\n').filter(Boolean).map(f => {
|
|
780
|
+
try {
|
|
781
|
+
const d = JSON.parse(readFileSync(f, 'utf-8'));
|
|
782
|
+
return { id: d.id, timestamp: d.timestamp, turns: d.turns };
|
|
783
|
+
}
|
|
784
|
+
catch {
|
|
785
|
+
return null;
|
|
786
|
+
}
|
|
787
|
+
}).filter((x) => x !== null);
|
|
788
|
+
}
|
|
789
|
+
catch {
|
|
790
|
+
return [];
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
/** Load a saved voice session by ID */
|
|
794
|
+
export function getVoiceSession(sessionId) {
|
|
795
|
+
const p = join(VOICE_DIR, `${sessionId}.json`);
|
|
796
|
+
if (!existsSync(p))
|
|
797
|
+
return null;
|
|
798
|
+
try {
|
|
799
|
+
return JSON.parse(readFileSync(p, 'utf-8'));
|
|
800
|
+
}
|
|
801
|
+
catch {
|
|
802
|
+
return null;
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
//# sourceMappingURL=voice-realtime.js.map
|