@kernel.chat/kbot 3.62.0 → 3.64.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,362 @@
1
+ // kbot Voice Input — Local-first speech-to-text foundation
2
+ //
3
+ // Push-to-talk flow: start recording → transcribe locally → return text
4
+ //
5
+ // Transcription backends (priority order):
6
+ // 1. whisper.cpp / openai-whisper CLI binary (fully local, $0)
7
+ // 2. Ollama with a whisper-compatible model (fully local, $0)
8
+ // 3. Falls back to text input if neither is available
9
+ //
10
+ // Recording backends:
11
+ // - macOS: `rec` (sox) — 16kHz mono WAV with silence detection
12
+ // - Linux: `arecord` (ALSA) — 16kHz mono WAV with fixed duration
13
+ //
14
+ // No cloud APIs. No subscriptions. BYOK philosophy.
15
+ //
16
+ // Usage:
17
+ // import { getVoiceInput, checkVoiceInputStatus } from './voice-input.js'
18
+ // const text = await getVoiceInput() // record + transcribe
19
+ // const status = await checkVoiceInputStatus() // check readiness
20
+ import { execSync, spawn } from 'node:child_process';
21
+ import { homedir } from 'node:os';
22
+ import { join } from 'node:path';
23
+ import { existsSync, unlinkSync, mkdirSync, statSync } from 'node:fs';
24
+ // ---------------------------------------------------------------------------
25
+ // Constants
26
+ // ---------------------------------------------------------------------------
27
+ const KBOT_DIR = join(homedir(), '.kbot');
28
+ const VOICE_TMP_DIR = join(KBOT_DIR, 'voice-tmp');
29
+ const OLLAMA_HOST = process.env.OLLAMA_HOST || 'http://localhost:11434';
30
+ // ---------------------------------------------------------------------------
31
+ // Platform detection helpers
32
+ // ---------------------------------------------------------------------------
33
+ function commandExists(cmd) {
34
+ try {
35
+ execSync(`which ${cmd}`, { stdio: 'ignore' });
36
+ return true;
37
+ }
38
+ catch {
39
+ return false;
40
+ }
41
+ }
42
+ function detectRecorder() {
43
+ if (commandExists('rec'))
44
+ return 'rec';
45
+ if (commandExists('arecord'))
46
+ return 'arecord';
47
+ return 'none';
48
+ }
49
+ function getWhisperCliPath() {
50
+ // Check common binary names in priority order
51
+ for (const cmd of ['whisper', 'whisper.cpp', 'whisper-cpp']) {
52
+ if (commandExists(cmd))
53
+ return cmd;
54
+ }
55
+ return null;
56
+ }
57
+ async function isOllamaReachable(host) {
58
+ try {
59
+ const res = await fetch(`${host}/api/tags`, {
60
+ signal: AbortSignal.timeout(3000),
61
+ });
62
+ return res.ok;
63
+ }
64
+ catch {
65
+ return false;
66
+ }
67
+ }
68
+ async function ollamaHasWhisperModel(host, modelName) {
69
+ try {
70
+ const res = await fetch(`${host}/api/tags`, {
71
+ signal: AbortSignal.timeout(3000),
72
+ });
73
+ if (!res.ok)
74
+ return false;
75
+ const data = await res.json();
76
+ if (!data.models)
77
+ return false;
78
+ return data.models.some(m => m.name.toLowerCase().includes(modelName.toLowerCase()));
79
+ }
80
+ catch {
81
+ return false;
82
+ }
83
+ }
84
+ function detectTranscriber(whisperCli, ollamaWhisper) {
85
+ if (whisperCli)
86
+ return 'whisper-cli';
87
+ if (ollamaWhisper)
88
+ return 'ollama';
89
+ return 'none';
90
+ }
91
+ // ---------------------------------------------------------------------------
92
+ // Audio recording
93
+ // ---------------------------------------------------------------------------
94
+ function ensureVoiceTmpDir() {
95
+ if (!existsSync(VOICE_TMP_DIR)) {
96
+ mkdirSync(VOICE_TMP_DIR, { recursive: true });
97
+ }
98
+ }
99
+ function generateTmpPath() {
100
+ ensureVoiceTmpDir();
101
+ const id = `voice-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
102
+ return join(VOICE_TMP_DIR, `${id}.wav`);
103
+ }
104
+ function cleanupFile(path) {
105
+ try {
106
+ if (existsSync(path))
107
+ unlinkSync(path);
108
+ }
109
+ catch {
110
+ // best effort cleanup
111
+ }
112
+ }
113
+ /**
114
+ * Record audio from the microphone.
115
+ * Returns the path to the recorded WAV file, or null if recording failed.
116
+ */
117
+ async function recordAudio(outputPath, recorder, maxSeconds, silenceThreshold) {
118
+ if (recorder === 'none')
119
+ return false;
120
+ return new Promise((resolve) => {
121
+ let proc;
122
+ if (recorder === 'rec') {
123
+ // sox rec: 16kHz mono WAV, auto-stop on silence after speech
124
+ // silence 1 0.1 <threshold>% = start recording after sound above threshold
125
+ // silence 1 2.0 <threshold>% = stop recording after 2s silence below threshold
126
+ proc = spawn('rec', [
127
+ outputPath,
128
+ 'rate', '16k',
129
+ 'channels', '1',
130
+ 'silence', '1', '0.1', `${silenceThreshold}%`,
131
+ '1', '2.0', `${silenceThreshold}%`,
132
+ 'trim', '0', String(maxSeconds),
133
+ ], {
134
+ stdio: ['ignore', 'ignore', 'ignore'],
135
+ });
136
+ }
137
+ else {
138
+ // arecord: fixed-duration recording at 16kHz mono
139
+ proc = spawn('arecord', [
140
+ '-f', 'S16_LE',
141
+ '-r', '16000',
142
+ '-c', '1',
143
+ '-d', String(maxSeconds),
144
+ outputPath,
145
+ ], {
146
+ stdio: ['ignore', 'ignore', 'ignore'],
147
+ });
148
+ }
149
+ // Safety timeout — kill if recording hangs
150
+ const timeout = setTimeout(() => {
151
+ proc.kill('SIGTERM');
152
+ }, (maxSeconds + 5) * 1000);
153
+ proc.on('close', () => {
154
+ clearTimeout(timeout);
155
+ // Verify the file exists and has content (not just a header)
156
+ if (existsSync(outputPath)) {
157
+ try {
158
+ const stat = statSync(outputPath);
159
+ resolve(stat.size > 44); // WAV header is 44 bytes; need actual audio data
160
+ }
161
+ catch {
162
+ resolve(false);
163
+ }
164
+ }
165
+ else {
166
+ resolve(false);
167
+ }
168
+ });
169
+ proc.on('error', () => {
170
+ clearTimeout(timeout);
171
+ resolve(false);
172
+ });
173
+ });
174
+ }
175
+ // ---------------------------------------------------------------------------
176
+ // Transcription — whisper.cpp CLI
177
+ // ---------------------------------------------------------------------------
178
+ function transcribeWithWhisperCli(audioPath, whisperCmd, model, language) {
179
+ // Both whisper.cpp and openai-whisper support --model and --language
180
+ // whisper.cpp outputs to stdout with --output_format txt
181
+ const output = execSync(`${whisperCmd} "${audioPath}" --model ${model} --language ${language} --output_format txt 2>/dev/null`, { encoding: 'utf-8', timeout: 120_000 }).trim();
182
+ return output;
183
+ }
184
+ // ---------------------------------------------------------------------------
185
+ // Transcription — Ollama (whisper-compatible audio model)
186
+ // ---------------------------------------------------------------------------
187
+ async function transcribeWithOllama(audioPath, host, modelName) {
188
+ // Ollama doesn't have a native whisper endpoint as of 2026-03.
189
+ // But some audio-capable models can transcribe when given a base64-encoded
190
+ // audio file as an "image" (multimodal input). This is the pattern used by
191
+ // models like whisper variants on Ollama.
192
+ //
193
+ // If the model supports the /api/generate endpoint with images, we send
194
+ // the audio as a base64 payload. If not, we fall back to asking the model
195
+ // to transcribe (text-only, which won't work for actual audio).
196
+ const { readFileSync } = await import('node:fs');
197
+ const audioBytes = readFileSync(audioPath);
198
+ const audioBase64 = audioBytes.toString('base64');
199
+ const res = await fetch(`${host}/api/generate`, {
200
+ method: 'POST',
201
+ headers: { 'Content-Type': 'application/json' },
202
+ body: JSON.stringify({
203
+ model: modelName,
204
+ prompt: 'Transcribe this audio to text. Return only the transcription, no commentary.',
205
+ images: [audioBase64],
206
+ stream: false,
207
+ }),
208
+ signal: AbortSignal.timeout(120_000),
209
+ });
210
+ if (!res.ok) {
211
+ const err = await res.text().catch(() => `HTTP ${res.status}`);
212
+ throw new Error(`Ollama transcription failed: ${err}`);
213
+ }
214
+ const data = await res.json();
215
+ return (data.response || '').trim();
216
+ }
217
+ // ---------------------------------------------------------------------------
218
+ // Public API
219
+ // ---------------------------------------------------------------------------
220
+ /**
221
+ * Check voice input system status — microphone, transcription engine, models.
222
+ * Call this to diagnose issues before recording.
223
+ */
224
+ export async function checkVoiceInputStatus(options) {
225
+ const host = options?.ollamaHost || OLLAMA_HOST;
226
+ const whisperModel = options?.ollamaWhisperModel || 'whisper';
227
+ const recorder = detectRecorder();
228
+ const whisperCli = getWhisperCliPath();
229
+ const ollamaReachable = await isOllamaReachable(host);
230
+ const ollamaHasWhisper = ollamaReachable
231
+ ? await ollamaHasWhisperModel(host, whisperModel)
232
+ : false;
233
+ const transcriber = detectTranscriber(whisperCli, ollamaHasWhisper);
234
+ const issues = [];
235
+ const suggestions = [];
236
+ // Check recorder
237
+ if (recorder === 'none') {
238
+ issues.push('No audio recorder found (need `rec` from sox or `arecord` from ALSA)');
239
+ if (process.platform === 'darwin') {
240
+ suggestions.push('Install sox: brew install sox');
241
+ }
242
+ else {
243
+ suggestions.push('Install sox: sudo apt install sox OR sudo apt install alsa-utils');
244
+ }
245
+ }
246
+ // Check transcriber
247
+ if (transcriber === 'none') {
248
+ issues.push('No transcription engine found');
249
+ suggestions.push('Install whisper.cpp: brew install whisper-cpp (macOS)');
250
+ suggestions.push('Or pull a whisper model in Ollama: ollama pull whisper');
251
+ suggestions.push('Or install openai-whisper: pip install openai-whisper');
252
+ }
253
+ // Ollama status
254
+ if (!ollamaReachable) {
255
+ suggestions.push(`Ollama not reachable at ${host}. Start it: ollama serve`);
256
+ }
257
+ else if (!ollamaHasWhisper) {
258
+ suggestions.push(`Ollama running but no whisper model found. Pull one: ollama pull whisper`);
259
+ }
260
+ const available = recorder !== 'none' && transcriber !== 'none';
261
+ return {
262
+ available,
263
+ recorder,
264
+ transcriber,
265
+ whisperCliPath: whisperCli,
266
+ ollamaReachable,
267
+ ollamaHasWhisper,
268
+ issues,
269
+ suggestions,
270
+ };
271
+ }
272
+ /**
273
+ * Record audio from the microphone and transcribe it locally.
274
+ * Returns the transcribed text.
275
+ *
276
+ * This is the main entry point — call this for push-to-talk.
277
+ *
278
+ * @throws Error if no recorder or transcriber is available
279
+ */
280
+ export async function getVoiceInput(options) {
281
+ const model = options?.model ?? 'base';
282
+ const language = options?.language ?? 'en';
283
+ const maxRecordSeconds = options?.maxRecordSeconds ?? 15;
284
+ const silenceThreshold = options?.silenceThreshold ?? '1.5';
285
+ const ollamaHost = options?.ollamaHost ?? OLLAMA_HOST;
286
+ const ollamaWhisperModel = options?.ollamaWhisperModel ?? 'whisper';
287
+ // Detect available backends
288
+ const recorder = detectRecorder();
289
+ if (recorder === 'none') {
290
+ throw new Error('No audio recorder found. Install sox (brew install sox) or alsa-utils (sudo apt install alsa-utils).');
291
+ }
292
+ const whisperCli = getWhisperCliPath();
293
+ const ollamaReachable = await isOllamaReachable(ollamaHost);
294
+ const ollamaWhisper = ollamaReachable
295
+ ? await ollamaHasWhisperModel(ollamaHost, ollamaWhisperModel)
296
+ : false;
297
+ const transcriber = detectTranscriber(whisperCli, ollamaWhisper);
298
+ if (transcriber === 'none') {
299
+ throw new Error('No transcription engine available.\n' +
300
+ 'Install one of:\n' +
301
+ ' - whisper.cpp: brew install whisper-cpp\n' +
302
+ ' - openai-whisper: pip install openai-whisper\n' +
303
+ ' - Ollama whisper: ollama pull whisper');
304
+ }
305
+ // Record
306
+ const audioPath = generateTmpPath();
307
+ const startTime = Date.now();
308
+ const recorded = await recordAudio(audioPath, recorder, maxRecordSeconds, silenceThreshold);
309
+ if (!recorded) {
310
+ cleanupFile(audioPath);
311
+ throw new Error('Recording failed — no audio captured. Check microphone permissions and that the mic is connected.');
312
+ }
313
+ // Transcribe
314
+ let text = '';
315
+ let source = transcriber;
316
+ try {
317
+ if (transcriber === 'whisper-cli' && whisperCli) {
318
+ text = transcribeWithWhisperCli(audioPath, whisperCli, model, language);
319
+ }
320
+ else if (transcriber === 'ollama') {
321
+ text = await transcribeWithOllama(audioPath, ollamaHost, ollamaWhisperModel);
322
+ }
323
+ }
324
+ catch (err) {
325
+ // If primary transcriber fails, try fallback
326
+ if (transcriber === 'whisper-cli' && ollamaWhisper) {
327
+ try {
328
+ text = await transcribeWithOllama(audioPath, ollamaHost, ollamaWhisperModel);
329
+ source = 'ollama';
330
+ }
331
+ catch {
332
+ cleanupFile(audioPath);
333
+ throw new Error(`Transcription failed: ${err instanceof Error ? err.message : String(err)}`);
334
+ }
335
+ }
336
+ else {
337
+ cleanupFile(audioPath);
338
+ throw new Error(`Transcription failed: ${err instanceof Error ? err.message : String(err)}`);
339
+ }
340
+ }
341
+ const durationMs = Date.now() - startTime;
342
+ // Clean up the temp audio file
343
+ cleanupFile(audioPath);
344
+ if (!text) {
345
+ throw new Error('Transcription returned empty text — microphone may not have captured speech.');
346
+ }
347
+ return {
348
+ text: text.trim(),
349
+ source,
350
+ durationMs,
351
+ audioFile: null, // cleaned up
352
+ };
353
+ }
354
+ /**
355
+ * Quick check: can voice input work right now?
356
+ * Returns true if both a recorder and transcriber are available.
357
+ */
358
+ export async function isVoiceInputAvailable(options) {
359
+ const status = await checkVoiceInputStatus(options);
360
+ return status.available;
361
+ }
362
+ //# sourceMappingURL=voice-input.js.map
@@ -2,7 +2,7 @@
2
2
  "name": "kbot",
3
3
  "display_name": "kbot",
4
4
  "description": "Open-source terminal AI agent — 670+ tools, 35 agents, science, finance, security, music production, and more.",
5
- "version": "3.60.1",
5
+ "version": "3.62.0",
6
6
  "homepage": "https://kernel.chat",
7
7
  "repository": "https://github.com/isaacsight/kernel",
8
8
  "license": "MIT",
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@kernel.chat/kbot",
3
- "version": "3.62.0",
4
- "description": "Open-source terminal AI agent. 670+ tools, 35 agents, 20 providers. Fully local, fully sovereign. MIT.",
3
+ "version": "3.64.0",
4
+ "description": "Open-source terminal AI agent. 686+ tools, 35 agents, 20 providers. Fully local, fully sovereign. MIT.",
5
5
  "type": "module",
6
6
  "repository": {
7
7
  "type": "git",