@kernel.chat/kbot 3.63.0 → 3.64.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,132 @@
1
+ // kbot Voice Input Tools — Agent-accessible push-to-talk transcription
2
+ //
3
+ // Exposes local voice input to kbot's tool system so agents can:
4
+ // - Listen and transcribe speech via push-to-talk
5
+ // - Check voice input availability (mic, whisper model, recorder)
6
+ //
7
+ // Fully local — no cloud APIs. Uses whisper.cpp or Ollama whisper.
8
+ import { registerTool } from './index.js';
9
+ import { getVoiceInput, checkVoiceInputStatus } from '../voice-input.js';
10
+ export function registerVoiceInputTools() {
11
+ // ── voice_listen ──
12
+ // Start listening and return transcribed text
13
+ registerTool({
14
+ name: 'voice_listen',
15
+ description: 'Listen via microphone and transcribe speech to text using local whisper. Push-to-talk: records until silence is detected (up to max duration), then transcribes locally at $0 cost. Requires sox (rec) and whisper.cpp or Ollama whisper model. Returns the transcribed text.',
16
+ parameters: {
17
+ model: {
18
+ type: 'string',
19
+ description: 'Whisper model size: tiny, base, small, medium, large (default: base). Larger = more accurate but slower.',
20
+ required: false,
21
+ default: 'base',
22
+ },
23
+ language: {
24
+ type: 'string',
25
+ description: 'Language code for transcription (default: en). Examples: en, es, fr, de, ja, zh',
26
+ required: false,
27
+ default: 'en',
28
+ },
29
+ max_seconds: {
30
+ type: 'number',
31
+ description: 'Maximum recording duration in seconds (default: 15). Recording auto-stops on silence.',
32
+ required: false,
33
+ default: 15,
34
+ },
35
+ silence_threshold: {
36
+ type: 'string',
37
+ description: 'Silence detection threshold as percentage for sox (default: 1.5). Lower = more sensitive.',
38
+ required: false,
39
+ default: '1.5',
40
+ },
41
+ },
42
+ tier: 'free',
43
+ timeout: 180_000, // 3 min — recording (up to 15s) + transcription (up to 2 min)
44
+ execute: async (args) => {
45
+ const model = args.model || 'base';
46
+ const language = args.language || 'en';
47
+ const maxSeconds = args.max_seconds || 15;
48
+ const silenceThreshold = args.silence_threshold || '1.5';
49
+ // Validate model
50
+ const validModels = ['tiny', 'base', 'small', 'medium', 'large'];
51
+ if (!validModels.includes(model)) {
52
+ return `Error: invalid model "${model}". Choose from: ${validModels.join(', ')}`;
53
+ }
54
+ try {
55
+ const result = await getVoiceInput({
56
+ model: model,
57
+ language,
58
+ maxRecordSeconds: maxSeconds,
59
+ silenceThreshold,
60
+ });
61
+ const lines = [
62
+ `Transcription: ${result.text}`,
63
+ '',
64
+ ` Backend: ${result.source}`,
65
+ ` Duration: ${result.durationMs}ms`,
66
+ ` Language: ${language}`,
67
+ ` Model: ${model}`,
68
+ ];
69
+ return lines.join('\n');
70
+ }
71
+ catch (err) {
72
+ const message = err instanceof Error ? err.message : String(err);
73
+ // If it's a setup issue, include helpful guidance
74
+ if (message.includes('No audio recorder') || message.includes('No transcription engine')) {
75
+ const status = await checkVoiceInputStatus();
76
+ const guidance = [
77
+ `Error: ${message}`,
78
+ '',
79
+ 'Setup suggestions:',
80
+ ...status.suggestions.map(s => ` - ${s}`),
81
+ ];
82
+ return guidance.join('\n');
83
+ }
84
+ return `Error: ${message}`;
85
+ }
86
+ },
87
+ });
88
+ // ── voice_status ──
89
+ // Check if voice input is available (mic permissions, whisper model)
90
+ registerTool({
91
+ name: 'voice_status',
92
+ description: 'Check voice input readiness — reports whether a microphone recorder (sox/arecord) and transcription engine (whisper.cpp/Ollama whisper) are available. Lists any issues and installation suggestions. Call this before voice_listen to diagnose problems.',
93
+ parameters: {},
94
+ tier: 'free',
95
+ execute: async () => {
96
+ const status = await checkVoiceInputStatus();
97
+ const lines = [
98
+ 'Voice Input Status',
99
+ '══════════════════',
100
+ `Available: ${status.available ? 'YES' : 'NO'}`,
101
+ '',
102
+ 'Recorder:',
103
+ ` Backend: ${status.recorder === 'none' ? 'NOT FOUND' : status.recorder}`,
104
+ ...(status.recorder === 'rec' ? [' (sox rec — silence detection, auto-stop)'] : []),
105
+ ...(status.recorder === 'arecord' ? [' (ALSA arecord — fixed duration)'] : []),
106
+ '',
107
+ 'Transcription:',
108
+ ` Backend: ${status.transcriber === 'none' ? 'NOT FOUND' : status.transcriber}`,
109
+ ...(status.whisperCliPath ? [` Whisper CLI: ${status.whisperCliPath}`] : []),
110
+ ` Ollama reachable: ${status.ollamaReachable ? 'yes' : 'no'}`,
111
+ ` Ollama whisper model: ${status.ollamaHasWhisper ? 'yes' : 'no'}`,
112
+ ];
113
+ if (status.issues.length > 0) {
114
+ lines.push('', 'Issues:');
115
+ for (const issue of status.issues) {
116
+ lines.push(` ! ${issue}`);
117
+ }
118
+ }
119
+ if (status.suggestions.length > 0) {
120
+ lines.push('', 'Suggestions:');
121
+ for (const suggestion of status.suggestions) {
122
+ lines.push(` - ${suggestion}`);
123
+ }
124
+ }
125
+ if (status.available) {
126
+ lines.push('', 'Ready to use. Call voice_listen to start recording.');
127
+ }
128
+ return lines.join('\n');
129
+ },
130
+ });
131
+ } // end registerVoiceInputTools
132
+ //# sourceMappingURL=voice-input-tools.js.map
@@ -0,0 +1,53 @@
1
+ export type WhisperModel = 'tiny' | 'base' | 'small' | 'medium' | 'large';
2
+ export type TranscriptionBackend = 'whisper-cli' | 'ollama' | 'none';
3
+ export type RecorderBackend = 'rec' | 'arecord' | 'none';
4
+ export interface VoiceInputOptions {
5
+ /** Whisper model size (default: 'base') */
6
+ model?: WhisperModel;
7
+ /** Language code for transcription (default: 'en') */
8
+ language?: string;
9
+ /** Max recording duration in seconds (default: 15) */
10
+ maxRecordSeconds?: number;
11
+ /** Silence threshold for sox-based recording stop, as a percentage (default: '1.5') */
12
+ silenceThreshold?: string;
13
+ /** Ollama host URL (default: OLLAMA_HOST env or 'http://localhost:11434') */
14
+ ollamaHost?: string;
15
+ /** Ollama whisper model name (default: 'whisper') */
16
+ ollamaWhisperModel?: string;
17
+ }
18
+ export interface VoiceInputStatus {
19
+ available: boolean;
20
+ recorder: RecorderBackend;
21
+ transcriber: TranscriptionBackend;
22
+ whisperCliPath: string | null;
23
+ ollamaReachable: boolean;
24
+ ollamaHasWhisper: boolean;
25
+ issues: string[];
26
+ suggestions: string[];
27
+ }
28
+ export interface TranscriptionResult {
29
+ text: string;
30
+ source: TranscriptionBackend;
31
+ durationMs: number;
32
+ audioFile: string | null;
33
+ }
34
+ /**
35
+ * Check voice input system status — microphone, transcription engine, models.
36
+ * Call this to diagnose issues before recording.
37
+ */
38
+ export declare function checkVoiceInputStatus(options?: Pick<VoiceInputOptions, 'ollamaHost' | 'ollamaWhisperModel'>): Promise<VoiceInputStatus>;
39
+ /**
40
+ * Record audio from the microphone and transcribe it locally.
41
+ * Returns the transcribed text.
42
+ *
43
+ * This is the main entry point — call this for push-to-talk.
44
+ *
45
+ * @throws Error if no recorder or transcriber is available
46
+ */
47
+ export declare function getVoiceInput(options?: VoiceInputOptions): Promise<TranscriptionResult>;
48
+ /**
49
+ * Quick check: can voice input work right now?
50
+ * Returns true if both a recorder and transcriber are available.
51
+ */
52
+ export declare function isVoiceInputAvailable(options?: Pick<VoiceInputOptions, 'ollamaHost' | 'ollamaWhisperModel'>): Promise<boolean>;
53
+ //# sourceMappingURL=voice-input.d.ts.map
@@ -0,0 +1,362 @@
1
+ // kbot Voice Input — Local-first speech-to-text foundation
2
+ //
3
+ // Push-to-talk flow: start recording → transcribe locally → return text
4
+ //
5
+ // Transcription backends (priority order):
6
+ // 1. whisper.cpp / openai-whisper CLI binary (fully local, $0)
7
+ // 2. Ollama with a whisper-compatible model (fully local, $0)
8
+ // 3. Falls back to text input if neither is available
9
+ //
10
+ // Recording backends:
11
+ // - macOS: `rec` (sox) — 16kHz mono WAV with silence detection
12
+ // - Linux: `arecord` (ALSA) — 16kHz mono WAV with fixed duration
13
+ //
14
+ // No cloud APIs. No subscriptions. BYOK philosophy.
15
+ //
16
+ // Usage:
17
+ // import { getVoiceInput, checkVoiceInputStatus } from './voice-input.js'
18
+ // const text = await getVoiceInput() // record + transcribe
19
+ // const status = await checkVoiceInputStatus() // check readiness
20
+ import { execSync, spawn } from 'node:child_process';
21
+ import { homedir } from 'node:os';
22
+ import { join } from 'node:path';
23
+ import { existsSync, unlinkSync, mkdirSync, statSync } from 'node:fs';
24
+ // ---------------------------------------------------------------------------
25
+ // Constants
26
+ // ---------------------------------------------------------------------------
27
+ const KBOT_DIR = join(homedir(), '.kbot');
28
+ const VOICE_TMP_DIR = join(KBOT_DIR, 'voice-tmp');
29
+ const OLLAMA_HOST = process.env.OLLAMA_HOST || 'http://localhost:11434';
30
+ // ---------------------------------------------------------------------------
31
+ // Platform detection helpers
32
+ // ---------------------------------------------------------------------------
33
+ function commandExists(cmd) {
34
+ try {
35
+ execSync(`which ${cmd}`, { stdio: 'ignore' });
36
+ return true;
37
+ }
38
+ catch {
39
+ return false;
40
+ }
41
+ }
42
+ function detectRecorder() {
43
+ if (commandExists('rec'))
44
+ return 'rec';
45
+ if (commandExists('arecord'))
46
+ return 'arecord';
47
+ return 'none';
48
+ }
49
+ function getWhisperCliPath() {
50
+ // Check common binary names in priority order
51
+ for (const cmd of ['whisper', 'whisper.cpp', 'whisper-cpp']) {
52
+ if (commandExists(cmd))
53
+ return cmd;
54
+ }
55
+ return null;
56
+ }
57
+ async function isOllamaReachable(host) {
58
+ try {
59
+ const res = await fetch(`${host}/api/tags`, {
60
+ signal: AbortSignal.timeout(3000),
61
+ });
62
+ return res.ok;
63
+ }
64
+ catch {
65
+ return false;
66
+ }
67
+ }
68
+ async function ollamaHasWhisperModel(host, modelName) {
69
+ try {
70
+ const res = await fetch(`${host}/api/tags`, {
71
+ signal: AbortSignal.timeout(3000),
72
+ });
73
+ if (!res.ok)
74
+ return false;
75
+ const data = await res.json();
76
+ if (!data.models)
77
+ return false;
78
+ return data.models.some(m => m.name.toLowerCase().includes(modelName.toLowerCase()));
79
+ }
80
+ catch {
81
+ return false;
82
+ }
83
+ }
84
+ function detectTranscriber(whisperCli, ollamaWhisper) {
85
+ if (whisperCli)
86
+ return 'whisper-cli';
87
+ if (ollamaWhisper)
88
+ return 'ollama';
89
+ return 'none';
90
+ }
91
+ // ---------------------------------------------------------------------------
92
+ // Audio recording
93
+ // ---------------------------------------------------------------------------
94
+ function ensureVoiceTmpDir() {
95
+ if (!existsSync(VOICE_TMP_DIR)) {
96
+ mkdirSync(VOICE_TMP_DIR, { recursive: true });
97
+ }
98
+ }
99
+ function generateTmpPath() {
100
+ ensureVoiceTmpDir();
101
+ const id = `voice-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
102
+ return join(VOICE_TMP_DIR, `${id}.wav`);
103
+ }
104
+ function cleanupFile(path) {
105
+ try {
106
+ if (existsSync(path))
107
+ unlinkSync(path);
108
+ }
109
+ catch {
110
+ // best effort cleanup
111
+ }
112
+ }
113
+ /**
114
+ * Record audio from the microphone.
115
+ * Returns the path to the recorded WAV file, or null if recording failed.
116
+ */
117
+ async function recordAudio(outputPath, recorder, maxSeconds, silenceThreshold) {
118
+ if (recorder === 'none')
119
+ return false;
120
+ return new Promise((resolve) => {
121
+ let proc;
122
+ if (recorder === 'rec') {
123
+ // sox rec: 16kHz mono WAV, auto-stop on silence after speech
124
+ // silence 1 0.1 <threshold>% = start recording after sound above threshold
125
+ // silence 1 2.0 <threshold>% = stop recording after 2s silence below threshold
126
+ proc = spawn('rec', [
127
+ outputPath,
128
+ 'rate', '16k',
129
+ 'channels', '1',
130
+ 'silence', '1', '0.1', `${silenceThreshold}%`,
131
+ '1', '2.0', `${silenceThreshold}%`,
132
+ 'trim', '0', String(maxSeconds),
133
+ ], {
134
+ stdio: ['ignore', 'ignore', 'ignore'],
135
+ });
136
+ }
137
+ else {
138
+ // arecord: fixed-duration recording at 16kHz mono
139
+ proc = spawn('arecord', [
140
+ '-f', 'S16_LE',
141
+ '-r', '16000',
142
+ '-c', '1',
143
+ '-d', String(maxSeconds),
144
+ outputPath,
145
+ ], {
146
+ stdio: ['ignore', 'ignore', 'ignore'],
147
+ });
148
+ }
149
+ // Safety timeout — kill if recording hangs
150
+ const timeout = setTimeout(() => {
151
+ proc.kill('SIGTERM');
152
+ }, (maxSeconds + 5) * 1000);
153
+ proc.on('close', () => {
154
+ clearTimeout(timeout);
155
+ // Verify the file exists and has content (not just a header)
156
+ if (existsSync(outputPath)) {
157
+ try {
158
+ const stat = statSync(outputPath);
159
+ resolve(stat.size > 44); // WAV header is 44 bytes; need actual audio data
160
+ }
161
+ catch {
162
+ resolve(false);
163
+ }
164
+ }
165
+ else {
166
+ resolve(false);
167
+ }
168
+ });
169
+ proc.on('error', () => {
170
+ clearTimeout(timeout);
171
+ resolve(false);
172
+ });
173
+ });
174
+ }
175
+ // ---------------------------------------------------------------------------
176
+ // Transcription — whisper.cpp CLI
177
+ // ---------------------------------------------------------------------------
178
+ function transcribeWithWhisperCli(audioPath, whisperCmd, model, language) {
179
+ // Both whisper.cpp and openai-whisper support --model and --language
180
+ // whisper.cpp outputs to stdout with --output_format txt
181
+ const output = execSync(`${whisperCmd} "${audioPath}" --model ${model} --language ${language} --output_format txt 2>/dev/null`, { encoding: 'utf-8', timeout: 120_000 }).trim();
182
+ return output;
183
+ }
184
+ // ---------------------------------------------------------------------------
185
+ // Transcription — Ollama (whisper-compatible audio model)
186
+ // ---------------------------------------------------------------------------
187
+ async function transcribeWithOllama(audioPath, host, modelName) {
188
+ // Ollama doesn't have a native whisper endpoint as of 2026-03.
189
+ // But some audio-capable models can transcribe when given a base64-encoded
190
+ // audio file as an "image" (multimodal input). This is the pattern used by
191
+ // models like whisper variants on Ollama.
192
+ //
193
+ // If the model supports the /api/generate endpoint with images, we send
194
+ // the audio as a base64 payload. If not, we fall back to asking the model
195
+ // to transcribe (text-only, which won't work for actual audio).
196
+ const { readFileSync } = await import('node:fs');
197
+ const audioBytes = readFileSync(audioPath);
198
+ const audioBase64 = audioBytes.toString('base64');
199
+ const res = await fetch(`${host}/api/generate`, {
200
+ method: 'POST',
201
+ headers: { 'Content-Type': 'application/json' },
202
+ body: JSON.stringify({
203
+ model: modelName,
204
+ prompt: 'Transcribe this audio to text. Return only the transcription, no commentary.',
205
+ images: [audioBase64],
206
+ stream: false,
207
+ }),
208
+ signal: AbortSignal.timeout(120_000),
209
+ });
210
+ if (!res.ok) {
211
+ const err = await res.text().catch(() => `HTTP ${res.status}`);
212
+ throw new Error(`Ollama transcription failed: ${err}`);
213
+ }
214
+ const data = await res.json();
215
+ return (data.response || '').trim();
216
+ }
217
+ // ---------------------------------------------------------------------------
218
+ // Public API
219
+ // ---------------------------------------------------------------------------
220
+ /**
221
+ * Check voice input system status — microphone, transcription engine, models.
222
+ * Call this to diagnose issues before recording.
223
+ */
224
+ export async function checkVoiceInputStatus(options) {
225
+ const host = options?.ollamaHost || OLLAMA_HOST;
226
+ const whisperModel = options?.ollamaWhisperModel || 'whisper';
227
+ const recorder = detectRecorder();
228
+ const whisperCli = getWhisperCliPath();
229
+ const ollamaReachable = await isOllamaReachable(host);
230
+ const ollamaHasWhisper = ollamaReachable
231
+ ? await ollamaHasWhisperModel(host, whisperModel)
232
+ : false;
233
+ const transcriber = detectTranscriber(whisperCli, ollamaHasWhisper);
234
+ const issues = [];
235
+ const suggestions = [];
236
+ // Check recorder
237
+ if (recorder === 'none') {
238
+ issues.push('No audio recorder found (need `rec` from sox or `arecord` from ALSA)');
239
+ if (process.platform === 'darwin') {
240
+ suggestions.push('Install sox: brew install sox');
241
+ }
242
+ else {
243
+ suggestions.push('Install sox: sudo apt install sox OR sudo apt install alsa-utils');
244
+ }
245
+ }
246
+ // Check transcriber
247
+ if (transcriber === 'none') {
248
+ issues.push('No transcription engine found');
249
+ suggestions.push('Install whisper.cpp: brew install whisper-cpp (macOS)');
250
+ suggestions.push('Or pull a whisper model in Ollama: ollama pull whisper');
251
+ suggestions.push('Or install openai-whisper: pip install openai-whisper');
252
+ }
253
+ // Ollama status
254
+ if (!ollamaReachable) {
255
+ suggestions.push(`Ollama not reachable at ${host}. Start it: ollama serve`);
256
+ }
257
+ else if (!ollamaHasWhisper) {
258
+ suggestions.push(`Ollama running but no whisper model found. Pull one: ollama pull whisper`);
259
+ }
260
+ const available = recorder !== 'none' && transcriber !== 'none';
261
+ return {
262
+ available,
263
+ recorder,
264
+ transcriber,
265
+ whisperCliPath: whisperCli,
266
+ ollamaReachable,
267
+ ollamaHasWhisper,
268
+ issues,
269
+ suggestions,
270
+ };
271
+ }
272
+ /**
273
+ * Record audio from the microphone and transcribe it locally.
274
+ * Returns the transcribed text.
275
+ *
276
+ * This is the main entry point — call this for push-to-talk.
277
+ *
278
+ * @throws Error if no recorder or transcriber is available
279
+ */
280
+ export async function getVoiceInput(options) {
281
+ const model = options?.model ?? 'base';
282
+ const language = options?.language ?? 'en';
283
+ const maxRecordSeconds = options?.maxRecordSeconds ?? 15;
284
+ const silenceThreshold = options?.silenceThreshold ?? '1.5';
285
+ const ollamaHost = options?.ollamaHost ?? OLLAMA_HOST;
286
+ const ollamaWhisperModel = options?.ollamaWhisperModel ?? 'whisper';
287
+ // Detect available backends
288
+ const recorder = detectRecorder();
289
+ if (recorder === 'none') {
290
+ throw new Error('No audio recorder found. Install sox (brew install sox) or alsa-utils (sudo apt install alsa-utils).');
291
+ }
292
+ const whisperCli = getWhisperCliPath();
293
+ const ollamaReachable = await isOllamaReachable(ollamaHost);
294
+ const ollamaWhisper = ollamaReachable
295
+ ? await ollamaHasWhisperModel(ollamaHost, ollamaWhisperModel)
296
+ : false;
297
+ const transcriber = detectTranscriber(whisperCli, ollamaWhisper);
298
+ if (transcriber === 'none') {
299
+ throw new Error('No transcription engine available.\n' +
300
+ 'Install one of:\n' +
301
+ ' - whisper.cpp: brew install whisper-cpp\n' +
302
+ ' - openai-whisper: pip install openai-whisper\n' +
303
+ ' - Ollama whisper: ollama pull whisper');
304
+ }
305
+ // Record
306
+ const audioPath = generateTmpPath();
307
+ const startTime = Date.now();
308
+ const recorded = await recordAudio(audioPath, recorder, maxRecordSeconds, silenceThreshold);
309
+ if (!recorded) {
310
+ cleanupFile(audioPath);
311
+ throw new Error('Recording failed — no audio captured. Check microphone permissions and that the mic is connected.');
312
+ }
313
+ // Transcribe
314
+ let text = '';
315
+ let source = transcriber;
316
+ try {
317
+ if (transcriber === 'whisper-cli' && whisperCli) {
318
+ text = transcribeWithWhisperCli(audioPath, whisperCli, model, language);
319
+ }
320
+ else if (transcriber === 'ollama') {
321
+ text = await transcribeWithOllama(audioPath, ollamaHost, ollamaWhisperModel);
322
+ }
323
+ }
324
+ catch (err) {
325
+ // If primary transcriber fails, try fallback
326
+ if (transcriber === 'whisper-cli' && ollamaWhisper) {
327
+ try {
328
+ text = await transcribeWithOllama(audioPath, ollamaHost, ollamaWhisperModel);
329
+ source = 'ollama';
330
+ }
331
+ catch {
332
+ cleanupFile(audioPath);
333
+ throw new Error(`Transcription failed: ${err instanceof Error ? err.message : String(err)}`);
334
+ }
335
+ }
336
+ else {
337
+ cleanupFile(audioPath);
338
+ throw new Error(`Transcription failed: ${err instanceof Error ? err.message : String(err)}`);
339
+ }
340
+ }
341
+ const durationMs = Date.now() - startTime;
342
+ // Clean up the temp audio file
343
+ cleanupFile(audioPath);
344
+ if (!text) {
345
+ throw new Error('Transcription returned empty text — microphone may not have captured speech.');
346
+ }
347
+ return {
348
+ text: text.trim(),
349
+ source,
350
+ durationMs,
351
+ audioFile: null, // cleaned up
352
+ };
353
+ }
354
+ /**
355
+ * Quick check: can voice input work right now?
356
+ * Returns true if both a recorder and transcriber are available.
357
+ */
358
+ export async function isVoiceInputAvailable(options) {
359
+ const status = await checkVoiceInputStatus(options);
360
+ return status.available;
361
+ }
362
+ //# sourceMappingURL=voice-input.js.map
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@kernel.chat/kbot",
3
- "version": "3.63.0",
4
- "description": "Open-source terminal AI agent. 676+ tools, 35 agents, 20 providers. Fully local, fully sovereign. MIT.",
3
+ "version": "3.64.0",
4
+ "description": "Open-source terminal AI agent. 686+ tools, 35 agents, 20 providers. Fully local, fully sovereign. MIT.",
5
5
  "type": "module",
6
6
  "repository": {
7
7
  "type": "git",