@kernel.chat/kbot 3.63.0 → 3.64.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.js +11 -0
- package/dist/buddy.d.ts +33 -0
- package/dist/buddy.js +468 -0
- package/dist/memory-scanner.d.ts +60 -0
- package/dist/memory-scanner.js +461 -0
- package/dist/tools/buddy-tools.d.ts +2 -0
- package/dist/tools/buddy-tools.js +63 -0
- package/dist/tools/index.js +3 -0
- package/dist/tools/memory-scanner-tools.d.ts +2 -0
- package/dist/tools/memory-scanner-tools.js +87 -0
- package/dist/tools/voice-input-tools.d.ts +2 -0
- package/dist/tools/voice-input-tools.js +132 -0
- package/dist/voice-input.d.ts +53 -0
- package/dist/voice-input.js +362 -0
- package/package.json +2 -2
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
// kbot Voice Input Tools — Agent-accessible push-to-talk transcription
|
|
2
|
+
//
|
|
3
|
+
// Exposes local voice input to kbot's tool system so agents can:
|
|
4
|
+
// - Listen and transcribe speech via push-to-talk
|
|
5
|
+
// - Check voice input availability (mic, whisper model, recorder)
|
|
6
|
+
//
|
|
7
|
+
// Fully local — no cloud APIs. Uses whisper.cpp or Ollama whisper.
|
|
8
|
+
import { registerTool } from './index.js';
|
|
9
|
+
import { getVoiceInput, checkVoiceInputStatus } from '../voice-input.js';
|
|
10
|
+
export function registerVoiceInputTools() {
|
|
11
|
+
// ── voice_listen ──
|
|
12
|
+
// Start listening and return transcribed text
|
|
13
|
+
registerTool({
|
|
14
|
+
name: 'voice_listen',
|
|
15
|
+
description: 'Listen via microphone and transcribe speech to text using local whisper. Push-to-talk: records until silence is detected (up to max duration), then transcribes locally at $0 cost. Requires sox (rec) and whisper.cpp or Ollama whisper model. Returns the transcribed text.',
|
|
16
|
+
parameters: {
|
|
17
|
+
model: {
|
|
18
|
+
type: 'string',
|
|
19
|
+
description: 'Whisper model size: tiny, base, small, medium, large (default: base). Larger = more accurate but slower.',
|
|
20
|
+
required: false,
|
|
21
|
+
default: 'base',
|
|
22
|
+
},
|
|
23
|
+
language: {
|
|
24
|
+
type: 'string',
|
|
25
|
+
description: 'Language code for transcription (default: en). Examples: en, es, fr, de, ja, zh',
|
|
26
|
+
required: false,
|
|
27
|
+
default: 'en',
|
|
28
|
+
},
|
|
29
|
+
max_seconds: {
|
|
30
|
+
type: 'number',
|
|
31
|
+
description: 'Maximum recording duration in seconds (default: 15). Recording auto-stops on silence.',
|
|
32
|
+
required: false,
|
|
33
|
+
default: 15,
|
|
34
|
+
},
|
|
35
|
+
silence_threshold: {
|
|
36
|
+
type: 'string',
|
|
37
|
+
description: 'Silence detection threshold as percentage for sox (default: 1.5). Lower = more sensitive.',
|
|
38
|
+
required: false,
|
|
39
|
+
default: '1.5',
|
|
40
|
+
},
|
|
41
|
+
},
|
|
42
|
+
tier: 'free',
|
|
43
|
+
timeout: 180_000, // 3 min — recording (up to 15s) + transcription (up to 2 min)
|
|
44
|
+
execute: async (args) => {
|
|
45
|
+
const model = args.model || 'base';
|
|
46
|
+
const language = args.language || 'en';
|
|
47
|
+
const maxSeconds = args.max_seconds || 15;
|
|
48
|
+
const silenceThreshold = args.silence_threshold || '1.5';
|
|
49
|
+
// Validate model
|
|
50
|
+
const validModels = ['tiny', 'base', 'small', 'medium', 'large'];
|
|
51
|
+
if (!validModels.includes(model)) {
|
|
52
|
+
return `Error: invalid model "${model}". Choose from: ${validModels.join(', ')}`;
|
|
53
|
+
}
|
|
54
|
+
try {
|
|
55
|
+
const result = await getVoiceInput({
|
|
56
|
+
model: model,
|
|
57
|
+
language,
|
|
58
|
+
maxRecordSeconds: maxSeconds,
|
|
59
|
+
silenceThreshold,
|
|
60
|
+
});
|
|
61
|
+
const lines = [
|
|
62
|
+
`Transcription: ${result.text}`,
|
|
63
|
+
'',
|
|
64
|
+
` Backend: ${result.source}`,
|
|
65
|
+
` Duration: ${result.durationMs}ms`,
|
|
66
|
+
` Language: ${language}`,
|
|
67
|
+
` Model: ${model}`,
|
|
68
|
+
];
|
|
69
|
+
return lines.join('\n');
|
|
70
|
+
}
|
|
71
|
+
catch (err) {
|
|
72
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
73
|
+
// If it's a setup issue, include helpful guidance
|
|
74
|
+
if (message.includes('No audio recorder') || message.includes('No transcription engine')) {
|
|
75
|
+
const status = await checkVoiceInputStatus();
|
|
76
|
+
const guidance = [
|
|
77
|
+
`Error: ${message}`,
|
|
78
|
+
'',
|
|
79
|
+
'Setup suggestions:',
|
|
80
|
+
...status.suggestions.map(s => ` - ${s}`),
|
|
81
|
+
];
|
|
82
|
+
return guidance.join('\n');
|
|
83
|
+
}
|
|
84
|
+
return `Error: ${message}`;
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
});
|
|
88
|
+
// ── voice_status ──
|
|
89
|
+
// Check if voice input is available (mic permissions, whisper model)
|
|
90
|
+
registerTool({
|
|
91
|
+
name: 'voice_status',
|
|
92
|
+
description: 'Check voice input readiness — reports whether a microphone recorder (sox/arecord) and transcription engine (whisper.cpp/Ollama whisper) are available. Lists any issues and installation suggestions. Call this before voice_listen to diagnose problems.',
|
|
93
|
+
parameters: {},
|
|
94
|
+
tier: 'free',
|
|
95
|
+
execute: async () => {
|
|
96
|
+
const status = await checkVoiceInputStatus();
|
|
97
|
+
const lines = [
|
|
98
|
+
'Voice Input Status',
|
|
99
|
+
'══════════════════',
|
|
100
|
+
`Available: ${status.available ? 'YES' : 'NO'}`,
|
|
101
|
+
'',
|
|
102
|
+
'Recorder:',
|
|
103
|
+
` Backend: ${status.recorder === 'none' ? 'NOT FOUND' : status.recorder}`,
|
|
104
|
+
...(status.recorder === 'rec' ? [' (sox rec — silence detection, auto-stop)'] : []),
|
|
105
|
+
...(status.recorder === 'arecord' ? [' (ALSA arecord — fixed duration)'] : []),
|
|
106
|
+
'',
|
|
107
|
+
'Transcription:',
|
|
108
|
+
` Backend: ${status.transcriber === 'none' ? 'NOT FOUND' : status.transcriber}`,
|
|
109
|
+
...(status.whisperCliPath ? [` Whisper CLI: ${status.whisperCliPath}`] : []),
|
|
110
|
+
` Ollama reachable: ${status.ollamaReachable ? 'yes' : 'no'}`,
|
|
111
|
+
` Ollama whisper model: ${status.ollamaHasWhisper ? 'yes' : 'no'}`,
|
|
112
|
+
];
|
|
113
|
+
if (status.issues.length > 0) {
|
|
114
|
+
lines.push('', 'Issues:');
|
|
115
|
+
for (const issue of status.issues) {
|
|
116
|
+
lines.push(` ! ${issue}`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
if (status.suggestions.length > 0) {
|
|
120
|
+
lines.push('', 'Suggestions:');
|
|
121
|
+
for (const suggestion of status.suggestions) {
|
|
122
|
+
lines.push(` - ${suggestion}`);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (status.available) {
|
|
126
|
+
lines.push('', 'Ready to use. Call voice_listen to start recording.');
|
|
127
|
+
}
|
|
128
|
+
return lines.join('\n');
|
|
129
|
+
},
|
|
130
|
+
});
|
|
131
|
+
} // end registerVoiceInputTools
|
|
132
|
+
//# sourceMappingURL=voice-input-tools.js.map
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
export type WhisperModel = 'tiny' | 'base' | 'small' | 'medium' | 'large';
|
|
2
|
+
export type TranscriptionBackend = 'whisper-cli' | 'ollama' | 'none';
|
|
3
|
+
export type RecorderBackend = 'rec' | 'arecord' | 'none';
|
|
4
|
+
export interface VoiceInputOptions {
|
|
5
|
+
/** Whisper model size (default: 'base') */
|
|
6
|
+
model?: WhisperModel;
|
|
7
|
+
/** Language code for transcription (default: 'en') */
|
|
8
|
+
language?: string;
|
|
9
|
+
/** Max recording duration in seconds (default: 15) */
|
|
10
|
+
maxRecordSeconds?: number;
|
|
11
|
+
/** Silence threshold for sox-based recording stop, as a percentage (default: '1.5') */
|
|
12
|
+
silenceThreshold?: string;
|
|
13
|
+
/** Ollama host URL (default: OLLAMA_HOST env or 'http://localhost:11434') */
|
|
14
|
+
ollamaHost?: string;
|
|
15
|
+
/** Ollama whisper model name (default: 'whisper') */
|
|
16
|
+
ollamaWhisperModel?: string;
|
|
17
|
+
}
|
|
18
|
+
export interface VoiceInputStatus {
|
|
19
|
+
available: boolean;
|
|
20
|
+
recorder: RecorderBackend;
|
|
21
|
+
transcriber: TranscriptionBackend;
|
|
22
|
+
whisperCliPath: string | null;
|
|
23
|
+
ollamaReachable: boolean;
|
|
24
|
+
ollamaHasWhisper: boolean;
|
|
25
|
+
issues: string[];
|
|
26
|
+
suggestions: string[];
|
|
27
|
+
}
|
|
28
|
+
export interface TranscriptionResult {
|
|
29
|
+
text: string;
|
|
30
|
+
source: TranscriptionBackend;
|
|
31
|
+
durationMs: number;
|
|
32
|
+
audioFile: string | null;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Check voice input system status — microphone, transcription engine, models.
|
|
36
|
+
* Call this to diagnose issues before recording.
|
|
37
|
+
*/
|
|
38
|
+
export declare function checkVoiceInputStatus(options?: Pick<VoiceInputOptions, 'ollamaHost' | 'ollamaWhisperModel'>): Promise<VoiceInputStatus>;
|
|
39
|
+
/**
|
|
40
|
+
* Record audio from the microphone and transcribe it locally.
|
|
41
|
+
* Returns the transcribed text.
|
|
42
|
+
*
|
|
43
|
+
* This is the main entry point — call this for push-to-talk.
|
|
44
|
+
*
|
|
45
|
+
* @throws Error if no recorder or transcriber is available
|
|
46
|
+
*/
|
|
47
|
+
export declare function getVoiceInput(options?: VoiceInputOptions): Promise<TranscriptionResult>;
|
|
48
|
+
/**
|
|
49
|
+
* Quick check: can voice input work right now?
|
|
50
|
+
* Returns true if both a recorder and transcriber are available.
|
|
51
|
+
*/
|
|
52
|
+
export declare function isVoiceInputAvailable(options?: Pick<VoiceInputOptions, 'ollamaHost' | 'ollamaWhisperModel'>): Promise<boolean>;
|
|
53
|
+
//# sourceMappingURL=voice-input.d.ts.map
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
// kbot Voice Input — Local-first speech-to-text foundation
|
|
2
|
+
//
|
|
3
|
+
// Push-to-talk flow: start recording → transcribe locally → return text
|
|
4
|
+
//
|
|
5
|
+
// Transcription backends (priority order):
|
|
6
|
+
// 1. whisper.cpp / openai-whisper CLI binary (fully local, $0)
|
|
7
|
+
// 2. Ollama with a whisper-compatible model (fully local, $0)
|
|
8
|
+
// 3. Falls back to text input if neither is available
|
|
9
|
+
//
|
|
10
|
+
// Recording backends:
|
|
11
|
+
// - macOS: `rec` (sox) — 16kHz mono WAV with silence detection
|
|
12
|
+
// - Linux: `arecord` (ALSA) — 16kHz mono WAV with fixed duration
|
|
13
|
+
//
|
|
14
|
+
// No cloud APIs. No subscriptions. BYOK philosophy.
|
|
15
|
+
//
|
|
16
|
+
// Usage:
|
|
17
|
+
// import { getVoiceInput, checkVoiceInputStatus } from './voice-input.js'
|
|
18
|
+
// const text = await getVoiceInput() // record + transcribe
|
|
19
|
+
// const status = await checkVoiceInputStatus() // check readiness
|
|
20
|
+
import { execSync, spawn } from 'node:child_process';
|
|
21
|
+
import { homedir } from 'node:os';
|
|
22
|
+
import { join } from 'node:path';
|
|
23
|
+
import { existsSync, unlinkSync, mkdirSync, statSync } from 'node:fs';
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Constants
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
const KBOT_DIR = join(homedir(), '.kbot');
|
|
28
|
+
const VOICE_TMP_DIR = join(KBOT_DIR, 'voice-tmp');
|
|
29
|
+
const OLLAMA_HOST = process.env.OLLAMA_HOST || 'http://localhost:11434';
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Platform detection helpers
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
function commandExists(cmd) {
|
|
34
|
+
try {
|
|
35
|
+
execSync(`which ${cmd}`, { stdio: 'ignore' });
|
|
36
|
+
return true;
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
return false;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
function detectRecorder() {
|
|
43
|
+
if (commandExists('rec'))
|
|
44
|
+
return 'rec';
|
|
45
|
+
if (commandExists('arecord'))
|
|
46
|
+
return 'arecord';
|
|
47
|
+
return 'none';
|
|
48
|
+
}
|
|
49
|
+
function getWhisperCliPath() {
|
|
50
|
+
// Check common binary names in priority order
|
|
51
|
+
for (const cmd of ['whisper', 'whisper.cpp', 'whisper-cpp']) {
|
|
52
|
+
if (commandExists(cmd))
|
|
53
|
+
return cmd;
|
|
54
|
+
}
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
async function isOllamaReachable(host) {
|
|
58
|
+
try {
|
|
59
|
+
const res = await fetch(`${host}/api/tags`, {
|
|
60
|
+
signal: AbortSignal.timeout(3000),
|
|
61
|
+
});
|
|
62
|
+
return res.ok;
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
async function ollamaHasWhisperModel(host, modelName) {
|
|
69
|
+
try {
|
|
70
|
+
const res = await fetch(`${host}/api/tags`, {
|
|
71
|
+
signal: AbortSignal.timeout(3000),
|
|
72
|
+
});
|
|
73
|
+
if (!res.ok)
|
|
74
|
+
return false;
|
|
75
|
+
const data = await res.json();
|
|
76
|
+
if (!data.models)
|
|
77
|
+
return false;
|
|
78
|
+
return data.models.some(m => m.name.toLowerCase().includes(modelName.toLowerCase()));
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
return false;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
function detectTranscriber(whisperCli, ollamaWhisper) {
|
|
85
|
+
if (whisperCli)
|
|
86
|
+
return 'whisper-cli';
|
|
87
|
+
if (ollamaWhisper)
|
|
88
|
+
return 'ollama';
|
|
89
|
+
return 'none';
|
|
90
|
+
}
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// Audio recording
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
function ensureVoiceTmpDir() {
|
|
95
|
+
if (!existsSync(VOICE_TMP_DIR)) {
|
|
96
|
+
mkdirSync(VOICE_TMP_DIR, { recursive: true });
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
function generateTmpPath() {
|
|
100
|
+
ensureVoiceTmpDir();
|
|
101
|
+
const id = `voice-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
102
|
+
return join(VOICE_TMP_DIR, `${id}.wav`);
|
|
103
|
+
}
|
|
104
|
+
function cleanupFile(path) {
|
|
105
|
+
try {
|
|
106
|
+
if (existsSync(path))
|
|
107
|
+
unlinkSync(path);
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
// best effort cleanup
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Record audio from the microphone.
|
|
115
|
+
* Returns the path to the recorded WAV file, or null if recording failed.
|
|
116
|
+
*/
|
|
117
|
+
async function recordAudio(outputPath, recorder, maxSeconds, silenceThreshold) {
|
|
118
|
+
if (recorder === 'none')
|
|
119
|
+
return false;
|
|
120
|
+
return new Promise((resolve) => {
|
|
121
|
+
let proc;
|
|
122
|
+
if (recorder === 'rec') {
|
|
123
|
+
// sox rec: 16kHz mono WAV, auto-stop on silence after speech
|
|
124
|
+
// silence 1 0.1 <threshold>% = start recording after sound above threshold
|
|
125
|
+
// silence 1 2.0 <threshold>% = stop recording after 2s silence below threshold
|
|
126
|
+
proc = spawn('rec', [
|
|
127
|
+
outputPath,
|
|
128
|
+
'rate', '16k',
|
|
129
|
+
'channels', '1',
|
|
130
|
+
'silence', '1', '0.1', `${silenceThreshold}%`,
|
|
131
|
+
'1', '2.0', `${silenceThreshold}%`,
|
|
132
|
+
'trim', '0', String(maxSeconds),
|
|
133
|
+
], {
|
|
134
|
+
stdio: ['ignore', 'ignore', 'ignore'],
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
else {
|
|
138
|
+
// arecord: fixed-duration recording at 16kHz mono
|
|
139
|
+
proc = spawn('arecord', [
|
|
140
|
+
'-f', 'S16_LE',
|
|
141
|
+
'-r', '16000',
|
|
142
|
+
'-c', '1',
|
|
143
|
+
'-d', String(maxSeconds),
|
|
144
|
+
outputPath,
|
|
145
|
+
], {
|
|
146
|
+
stdio: ['ignore', 'ignore', 'ignore'],
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
// Safety timeout — kill if recording hangs
|
|
150
|
+
const timeout = setTimeout(() => {
|
|
151
|
+
proc.kill('SIGTERM');
|
|
152
|
+
}, (maxSeconds + 5) * 1000);
|
|
153
|
+
proc.on('close', () => {
|
|
154
|
+
clearTimeout(timeout);
|
|
155
|
+
// Verify the file exists and has content (not just a header)
|
|
156
|
+
if (existsSync(outputPath)) {
|
|
157
|
+
try {
|
|
158
|
+
const stat = statSync(outputPath);
|
|
159
|
+
resolve(stat.size > 44); // WAV header is 44 bytes; need actual audio data
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
resolve(false);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
else {
|
|
166
|
+
resolve(false);
|
|
167
|
+
}
|
|
168
|
+
});
|
|
169
|
+
proc.on('error', () => {
|
|
170
|
+
clearTimeout(timeout);
|
|
171
|
+
resolve(false);
|
|
172
|
+
});
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
// Transcription — whisper.cpp CLI
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
function transcribeWithWhisperCli(audioPath, whisperCmd, model, language) {
|
|
179
|
+
// Both whisper.cpp and openai-whisper support --model and --language
|
|
180
|
+
// whisper.cpp outputs to stdout with --output_format txt
|
|
181
|
+
const output = execSync(`${whisperCmd} "${audioPath}" --model ${model} --language ${language} --output_format txt 2>/dev/null`, { encoding: 'utf-8', timeout: 120_000 }).trim();
|
|
182
|
+
return output;
|
|
183
|
+
}
|
|
184
|
+
// ---------------------------------------------------------------------------
|
|
185
|
+
// Transcription — Ollama (whisper-compatible audio model)
|
|
186
|
+
// ---------------------------------------------------------------------------
|
|
187
|
+
async function transcribeWithOllama(audioPath, host, modelName) {
|
|
188
|
+
// Ollama doesn't have a native whisper endpoint as of 2026-03.
|
|
189
|
+
// But some audio-capable models can transcribe when given a base64-encoded
|
|
190
|
+
// audio file as an "image" (multimodal input). This is the pattern used by
|
|
191
|
+
// models like whisper variants on Ollama.
|
|
192
|
+
//
|
|
193
|
+
// If the model supports the /api/generate endpoint with images, we send
|
|
194
|
+
// the audio as a base64 payload. If not, we fall back to asking the model
|
|
195
|
+
// to transcribe (text-only, which won't work for actual audio).
|
|
196
|
+
const { readFileSync } = await import('node:fs');
|
|
197
|
+
const audioBytes = readFileSync(audioPath);
|
|
198
|
+
const audioBase64 = audioBytes.toString('base64');
|
|
199
|
+
const res = await fetch(`${host}/api/generate`, {
|
|
200
|
+
method: 'POST',
|
|
201
|
+
headers: { 'Content-Type': 'application/json' },
|
|
202
|
+
body: JSON.stringify({
|
|
203
|
+
model: modelName,
|
|
204
|
+
prompt: 'Transcribe this audio to text. Return only the transcription, no commentary.',
|
|
205
|
+
images: [audioBase64],
|
|
206
|
+
stream: false,
|
|
207
|
+
}),
|
|
208
|
+
signal: AbortSignal.timeout(120_000),
|
|
209
|
+
});
|
|
210
|
+
if (!res.ok) {
|
|
211
|
+
const err = await res.text().catch(() => `HTTP ${res.status}`);
|
|
212
|
+
throw new Error(`Ollama transcription failed: ${err}`);
|
|
213
|
+
}
|
|
214
|
+
const data = await res.json();
|
|
215
|
+
return (data.response || '').trim();
|
|
216
|
+
}
|
|
217
|
+
// ---------------------------------------------------------------------------
|
|
218
|
+
// Public API
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
/**
|
|
221
|
+
* Check voice input system status — microphone, transcription engine, models.
|
|
222
|
+
* Call this to diagnose issues before recording.
|
|
223
|
+
*/
|
|
224
|
+
export async function checkVoiceInputStatus(options) {
|
|
225
|
+
const host = options?.ollamaHost || OLLAMA_HOST;
|
|
226
|
+
const whisperModel = options?.ollamaWhisperModel || 'whisper';
|
|
227
|
+
const recorder = detectRecorder();
|
|
228
|
+
const whisperCli = getWhisperCliPath();
|
|
229
|
+
const ollamaReachable = await isOllamaReachable(host);
|
|
230
|
+
const ollamaHasWhisper = ollamaReachable
|
|
231
|
+
? await ollamaHasWhisperModel(host, whisperModel)
|
|
232
|
+
: false;
|
|
233
|
+
const transcriber = detectTranscriber(whisperCli, ollamaHasWhisper);
|
|
234
|
+
const issues = [];
|
|
235
|
+
const suggestions = [];
|
|
236
|
+
// Check recorder
|
|
237
|
+
if (recorder === 'none') {
|
|
238
|
+
issues.push('No audio recorder found (need `rec` from sox or `arecord` from ALSA)');
|
|
239
|
+
if (process.platform === 'darwin') {
|
|
240
|
+
suggestions.push('Install sox: brew install sox');
|
|
241
|
+
}
|
|
242
|
+
else {
|
|
243
|
+
suggestions.push('Install sox: sudo apt install sox OR sudo apt install alsa-utils');
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
// Check transcriber
|
|
247
|
+
if (transcriber === 'none') {
|
|
248
|
+
issues.push('No transcription engine found');
|
|
249
|
+
suggestions.push('Install whisper.cpp: brew install whisper-cpp (macOS)');
|
|
250
|
+
suggestions.push('Or pull a whisper model in Ollama: ollama pull whisper');
|
|
251
|
+
suggestions.push('Or install openai-whisper: pip install openai-whisper');
|
|
252
|
+
}
|
|
253
|
+
// Ollama status
|
|
254
|
+
if (!ollamaReachable) {
|
|
255
|
+
suggestions.push(`Ollama not reachable at ${host}. Start it: ollama serve`);
|
|
256
|
+
}
|
|
257
|
+
else if (!ollamaHasWhisper) {
|
|
258
|
+
suggestions.push(`Ollama running but no whisper model found. Pull one: ollama pull whisper`);
|
|
259
|
+
}
|
|
260
|
+
const available = recorder !== 'none' && transcriber !== 'none';
|
|
261
|
+
return {
|
|
262
|
+
available,
|
|
263
|
+
recorder,
|
|
264
|
+
transcriber,
|
|
265
|
+
whisperCliPath: whisperCli,
|
|
266
|
+
ollamaReachable,
|
|
267
|
+
ollamaHasWhisper,
|
|
268
|
+
issues,
|
|
269
|
+
suggestions,
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Record audio from the microphone and transcribe it locally.
|
|
274
|
+
* Returns the transcribed text.
|
|
275
|
+
*
|
|
276
|
+
* This is the main entry point — call this for push-to-talk.
|
|
277
|
+
*
|
|
278
|
+
* @throws Error if no recorder or transcriber is available
|
|
279
|
+
*/
|
|
280
|
+
export async function getVoiceInput(options) {
|
|
281
|
+
const model = options?.model ?? 'base';
|
|
282
|
+
const language = options?.language ?? 'en';
|
|
283
|
+
const maxRecordSeconds = options?.maxRecordSeconds ?? 15;
|
|
284
|
+
const silenceThreshold = options?.silenceThreshold ?? '1.5';
|
|
285
|
+
const ollamaHost = options?.ollamaHost ?? OLLAMA_HOST;
|
|
286
|
+
const ollamaWhisperModel = options?.ollamaWhisperModel ?? 'whisper';
|
|
287
|
+
// Detect available backends
|
|
288
|
+
const recorder = detectRecorder();
|
|
289
|
+
if (recorder === 'none') {
|
|
290
|
+
throw new Error('No audio recorder found. Install sox (brew install sox) or alsa-utils (sudo apt install alsa-utils).');
|
|
291
|
+
}
|
|
292
|
+
const whisperCli = getWhisperCliPath();
|
|
293
|
+
const ollamaReachable = await isOllamaReachable(ollamaHost);
|
|
294
|
+
const ollamaWhisper = ollamaReachable
|
|
295
|
+
? await ollamaHasWhisperModel(ollamaHost, ollamaWhisperModel)
|
|
296
|
+
: false;
|
|
297
|
+
const transcriber = detectTranscriber(whisperCli, ollamaWhisper);
|
|
298
|
+
if (transcriber === 'none') {
|
|
299
|
+
throw new Error('No transcription engine available.\n' +
|
|
300
|
+
'Install one of:\n' +
|
|
301
|
+
' - whisper.cpp: brew install whisper-cpp\n' +
|
|
302
|
+
' - openai-whisper: pip install openai-whisper\n' +
|
|
303
|
+
' - Ollama whisper: ollama pull whisper');
|
|
304
|
+
}
|
|
305
|
+
// Record
|
|
306
|
+
const audioPath = generateTmpPath();
|
|
307
|
+
const startTime = Date.now();
|
|
308
|
+
const recorded = await recordAudio(audioPath, recorder, maxRecordSeconds, silenceThreshold);
|
|
309
|
+
if (!recorded) {
|
|
310
|
+
cleanupFile(audioPath);
|
|
311
|
+
throw new Error('Recording failed — no audio captured. Check microphone permissions and that the mic is connected.');
|
|
312
|
+
}
|
|
313
|
+
// Transcribe
|
|
314
|
+
let text = '';
|
|
315
|
+
let source = transcriber;
|
|
316
|
+
try {
|
|
317
|
+
if (transcriber === 'whisper-cli' && whisperCli) {
|
|
318
|
+
text = transcribeWithWhisperCli(audioPath, whisperCli, model, language);
|
|
319
|
+
}
|
|
320
|
+
else if (transcriber === 'ollama') {
|
|
321
|
+
text = await transcribeWithOllama(audioPath, ollamaHost, ollamaWhisperModel);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
catch (err) {
|
|
325
|
+
// If primary transcriber fails, try fallback
|
|
326
|
+
if (transcriber === 'whisper-cli' && ollamaWhisper) {
|
|
327
|
+
try {
|
|
328
|
+
text = await transcribeWithOllama(audioPath, ollamaHost, ollamaWhisperModel);
|
|
329
|
+
source = 'ollama';
|
|
330
|
+
}
|
|
331
|
+
catch {
|
|
332
|
+
cleanupFile(audioPath);
|
|
333
|
+
throw new Error(`Transcription failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
else {
|
|
337
|
+
cleanupFile(audioPath);
|
|
338
|
+
throw new Error(`Transcription failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
const durationMs = Date.now() - startTime;
|
|
342
|
+
// Clean up the temp audio file
|
|
343
|
+
cleanupFile(audioPath);
|
|
344
|
+
if (!text) {
|
|
345
|
+
throw new Error('Transcription returned empty text — microphone may not have captured speech.');
|
|
346
|
+
}
|
|
347
|
+
return {
|
|
348
|
+
text: text.trim(),
|
|
349
|
+
source,
|
|
350
|
+
durationMs,
|
|
351
|
+
audioFile: null, // cleaned up
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Quick check: can voice input work right now?
|
|
356
|
+
* Returns true if both a recorder and transcriber are available.
|
|
357
|
+
*/
|
|
358
|
+
export async function isVoiceInputAvailable(options) {
|
|
359
|
+
const status = await checkVoiceInputStatus(options);
|
|
360
|
+
return status.available;
|
|
361
|
+
}
|
|
362
|
+
//# sourceMappingURL=voice-input.js.map
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kernel.chat/kbot",
|
|
3
|
-
"version": "3.
|
|
4
|
-
"description": "Open-source terminal AI agent.
|
|
3
|
+
"version": "3.64.0",
|
|
4
|
+
"description": "Open-source terminal AI agent. 686+ tools, 35 agents, 20 providers. Fully local, fully sovereign. MIT.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"repository": {
|
|
7
7
|
"type": "git",
|