shmakk 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +23 -0
- package/LICENSE +21 -0
- package/README.md +138 -0
- package/bin/shmakk.js +2 -0
- package/docs/index.html +581 -0
- package/docs/voice.md +181 -0
- package/package.json +58 -0
- package/scripts/patch-onnxruntime.js +82 -0
- package/src/agent.js +0 -0
- package/src/audit.js +18 -0
- package/src/cli.js +177 -0
- package/src/completions.js +167 -0
- package/src/control.js +250 -0
- package/src/correction.js +159 -0
- package/src/endpoints.js +52 -0
- package/src/global-doctor.js +33 -0
- package/src/global-setup.js +62 -0
- package/src/glossary.js +235 -0
- package/src/history-parser.js +166 -0
- package/src/hooks/bash.js +43 -0
- package/src/hooks/fish.js +25 -0
- package/src/hooks/index.js +14 -0
- package/src/hooks/zsh.js +42 -0
- package/src/index.js +166 -0
- package/src/llm.js +45 -0
- package/src/markers.js +113 -0
- package/src/orchestrator.js +61 -0
- package/src/profiles.js +19 -0
- package/src/prompt-cache.js +83 -0
- package/src/pty.js +107 -0
- package/src/review.js +75 -0
- package/src/safety.js +77 -0
- package/src/services/stt.js +131 -0
- package/src/services/tts.js +307 -0
- package/src/services/voice.js +362 -0
- package/src/session.js +604 -0
- package/src/setup-voice.js +108 -0
- package/src/shell.js +32 -0
- package/src/skills.js +309 -0
- package/src/subagent.js +42 -0
- package/src/system-prompt.js +261 -0
- package/src/tools.js +386 -0
- package/src/web.js +228 -0
- package/src/workspace-index.js +213 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
// Voice input service for shmakk.
|
|
2
|
+
// VAD-based recording: starts on speech, stops after silence.
|
|
3
|
+
// Transcribes via in-process Whisper ONNX. Supports TTS interrupt.
|
|
4
|
+
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const os = require('os');
|
|
8
|
+
const { spawn, execSync } = require('child_process');
|
|
9
|
+
|
|
10
|
+
const AUDIO_DIR = path.join(os.tmpdir(), 'shmakk-voice');
|
|
11
|
+
const MAX_RECORD_SEC = 30;
|
|
12
|
+
const SILENCE_SEC = parseFloat(process.env.SHMAKK_VOICE_SILENCE_SEC || '1.8');
|
|
13
|
+
const SILENCE_THRESHOLD = process.env.SHMAKK_VOICE_SILENCE_THRESHOLD || '2%';
|
|
14
|
+
const SILENCE_START_SEC = parseFloat(process.env.SHMAKK_VOICE_SILENCE_START_SEC || '0.15');
|
|
15
|
+
const PAD_START_SEC = parseFloat(process.env.SHMAKK_VOICE_PAD_START_SEC || '0.3');
|
|
16
|
+
// Post-recording RMS gate (0..1, on int16 normalized). Below this is treated
|
|
17
|
+
// as noise/silence and never sent to Whisper. Tunable for noisy rooms.
|
|
18
|
+
const MIN_RMS = parseFloat(process.env.SHMAKK_VOICE_MIN_RMS || '0.003');
|
|
19
|
+
// Minimum captured speech duration in seconds (anything shorter is noise).
|
|
20
|
+
const MIN_SPEECH_SEC = parseFloat(process.env.SHMAKK_VOICE_MIN_SPEECH_SEC || '0.8');
|
|
21
|
+
|
|
22
|
+
// Track active TTS playback process so we can kill it on interrupt
|
|
23
|
+
let _ttsProc = null;
|
|
24
|
+
let _ttsKilled = false;
|
|
25
|
+
function _setTtsProc(proc) { _ttsProc = proc; _ttsKilled = false; }
|
|
26
|
+
function _isTtsKilled() { return _ttsKilled; }
|
|
27
|
+
|
|
28
|
+
function _killTts() {
|
|
29
|
+
if (_ttsProc) {
|
|
30
|
+
try { _ttsProc.kill('SIGTERM'); } catch {}
|
|
31
|
+
_ttsProc = null;
|
|
32
|
+
}
|
|
33
|
+
_ttsKilled = true;
|
|
34
|
+
// Also cancel sentence streaming (avoids wasted generation)
|
|
35
|
+
try {
|
|
36
|
+
const tts = require('./tts');
|
|
37
|
+
tts.stopSpeaking();
|
|
38
|
+
} catch {}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Track active recorder process so we can kill it on Ctrl+C
|
|
42
|
+
let _recorderProc = null;
|
|
43
|
+
|
|
44
|
+
function _killRecorder() {
|
|
45
|
+
if (_recorderProc) {
|
|
46
|
+
try { _recorderProc.kill('SIGTERM'); } catch {}
|
|
47
|
+
_recorderProc = null;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function ensureAudioDir() {
|
|
52
|
+
if (!fs.existsSync(AUDIO_DIR)) fs.mkdirSync(AUDIO_DIR, { recursive: true });
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Detect an available audio recorder.
|
|
57
|
+
* Sox is preferred because it supports VAD silence detection.
|
|
58
|
+
* Returns { cmd, args, ext, label } or null if none found.
|
|
59
|
+
*/
|
|
60
|
+
// Cached recorder detection — only runs once per process lifetime
|
|
61
|
+
let _cachedRecorder = undefined;
|
|
62
|
+
function detectRecorder() {
|
|
63
|
+
if (_cachedRecorder !== undefined) return _cachedRecorder;
|
|
64
|
+
|
|
65
|
+
// Use a single `which` call with all candidates for instant detection
|
|
66
|
+
let whichOut = '';
|
|
67
|
+
try {
|
|
68
|
+
whichOut = execSync('which rec sox ffmpeg arecord 2>/dev/null', {
|
|
69
|
+
encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'], timeout: 1000,
|
|
70
|
+
});
|
|
71
|
+
} catch {}
|
|
72
|
+
|
|
73
|
+
const found = new Set(
|
|
74
|
+
whichOut.split('\n').map(s => s.trim()).filter(Boolean)
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
// 1. rec (sox frontend) — preferred
|
|
78
|
+
if (found.has('rec') || [...found].some(p => p.endsWith('/rec'))) {
|
|
79
|
+
_cachedRecorder = { cmd: 'rec', ext: '.wav', label: 'rec (Sox)', vad: true, useSoxInput: false };
|
|
80
|
+
return _cachedRecorder;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// 2. sox with explicit pulse input
|
|
84
|
+
if (found.has('sox') || [...found].some(p => p.endsWith('/sox'))) {
|
|
85
|
+
_cachedRecorder = { cmd: 'sox', ext: '.wav', label: 'sox (Sox)', vad: true, useSoxInput: true };
|
|
86
|
+
return _cachedRecorder;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// 3. ffmpeg — no VAD, fixed duration fallback
|
|
90
|
+
if (found.has('ffmpeg') || [...found].some(p => p.endsWith('/ffmpeg'))) {
|
|
91
|
+
_cachedRecorder = { cmd: 'ffmpeg', ext: '.wav', label: 'ffmpeg', vad: false };
|
|
92
|
+
return _cachedRecorder;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// 4. arecord — no VAD, fixed duration fallback
|
|
96
|
+
if (found.has('arecord') || [...found].some(p => p.endsWith('/arecord'))) {
|
|
97
|
+
_cachedRecorder = { cmd: 'arecord', ext: '.wav', label: 'arecord (ALSA)', vad: false };
|
|
98
|
+
return _cachedRecorder;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
_cachedRecorder = null;
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Record audio — uses VAD silence detection if sox is available.
|
|
107
|
+
* With sox: starts capturing immediately, stops after SILENCE_SEC of quiet.
|
|
108
|
+
* Without sox: falls back to fixed-duration recording.
|
|
109
|
+
*/
|
|
110
|
+
function recordAudio(recorder, outFile, { maxDurationSec = MAX_RECORD_SEC } = {}) {
|
|
111
|
+
return new Promise((resolve, reject) => {
|
|
112
|
+
let proc;
|
|
113
|
+
|
|
114
|
+
if (recorder.vad) {
|
|
115
|
+
// sox VAD: record until silence
|
|
116
|
+
let args;
|
|
117
|
+
if (recorder.useSoxInput) {
|
|
118
|
+
// sox needs explicit input type when used directly (not via rec)
|
|
119
|
+
args = [
|
|
120
|
+
'-q',
|
|
121
|
+
'-t', 'pulseaudio', 'default',
|
|
122
|
+
'-r', '16000', '-c', '1',
|
|
123
|
+
'-t', 'wav', outFile,
|
|
124
|
+
'silence', '1', String(SILENCE_START_SEC), SILENCE_THRESHOLD,
|
|
125
|
+
'1', String(SILENCE_SEC), SILENCE_THRESHOLD,
|
|
126
|
+
'pad', String(PAD_START_SEC), '0',
|
|
127
|
+
'trim', '0', String(maxDurationSec),
|
|
128
|
+
];
|
|
129
|
+
} else {
|
|
130
|
+
args = [
|
|
131
|
+
'-q',
|
|
132
|
+
'-r', '16000', '-c', '1',
|
|
133
|
+
'-t', 'wav', outFile,
|
|
134
|
+
'silence', '1', String(SILENCE_START_SEC), SILENCE_THRESHOLD,
|
|
135
|
+
'1', String(SILENCE_SEC), SILENCE_THRESHOLD,
|
|
136
|
+
'pad', String(PAD_START_SEC), '0',
|
|
137
|
+
'trim', '0', String(maxDurationSec),
|
|
138
|
+
];
|
|
139
|
+
}
|
|
140
|
+
proc = spawn(recorder.cmd, args, { stdio: ['ignore', 'ignore', 'ignore'] });
|
|
141
|
+
} else if (recorder.cmd === 'ffmpeg') {
|
|
142
|
+
const args = ['-y', '-f', 'pulse', '-i', 'default', '-ac', '1', '-ar', '16000',
|
|
143
|
+
'-t', String(maxDurationSec), outFile];
|
|
144
|
+
proc = spawn('ffmpeg', args, { stdio: ['ignore', 'ignore', 'ignore'] });
|
|
145
|
+
} else if (recorder.cmd === 'arecord') {
|
|
146
|
+
const args = ['-q', '-f', 'cd', '-t', 'wav', '-d', String(maxDurationSec), outFile];
|
|
147
|
+
proc = spawn('arecord', args, { stdio: ['ignore', 'ignore', 'ignore'] });
|
|
148
|
+
} else {
|
|
149
|
+
return reject(new Error('Unknown recorder'));
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Track for external kill (Ctrl+C)
|
|
153
|
+
_recorderProc = proc;
|
|
154
|
+
|
|
155
|
+
const timeout = setTimeout(() => { try { proc.kill('SIGTERM'); } catch {} },
|
|
156
|
+
(maxDurationSec + 5) * 1000);
|
|
157
|
+
|
|
158
|
+
proc.on('exit', (code) => {
|
|
159
|
+
clearTimeout(timeout);
|
|
160
|
+
if (_recorderProc === proc) _recorderProc = null;
|
|
161
|
+
// sox exits 0 normally; ffmpeg exits 255 on SIGTERM; 143 = killed by SIGTERM
|
|
162
|
+
if (code === 0 || code === null || code === 255 || code === 143 || code === 141) resolve();
|
|
163
|
+
else reject(new Error(`recorder exited ${code}`));
|
|
164
|
+
});
|
|
165
|
+
proc.on('error', (err) => { clearTimeout(timeout); if (_recorderProc === proc) _recorderProc = null; reject(err); });
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Transcribe an audio file using in-process Whisper ONNX.
|
|
171
|
+
*/
|
|
172
|
+
async function transcribeAudio(audioPath, opts = {}) {
|
|
173
|
+
const { transcribe } = require('./stt');
|
|
174
|
+
return transcribe(audioPath, { language: opts.language || 'english' });
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* High-level: record from microphone → transcribe → return text.
|
|
179
|
+
* Kills any active TTS playback when recording starts (interrupt).
|
|
180
|
+
*/
|
|
181
|
+
const STOP_WORDS = new Set(['stop', 'quiet', 'shut up', 'silence', 'enough', 'cancel']);
|
|
182
|
+
|
|
183
|
+
// Whisper hallucination patterns — common output on silence/non-speech.
|
|
184
|
+
// Normalize aggressively: lowercase, strip punctuation/whitespace, then match.
|
|
185
|
+
const HALLUCINATION_PATTERNS = [
|
|
186
|
+
/^you[.!?]*$/i,
|
|
187
|
+
/^(bye|goodbye)\s*(you)?[.!?]*$/i,
|
|
188
|
+
/^thank\s*you[.!?]*$/i,
|
|
189
|
+
/^(thanks\s*for\s*watching|please\s*subscribe|subscribe)[.!?]*$/i,
|
|
190
|
+
/^(the|a|an|um|uh|uhh|hmm|mhm|bye|goodbye|stop|quiet|go|okay|ok|yeah|yes|no)[.!?]*$/i,
|
|
191
|
+
/^(i'?m\s+)?(sorry|fine)[.!?]*$/i,
|
|
192
|
+
/^[.,;:!?]+$/,
|
|
193
|
+
/^[\s.,;:!?]*$/,
|
|
194
|
+
// very short utterances that are 90%+ punctuation/symbols
|
|
195
|
+
/^[\W_]{1,3}\w{0,2}[\W_]*$/,
|
|
196
|
+
];
|
|
197
|
+
|
|
198
|
+
function filterHallucination(text) {
|
|
199
|
+
if (!text) return text;
|
|
200
|
+
const cleaned = text.trim().replace(/[.,!?;:]+$/, '').trim();
|
|
201
|
+
for (const re of HALLUCINATION_PATTERNS) {
|
|
202
|
+
if (re.test(cleaned)) return '';
|
|
203
|
+
}
|
|
204
|
+
// Single character or just non-alphanumeric
|
|
205
|
+
if (cleaned.length <= 2 && !/[a-z0-9]/i.test(cleaned)) return '';
|
|
206
|
+
return text;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
async function recordAndTranscribe({ language, maxDurationSec, onStart, onStop } = {}) {
|
|
210
|
+
ensureAudioDir();
|
|
211
|
+
const recorder = detectRecorder();
|
|
212
|
+
if (!recorder) {
|
|
213
|
+
throw new Error(
|
|
214
|
+
'No audio recorder found. Install sox (recommended): sudo pacman -S sox'
|
|
215
|
+
);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Kill TTS so the AI stops talking when user starts speaking
|
|
219
|
+
_killTts();
|
|
220
|
+
|
|
221
|
+
const outFile = path.join(AUDIO_DIR, `voice-${Date.now()}.wav`);
|
|
222
|
+
if (onStart) onStart();
|
|
223
|
+
try {
|
|
224
|
+
await recordAudio(recorder, outFile, { maxDurationSec: maxDurationSec || MAX_RECORD_SEC });
|
|
225
|
+
} catch (err) {
|
|
226
|
+
cleanupFile(outFile);
|
|
227
|
+
throw err;
|
|
228
|
+
}
|
|
229
|
+
if (onStop) onStop();
|
|
230
|
+
|
|
231
|
+
// Energy gate: if the captured audio is too quiet or too short, drop it
|
|
232
|
+
// without invoking Whisper. Whisper invents text from near-silence
|
|
233
|
+
// ("Bye you.", "Thank you for watching."), so we must filter at the
|
|
234
|
+
// audio level, not just the text level.
|
|
235
|
+
try {
|
|
236
|
+
const { rms, durationSec } = audioStats(outFile);
|
|
237
|
+
if (rms < MIN_RMS || durationSec < MIN_SPEECH_SEC) {
|
|
238
|
+
process.stderr.write(
|
|
239
|
+
`\r\x1b[90m[voice] skip (too quiet): rms=${rms.toFixed(4)} dur=${durationSec.toFixed(2)}s — tune with SHMAKK_VOICE_MIN_RMS\x1b[0m\n`,
|
|
240
|
+
);
|
|
241
|
+
cleanupFile(outFile);
|
|
242
|
+
return '';
|
|
243
|
+
}
|
|
244
|
+
if (process.env.SHMAKK_VOICE_DEBUG) {
|
|
245
|
+
process.stderr.write(
|
|
246
|
+
`\r\x1b[90m[voice] accept: rms=${rms.toFixed(4)} dur=${durationSec.toFixed(2)}s\x1b[0m\n`,
|
|
247
|
+
);
|
|
248
|
+
}
|
|
249
|
+
} catch {}
|
|
250
|
+
|
|
251
|
+
try {
|
|
252
|
+
const text = await transcribeAudio(outFile, { language: language || 'english' });
|
|
253
|
+
// Filter common Whisper hallucinations (standalone "You", "Thank you", etc.)
|
|
254
|
+
const filtered = filterHallucination(text);
|
|
255
|
+
// Check for stop words — kill TTS and discard
|
|
256
|
+
if (filtered && STOP_WORDS.has(filtered.toLowerCase().trim().replace(/[.!?]$/, ''))) {
|
|
257
|
+
_killTts();
|
|
258
|
+
process.stderr.write(`\r\x1b[33m🤫 stopped\x1b[0m\n`);
|
|
259
|
+
return '';
|
|
260
|
+
}
|
|
261
|
+
// Write transcript to stderr so it shows in terminal but isn't injected as input
|
|
262
|
+
if (filtered) process.stderr.write(`\r\x1b[36m🎤 ${filtered}\x1b[0m\n`);
|
|
263
|
+
return filtered;
|
|
264
|
+
} finally {
|
|
265
|
+
cleanupFile(outFile);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Compute audio stats from a captured WAV — used to drop near-silent
|
|
271
|
+
* recordings before they reach Whisper (which hallucinates on silence).
|
|
272
|
+
* Returns { rms, durationSec } where rms is in [0,1] over int16-normalized samples.
|
|
273
|
+
*/
|
|
274
|
+
function audioStats(wavPath) {
|
|
275
|
+
try {
|
|
276
|
+
const { WaveFile } = require('wavefile');
|
|
277
|
+
const wav = new WaveFile(fs.readFileSync(wavPath));
|
|
278
|
+
const sampleRate = wav.fmt.sampleRate || 16000;
|
|
279
|
+
// toBuffer/getSamples handles bit-depth normalization
|
|
280
|
+
const samples = wav.getSamples(true, Int16Array); // mono, int16
|
|
281
|
+
if (!samples || !samples.length) return { rms: 0, durationSec: 0 };
|
|
282
|
+
let sumSq = 0;
|
|
283
|
+
for (let i = 0; i < samples.length; i++) {
|
|
284
|
+
const v = samples[i] / 32768;
|
|
285
|
+
sumSq += v * v;
|
|
286
|
+
}
|
|
287
|
+
return {
|
|
288
|
+
rms: Math.sqrt(sumSq / samples.length),
|
|
289
|
+
durationSec: samples.length / sampleRate,
|
|
290
|
+
};
|
|
291
|
+
} catch {
|
|
292
|
+
return { rms: 1, durationSec: 999 }; // fail open — let Whisper try
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function cleanupFile(p) {
|
|
297
|
+
try {
|
|
298
|
+
fs.rmSync(p, { force: true });
|
|
299
|
+
} catch {}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Check whether a microphone recorder is available on this system.
|
|
304
|
+
*/
|
|
305
|
+
function isAvailable() {
|
|
306
|
+
return detectRecorder() !== null;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Quick microphone test: record 2 seconds, report file size.
|
|
311
|
+
* Returns { ok: bool, recorder: string|null, fileSize: number|null, error: string|null }
|
|
312
|
+
*/
|
|
313
|
+
async function testMicrophone() {
|
|
314
|
+
const recorder = detectRecorder();
|
|
315
|
+
if (!recorder) {
|
|
316
|
+
return {
|
|
317
|
+
ok: false,
|
|
318
|
+
recorder: null,
|
|
319
|
+
fileSize: null,
|
|
320
|
+
error:
|
|
321
|
+
'No audio recorder found. Install sox, arecord (alsa-utils), or ffmpeg.',
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
ensureAudioDir();
|
|
326
|
+
const outFile = path.join(AUDIO_DIR, `mic-test-${Date.now()}${recorder.ext || '.wav'}`);
|
|
327
|
+
|
|
328
|
+
try {
|
|
329
|
+
await recordAudio(recorder, outFile, { maxDurationSec: 2 });
|
|
330
|
+
const stat = fs.statSync(outFile);
|
|
331
|
+
const tooSmall = stat.size < 100; // less than 100 bytes = probably silence/error
|
|
332
|
+
return {
|
|
333
|
+
ok: !tooSmall,
|
|
334
|
+
recorder: recorder.label,
|
|
335
|
+
fileSize: stat.size,
|
|
336
|
+
error: tooSmall
|
|
337
|
+
? `Recorded only ${stat.size} bytes — microphone may be muted or disconnected.`
|
|
338
|
+
: null,
|
|
339
|
+
};
|
|
340
|
+
} catch (err) {
|
|
341
|
+
return { ok: false, recorder: recorder.label, fileSize: null, error: err.message };
|
|
342
|
+
} finally {
|
|
343
|
+
cleanupFile(outFile);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
module.exports = {
|
|
348
|
+
recordAndTranscribe,
|
|
349
|
+
transcribeAudio,
|
|
350
|
+
testMicrophone,
|
|
351
|
+
isAvailable,
|
|
352
|
+
detectRecorder,
|
|
353
|
+
MAX_RECORD_SEC,
|
|
354
|
+
_killTts,
|
|
355
|
+
_killRecorder,
|
|
356
|
+
_setTtsProc,
|
|
357
|
+
_isTtsKilled,
|
|
358
|
+
/** Preload STT model in background so first transcription is instant. */
|
|
359
|
+
preloadSTT() {
|
|
360
|
+
try { require('./stt')._ensureModel(); } catch {}
|
|
361
|
+
},
|
|
362
|
+
};
|