shmakk 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,362 @@
1
+ // Voice input service for shmakk.
2
+ // VAD-based recording: starts on speech, stops after silence.
3
+ // Transcribes via in-process Whisper ONNX. Supports TTS interrupt.
4
+
5
+ const fs = require('fs');
6
+ const path = require('path');
7
+ const os = require('os');
8
+ const { spawn, execSync } = require('child_process');
9
+
10
+ const AUDIO_DIR = path.join(os.tmpdir(), 'shmakk-voice');
11
+ const MAX_RECORD_SEC = 30;
12
+ const SILENCE_SEC = parseFloat(process.env.SHMAKK_VOICE_SILENCE_SEC || '1.8');
13
+ const SILENCE_THRESHOLD = process.env.SHMAKK_VOICE_SILENCE_THRESHOLD || '2%';
14
+ const SILENCE_START_SEC = parseFloat(process.env.SHMAKK_VOICE_SILENCE_START_SEC || '0.15');
15
+ const PAD_START_SEC = parseFloat(process.env.SHMAKK_VOICE_PAD_START_SEC || '0.3');
16
+ // Post-recording RMS gate (0..1, on int16 normalized). Below this is treated
17
+ // as noise/silence and never sent to Whisper. Tunable for noisy rooms.
18
+ const MIN_RMS = parseFloat(process.env.SHMAKK_VOICE_MIN_RMS || '0.003');
19
+ // Minimum captured speech duration in seconds (anything shorter is noise).
20
+ const MIN_SPEECH_SEC = parseFloat(process.env.SHMAKK_VOICE_MIN_SPEECH_SEC || '0.8');
21
+
22
+ // Track active TTS playback process so we can kill it on interrupt
23
+ let _ttsProc = null;
24
+ let _ttsKilled = false;
25
+ function _setTtsProc(proc) { _ttsProc = proc; _ttsKilled = false; }
26
+ function _isTtsKilled() { return _ttsKilled; }
27
+
28
+ function _killTts() {
29
+ if (_ttsProc) {
30
+ try { _ttsProc.kill('SIGTERM'); } catch {}
31
+ _ttsProc = null;
32
+ }
33
+ _ttsKilled = true;
34
+ // Also cancel sentence streaming (avoids wasted generation)
35
+ try {
36
+ const tts = require('./tts');
37
+ tts.stopSpeaking();
38
+ } catch {}
39
+ }
40
+
41
+ // Track active recorder process so we can kill it on Ctrl+C
42
+ let _recorderProc = null;
43
+
44
+ function _killRecorder() {
45
+ if (_recorderProc) {
46
+ try { _recorderProc.kill('SIGTERM'); } catch {}
47
+ _recorderProc = null;
48
+ }
49
+ }
50
+
51
+ function ensureAudioDir() {
52
+ if (!fs.existsSync(AUDIO_DIR)) fs.mkdirSync(AUDIO_DIR, { recursive: true });
53
+ }
54
+
55
+ /**
56
+ * Detect an available audio recorder.
57
+ * Sox is preferred because it supports VAD silence detection.
58
+ * Returns { cmd, args, ext, label } or null if none found.
59
+ */
60
+ // Cached recorder detection — only runs once per process lifetime
61
+ let _cachedRecorder = undefined;
62
+ function detectRecorder() {
63
+ if (_cachedRecorder !== undefined) return _cachedRecorder;
64
+
65
+ // Use a single `which` call with all candidates for instant detection
66
+ let whichOut = '';
67
+ try {
68
+ whichOut = execSync('which rec sox ffmpeg arecord 2>/dev/null', {
69
+ encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'], timeout: 1000,
70
+ });
71
+ } catch {}
72
+
73
+ const found = new Set(
74
+ whichOut.split('\n').map(s => s.trim()).filter(Boolean)
75
+ );
76
+
77
+ // 1. rec (sox frontend) — preferred
78
+ if (found.has('rec') || [...found].some(p => p.endsWith('/rec'))) {
79
+ _cachedRecorder = { cmd: 'rec', ext: '.wav', label: 'rec (Sox)', vad: true, useSoxInput: false };
80
+ return _cachedRecorder;
81
+ }
82
+
83
+ // 2. sox with explicit pulse input
84
+ if (found.has('sox') || [...found].some(p => p.endsWith('/sox'))) {
85
+ _cachedRecorder = { cmd: 'sox', ext: '.wav', label: 'sox (Sox)', vad: true, useSoxInput: true };
86
+ return _cachedRecorder;
87
+ }
88
+
89
+ // 3. ffmpeg — no VAD, fixed duration fallback
90
+ if (found.has('ffmpeg') || [...found].some(p => p.endsWith('/ffmpeg'))) {
91
+ _cachedRecorder = { cmd: 'ffmpeg', ext: '.wav', label: 'ffmpeg', vad: false };
92
+ return _cachedRecorder;
93
+ }
94
+
95
+ // 4. arecord — no VAD, fixed duration fallback
96
+ if (found.has('arecord') || [...found].some(p => p.endsWith('/arecord'))) {
97
+ _cachedRecorder = { cmd: 'arecord', ext: '.wav', label: 'arecord (ALSA)', vad: false };
98
+ return _cachedRecorder;
99
+ }
100
+
101
+ _cachedRecorder = null;
102
+ return null;
103
+ }
104
+
105
+ /**
106
+ * Record audio — uses VAD silence detection if sox is available.
107
+ * With sox: starts capturing immediately, stops after SILENCE_SEC of quiet.
108
+ * Without sox: falls back to fixed-duration recording.
109
+ */
110
+ function recordAudio(recorder, outFile, { maxDurationSec = MAX_RECORD_SEC } = {}) {
111
+ return new Promise((resolve, reject) => {
112
+ let proc;
113
+
114
+ if (recorder.vad) {
115
+ // sox VAD: record until silence
116
+ let args;
117
+ if (recorder.useSoxInput) {
118
+ // sox needs explicit input type when used directly (not via rec)
119
+ args = [
120
+ '-q',
121
+ '-t', 'pulseaudio', 'default',
122
+ '-r', '16000', '-c', '1',
123
+ '-t', 'wav', outFile,
124
+ 'silence', '1', String(SILENCE_START_SEC), SILENCE_THRESHOLD,
125
+ '1', String(SILENCE_SEC), SILENCE_THRESHOLD,
126
+ 'pad', String(PAD_START_SEC), '0',
127
+ 'trim', '0', String(maxDurationSec),
128
+ ];
129
+ } else {
130
+ args = [
131
+ '-q',
132
+ '-r', '16000', '-c', '1',
133
+ '-t', 'wav', outFile,
134
+ 'silence', '1', String(SILENCE_START_SEC), SILENCE_THRESHOLD,
135
+ '1', String(SILENCE_SEC), SILENCE_THRESHOLD,
136
+ 'pad', String(PAD_START_SEC), '0',
137
+ 'trim', '0', String(maxDurationSec),
138
+ ];
139
+ }
140
+ proc = spawn(recorder.cmd, args, { stdio: ['ignore', 'ignore', 'ignore'] });
141
+ } else if (recorder.cmd === 'ffmpeg') {
142
+ const args = ['-y', '-f', 'pulse', '-i', 'default', '-ac', '1', '-ar', '16000',
143
+ '-t', String(maxDurationSec), outFile];
144
+ proc = spawn('ffmpeg', args, { stdio: ['ignore', 'ignore', 'ignore'] });
145
+ } else if (recorder.cmd === 'arecord') {
146
+ const args = ['-q', '-f', 'cd', '-t', 'wav', '-d', String(maxDurationSec), outFile];
147
+ proc = spawn('arecord', args, { stdio: ['ignore', 'ignore', 'ignore'] });
148
+ } else {
149
+ return reject(new Error('Unknown recorder'));
150
+ }
151
+
152
+ // Track for external kill (Ctrl+C)
153
+ _recorderProc = proc;
154
+
155
+ const timeout = setTimeout(() => { try { proc.kill('SIGTERM'); } catch {} },
156
+ (maxDurationSec + 5) * 1000);
157
+
158
+ proc.on('exit', (code) => {
159
+ clearTimeout(timeout);
160
+ if (_recorderProc === proc) _recorderProc = null;
161
+ // sox exits 0 normally; ffmpeg exits 255 on SIGTERM; 143 = killed by SIGTERM
162
+ if (code === 0 || code === null || code === 255 || code === 143 || code === 141) resolve();
163
+ else reject(new Error(`recorder exited ${code}`));
164
+ });
165
+ proc.on('error', (err) => { clearTimeout(timeout); if (_recorderProc === proc) _recorderProc = null; reject(err); });
166
+ });
167
+ }
168
+
169
+ /**
170
+ * Transcribe an audio file using in-process Whisper ONNX.
171
+ */
172
+ async function transcribeAudio(audioPath, opts = {}) {
173
+ const { transcribe } = require('./stt');
174
+ return transcribe(audioPath, { language: opts.language || 'english' });
175
+ }
176
+
177
+ /**
178
+ * High-level: record from microphone → transcribe → return text.
179
+ * Kills any active TTS playback when recording starts (interrupt).
180
+ */
181
+ const STOP_WORDS = new Set(['stop', 'quiet', 'shut up', 'silence', 'enough', 'cancel']);
182
+
183
+ // Whisper hallucination patterns — common output on silence/non-speech.
184
+ // Normalize aggressively: lowercase, strip punctuation/whitespace, then match.
185
+ const HALLUCINATION_PATTERNS = [
186
+ /^you[.!?]*$/i,
187
+ /^(bye|goodbye)\s*(you)?[.!?]*$/i,
188
+ /^thank\s*you[.!?]*$/i,
189
+ /^(thanks\s*for\s*watching|please\s*subscribe|subscribe)[.!?]*$/i,
190
+ /^(the|a|an|um|uh|uhh|hmm|mhm|bye|goodbye|stop|quiet|go|okay|ok|yeah|yes|no)[.!?]*$/i,
191
+ /^(i'?m\s+)?(sorry|fine)[.!?]*$/i,
192
+ /^[.,;:!?]+$/,
193
+ /^[\s.,;:!?]*$/,
194
+ // very short utterances that are 90%+ punctuation/symbols
195
+ /^[\W_]{1,3}\w{0,2}[\W_]*$/,
196
+ ];
197
+
198
+ function filterHallucination(text) {
199
+ if (!text) return text;
200
+ const cleaned = text.trim().replace(/[.,!?;:]+$/, '').trim();
201
+ for (const re of HALLUCINATION_PATTERNS) {
202
+ if (re.test(cleaned)) return '';
203
+ }
204
+ // Single character or just non-alphanumeric
205
+ if (cleaned.length <= 2 && !/[a-z0-9]/i.test(cleaned)) return '';
206
+ return text;
207
+ }
208
+
209
+ async function recordAndTranscribe({ language, maxDurationSec, onStart, onStop } = {}) {
210
+ ensureAudioDir();
211
+ const recorder = detectRecorder();
212
+ if (!recorder) {
213
+ throw new Error(
214
+ 'No audio recorder found. Install sox (recommended): sudo pacman -S sox'
215
+ );
216
+ }
217
+
218
+ // Kill TTS so the AI stops talking when user starts speaking
219
+ _killTts();
220
+
221
+ const outFile = path.join(AUDIO_DIR, `voice-${Date.now()}.wav`);
222
+ if (onStart) onStart();
223
+ try {
224
+ await recordAudio(recorder, outFile, { maxDurationSec: maxDurationSec || MAX_RECORD_SEC });
225
+ } catch (err) {
226
+ cleanupFile(outFile);
227
+ throw err;
228
+ }
229
+ if (onStop) onStop();
230
+
231
+ // Energy gate: if the captured audio is too quiet or too short, drop it
232
+ // without invoking Whisper. Whisper invents text from near-silence
233
+ // ("Bye you.", "Thank you for watching."), so we must filter at the
234
+ // audio level, not just the text level.
235
+ try {
236
+ const { rms, durationSec } = audioStats(outFile);
237
+ if (rms < MIN_RMS || durationSec < MIN_SPEECH_SEC) {
238
+ process.stderr.write(
239
+ `\r\x1b[90m[voice] skip (too quiet): rms=${rms.toFixed(4)} dur=${durationSec.toFixed(2)}s — tune with SHMAKK_VOICE_MIN_RMS\x1b[0m\n`,
240
+ );
241
+ cleanupFile(outFile);
242
+ return '';
243
+ }
244
+ if (process.env.SHMAKK_VOICE_DEBUG) {
245
+ process.stderr.write(
246
+ `\r\x1b[90m[voice] accept: rms=${rms.toFixed(4)} dur=${durationSec.toFixed(2)}s\x1b[0m\n`,
247
+ );
248
+ }
249
+ } catch {}
250
+
251
+ try {
252
+ const text = await transcribeAudio(outFile, { language: language || 'english' });
253
+ // Filter common Whisper hallucinations (standalone "You", "Thank you", etc.)
254
+ const filtered = filterHallucination(text);
255
+ // Check for stop words — kill TTS and discard
256
+ if (filtered && STOP_WORDS.has(filtered.toLowerCase().trim().replace(/[.!?]$/, ''))) {
257
+ _killTts();
258
+ process.stderr.write(`\r\x1b[33m🤫 stopped\x1b[0m\n`);
259
+ return '';
260
+ }
261
+ // Write transcript to stderr so it shows in terminal but isn't injected as input
262
+ if (filtered) process.stderr.write(`\r\x1b[36m🎤 ${filtered}\x1b[0m\n`);
263
+ return filtered;
264
+ } finally {
265
+ cleanupFile(outFile);
266
+ }
267
+ }
268
+
269
+ /**
270
+ * Compute audio stats from a captured WAV — used to drop near-silent
271
+ * recordings before they reach Whisper (which hallucinates on silence).
272
+ * Returns { rms, durationSec } where rms is in [0,1] over int16-normalized samples.
273
+ */
274
+ function audioStats(wavPath) {
275
+ try {
276
+ const { WaveFile } = require('wavefile');
277
+ const wav = new WaveFile(fs.readFileSync(wavPath));
278
+ const sampleRate = wav.fmt.sampleRate || 16000;
279
+ // toBuffer/getSamples handles bit-depth normalization
280
+ const samples = wav.getSamples(true, Int16Array); // mono, int16
281
+ if (!samples || !samples.length) return { rms: 0, durationSec: 0 };
282
+ let sumSq = 0;
283
+ for (let i = 0; i < samples.length; i++) {
284
+ const v = samples[i] / 32768;
285
+ sumSq += v * v;
286
+ }
287
+ return {
288
+ rms: Math.sqrt(sumSq / samples.length),
289
+ durationSec: samples.length / sampleRate,
290
+ };
291
+ } catch {
292
+ return { rms: 1, durationSec: 999 }; // fail open — let Whisper try
293
+ }
294
+ }
295
+
296
+ function cleanupFile(p) {
297
+ try {
298
+ fs.rmSync(p, { force: true });
299
+ } catch {}
300
+ }
301
+
302
+ /**
303
+ * Check whether a microphone recorder is available on this system.
304
+ */
305
+ function isAvailable() {
306
+ return detectRecorder() !== null;
307
+ }
308
+
309
+ /**
310
+ * Quick microphone test: record 2 seconds, report file size.
311
+ * Returns { ok: bool, recorder: string|null, fileSize: number|null, error: string|null }
312
+ */
313
+ async function testMicrophone() {
314
+ const recorder = detectRecorder();
315
+ if (!recorder) {
316
+ return {
317
+ ok: false,
318
+ recorder: null,
319
+ fileSize: null,
320
+ error:
321
+ 'No audio recorder found. Install sox, arecord (alsa-utils), or ffmpeg.',
322
+ };
323
+ }
324
+
325
+ ensureAudioDir();
326
+ const outFile = path.join(AUDIO_DIR, `mic-test-${Date.now()}${recorder.ext || '.wav'}`);
327
+
328
+ try {
329
+ await recordAudio(recorder, outFile, { maxDurationSec: 2 });
330
+ const stat = fs.statSync(outFile);
331
+ const tooSmall = stat.size < 100; // less than 100 bytes = probably silence/error
332
+ return {
333
+ ok: !tooSmall,
334
+ recorder: recorder.label,
335
+ fileSize: stat.size,
336
+ error: tooSmall
337
+ ? `Recorded only ${stat.size} bytes — microphone may be muted or disconnected.`
338
+ : null,
339
+ };
340
+ } catch (err) {
341
+ return { ok: false, recorder: recorder.label, fileSize: null, error: err.message };
342
+ } finally {
343
+ cleanupFile(outFile);
344
+ }
345
+ }
346
+
347
+ module.exports = {
348
+ recordAndTranscribe,
349
+ transcribeAudio,
350
+ testMicrophone,
351
+ isAvailable,
352
+ detectRecorder,
353
+ MAX_RECORD_SEC,
354
+ _killTts,
355
+ _killRecorder,
356
+ _setTtsProc,
357
+ _isTtsKilled,
358
+ /** Preload STT model in background so first transcription is instant. */
359
+ preloadSTT() {
360
+ try { require('./stt')._ensureModel(); } catch {}
361
+ },
362
+ };