@tritard/waterbrother 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tritard/waterbrother",
3
- "version": "0.9.2",
3
+ "version": "0.9.4",
4
4
  "description": "Waterbrother: Grok-powered coding CLI with local tools, sessions, operator modes, and approval controls",
5
5
  "type": "module",
6
6
  "bin": {
package/src/agent.js CHANGED
@@ -74,7 +74,8 @@ When you use tools:
74
74
  - avoid hype such as "premium", "luxurious", "studio-grade", or "improved!"
75
75
  - Explain what you changed and why.
76
76
  - Never claim you ran commands you did not run.
77
- - If a tool fails, show the failure and recover.`;
77
+ - If a tool fails, show the failure and recover.
78
+ - You are a coding tool for real software engineering work. If a request is clearly a joke, hypothetical, non-technical, or not related to actual software development, respond conversationally WITHOUT using any tools. Do not create files, write scripts, or make edits for non-engineering requests. Examples of things you should NOT build: personality generators, dating advice scripts, joke apps, horoscope generators, or any request that is clearly not serious engineering work.`;
78
79
 
79
80
  const COMPACTION_SYSTEM_PROMPT = `You summarize coding assistant transcripts for context compaction.
80
81
  Output concise markdown with these sections:
package/src/cli.js CHANGED
@@ -167,7 +167,8 @@ const INTERACTIVE_COMMANDS = [
167
167
  { name: "/models", description: "Select model from list" },
168
168
  { name: "/feedback", description: "Report a bug or share feedback" },
169
169
  { name: "/cost", description: "Show session token usage and cost breakdown" },
170
- { name: "/diff", description: "Show git changes in the current repo" }
170
+ { name: "/diff", description: "Show git changes in the current repo" },
171
+ { name: "/voice", description: "Toggle voice dictation (press space to record)" }
171
172
  ];
172
173
 
173
174
  const AGENT_PROFILES = ["coder", "designer", "reviewer", "planner"];
@@ -4459,6 +4460,8 @@ async function readInteractiveLine(options = {}) {
4459
4460
  const output = process.stdout;
4460
4461
  const initialRaw = Boolean(input.isRaw);
4461
4462
  const getFooterText = typeof options.getFooterText === "function" ? options.getFooterText : null;
4463
+ const voiceSession = options.voiceSession || null;
4464
+ const grokConfig = options.grokConfig || null;
4462
4465
 
4463
4466
  return new Promise((resolve, reject) => {
4464
4467
  let buffer = "";
@@ -4468,6 +4471,10 @@ async function readInteractiveLine(options = {}) {
4468
4471
  let ignoredPasteEnters = 0;
4469
4472
  let pasteSuppressUntil = 0;
4470
4473
 
4474
+ // Voice recording state
4475
+ let voiceRecording = false;
4476
+ let voiceIndicator = "";
4477
+
4471
4478
  function finish(nextValue) {
4472
4479
  if (settled) return;
4473
4480
  settled = true;
@@ -4490,8 +4497,9 @@ async function readInteractiveLine(options = {}) {
4490
4497
  selectedIndex = 0;
4491
4498
  }
4492
4499
 
4500
+ const displayBuffer = voiceIndicator ? `${buffer} ${voiceIndicator}` : buffer;
4493
4501
  const writePrompt = () => {
4494
- output.write(formatPromptRow(buffer, columns));
4502
+ output.write(formatPromptRow(displayBuffer, columns));
4495
4503
  };
4496
4504
 
4497
4505
  output.write("\r\x1b[2K");
@@ -4625,6 +4633,60 @@ async function readInteractiveLine(options = {}) {
4625
4633
  return;
4626
4634
  }
4627
4635
 
4636
+ // Voice: spacebar on empty/trailing-space triggers a 5-second recording.
4637
+ // Uses fixed duration with clean sox exit — same code path as test-capture.mjs.
4638
+ if (voiceSession && !voiceRecording && str === " " && (buffer.length === 0 || buffer.endsWith(" "))) {
4639
+ voiceRecording = true;
4640
+ voiceIndicator = "\x1b[31m[recording 5s — speak now]\x1b[0m";
4641
+ render();
4642
+
4643
+ (async () => {
4644
+ try {
4645
+ const result = await voiceSession.recordAndTranscribe(5);
4646
+ voiceRecording = false;
4647
+
4648
+ if (result && typeof result === "object" && result.error) {
4649
+ voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
4650
+ render();
4651
+ setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4652
+ return;
4653
+ }
4654
+
4655
+ const rawText = typeof result === "string" ? result : "";
4656
+ voiceIndicator = "";
4657
+ if (!rawText) {
4658
+ render();
4659
+ return;
4660
+ }
4661
+
4662
+ const insertPoint = buffer.length;
4663
+ buffer += rawText;
4664
+ render();
4665
+
4666
+ if (grokConfig && grokConfig.apiKey) {
4667
+ voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
4668
+ render();
4669
+ voiceSession.correctTranscript(rawText, grokConfig).then((corrected) => {
4670
+ voiceIndicator = "";
4671
+ if (settled) return;
4672
+ if (corrected && corrected !== rawText) {
4673
+ const before = buffer.slice(0, insertPoint);
4674
+ const after = buffer.slice(insertPoint + rawText.length);
4675
+ buffer = before + corrected + after;
4676
+ }
4677
+ render();
4678
+ });
4679
+ }
4680
+ } catch (err) {
4681
+ voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
4682
+ voiceRecording = false;
4683
+ render();
4684
+ setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4685
+ }
4686
+ })();
4687
+ return;
4688
+ }
4689
+
4628
4690
  if (isPrintableKey(str, key)) {
4629
4691
  buffer += str;
4630
4692
  selectedIndex = 0;
@@ -5015,7 +5077,7 @@ async function promptLoop(agent, session, context) {
5015
5077
  let line = normalizeInteractiveInput(
5016
5078
  await readInteractiveLine({
5017
5079
  getFooterText(inputBuffer) {
5018
- return buildInteractiveFooter({
5080
+ const footer = buildInteractiveFooter({
5019
5081
  agent,
5020
5082
  cwd: context.cwd,
5021
5083
  sessionId: currentSession.id,
@@ -5023,7 +5085,17 @@ async function promptLoop(agent, session, context) {
5023
5085
  lastUsage: context.lastUsage,
5024
5086
  costTracker: context.costTracker
5025
5087
  });
5026
- }
5088
+ if (context.voiceModeEnabled) {
5089
+ return "Voice ON — space to record, space to stop | " + footer;
5090
+ }
5091
+ return footer;
5092
+ },
5093
+ voiceSession: context.voiceModeEnabled ? context.voiceSession : null,
5094
+ grokConfig: context.voiceModeEnabled ? {
5095
+ apiKey: context.runtime.apiKey,
5096
+ baseUrl: context.runtime.baseUrl,
5097
+ model: context.runtime.model
5098
+ } : null
5027
5099
  })
5028
5100
  );
5029
5101
  if (!line) continue;
@@ -6680,6 +6752,30 @@ async function promptLoop(agent, session, context) {
6680
6752
  continue;
6681
6753
  }
6682
6754
 
6755
+ if (line === "/voice") {
6756
+ if (!context.voiceModeEnabled) {
6757
+ try {
6758
+ if (!context.voiceSession) {
6759
+ const { setupVoice } = await import("./voice.js");
6760
+ context.voiceSession = await setupVoice((msg) => console.log(msg));
6761
+ }
6762
+ context.voiceModeEnabled = true;
6763
+ console.log("Voice mode ON. Press spacebar to record (5 seconds).");
6764
+ console.log(dim("Tip: Grok will auto-correct technical terms after transcription."));
6765
+ } catch (error) {
6766
+ console.log(`Voice mode failed: ${error instanceof Error ? error.message : String(error)}`);
6767
+ }
6768
+ } else {
6769
+ context.voiceModeEnabled = false;
6770
+ if (context.voiceSession) {
6771
+ context.voiceSession.destroy();
6772
+ context.voiceSession = null;
6773
+ }
6774
+ console.log("Voice mode OFF.");
6775
+ }
6776
+ continue;
6777
+ }
6778
+
6683
6779
  if (line.startsWith("/")) {
6684
6780
  console.log("Unknown slash command. Use /help.");
6685
6781
  continue;
package/src/voice.js ADDED
@@ -0,0 +1,511 @@
1
+ import { execFile, spawn } from "node:child_process";
2
+ import fs from "node:fs/promises";
3
+ import { createRequire } from "node:module";
4
+ import path from "node:path";
5
+ import process from "node:process";
6
+ import { pathToFileURL } from "node:url";
7
+ import { promisify } from "node:util";
8
+ import { createChatCompletion } from "./grok-client.js";
9
+
10
+ const execFileAsync = promisify(execFile);
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Paths
14
+ // ---------------------------------------------------------------------------
15
+
16
+ const MODEL_DIR_NAME = "sherpa-onnx-moonshine-base-en-int8";
17
+ const MODEL_FILES = [
18
+ "preprocess.onnx",
19
+ "encode.int8.onnx",
20
+ "uncached_decode.int8.onnx",
21
+ "cached_decode.int8.onnx",
22
+ "tokens.txt"
23
+ ];
24
+
25
+ function getWaterbrotherHome() {
26
+ const home = process.env.HOME || process.env.USERPROFILE || "";
27
+ return path.join(home, ".waterbrother");
28
+ }
29
+
30
+ function getModelsDir() {
31
+ return path.join(getWaterbrotherHome(), "models", MODEL_DIR_NAME);
32
+ }
33
+
34
+ function getVoiceRuntimeDir() {
35
+ return path.join(getWaterbrotherHome(), "voice-runtime");
36
+ }
37
+
38
+ // ---------------------------------------------------------------------------
39
+ // System checks
40
+ // ---------------------------------------------------------------------------
41
+
42
+ async function hasBin(name) {
43
+ const cmd = process.platform === "win32" ? "where" : "which";
44
+ try {
45
+ await execFileAsync(cmd, [name]);
46
+ return true;
47
+ } catch {
48
+ return false;
49
+ }
50
+ }
51
+
52
+ async function checkSox() {
53
+ const cmd = process.platform === "win32" ? "where" : "which";
54
+ try {
55
+ const { stdout } = await execFileAsync(cmd, ["sox"]);
56
+ return { ok: true, path: String(stdout || "").trim().split("\n")[0] || null };
57
+ } catch {}
58
+
59
+ // On Windows, winget portable installs don't add to PATH — search known locations
60
+ if (process.platform === "win32") {
61
+ const localAppData = process.env.LOCALAPPDATA;
62
+ if (localAppData) {
63
+ const packagesDir = path.join(localAppData, "Microsoft", "WinGet", "Packages");
64
+ try {
65
+ const entries = await fs.readdir(packagesDir);
66
+ for (const entry of entries) {
67
+ if (!entry.toLowerCase().includes("sox")) continue;
68
+ const entryPath = path.join(packagesDir, entry);
69
+ const subEntries = await fs.readdir(entryPath, { recursive: true });
70
+ for (const sub of subEntries) {
71
+ if (path.basename(sub).toLowerCase() === "sox.exe") {
72
+ const fullPath = path.join(entryPath, sub);
73
+ const soxDir = path.dirname(fullPath);
74
+ process.env.PATH = `${soxDir};${process.env.PATH}`;
75
+ return { ok: true, path: fullPath };
76
+ }
77
+ }
78
+ }
79
+ } catch {}
80
+ }
81
+ }
82
+
83
+ return { ok: false, path: null };
84
+ }
85
+
86
+ async function checkSherpaOnnx() {
87
+ const runtimeDir = getVoiceRuntimeDir();
88
+ const markerPath = path.join(runtimeDir, "node_modules", "sherpa-onnx-node", "package.json");
89
+ try {
90
+ await fs.access(markerPath);
91
+ return { ok: true };
92
+ } catch {
93
+ return { ok: false };
94
+ }
95
+ }
96
+
97
+ async function checkModel() {
98
+ const dir = getModelsDir();
99
+ try {
100
+ const entries = await fs.readdir(dir);
101
+ const missing = MODEL_FILES.filter((f) => !entries.includes(f));
102
+ return { ok: missing.length === 0, dir, missing };
103
+ } catch {
104
+ return { ok: false, dir, missing: MODEL_FILES };
105
+ }
106
+ }
107
+
108
+ // ---------------------------------------------------------------------------
109
+ // Model download
110
+ // ---------------------------------------------------------------------------
111
+
112
+ const MODEL_ARCHIVE_URL =
113
+ `https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${MODEL_DIR_NAME}.tar.bz2`;
114
+
115
+ async function downloadModel(onProgress) {
116
+ const modelsRoot = path.dirname(getModelsDir());
117
+ await fs.mkdir(modelsRoot, { recursive: true });
118
+
119
+ if (onProgress) onProgress({ status: "downloading" });
120
+
121
+ const response = await fetch(MODEL_ARCHIVE_URL, { redirect: "follow" });
122
+ if (!response.ok) {
123
+ throw new Error(`Failed to download model archive: HTTP ${response.status}`);
124
+ }
125
+
126
+ const contentLength = Number(response.headers.get("content-length")) || 0;
127
+ const reader = response.body.getReader();
128
+ const chunks = [];
129
+ let downloaded = 0;
130
+
131
+ while (true) {
132
+ const { done, value } = await reader.read();
133
+ if (done) break;
134
+ chunks.push(value);
135
+ downloaded += value.length;
136
+ if (onProgress && contentLength > 0) {
137
+ onProgress({ status: "progress", downloaded, total: contentLength });
138
+ }
139
+ }
140
+
141
+ // Write archive to temp file, then extract
142
+ const archivePath = path.join(modelsRoot, `${MODEL_DIR_NAME}.tar.bz2`);
143
+ const archiveBuffer = Buffer.concat(chunks);
144
+ await fs.writeFile(archivePath, archiveBuffer);
145
+ if (onProgress) onProgress({ status: "extracting" });
146
+
147
+ await execFileAsync("tar", ["xjf", archivePath, "-C", modelsRoot], {
148
+ timeout: 120_000
149
+ });
150
+
151
+ // Clean up archive
152
+ await fs.unlink(archivePath).catch(() => {});
153
+ if (onProgress) onProgress({ status: "done", size: archiveBuffer.length });
154
+ }
155
+
156
+ // ---------------------------------------------------------------------------
157
+ // Recognizer lifecycle
158
+ // ---------------------------------------------------------------------------
159
+
160
+ let _sherpaOnnx = null;
161
+
162
+ async function loadSherpaOnnx() {
163
+ if (_sherpaOnnx) return _sherpaOnnx;
164
+
165
+ // Load from the local voice-runtime install via createRequire
166
+ const runtimeDir = getVoiceRuntimeDir();
167
+ const fakePath = path.join(runtimeDir, "loader.cjs");
168
+ const require = createRequire(fakePath);
169
+ _sherpaOnnx = require("sherpa-onnx-node");
170
+ return _sherpaOnnx;
171
+ }
172
+
173
+ function createRecognizer() {
174
+ const sherpa = _sherpaOnnx;
175
+ if (!sherpa) throw new Error("sherpa-onnx-node not loaded");
176
+
177
+ const dir = getModelsDir();
178
+ const config = {
179
+ modelConfig: {
180
+ moonshine: {
181
+ preprocessor: path.join(dir, "preprocess.onnx"),
182
+ encoder: path.join(dir, "encode.int8.onnx"),
183
+ uncachedDecoder: path.join(dir, "uncached_decode.int8.onnx"),
184
+ cachedDecoder: path.join(dir, "cached_decode.int8.onnx")
185
+ },
186
+ tokens: path.join(dir, "tokens.txt"),
187
+ provider: "cpu",
188
+ numThreads: 2,
189
+ debug: 0
190
+ }
191
+ };
192
+
193
+ return new sherpa.OfflineRecognizer(config);
194
+ }
195
+
196
+ // ---------------------------------------------------------------------------
197
+ // Audio device detection (Windows)
198
+ // ---------------------------------------------------------------------------
199
+
200
+ const WAVEIN_ENUM_SCRIPT = `
201
+ Add-Type -TypeDefinition @"
202
+ using System;
203
+ using System.Runtime.InteropServices;
204
+ public class WaveInHelper {
205
+ [DllImport("winmm.dll")]
206
+ public static extern uint waveInGetNumDevs();
207
+ [DllImport("winmm.dll", CharSet = CharSet.Auto)]
208
+ public static extern uint waveInGetDevCapsW(uint id, ref WAVEINCAPS caps, uint size);
209
+ [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Auto)]
210
+ public struct WAVEINCAPS {
211
+ public ushort wMid;
212
+ public ushort wPid;
213
+ public uint vDriverVersion;
214
+ [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 32)]
215
+ public string szPname;
216
+ public uint dwFormats;
217
+ public ushort wChannels;
218
+ public ushort wReserved1;
219
+ }
220
+ }
221
+ "@
222
+ $$n = [WaveInHelper]::waveInGetNumDevs()
223
+ for ($$i = 0; $$i -lt $$n; $$i++) {
224
+ $$c = New-Object WaveInHelper+WAVEINCAPS
225
+ [WaveInHelper]::waveInGetDevCapsW($$i, [ref]$$c, [Runtime.InteropServices.Marshal]::SizeOf($$c)) | Out-Null
226
+ Write-Output "$$i|$$($$c.szPname)"
227
+ }
228
+ `.replace(/\$\$/g, "$");
229
+
230
+ async function detectAudioDevice(soxPath, log) {
231
+ if (process.platform !== "win32") return null;
232
+
233
+ try {
234
+ const { stdout } = await execFileAsync("powershell.exe", [
235
+ "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", WAVEIN_ENUM_SCRIPT
236
+ ], { timeout: 10000 });
237
+
238
+ const devices = String(stdout).trim().split("\n")
239
+ .map((line) => line.trim())
240
+ .filter(Boolean)
241
+ .map((line) => {
242
+ const sep = line.indexOf("|");
243
+ return { index: line.slice(0, sep), name: line.slice(sep + 1) };
244
+ });
245
+
246
+ if (devices.length === 0) return "default";
247
+
248
+ // Score each device: prefer hardware mics, deprioritize virtual/mixer devices
249
+ const scored = devices.map((d) => {
250
+ const lower = d.name.toLowerCase();
251
+ let score = 0;
252
+ if (lower.startsWith("headset microphone")) score += 100;
253
+ else if (lower.startsWith("microphone (")) score += 50;
254
+ else if (lower.startsWith("microphone")) score += 30;
255
+ else if (lower.startsWith("line in")) score += 20;
256
+ if (/sonar|virtual|nahimic|nvidia/i.test(d.name)) score -= 200;
257
+ return { ...d, score };
258
+ });
259
+
260
+ scored.sort((a, b) => b.score - a.score);
261
+ const best = scored[0];
262
+ log(` audio device: ${best.name} (device ${best.index})`);
263
+ return best.index;
264
+ } catch {
265
+ return "default";
266
+ }
267
+ }
268
+
269
+ // ---------------------------------------------------------------------------
270
+ // Audio capture via sox
271
+ // ---------------------------------------------------------------------------
272
+
273
+ // Record for a fixed duration, sox exits cleanly. This is the exact same
274
+ // approach as test-capture.mjs which is the only code path proven to work.
275
+ async function captureAudio(soxPath, audioDevice, durationSec) {
276
+ const isWin = process.platform === "win32";
277
+ const inputArgs = isWin
278
+ ? ["-t", "waveaudio", audioDevice || "default"]
279
+ : ["-d"];
280
+
281
+ const args = [
282
+ ...inputArgs,
283
+ "-t", "raw", "-r", "16000", "-c", "1", "-b", "16", "-e", "signed-integer",
284
+ "-", "trim", "0", String(durationSec)
285
+ ];
286
+
287
+ const { stdout } = await execFileAsync(soxPath, args, {
288
+ timeout: (durationSec + 5) * 1000,
289
+ maxBuffer: durationSec * 32000 + 1024,
290
+ encoding: "buffer"
291
+ });
292
+
293
+ const samples = new Float32Array(Math.floor(stdout.length / 2));
294
+ for (let i = 0; i < samples.length; i++) {
295
+ samples[i] = stdout.readInt16LE(i * 2) / 32768.0;
296
+ }
297
+ return samples;
298
+ }
299
+
300
+ // ---------------------------------------------------------------------------
301
+ // Transcription
302
+ // ---------------------------------------------------------------------------
303
+
304
+ function transcribe(recognizer, audioSamples) {
305
+ if (!audioSamples || audioSamples.length < 1600) {
306
+ return "";
307
+ }
308
+
309
+ const stream = recognizer.createStream();
310
+ stream.acceptWaveform({ sampleRate: 16000, samples: audioSamples });
311
+ recognizer.decode(stream);
312
+ return recognizer.getResult(stream).text.trim();
313
+ }
314
+
315
+ // ---------------------------------------------------------------------------
316
+ // Grok correction pass
317
+ // ---------------------------------------------------------------------------
318
+
319
+ const CORRECTION_SYSTEM_PROMPT =
320
+ "You are a transcription corrector for a coding CLI. Fix speech-to-text errors. " +
321
+ "Properly format technical terms (camelCase, snake_case, file paths, CLI flags, function names). " +
322
+ "Return ONLY the corrected text, nothing else. If the text is already correct, return it unchanged.";
323
+
324
+ async function correctTranscript(rawText, { apiKey, baseUrl, model }) {
325
+ if (!rawText || !apiKey) return rawText;
326
+
327
+ try {
328
+ const completion = await createChatCompletion({
329
+ apiKey,
330
+ baseUrl,
331
+ model: model || "grok-3-mini",
332
+ messages: [
333
+ { role: "system", content: CORRECTION_SYSTEM_PROMPT },
334
+ { role: "user", content: rawText }
335
+ ],
336
+ temperature: 0
337
+ });
338
+
339
+ const corrected = (completion?.message?.content || "").trim();
340
+ return corrected || rawText;
341
+ } catch {
342
+ return rawText;
343
+ }
344
+ }
345
+
346
+ // ---------------------------------------------------------------------------
347
+ // Auto-install helpers
348
+ // ---------------------------------------------------------------------------
349
+
350
+ async function runShell(command, args, { label, log, timeout = 300_000 } = {}) {
351
+ if (log && label) log(` Installing ${label}...`);
352
+ const { stdout, stderr } = await execFileAsync(command, args, {
353
+ timeout,
354
+ env: process.env,
355
+ shell: process.platform === "win32"
356
+ });
357
+ return { stdout, stderr };
358
+ }
359
+
360
+ async function installSox(log) {
361
+ const platform = process.platform;
362
+ if (platform === "darwin") {
363
+ if (!await hasBin("brew")) throw new Error("Cannot auto-install sox: Homebrew not found.");
364
+ await runShell("brew", ["install", "sox"], { label: "sox via Homebrew", log });
365
+ } else if (platform === "win32") {
366
+ if (await hasBin("winget")) {
367
+ await runShell("winget", ["install", "--id", "ChrisBagwell.SoX", "-e", "--accept-source-agreements", "--accept-package-agreements"], { label: "sox via winget", log });
368
+ } else if (await hasBin("choco")) {
369
+ await runShell("choco", ["install", "sox", "-y"], { label: "sox via Chocolatey", log });
370
+ } else {
371
+ throw new Error("Cannot auto-install sox: neither winget nor choco found.");
372
+ }
373
+ } else {
374
+ if (await hasBin("apt-get")) {
375
+ await runShell("sudo", ["apt-get", "install", "-y", "sox"], { label: "sox via apt", log });
376
+ } else if (await hasBin("dnf")) {
377
+ await runShell("sudo", ["dnf", "install", "-y", "sox"], { label: "sox via dnf", log });
378
+ } else {
379
+ throw new Error("Cannot auto-install sox: neither apt-get nor dnf found.");
380
+ }
381
+ }
382
+ }
383
+
384
+ function getNativeAddonPackage() {
385
+ const { platform, arch } = process;
386
+ if (platform === "win32" && arch === "x64") return "sherpa-onnx-win-x64";
387
+ if (platform === "win32" && arch === "ia32") return "sherpa-onnx-win-ia32";
388
+ if (platform === "darwin" && arch === "arm64") return "sherpa-onnx-darwin-arm64";
389
+ if (platform === "darwin" && arch === "x64") return "sherpa-onnx-darwin-x64";
390
+ if (platform === "linux" && arch === "x64") return "sherpa-onnx-linux-x64";
391
+ if (platform === "linux" && arch === "arm64") return "sherpa-onnx-linux-arm64";
392
+ return null;
393
+ }
394
+
395
+ async function installSherpaOnnx(log) {
396
+ const runtimeDir = getVoiceRuntimeDir();
397
+ await fs.mkdir(runtimeDir, { recursive: true });
398
+
399
+ const nativePkg = getNativeAddonPackage();
400
+ if (!nativePkg) {
401
+ throw new Error(`Unsupported platform: ${process.platform}-${process.arch}`);
402
+ }
403
+
404
+ // Include the platform-specific native addon as a direct dependency
405
+ const pkgPath = path.join(runtimeDir, "package.json");
406
+ await fs.writeFile(pkgPath, JSON.stringify({
407
+ name: "waterbrother-voice-runtime",
408
+ version: "1.0.0",
409
+ private: true,
410
+ dependencies: {
411
+ "sherpa-onnx-node": "^1.12.0",
412
+ [nativePkg]: "^1.12.0"
413
+ }
414
+ }, null, 2));
415
+
416
+ log(" Installing sherpa-onnx-node (this may take a minute)...");
417
+ const npmCmd = process.platform === "win32" ? "npm.cmd" : "npm";
418
+ await execFileAsync(npmCmd, ["install", "--no-audit", "--no-fund"], {
419
+ cwd: runtimeDir,
420
+ timeout: 300_000,
421
+ env: process.env,
422
+ shell: process.platform === "win32"
423
+ });
424
+ }
425
+
426
+ // ---------------------------------------------------------------------------
427
+ // Setup orchestrator
428
+ // ---------------------------------------------------------------------------
429
+
430
+ export async function setupVoice(onStatus) {
431
+ const log = onStatus || (() => {});
432
+
433
+ log("Setting up voice mode...");
434
+
435
+ // 1. sox — check, auto-install if missing
436
+ let sox = await checkSox();
437
+ if (!sox.ok) {
438
+ await installSox(log);
439
+ sox = await checkSox();
440
+ if (!sox.ok) throw new Error("sox installed but not found. Restart your terminal and try again.");
441
+ }
442
+ log(` sox: ${sox.path}`);
443
+
444
+ // 2. sherpa-onnx — install into ~/.waterbrother/voice-runtime/ if missing
445
+ let sherpa = await checkSherpaOnnx();
446
+ if (!sherpa.ok) {
447
+ await installSherpaOnnx(log);
448
+ sherpa = await checkSherpaOnnx();
449
+ if (!sherpa.ok) throw new Error("sherpa-onnx-node install failed. Check ~/.waterbrother/voice-runtime/ for errors.");
450
+ }
451
+ log(" sherpa-onnx: ready");
452
+
453
+ // 3. Model — auto-download if missing
454
+ const model = await checkModel();
455
+ if (!model.ok) {
456
+ log(" Downloading Moonshine Base model (~250 MB)...");
457
+ await downloadModel(({ status, downloaded, total, size }) => {
458
+ if (status === "progress" && total > 0) {
459
+ const pct = Math.round((downloaded / total) * 100);
460
+ process.stdout.write(`\r ${pct}% (${formatBytes(downloaded)}/${formatBytes(total)})`);
461
+ } else if (status === "extracting") {
462
+ process.stdout.write(`\r Extracting... \n`);
463
+ } else if (status === "done") {
464
+ log(` Done (${formatBytes(size)})`);
465
+ }
466
+ });
467
+ log(" Model ready.");
468
+ } else {
469
+ log(" Moonshine Base: ready");
470
+ }
471
+
472
+ // 4. Detect audio device (Windows)
473
+ const soxPath = sox.path;
474
+ const audioDevice = await detectAudioDevice(soxPath, log);
475
+
476
+ // 5. Initialize recognizer
477
+ await loadSherpaOnnx();
478
+ const recognizer = createRecognizer();
479
+
480
+ return {
481
+ // Record for a fixed duration (sox exits cleanly, no kill).
482
+ // Returns transcribed text or { error: "..." }.
483
+ async recordAndTranscribe(durationSec = 5) {
484
+ const samples = await captureAudio(soxPath, audioDevice, durationSec);
485
+ const durationMs = Math.round((samples.length / 16000) * 1000);
486
+ let maxAmp = 0;
487
+ for (const v of samples) { const a = Math.abs(v); if (a > maxAmp) maxAmp = a; }
488
+ const text = transcribe(recognizer, samples);
489
+ if (text) return text;
490
+ if (samples.length < 1600) return { error: `Recording too short (${durationMs}ms)` };
491
+ if (maxAmp < 0.01) return { error: `Silence (${durationMs}ms, amp=${maxAmp.toFixed(4)}) — mic not active` };
492
+ return { error: `No speech detected (${durationMs}ms, amp=${maxAmp.toFixed(4)})` };
493
+ },
494
+
495
+ async correctTranscript(rawText, grokConfig) {
496
+ return correctTranscript(rawText, grokConfig);
497
+ },
498
+
499
+ destroy() {}
500
+ };
501
+ }
502
+
503
+ // ---------------------------------------------------------------------------
504
+ // Helpers
505
+ // ---------------------------------------------------------------------------
506
+
507
+ function formatBytes(bytes) {
508
+ if (bytes < 1024) return `${bytes} B`;
509
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
510
+ return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
511
+ }