@tritard/waterbrother 0.14.12 → 0.14.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/package.json +1 -1
  2. package/src/cli.js +292 -52
  3. package/src/voice.js +305 -33
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tritard/waterbrother",
3
- "version": "0.14.12",
3
+ "version": "0.14.13",
4
4
  "description": "Waterbrother: Grok-powered coding CLI with local tools, sessions, operator modes, and approval controls",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli.js CHANGED
@@ -172,7 +172,8 @@ const INTERACTIVE_COMMANDS = [
172
172
  { name: "/feedback", description: "Report a bug or share feedback" },
173
173
  { name: "/cost", description: "Show session token usage and cost breakdown" },
174
174
  { name: "/diff", description: "Show git changes in the current repo" },
175
- { name: "/voice", description: "Toggle voice dictation (press space to record)" }
175
+ { name: "/voice", description: "Toggle voice dictation (press space to record)" },
176
+ { name: "/speak", description: "Toggle TTS — agent reads responses aloud (esc to stop)" }
176
177
  ];
177
178
 
178
179
  const AGENT_PROFILES = ["coder", "designer", "reviewer", "planner"];
@@ -4133,6 +4134,15 @@ async function runTextTurnInteractive({
4133
4134
  });
4134
4135
 
4135
4136
  printAssistantOutput(renderedAssistantText);
4137
+
4138
+ // TTS: speak the response when speak mode is active
4139
+ if (context.speakModeEnabled && context.voiceSession?.hasTts?.()) {
4140
+ context.voiceSession.speakFull(renderedAssistantText, {
4141
+ apiKey: context.runtime.apiKey,
4142
+ baseUrl: context.runtime.baseUrl,
4143
+ });
4144
+ }
4145
+
4136
4146
  await setSessionRunState(currentSession, agent, "done");
4137
4147
  printTurnSummary(turnSummary, response, { modelId: agent.getModel(), costTracker: context.costTracker, traceMode: context.runtime.traceMode });
4138
4148
  printTraceTimeline(turnSummary, context.runtime.traceMode);
@@ -4469,6 +4479,7 @@ async function readInteractiveLine(options = {}) {
4469
4479
 
4470
4480
  return new Promise((resolve, reject) => {
4471
4481
  let buffer = "";
4482
+ let cursorPos = 0;
4472
4483
  let selectedIndex = 0;
4473
4484
  let settled = false;
4474
4485
  let ignoredPastePrintable = 0;
@@ -4478,6 +4489,38 @@ async function readInteractiveLine(options = {}) {
4478
4489
  // Voice recording state
4479
4490
  let voiceRecording = false;
4480
4491
  let voiceIndicator = "";
4492
+ // Tracks the colored region for voice-dictated text
4493
+ // { start, length, state: 'raw'|'corrected', sweepPos: number (chars already swept green) }
4494
+ let voiceSegment = null;
4495
+ let voiceSegmentTimer = null;
4496
+ let voiceSweepTimer = null;
4497
+
4498
+ function sweepToGreen(start, length) {
4499
+ if (voiceSweepTimer) clearInterval(voiceSweepTimer);
4500
+ if (!voiceSegment) return;
4501
+ voiceSegment.sweepPos = 0;
4502
+ voiceSegment.state = "corrected";
4503
+ const text = buffer.slice(start, start + length);
4504
+ const words = text.split(/(?<=\s)/); // split keeping spaces
4505
+ let charsDone = 0;
4506
+ let wordIdx = 0;
4507
+ voiceSweepTimer = setInterval(() => {
4508
+ if (settled || wordIdx >= words.length) {
4509
+ clearInterval(voiceSweepTimer);
4510
+ voiceSweepTimer = null;
4511
+ if (voiceSegment) voiceSegment.sweepPos = length;
4512
+ render();
4513
+ // Fade to normal after sweep completes
4514
+ if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
4515
+ voiceSegmentTimer = setTimeout(() => { voiceSegment = null; render(); }, 2000);
4516
+ return;
4517
+ }
4518
+ charsDone += words[wordIdx].length;
4519
+ wordIdx++;
4520
+ if (voiceSegment) voiceSegment.sweepPos = charsDone;
4521
+ render();
4522
+ }, 80);
4523
+ }
4481
4524
 
4482
4525
  function finish(nextValue) {
4483
4526
  if (settled) return;
@@ -4501,7 +4544,24 @@ async function readInteractiveLine(options = {}) {
4501
4544
  selectedIndex = 0;
4502
4545
  }
4503
4546
 
4504
- const displayBuffer = voiceIndicator ? `${buffer} ${voiceIndicator}` : buffer;
4547
+ // Apply visual coloring to voice-dictated text segments
4548
+ let coloredBuffer = buffer;
4549
+ if (voiceSegment && voiceSegment.start < buffer.length) {
4550
+ const s = voiceSegment;
4551
+ const before = buffer.slice(0, s.start);
4552
+ const seg = buffer.slice(s.start, s.start + s.length);
4553
+ const after = buffer.slice(s.start + s.length);
4554
+ if (s.state === "corrected" && typeof s.sweepPos === "number" && s.sweepPos < s.length) {
4555
+ // Sweep: green for swept portion, magenta for remaining
4556
+ const swept = seg.slice(0, s.sweepPos);
4557
+ const remaining = seg.slice(s.sweepPos);
4558
+ coloredBuffer = `${before}\x1b[32m${swept}\x1b[35m${remaining}\x1b[0m${after}`;
4559
+ } else {
4560
+ const color = s.state === "corrected" ? "\x1b[32m" : "\x1b[35m";
4561
+ coloredBuffer = `${before}${color}${seg}\x1b[0m${after}`;
4562
+ }
4563
+ }
4564
+ const displayBuffer = voiceIndicator ? `${coloredBuffer} ${voiceIndicator}` : coloredBuffer;
4505
4565
  const writePrompt = () => {
4506
4566
  output.write(formatPromptRow(displayBuffer, columns));
4507
4567
  };
@@ -4539,6 +4599,13 @@ async function readInteractiveLine(options = {}) {
4539
4599
  output.write("\r");
4540
4600
  writePrompt();
4541
4601
  }
4602
+
4603
+ // Position terminal cursor at cursorPos within the buffer
4604
+ if (cursorPos < buffer.length) {
4605
+ const prefixLen = USER_PREFIX.length + 1; // "you> " visible chars
4606
+ const col = prefixLen + cursorPos + 1; // 1-based column
4607
+ output.write(`\x1b[${col}G`);
4608
+ }
4542
4609
  }
4543
4610
 
4544
4611
  function cleanup() {
@@ -4560,6 +4627,7 @@ async function readInteractiveLine(options = {}) {
4560
4627
  if (!isExact || buffer === "/") {
4561
4628
  if (suggestionHasArgs(selected.name)) {
4562
4629
  buffer = nextValue;
4630
+ cursorPos = buffer.length;
4563
4631
  selectedIndex = 0;
4564
4632
  render();
4565
4633
  return;
@@ -4593,6 +4661,12 @@ async function readInteractiveLine(options = {}) {
4593
4661
  return;
4594
4662
  }
4595
4663
 
4664
+ // Escape: stop TTS playback if speaking
4665
+ if (key?.name === "escape" && voiceSession?.isSpeaking?.()) {
4666
+ voiceSession.stopSpeaking();
4667
+ return;
4668
+ }
4669
+
4596
4670
  if (key?.name === "return" || key?.name === "enter" || str === "\n" || str === "\r") {
4597
4671
  if (suppressSubmit) return;
4598
4672
  handleSubmit();
@@ -4600,14 +4674,44 @@ async function readInteractiveLine(options = {}) {
4600
4674
  }
4601
4675
 
4602
4676
  if (key?.name === "backspace") {
4603
- if (buffer.length > 0) {
4604
- buffer = buffer.slice(0, -1);
4677
+ if (cursorPos > 0) {
4678
+ buffer = buffer.slice(0, cursorPos - 1) + buffer.slice(cursorPos);
4679
+ cursorPos--;
4680
+ selectedIndex = 0;
4681
+ render();
4682
+ }
4683
+ return;
4684
+ }
4685
+
4686
+ if (key?.name === "delete") {
4687
+ if (cursorPos < buffer.length) {
4688
+ buffer = buffer.slice(0, cursorPos) + buffer.slice(cursorPos + 1);
4605
4689
  selectedIndex = 0;
4606
4690
  render();
4607
4691
  }
4608
4692
  return;
4609
4693
  }
4610
4694
 
4695
+ if (key?.name === "left") {
4696
+ if (cursorPos > 0) { cursorPos--; render(); }
4697
+ return;
4698
+ }
4699
+
4700
+ if (key?.name === "right") {
4701
+ if (cursorPos < buffer.length) { cursorPos++; render(); }
4702
+ return;
4703
+ }
4704
+
4705
+ if (key?.name === "home" || (key?.ctrl && key?.name === "a")) {
4706
+ if (cursorPos > 0) { cursorPos = 0; render(); }
4707
+ return;
4708
+ }
4709
+
4710
+ if (key?.name === "end" || (key?.ctrl && key?.name === "e")) {
4711
+ if (cursorPos < buffer.length) { cursorPos = buffer.length; render(); }
4712
+ return;
4713
+ }
4714
+
4611
4715
  if (key?.name === "up") {
4612
4716
  const suggestions = getSlashMenuSuggestions(buffer);
4613
4717
  if (suggestions.length > 0) {
@@ -4631,68 +4735,163 @@ async function readInteractiveLine(options = {}) {
4631
4735
  if (suggestions.length > 0) {
4632
4736
  const selected = suggestions[selectedIndex >= 0 ? selectedIndex : 0];
4633
4737
  buffer = commandInputFromSuggestion(selected.name);
4738
+ cursorPos = buffer.length;
4634
4739
  selectedIndex = 0;
4635
4740
  render();
4636
4741
  }
4637
4742
  return;
4638
4743
  }
4639
4744
 
4640
- // Voice: spacebar on empty/trailing-space triggers a 5-second recording.
4641
- // Uses fixed duration with clean sox exit same code path as test-capture.mjs.
4642
- if (voiceSession && !voiceRecording && str === " " && (buffer.length === 0 || buffer.endsWith(" "))) {
4643
- voiceRecording = true;
4644
- voiceIndicator = "\x1b[31m[recording 5s — speak now]\x1b[0m";
4745
+ // Voice: spacebar stops active streaming recording
4746
+ if (voiceSession && voiceRecording && voiceSession.hasStreaming?.() && str === " ") {
4747
+ voiceRecording = false;
4748
+ voiceIndicator = "\x1b[36m[finalizing...]\x1b[0m";
4645
4749
  render();
4646
4750
 
4647
4751
  (async () => {
4648
4752
  try {
4649
- const result = await voiceSession.recordAndTranscribe(5);
4650
- voiceRecording = false;
4753
+ const result = await voiceSession.stopStreaming();
4754
+ if (result && typeof result === "object" && result.error) {
4755
+ voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
4756
+ render();
4757
+ setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4758
+ return;
4759
+ }
4651
4760
 
4652
- if (result && typeof result === "object" && result.error) {
4653
- voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
4761
+ const finalText = typeof result === "string" ? result : "";
4762
+ voiceIndicator = "";
4763
+ if (!finalText) { render(); return; }
4764
+
4765
+ // Replace streaming preview with Moonshine final result
4766
+ const insertPoint = voiceSegment ? voiceSegment.start : buffer.length;
4767
+ const prevLength = voiceSegment ? voiceSegment.length : 0;
4768
+ const before = buffer.slice(0, insertPoint);
4769
+ const after = buffer.slice(insertPoint + prevLength);
4770
+ buffer = before + finalText + after;
4771
+ if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
4772
+ voiceSegment = { start: insertPoint, length: finalText.length, state: "raw" };
4773
+ cursorPos = insertPoint + finalText.length;
4654
4774
  render();
4655
- setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4656
- return;
4657
- }
4658
4775
 
4659
- const rawText = typeof result === "string" ? result : "";
4660
- voiceIndicator = "";
4661
- if (!rawText) {
4776
+ // Fire Grok correction with visual sweep
4777
+ if (grokConfig && grokConfig.apiKey) {
4778
+ voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
4779
+ render();
4780
+ voiceSession.correctTranscript(finalText, grokConfig).then((corrected) => {
4781
+ voiceIndicator = "";
4782
+ if (settled) return;
4783
+ const textToSweep = (corrected && corrected !== finalText) ? corrected : finalText;
4784
+ if (corrected && corrected !== finalText) {
4785
+ const b = buffer.slice(0, insertPoint);
4786
+ const a = buffer.slice(insertPoint + finalText.length);
4787
+ buffer = b + corrected + a;
4788
+ cursorPos = insertPoint + corrected.length;
4789
+ }
4790
+ voiceSegment = { start: insertPoint, length: textToSweep.length, state: "raw" };
4791
+ render();
4792
+ sweepToGreen(insertPoint, textToSweep.length);
4793
+ });
4794
+ } else {
4795
+ // No Grok — sweep the Moonshine result directly
4796
+ sweepToGreen(insertPoint, finalText.length);
4797
+ }
4798
+ } catch (err) {
4799
+ voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
4662
4800
  render();
4663
- return;
4801
+ setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4664
4802
  }
4803
+ })();
4804
+ return;
4805
+ }
4665
4806
 
4666
- const insertPoint = buffer.length;
4667
- buffer += rawText;
4807
+ // Voice: spacebar starts recording (only on empty/trailing-space at end of buffer)
4808
+ if (voiceSession && !voiceRecording && str === " " && cursorPos === buffer.length && (buffer.length === 0 || buffer.endsWith(" "))) {
4809
+ voiceRecording = true;
4810
+ const insertPoint = buffer.length;
4811
+ if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
4812
+ if (voiceSweepTimer) { clearInterval(voiceSweepTimer); voiceSweepTimer = null; }
4813
+ let lastStreamRender = 0;
4814
+ let streamRenderPending = null;
4815
+
4816
+ // Try streaming mode first, fall back to batch
4817
+ const streamingStarted = voiceSession.hasStreaming?.() && voiceSession.startStreaming((partialText) => {
4818
+ // Live update: replace voice segment with streaming partial result
4819
+ const before = buffer.slice(0, insertPoint);
4820
+ const prevLength = voiceSegment ? voiceSegment.length : 0;
4821
+ const after = buffer.slice(insertPoint + prevLength);
4822
+ buffer = before + partialText + after;
4823
+ voiceSegment = { start: insertPoint, length: partialText.length, state: "raw" };
4824
+ cursorPos = insertPoint + partialText.length;
4825
+ // Throttle renders to max ~8fps to prevent flicker
4826
+ const now = Date.now();
4827
+ if (now - lastStreamRender >= 120) {
4828
+ lastStreamRender = now;
4829
+ if (streamRenderPending) { clearTimeout(streamRenderPending); streamRenderPending = null; }
4668
4830
  render();
4831
+ } else if (!streamRenderPending) {
4832
+ streamRenderPending = setTimeout(() => { streamRenderPending = null; lastStreamRender = Date.now(); render(); }, 120);
4833
+ }
4834
+ });
4669
4835
 
4670
- if (grokConfig && grokConfig.apiKey) {
4671
- voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
4836
+ if (streamingStarted) {
4837
+ voiceIndicator = "\x1b[31m[recording — space to stop]\x1b[0m";
4838
+ voiceSegment = { start: insertPoint, length: 0, state: "raw" };
4839
+ render();
4840
+ } else {
4841
+ // Batch fallback (no streaming recognizer)
4842
+ voiceIndicator = "\x1b[31m[recording 5s — speak now]\x1b[0m";
4843
+ render();
4844
+ (async () => {
4845
+ try {
4846
+ const result = await voiceSession.recordAndTranscribe(5);
4847
+ voiceRecording = false;
4848
+ if (result && typeof result === "object" && result.error) {
4849
+ voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
4850
+ render();
4851
+ setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4852
+ return;
4853
+ }
4854
+ const rawText = typeof result === "string" ? result : "";
4855
+ voiceIndicator = "";
4856
+ if (!rawText) { render(); return; }
4857
+ buffer += rawText;
4858
+ voiceSegment = { start: insertPoint, length: rawText.length, state: "raw" };
4859
+ cursorPos = buffer.length;
4672
4860
  render();
4673
- voiceSession.correctTranscript(rawText, grokConfig).then((corrected) => {
4674
- voiceIndicator = "";
4675
- if (settled) return;
4676
- if (corrected && corrected !== rawText) {
4677
- const before = buffer.slice(0, insertPoint);
4678
- const after = buffer.slice(insertPoint + rawText.length);
4679
- buffer = before + corrected + after;
4680
- }
4861
+ if (grokConfig && grokConfig.apiKey) {
4862
+ voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
4681
4863
  render();
4682
- });
4864
+ voiceSession.correctTranscript(rawText, grokConfig).then((corrected) => {
4865
+ voiceIndicator = "";
4866
+ if (settled) return;
4867
+ const textToSweep = (corrected && corrected !== rawText) ? corrected : rawText;
4868
+ if (corrected && corrected !== rawText) {
4869
+ const b = buffer.slice(0, insertPoint);
4870
+ const a = buffer.slice(insertPoint + rawText.length);
4871
+ buffer = b + corrected + a;
4872
+ cursorPos = insertPoint + corrected.length;
4873
+ }
4874
+ voiceSegment = { start: insertPoint, length: textToSweep.length, state: "raw" };
4875
+ render();
4876
+ sweepToGreen(insertPoint, textToSweep.length);
4877
+ });
4878
+ } else {
4879
+ sweepToGreen(insertPoint, rawText.length);
4880
+ }
4881
+ } catch (err) {
4882
+ voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
4883
+ voiceRecording = false;
4884
+ render();
4885
+ setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4683
4886
  }
4684
- } catch (err) {
4685
- voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
4686
- voiceRecording = false;
4687
- render();
4688
- setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
4689
- }
4690
- })();
4887
+ })();
4888
+ }
4691
4889
  return;
4692
4890
  }
4693
4891
 
4694
4892
  if (isPrintableKey(str, key)) {
4695
- buffer += str;
4893
+ buffer = buffer.slice(0, cursorPos) + str + buffer.slice(cursorPos);
4894
+ cursorPos += str.length;
4696
4895
  selectedIndex = 0;
4697
4896
  render();
4698
4897
 
@@ -4733,7 +4932,8 @@ async function readInteractiveLine(options = {}) {
4733
4932
 
4734
4933
  if (looksLikePastedBlock) {
4735
4934
  if (normalized) {
4736
- buffer += normalized;
4935
+ buffer = buffer.slice(0, cursorPos) + normalized + buffer.slice(cursorPos);
4936
+ cursorPos += normalized.length;
4737
4937
  selectedIndex = 0;
4738
4938
  render();
4739
4939
  }
@@ -4758,8 +4958,9 @@ async function readInteractiveLine(options = {}) {
4758
4958
  return;
4759
4959
  }
4760
4960
  if (ch === "\u007f" || ch === "\b") {
4761
- if (buffer.length > 0) {
4762
- buffer = buffer.slice(0, -1);
4961
+ if (cursorPos > 0) {
4962
+ buffer = buffer.slice(0, cursorPos - 1) + buffer.slice(cursorPos);
4963
+ cursorPos--;
4763
4964
  selectedIndex = 0;
4764
4965
  render();
4765
4966
  }
@@ -4767,7 +4968,8 @@ async function readInteractiveLine(options = {}) {
4767
4968
  }
4768
4969
  if (ch.charCodeAt(0) < 32 || ch.charCodeAt(0) === 127) continue;
4769
4970
  if (ch.includes("\x1b")) continue;
4770
- buffer += ch;
4971
+ buffer = buffer.slice(0, cursorPos) + ch + buffer.slice(cursorPos);
4972
+ cursorPos++;
4771
4973
  selectedIndex = 0;
4772
4974
  render();
4773
4975
  }
@@ -5066,8 +5268,9 @@ async function promptLoop(agent, session, context) {
5066
5268
  }
5067
5269
 
5068
5270
  async function handleNaturalInput(line) {
5069
- // Product builder intake: detect "I want a recipe app" in any mode
5070
- if (detectProductRequest(line)) {
5271
+ // Product builder intake: only in standard/guide mode. Expert mode uses cockpit.
5272
+ const currentMode = agent.getExperienceMode();
5273
+ if (detectProductRequest(line) && currentMode !== "expert") {
5071
5274
  const intent = parseProductIntent(line);
5072
5275
 
5073
5276
  // Extract everything from the request — no interactive prompts (stdin is unreliable on Windows)
@@ -5513,8 +5716,14 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
5513
5716
  lastUsage: context.lastUsage,
5514
5717
  costTracker: context.costTracker
5515
5718
  });
5516
- if (context.voiceModeEnabled) {
5517
- return "Voice ON — space to record, space to stop | " + footer;
5719
+ const modes = [];
5720
+ if (context.voiceModeEnabled) modes.push("Voice ON");
5721
+ if (context.speakModeEnabled) modes.push("Speak ON");
5722
+ if (modes.length > 0) {
5723
+ const hints = [];
5724
+ if (context.voiceModeEnabled) hints.push("space to record/stop");
5725
+ if (context.speakModeEnabled) hints.push("esc stops speech");
5726
+ return `${modes.join(" | ")} — ${hints.join(", ")} | ${footer}`;
5518
5727
  }
5519
5728
  return footer;
5520
5729
  },
@@ -7292,18 +7501,49 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
7292
7501
  context.voiceSession = await setupVoice((msg) => console.log(msg));
7293
7502
  }
7294
7503
  context.voiceModeEnabled = true;
7295
- console.log("Voice mode ON. Press spacebar to record (5 seconds).");
7504
+ console.log("Voice dictation ON. Press spacebar to record, spacebar to stop.");
7296
7505
  console.log(dim("Tip: Grok will auto-correct technical terms after transcription."));
7297
7506
  } catch (error) {
7298
7507
  console.log(`Voice mode failed: ${error instanceof Error ? error.message : String(error)}`);
7299
7508
  }
7300
7509
  } else {
7301
7510
  context.voiceModeEnabled = false;
7302
- if (context.voiceSession) {
7511
+ // Only tear down session if speak mode is also off
7512
+ if (!context.speakModeEnabled && context.voiceSession) {
7513
+ context.voiceSession.destroy();
7514
+ context.voiceSession = null;
7515
+ }
7516
+ console.log("Voice dictation OFF.");
7517
+ }
7518
+ continue;
7519
+ }
7520
+
7521
+ if (line === "/speak") {
7522
+ if (!context.speakModeEnabled) {
7523
+ try {
7524
+ if (!context.runtime.apiKey) {
7525
+ console.log("Speak mode requires an xAI API key. Set your API key first.");
7526
+ continue;
7527
+ }
7528
+ // Voice session is needed for sox playback; set up if not already done
7529
+ if (!context.voiceSession) {
7530
+ const { setupVoice } = await import("./voice.js");
7531
+ context.voiceSession = await setupVoice((msg) => console.log(msg));
7532
+ }
7533
+ context.speakModeEnabled = true;
7534
+ console.log("Speak mode ON. Agent will read responses aloud via xAI.");
7535
+ console.log(dim("Tip: Press Escape to stop speech mid-sentence."));
7536
+ } catch (error) {
7537
+ console.log(`Speak mode failed: ${error instanceof Error ? error.message : String(error)}`);
7538
+ }
7539
+ } else {
7540
+ context.speakModeEnabled = false;
7541
+ // Only tear down session if voice mode is also off
7542
+ if (!context.voiceModeEnabled && context.voiceSession) {
7303
7543
  context.voiceSession.destroy();
7304
7544
  context.voiceSession = null;
7305
7545
  }
7306
- console.log("Voice mode OFF.");
7546
+ console.log("Speak mode OFF.");
7307
7547
  }
7308
7548
  continue;
7309
7549
  }
package/src/voice.js CHANGED
@@ -13,8 +13,9 @@ const execFileAsync = promisify(execFile);
13
13
  // Paths
14
14
  // ---------------------------------------------------------------------------
15
15
 
16
- const MODEL_DIR_NAME = "sherpa-onnx-moonshine-base-en-int8";
17
- const MODEL_FILES = [
16
+ // STT model (Moonshine)
17
+ const STT_MODEL_DIR_NAME = "sherpa-onnx-moonshine-base-en-int8";
18
+ const STT_MODEL_FILES = [
18
19
  "preprocess.onnx",
19
20
  "encode.int8.onnx",
20
21
  "uncached_decode.int8.onnx",
@@ -22,15 +23,33 @@ const MODEL_FILES = [
22
23
  "tokens.txt"
23
24
  ];
24
25
 
26
+ // Streaming STT model (Zipformer 20M)
27
+ const STREAM_MODEL_DIR_NAME = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17";
28
+ const STREAM_MODEL_FILES = [
29
+ "encoder-epoch-99-avg-1.int8.onnx",
30
+ "decoder-epoch-99-avg-1.int8.onnx",
31
+ "joiner-epoch-99-avg-1.int8.onnx",
32
+ "tokens.txt"
33
+ ];
34
+
35
+ // xAI TTS API
36
+ const XAI_TTS_URL = "https://api.x.ai/v1/tts";
37
+
25
38
  function getWaterbrotherHome() {
26
39
  const home = process.env.HOME || process.env.USERPROFILE || "";
27
40
  return path.join(home, ".waterbrother");
28
41
  }
29
42
 
30
- function getModelsDir() {
31
- return path.join(getWaterbrotherHome(), "models", MODEL_DIR_NAME);
43
+ function getSttModelsDir() {
44
+ return path.join(getWaterbrotherHome(), "models", STT_MODEL_DIR_NAME);
45
+ }
46
+
47
+ function getStreamModelsDir() {
48
+ return path.join(getWaterbrotherHome(), "models", STREAM_MODEL_DIR_NAME);
32
49
  }
33
50
 
51
+
52
+
34
53
  function getVoiceRuntimeDir() {
35
54
  return path.join(getWaterbrotherHome(), "voice-runtime");
36
55
  }
@@ -94,31 +113,49 @@ async function checkSherpaOnnx() {
94
113
  }
95
114
  }
96
115
 
97
- async function checkModel() {
98
- const dir = getModelsDir();
116
+ async function checkSttModel() {
117
+ const dir = getSttModelsDir();
118
+ try {
119
+ const entries = await fs.readdir(dir);
120
+ const missing = STT_MODEL_FILES.filter((f) => !entries.includes(f));
121
+ return { ok: missing.length === 0, dir, missing };
122
+ } catch {
123
+ return { ok: false, dir, missing: STT_MODEL_FILES };
124
+ }
125
+ }
126
+
127
+ async function checkStreamModel() {
128
+ const dir = getStreamModelsDir();
99
129
  try {
100
130
  const entries = await fs.readdir(dir);
101
- const missing = MODEL_FILES.filter((f) => !entries.includes(f));
131
+ const missing = STREAM_MODEL_FILES.filter((f) => !entries.includes(f));
102
132
  return { ok: missing.length === 0, dir, missing };
103
133
  } catch {
104
- return { ok: false, dir, missing: MODEL_FILES };
134
+ return { ok: false, dir, missing: STREAM_MODEL_FILES };
105
135
  }
106
136
  }
107
137
 
138
+
139
+
108
140
  // ---------------------------------------------------------------------------
109
141
  // Model download
110
142
  // ---------------------------------------------------------------------------
111
143
 
112
- const MODEL_ARCHIVE_URL =
113
- `https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${MODEL_DIR_NAME}.tar.bz2`;
144
+ const STT_MODEL_ARCHIVE_URL =
145
+ `https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${STT_MODEL_DIR_NAME}.tar.bz2`;
114
146
 
115
- async function downloadModel(onProgress) {
116
- const modelsRoot = path.dirname(getModelsDir());
147
+ const STREAM_MODEL_ARCHIVE_URL =
148
+ `https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${STREAM_MODEL_DIR_NAME}.tar.bz2`;
149
+
150
+
151
+
152
+ async function downloadArchive(archiveUrl, dirName, onProgress) {
153
+ const modelsRoot = path.join(getWaterbrotherHome(), "models");
117
154
  await fs.mkdir(modelsRoot, { recursive: true });
118
155
 
119
156
  if (onProgress) onProgress({ status: "downloading" });
120
157
 
121
- const response = await fetch(MODEL_ARCHIVE_URL, { redirect: "follow" });
158
+ const response = await fetch(archiveUrl, { redirect: "follow" });
122
159
  if (!response.ok) {
123
160
  throw new Error(`Failed to download model archive: HTTP ${response.status}`);
124
161
  }
@@ -139,7 +176,7 @@ async function downloadModel(onProgress) {
139
176
  }
140
177
 
141
178
  // Write archive to temp file, then extract
142
- const archivePath = path.join(modelsRoot, `${MODEL_DIR_NAME}.tar.bz2`);
179
+ const archivePath = path.join(modelsRoot, `${dirName}.tar.bz2`);
143
180
  const archiveBuffer = Buffer.concat(chunks);
144
181
  await fs.writeFile(archivePath, archiveBuffer);
145
182
  if (onProgress) onProgress({ status: "extracting" });
@@ -174,7 +211,7 @@ function createRecognizer() {
174
211
  const sherpa = _sherpaOnnx;
175
212
  if (!sherpa) throw new Error("sherpa-onnx-node not loaded");
176
213
 
177
- const dir = getModelsDir();
214
+ const dir = getSttModelsDir();
178
215
  const config = {
179
216
  modelConfig: {
180
217
  moonshine: {
@@ -193,6 +230,35 @@ function createRecognizer() {
193
230
  return new sherpa.OfflineRecognizer(config);
194
231
  }
195
232
 
233
+
234
+
235
+ function createStreamingRecognizer() {
236
+ const sherpa = _sherpaOnnx;
237
+ if (!sherpa) throw new Error("sherpa-onnx-node not loaded");
238
+
239
+ const dir = getStreamModelsDir();
240
+ const config = {
241
+ featConfig: { sampleRate: 16000, featureDim: 80 },
242
+ modelConfig: {
243
+ transducer: {
244
+ encoder: path.join(dir, "encoder-epoch-99-avg-1.int8.onnx"),
245
+ decoder: path.join(dir, "decoder-epoch-99-avg-1.int8.onnx"),
246
+ joiner: path.join(dir, "joiner-epoch-99-avg-1.int8.onnx"),
247
+ },
248
+ tokens: path.join(dir, "tokens.txt"),
249
+ numThreads: 2,
250
+ provider: "cpu",
251
+ debug: 0,
252
+ },
253
+ decodingMethod: "greedy_search",
254
+ enableEndpoint: true,
255
+ rule1MinTrailingSilence: 2.4,
256
+ rule2MinTrailingSilence: 1.2,
257
+ rule3MinUtteranceLength: 20,
258
+ };
259
+ return new sherpa.OnlineRecognizer(config);
260
+ }
261
+
196
262
  // ---------------------------------------------------------------------------
197
263
  // Audio device detection (Windows)
198
264
  // ---------------------------------------------------------------------------
@@ -450,32 +516,50 @@ export async function setupVoice(onStatus) {
450
516
  }
451
517
  log(" sherpa-onnx: ready");
452
518
 
453
- // 3. Model — auto-download if missing
454
- const model = await checkModel();
455
- if (!model.ok) {
456
- log(" Downloading Moonshine Base model (~250 MB)...");
457
- await downloadModel(({ status, downloaded, total, size }) => {
458
- if (status === "progress" && total > 0) {
459
- const pct = Math.round((downloaded / total) * 100);
460
- process.stdout.write(`\r ${pct}% (${formatBytes(downloaded)}/${formatBytes(total)})`);
461
- } else if (status === "extracting") {
462
- process.stdout.write(`\r Extracting... \n`);
463
- } else if (status === "done") {
464
- log(` Done (${formatBytes(size)})`);
465
- }
466
- });
467
- log(" Model ready.");
519
+ // 3. STT model — auto-download if missing
520
+ const sttModel = await checkSttModel();
521
+ if (!sttModel.ok) {
522
+ log(" Downloading Moonshine Base STT model (~250 MB)...");
523
+ await downloadArchive(STT_MODEL_ARCHIVE_URL, STT_MODEL_DIR_NAME, downloadProgressHandler(log));
524
+ log(" STT model ready.");
468
525
  } else {
469
526
  log(" Moonshine Base: ready");
470
527
  }
471
528
 
472
- // 4. Detect audio device (Windows)
529
+ // 4. Streaming STT model — auto-download if missing
530
+ const streamModel = await checkStreamModel();
531
+ if (!streamModel.ok) {
532
+ log(" Downloading Zipformer streaming STT model (~122 MB)...");
533
+ await downloadArchive(STREAM_MODEL_ARCHIVE_URL, STREAM_MODEL_DIR_NAME, downloadProgressHandler(log));
534
+ log(" Streaming STT model ready.");
535
+ } else {
536
+ log(" Zipformer streaming: ready");
537
+ }
538
+
539
+ // 5. Detect audio device (Windows)
473
540
  const soxPath = sox.path;
474
541
  const audioDevice = await detectAudioDevice(soxPath, log);
475
542
 
476
- // 5. Initialize recognizer
543
+ // 6. Initialize recognizers
477
544
  await loadSherpaOnnx();
478
545
  const recognizer = createRecognizer();
546
+ let streamingRecognizer = null;
547
+ try {
548
+ streamingRecognizer = createStreamingRecognizer();
549
+ log(" Zipformer streaming: initialized");
550
+ } catch (err) {
551
+ log(` Zipformer streaming: failed (${err.message}) — falling back to batch mode`);
552
+ }
553
+ log(" TTS: xAI voice API (requires /speak + API key)");
554
+
555
+ // Active streaming recording state
556
+ let _streamingSox = null;
557
+ let _streamingStream = null;
558
+ let _streamingChunks = [];
559
+
560
+ // TTS playback state
561
+ let _ttsPlayback = null;
562
+ let _ttsCancelled = false;
479
563
 
480
564
  return {
481
565
  // Record for a fixed duration (sox exits cleanly, no kill).
@@ -492,11 +576,186 @@ export async function setupVoice(onStatus) {
492
576
  return { error: `No speech detected (${durationMs}ms, amp=${maxAmp.toFixed(4)})` };
493
577
  },
494
578
 
579
+ // Start streaming recognition. onPartial(text) fires as words are recognized.
580
+ // Returns true if streaming started, false if falling back to batch.
581
+ startStreaming(onPartial) {
582
+ if (!streamingRecognizer) return false;
583
+
584
+ _streamingChunks = [];
585
+ _streamingStream = streamingRecognizer.createStream();
586
+
587
+ const isWin = process.platform === "win32";
588
+ const inputArgs = isWin
589
+ ? ["-t", "waveaudio", audioDevice || "default"]
590
+ : ["-d"];
591
+ const args = [
592
+ ...inputArgs,
593
+ "-t", "raw", "-r", "16000", "-c", "1", "-b", "16", "-e", "signed-integer",
594
+ "-"
595
+ ];
596
+
597
+ _streamingSox = spawn(soxPath, args, { stdio: ["ignore", "pipe", "ignore"] });
598
+
599
+ _streamingSox.stdout.on("data", (chunk) => {
600
+ _streamingChunks.push(chunk);
601
+ const samples = new Float32Array(Math.floor(chunk.length / 2));
602
+ for (let i = 0; i < samples.length; i++) {
603
+ samples[i] = chunk.readInt16LE(i * 2) / 32768.0;
604
+ }
605
+
606
+ _streamingStream.acceptWaveform({ sampleRate: 16000, samples });
607
+ while (streamingRecognizer.isReady(_streamingStream)) {
608
+ streamingRecognizer.decode(_streamingStream);
609
+ }
610
+
611
+ const text = streamingRecognizer.getResult(_streamingStream).text.trim();
612
+ if (text) onPartial(text);
613
+ });
614
+
615
+ return true;
616
+ },
617
+
618
+ // Stop streaming and finalize with Moonshine for accuracy.
619
+ // Returns final text or { error: "..." }.
620
+ async stopStreaming() {
621
+ if (_streamingSox) {
622
+ _streamingSox.kill();
623
+ _streamingSox = null;
624
+ }
625
+ if (_streamingStream) {
626
+ streamingRecognizer.reset(_streamingStream);
627
+ _streamingStream = null;
628
+ }
629
+
630
+ // Combine all captured chunks and run Moonshine for final accuracy
631
+ if (_streamingChunks.length === 0) {
632
+ return { error: "No audio captured" };
633
+ }
634
+ const fullBuffer = Buffer.concat(_streamingChunks);
635
+ _streamingChunks = [];
636
+ const samples = new Float32Array(Math.floor(fullBuffer.length / 2));
637
+ for (let i = 0; i < samples.length; i++) {
638
+ samples[i] = fullBuffer.readInt16LE(i * 2) / 32768.0;
639
+ }
640
+
641
+ const durationMs = Math.round((samples.length / 16000) * 1000);
642
+ let maxAmp = 0;
643
+ for (const v of samples) { const a = Math.abs(v); if (a > maxAmp) maxAmp = a; }
644
+ const text = transcribe(recognizer, samples);
645
+ if (text) return text;
646
+ if (samples.length < 1600) return { error: `Recording too short (${durationMs}ms)` };
647
+ if (maxAmp < 0.01) return { error: `Silence (${durationMs}ms, amp=${maxAmp.toFixed(4)}) — mic not active` };
648
+ return { error: `No speech detected (${durationMs}ms, amp=${maxAmp.toFixed(4)})` };
649
+ },
650
+
651
+ hasStreaming() { return streamingRecognizer !== null; },
652
+
495
653
  async correctTranscript(rawText, grokConfig) {
496
654
  return correctTranscript(rawText, grokConfig);
497
655
  },
498
656
 
499
- destroy() {}
657
+ // Speak full text via xAI TTS API. Streams MP3 to temp file, plays via sox.
658
+ // Can be cancelled via stopSpeaking(). Requires grokConfig with apiKey.
659
+ async speakFull(text, { apiKey, baseUrl } = {}) {
660
+ if (!text || !apiKey) return;
661
+ _ttsCancelled = false;
662
+
663
+ // Strip ANSI, markdown, code blocks, emojis
664
+ let clean = text.replace(/\x1b\[[0-9;]*m/g, "").replace(/```[\s\S]*?```/g, "").replace(/`[^`]+`/g, "");
665
+ clean = clean.replace(/[#*_~>]/g, "");
666
+ clean = clean.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{27BF}\u{2B50}\u{2B55}\u{231A}-\u{23F3}\u{23CF}\u{200D}\u{FE0F}\u{20E3}\u{E0020}-\u{E007F}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}]/gu, "");
667
+ const lines = clean.split("\n").map((l) => l.trim()).filter(Boolean);
668
+ const prose = lines.filter((l) => !(/^[/\\+\-@{]/.test(l) || /^\d+[:|]/.test(l) || l.length < 3));
669
+ const fullText = prose.join(". ");
670
+ if (!fullText.trim()) return;
671
+
672
+ try {
673
+ const response = await fetch(XAI_TTS_URL, {
674
+ method: "POST",
675
+ headers: {
676
+ "Authorization": `Bearer ${apiKey}`,
677
+ "Content-Type": "application/json",
678
+ },
679
+ body: JSON.stringify({
680
+ text: fullText,
681
+ voice_id: "eve",
682
+ language: "en",
683
+ }),
684
+ });
685
+
686
+ if (!response.ok) return;
687
+ if (_ttsCancelled) return;
688
+
689
+ const audioBuffer = Buffer.from(await response.arrayBuffer());
690
+ if (_ttsCancelled || !audioBuffer.length) return;
691
+
692
+ const tmpDir = path.join(getWaterbrotherHome(), "tmp");
693
+ await fs.mkdir(tmpDir, { recursive: true });
694
+ const ts = Date.now();
695
+ const mp3Path = path.join(tmpDir, `tts-${ts}.mp3`);
696
+ await fs.writeFile(mp3Path, audioBuffer);
697
+
698
+ if (_ttsCancelled) { fs.unlink(mp3Path).catch(() => {}); return; }
699
+
700
+ // Play MP3 — platform-native players
701
+ const cleanupFiles = [mp3Path];
702
+ let playCmd, playArgs;
703
+ if (process.platform === "darwin") {
704
+ playCmd = "afplay";
705
+ playArgs = [mp3Path];
706
+ } else if (process.platform === "win32") {
707
+ // PowerShell MediaPlayer — write temp .ps1 to avoid escaping issues
708
+ const psPath = path.join(tmpDir, `tts-${ts}.ps1`);
709
+ await fs.writeFile(psPath, [
710
+ "Add-Type -AssemblyName PresentationCore",
711
+ "$p = New-Object System.Windows.Media.MediaPlayer",
712
+ `$p.Open([uri]"${mp3Path.replace(/\\/g, "/")}")`,
713
+ "$p.Play()",
714
+ "Start-Sleep -Milliseconds 500",
715
+ "while($p.Position -lt $p.NaturalDuration.TimeSpan){ Start-Sleep -Milliseconds 200 }",
716
+ "$p.Close()",
717
+ ].join("\n"));
718
+ cleanupFiles.push(psPath);
719
+ playCmd = "powershell.exe";
720
+ playArgs = ["-NoProfile", "-ExecutionPolicy", "Bypass", "-File", psPath];
721
+ } else {
722
+ playCmd = "mpv";
723
+ playArgs = ["--no-video", "--really-quiet", mp3Path];
724
+ }
725
+
726
+ await new Promise((resolve) => {
727
+ const child = spawn(playCmd, playArgs, { stdio: "ignore" });
728
+ _ttsPlayback = child;
729
+ child.on("exit", () => {
730
+ if (_ttsPlayback === child) _ttsPlayback = null;
731
+ for (const f of cleanupFiles) fs.unlink(f).catch(() => {});
732
+ resolve();
733
+ });
734
+ child.on("error", () => { resolve(); });
735
+ });
736
+ } catch {
737
+ // TTS failed — silently ignore
738
+ }
739
+ _ttsCancelled = false;
740
+ },
741
+
742
+ // Stop any in-progress speech playback.
743
+ stopSpeaking() {
744
+ _ttsCancelled = true;
745
+ if (_ttsPlayback) {
746
+ try { _ttsPlayback.kill(); } catch {}
747
+ _ttsPlayback = null;
748
+ }
749
+ },
750
+
751
+ isSpeaking() { return _ttsPlayback !== null; },
752
+
753
+ hasTts() { return true; },
754
+
755
+ destroy() {
756
+ if (_streamingSox) { _streamingSox.kill(); _streamingSox = null; }
757
+ this.stopSpeaking();
758
+ }
500
759
  };
501
760
  }
502
761
 
@@ -504,6 +763,19 @@ export async function setupVoice(onStatus) {
504
763
  // Helpers
505
764
  // ---------------------------------------------------------------------------
506
765
 
766
+ function downloadProgressHandler(log) {
767
+ return ({ status, downloaded, total, size }) => {
768
+ if (status === "progress" && total > 0) {
769
+ const pct = Math.round((downloaded / total) * 100);
770
+ process.stdout.write(`\r ${pct}% (${formatBytes(downloaded)}/${formatBytes(total)})`);
771
+ } else if (status === "extracting") {
772
+ process.stdout.write(`\r Extracting... \n`);
773
+ } else if (status === "done") {
774
+ log(` Done (${formatBytes(size)})`);
775
+ }
776
+ };
777
+ }
778
+
507
779
  function formatBytes(bytes) {
508
780
  if (bytes < 1024) return `${bytes} B`;
509
781
  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;