@tritard/waterbrother 0.14.12 → 0.14.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +292 -52
- package/src/voice.js +305 -33
package/package.json
CHANGED
package/src/cli.js
CHANGED
|
@@ -172,7 +172,8 @@ const INTERACTIVE_COMMANDS = [
|
|
|
172
172
|
{ name: "/feedback", description: "Report a bug or share feedback" },
|
|
173
173
|
{ name: "/cost", description: "Show session token usage and cost breakdown" },
|
|
174
174
|
{ name: "/diff", description: "Show git changes in the current repo" },
|
|
175
|
-
{ name: "/voice", description: "Toggle voice dictation (press space to record)" }
|
|
175
|
+
{ name: "/voice", description: "Toggle voice dictation (press space to record)" },
|
|
176
|
+
{ name: "/speak", description: "Toggle TTS — agent reads responses aloud (esc to stop)" }
|
|
176
177
|
];
|
|
177
178
|
|
|
178
179
|
const AGENT_PROFILES = ["coder", "designer", "reviewer", "planner"];
|
|
@@ -4133,6 +4134,15 @@ async function runTextTurnInteractive({
|
|
|
4133
4134
|
});
|
|
4134
4135
|
|
|
4135
4136
|
printAssistantOutput(renderedAssistantText);
|
|
4137
|
+
|
|
4138
|
+
// TTS: speak the response when speak mode is active
|
|
4139
|
+
if (context.speakModeEnabled && context.voiceSession?.hasTts?.()) {
|
|
4140
|
+
context.voiceSession.speakFull(renderedAssistantText, {
|
|
4141
|
+
apiKey: context.runtime.apiKey,
|
|
4142
|
+
baseUrl: context.runtime.baseUrl,
|
|
4143
|
+
});
|
|
4144
|
+
}
|
|
4145
|
+
|
|
4136
4146
|
await setSessionRunState(currentSession, agent, "done");
|
|
4137
4147
|
printTurnSummary(turnSummary, response, { modelId: agent.getModel(), costTracker: context.costTracker, traceMode: context.runtime.traceMode });
|
|
4138
4148
|
printTraceTimeline(turnSummary, context.runtime.traceMode);
|
|
@@ -4469,6 +4479,7 @@ async function readInteractiveLine(options = {}) {
|
|
|
4469
4479
|
|
|
4470
4480
|
return new Promise((resolve, reject) => {
|
|
4471
4481
|
let buffer = "";
|
|
4482
|
+
let cursorPos = 0;
|
|
4472
4483
|
let selectedIndex = 0;
|
|
4473
4484
|
let settled = false;
|
|
4474
4485
|
let ignoredPastePrintable = 0;
|
|
@@ -4478,6 +4489,38 @@ async function readInteractiveLine(options = {}) {
|
|
|
4478
4489
|
// Voice recording state
|
|
4479
4490
|
let voiceRecording = false;
|
|
4480
4491
|
let voiceIndicator = "";
|
|
4492
|
+
// Tracks the colored region for voice-dictated text
|
|
4493
|
+
// { start, length, state: 'raw'|'corrected', sweepPos: number (chars already swept green) }
|
|
4494
|
+
let voiceSegment = null;
|
|
4495
|
+
let voiceSegmentTimer = null;
|
|
4496
|
+
let voiceSweepTimer = null;
|
|
4497
|
+
|
|
4498
|
+
function sweepToGreen(start, length) {
|
|
4499
|
+
if (voiceSweepTimer) clearInterval(voiceSweepTimer);
|
|
4500
|
+
if (!voiceSegment) return;
|
|
4501
|
+
voiceSegment.sweepPos = 0;
|
|
4502
|
+
voiceSegment.state = "corrected";
|
|
4503
|
+
const text = buffer.slice(start, start + length);
|
|
4504
|
+
const words = text.split(/(?<=\s)/); // split keeping spaces
|
|
4505
|
+
let charsDone = 0;
|
|
4506
|
+
let wordIdx = 0;
|
|
4507
|
+
voiceSweepTimer = setInterval(() => {
|
|
4508
|
+
if (settled || wordIdx >= words.length) {
|
|
4509
|
+
clearInterval(voiceSweepTimer);
|
|
4510
|
+
voiceSweepTimer = null;
|
|
4511
|
+
if (voiceSegment) voiceSegment.sweepPos = length;
|
|
4512
|
+
render();
|
|
4513
|
+
// Fade to normal after sweep completes
|
|
4514
|
+
if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
|
|
4515
|
+
voiceSegmentTimer = setTimeout(() => { voiceSegment = null; render(); }, 2000);
|
|
4516
|
+
return;
|
|
4517
|
+
}
|
|
4518
|
+
charsDone += words[wordIdx].length;
|
|
4519
|
+
wordIdx++;
|
|
4520
|
+
if (voiceSegment) voiceSegment.sweepPos = charsDone;
|
|
4521
|
+
render();
|
|
4522
|
+
}, 80);
|
|
4523
|
+
}
|
|
4481
4524
|
|
|
4482
4525
|
function finish(nextValue) {
|
|
4483
4526
|
if (settled) return;
|
|
@@ -4501,7 +4544,24 @@ async function readInteractiveLine(options = {}) {
|
|
|
4501
4544
|
selectedIndex = 0;
|
|
4502
4545
|
}
|
|
4503
4546
|
|
|
4504
|
-
|
|
4547
|
+
// Apply visual coloring to voice-dictated text segments
|
|
4548
|
+
let coloredBuffer = buffer;
|
|
4549
|
+
if (voiceSegment && voiceSegment.start < buffer.length) {
|
|
4550
|
+
const s = voiceSegment;
|
|
4551
|
+
const before = buffer.slice(0, s.start);
|
|
4552
|
+
const seg = buffer.slice(s.start, s.start + s.length);
|
|
4553
|
+
const after = buffer.slice(s.start + s.length);
|
|
4554
|
+
if (s.state === "corrected" && typeof s.sweepPos === "number" && s.sweepPos < s.length) {
|
|
4555
|
+
// Sweep: green for swept portion, magenta for remaining
|
|
4556
|
+
const swept = seg.slice(0, s.sweepPos);
|
|
4557
|
+
const remaining = seg.slice(s.sweepPos);
|
|
4558
|
+
coloredBuffer = `${before}\x1b[32m${swept}\x1b[35m${remaining}\x1b[0m${after}`;
|
|
4559
|
+
} else {
|
|
4560
|
+
const color = s.state === "corrected" ? "\x1b[32m" : "\x1b[35m";
|
|
4561
|
+
coloredBuffer = `${before}${color}${seg}\x1b[0m${after}`;
|
|
4562
|
+
}
|
|
4563
|
+
}
|
|
4564
|
+
const displayBuffer = voiceIndicator ? `${coloredBuffer} ${voiceIndicator}` : coloredBuffer;
|
|
4505
4565
|
const writePrompt = () => {
|
|
4506
4566
|
output.write(formatPromptRow(displayBuffer, columns));
|
|
4507
4567
|
};
|
|
@@ -4539,6 +4599,13 @@ async function readInteractiveLine(options = {}) {
|
|
|
4539
4599
|
output.write("\r");
|
|
4540
4600
|
writePrompt();
|
|
4541
4601
|
}
|
|
4602
|
+
|
|
4603
|
+
// Position terminal cursor at cursorPos within the buffer
|
|
4604
|
+
if (cursorPos < buffer.length) {
|
|
4605
|
+
const prefixLen = USER_PREFIX.length + 1; // "you> " visible chars
|
|
4606
|
+
const col = prefixLen + cursorPos + 1; // 1-based column
|
|
4607
|
+
output.write(`\x1b[${col}G`);
|
|
4608
|
+
}
|
|
4542
4609
|
}
|
|
4543
4610
|
|
|
4544
4611
|
function cleanup() {
|
|
@@ -4560,6 +4627,7 @@ async function readInteractiveLine(options = {}) {
|
|
|
4560
4627
|
if (!isExact || buffer === "/") {
|
|
4561
4628
|
if (suggestionHasArgs(selected.name)) {
|
|
4562
4629
|
buffer = nextValue;
|
|
4630
|
+
cursorPos = buffer.length;
|
|
4563
4631
|
selectedIndex = 0;
|
|
4564
4632
|
render();
|
|
4565
4633
|
return;
|
|
@@ -4593,6 +4661,12 @@ async function readInteractiveLine(options = {}) {
|
|
|
4593
4661
|
return;
|
|
4594
4662
|
}
|
|
4595
4663
|
|
|
4664
|
+
// Escape: stop TTS playback if speaking
|
|
4665
|
+
if (key?.name === "escape" && voiceSession?.isSpeaking?.()) {
|
|
4666
|
+
voiceSession.stopSpeaking();
|
|
4667
|
+
return;
|
|
4668
|
+
}
|
|
4669
|
+
|
|
4596
4670
|
if (key?.name === "return" || key?.name === "enter" || str === "\n" || str === "\r") {
|
|
4597
4671
|
if (suppressSubmit) return;
|
|
4598
4672
|
handleSubmit();
|
|
@@ -4600,14 +4674,44 @@ async function readInteractiveLine(options = {}) {
|
|
|
4600
4674
|
}
|
|
4601
4675
|
|
|
4602
4676
|
if (key?.name === "backspace") {
|
|
4603
|
-
if (
|
|
4604
|
-
buffer = buffer.slice(0, -1);
|
|
4677
|
+
if (cursorPos > 0) {
|
|
4678
|
+
buffer = buffer.slice(0, cursorPos - 1) + buffer.slice(cursorPos);
|
|
4679
|
+
cursorPos--;
|
|
4680
|
+
selectedIndex = 0;
|
|
4681
|
+
render();
|
|
4682
|
+
}
|
|
4683
|
+
return;
|
|
4684
|
+
}
|
|
4685
|
+
|
|
4686
|
+
if (key?.name === "delete") {
|
|
4687
|
+
if (cursorPos < buffer.length) {
|
|
4688
|
+
buffer = buffer.slice(0, cursorPos) + buffer.slice(cursorPos + 1);
|
|
4605
4689
|
selectedIndex = 0;
|
|
4606
4690
|
render();
|
|
4607
4691
|
}
|
|
4608
4692
|
return;
|
|
4609
4693
|
}
|
|
4610
4694
|
|
|
4695
|
+
if (key?.name === "left") {
|
|
4696
|
+
if (cursorPos > 0) { cursorPos--; render(); }
|
|
4697
|
+
return;
|
|
4698
|
+
}
|
|
4699
|
+
|
|
4700
|
+
if (key?.name === "right") {
|
|
4701
|
+
if (cursorPos < buffer.length) { cursorPos++; render(); }
|
|
4702
|
+
return;
|
|
4703
|
+
}
|
|
4704
|
+
|
|
4705
|
+
if (key?.name === "home" || (key?.ctrl && key?.name === "a")) {
|
|
4706
|
+
if (cursorPos > 0) { cursorPos = 0; render(); }
|
|
4707
|
+
return;
|
|
4708
|
+
}
|
|
4709
|
+
|
|
4710
|
+
if (key?.name === "end" || (key?.ctrl && key?.name === "e")) {
|
|
4711
|
+
if (cursorPos < buffer.length) { cursorPos = buffer.length; render(); }
|
|
4712
|
+
return;
|
|
4713
|
+
}
|
|
4714
|
+
|
|
4611
4715
|
if (key?.name === "up") {
|
|
4612
4716
|
const suggestions = getSlashMenuSuggestions(buffer);
|
|
4613
4717
|
if (suggestions.length > 0) {
|
|
@@ -4631,68 +4735,163 @@ async function readInteractiveLine(options = {}) {
|
|
|
4631
4735
|
if (suggestions.length > 0) {
|
|
4632
4736
|
const selected = suggestions[selectedIndex >= 0 ? selectedIndex : 0];
|
|
4633
4737
|
buffer = commandInputFromSuggestion(selected.name);
|
|
4738
|
+
cursorPos = buffer.length;
|
|
4634
4739
|
selectedIndex = 0;
|
|
4635
4740
|
render();
|
|
4636
4741
|
}
|
|
4637
4742
|
return;
|
|
4638
4743
|
}
|
|
4639
4744
|
|
|
4640
|
-
// Voice: spacebar
|
|
4641
|
-
|
|
4642
|
-
|
|
4643
|
-
|
|
4644
|
-
voiceIndicator = "\x1b[31m[recording 5s — speak now]\x1b[0m";
|
|
4745
|
+
// Voice: spacebar stops active streaming recording
|
|
4746
|
+
if (voiceSession && voiceRecording && voiceSession.hasStreaming?.() && str === " ") {
|
|
4747
|
+
voiceRecording = false;
|
|
4748
|
+
voiceIndicator = "\x1b[36m[finalizing...]\x1b[0m";
|
|
4645
4749
|
render();
|
|
4646
4750
|
|
|
4647
4751
|
(async () => {
|
|
4648
4752
|
try {
|
|
4649
|
-
const result = await voiceSession.
|
|
4650
|
-
|
|
4753
|
+
const result = await voiceSession.stopStreaming();
|
|
4754
|
+
if (result && typeof result === "object" && result.error) {
|
|
4755
|
+
voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
|
|
4756
|
+
render();
|
|
4757
|
+
setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
|
|
4758
|
+
return;
|
|
4759
|
+
}
|
|
4651
4760
|
|
|
4652
|
-
|
|
4653
|
-
voiceIndicator =
|
|
4761
|
+
const finalText = typeof result === "string" ? result : "";
|
|
4762
|
+
voiceIndicator = "";
|
|
4763
|
+
if (!finalText) { render(); return; }
|
|
4764
|
+
|
|
4765
|
+
// Replace streaming preview with Moonshine final result
|
|
4766
|
+
const insertPoint = voiceSegment ? voiceSegment.start : buffer.length;
|
|
4767
|
+
const prevLength = voiceSegment ? voiceSegment.length : 0;
|
|
4768
|
+
const before = buffer.slice(0, insertPoint);
|
|
4769
|
+
const after = buffer.slice(insertPoint + prevLength);
|
|
4770
|
+
buffer = before + finalText + after;
|
|
4771
|
+
if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
|
|
4772
|
+
voiceSegment = { start: insertPoint, length: finalText.length, state: "raw" };
|
|
4773
|
+
cursorPos = insertPoint + finalText.length;
|
|
4654
4774
|
render();
|
|
4655
|
-
setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
|
|
4656
|
-
return;
|
|
4657
|
-
}
|
|
4658
4775
|
|
|
4659
|
-
|
|
4660
|
-
|
|
4661
|
-
|
|
4776
|
+
// Fire Grok correction with visual sweep
|
|
4777
|
+
if (grokConfig && grokConfig.apiKey) {
|
|
4778
|
+
voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
|
|
4779
|
+
render();
|
|
4780
|
+
voiceSession.correctTranscript(finalText, grokConfig).then((corrected) => {
|
|
4781
|
+
voiceIndicator = "";
|
|
4782
|
+
if (settled) return;
|
|
4783
|
+
const textToSweep = (corrected && corrected !== finalText) ? corrected : finalText;
|
|
4784
|
+
if (corrected && corrected !== finalText) {
|
|
4785
|
+
const b = buffer.slice(0, insertPoint);
|
|
4786
|
+
const a = buffer.slice(insertPoint + finalText.length);
|
|
4787
|
+
buffer = b + corrected + a;
|
|
4788
|
+
cursorPos = insertPoint + corrected.length;
|
|
4789
|
+
}
|
|
4790
|
+
voiceSegment = { start: insertPoint, length: textToSweep.length, state: "raw" };
|
|
4791
|
+
render();
|
|
4792
|
+
sweepToGreen(insertPoint, textToSweep.length);
|
|
4793
|
+
});
|
|
4794
|
+
} else {
|
|
4795
|
+
// No Grok — sweep the Moonshine result directly
|
|
4796
|
+
sweepToGreen(insertPoint, finalText.length);
|
|
4797
|
+
}
|
|
4798
|
+
} catch (err) {
|
|
4799
|
+
voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
|
|
4662
4800
|
render();
|
|
4663
|
-
|
|
4801
|
+
setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
|
|
4664
4802
|
}
|
|
4803
|
+
})();
|
|
4804
|
+
return;
|
|
4805
|
+
}
|
|
4665
4806
|
|
|
4666
|
-
|
|
4667
|
-
|
|
4807
|
+
// Voice: spacebar starts recording (only on empty/trailing-space at end of buffer)
|
|
4808
|
+
if (voiceSession && !voiceRecording && str === " " && cursorPos === buffer.length && (buffer.length === 0 || buffer.endsWith(" "))) {
|
|
4809
|
+
voiceRecording = true;
|
|
4810
|
+
const insertPoint = buffer.length;
|
|
4811
|
+
if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
|
|
4812
|
+
if (voiceSweepTimer) { clearInterval(voiceSweepTimer); voiceSweepTimer = null; }
|
|
4813
|
+
let lastStreamRender = 0;
|
|
4814
|
+
let streamRenderPending = null;
|
|
4815
|
+
|
|
4816
|
+
// Try streaming mode first, fall back to batch
|
|
4817
|
+
const streamingStarted = voiceSession.hasStreaming?.() && voiceSession.startStreaming((partialText) => {
|
|
4818
|
+
// Live update: replace voice segment with streaming partial result
|
|
4819
|
+
const before = buffer.slice(0, insertPoint);
|
|
4820
|
+
const prevLength = voiceSegment ? voiceSegment.length : 0;
|
|
4821
|
+
const after = buffer.slice(insertPoint + prevLength);
|
|
4822
|
+
buffer = before + partialText + after;
|
|
4823
|
+
voiceSegment = { start: insertPoint, length: partialText.length, state: "raw" };
|
|
4824
|
+
cursorPos = insertPoint + partialText.length;
|
|
4825
|
+
// Throttle renders to max ~8fps to prevent flicker
|
|
4826
|
+
const now = Date.now();
|
|
4827
|
+
if (now - lastStreamRender >= 120) {
|
|
4828
|
+
lastStreamRender = now;
|
|
4829
|
+
if (streamRenderPending) { clearTimeout(streamRenderPending); streamRenderPending = null; }
|
|
4668
4830
|
render();
|
|
4831
|
+
} else if (!streamRenderPending) {
|
|
4832
|
+
streamRenderPending = setTimeout(() => { streamRenderPending = null; lastStreamRender = Date.now(); render(); }, 120);
|
|
4833
|
+
}
|
|
4834
|
+
});
|
|
4669
4835
|
|
|
4670
|
-
|
|
4671
|
-
|
|
4836
|
+
if (streamingStarted) {
|
|
4837
|
+
voiceIndicator = "\x1b[31m[recording — space to stop]\x1b[0m";
|
|
4838
|
+
voiceSegment = { start: insertPoint, length: 0, state: "raw" };
|
|
4839
|
+
render();
|
|
4840
|
+
} else {
|
|
4841
|
+
// Batch fallback (no streaming recognizer)
|
|
4842
|
+
voiceIndicator = "\x1b[31m[recording 5s — speak now]\x1b[0m";
|
|
4843
|
+
render();
|
|
4844
|
+
(async () => {
|
|
4845
|
+
try {
|
|
4846
|
+
const result = await voiceSession.recordAndTranscribe(5);
|
|
4847
|
+
voiceRecording = false;
|
|
4848
|
+
if (result && typeof result === "object" && result.error) {
|
|
4849
|
+
voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
|
|
4850
|
+
render();
|
|
4851
|
+
setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
|
|
4852
|
+
return;
|
|
4853
|
+
}
|
|
4854
|
+
const rawText = typeof result === "string" ? result : "";
|
|
4855
|
+
voiceIndicator = "";
|
|
4856
|
+
if (!rawText) { render(); return; }
|
|
4857
|
+
buffer += rawText;
|
|
4858
|
+
voiceSegment = { start: insertPoint, length: rawText.length, state: "raw" };
|
|
4859
|
+
cursorPos = buffer.length;
|
|
4672
4860
|
render();
|
|
4673
|
-
|
|
4674
|
-
voiceIndicator = "";
|
|
4675
|
-
if (settled) return;
|
|
4676
|
-
if (corrected && corrected !== rawText) {
|
|
4677
|
-
const before = buffer.slice(0, insertPoint);
|
|
4678
|
-
const after = buffer.slice(insertPoint + rawText.length);
|
|
4679
|
-
buffer = before + corrected + after;
|
|
4680
|
-
}
|
|
4861
|
+
if (grokConfig && grokConfig.apiKey) {
|
|
4862
|
+
voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
|
|
4681
4863
|
render();
|
|
4682
|
-
|
|
4864
|
+
voiceSession.correctTranscript(rawText, grokConfig).then((corrected) => {
|
|
4865
|
+
voiceIndicator = "";
|
|
4866
|
+
if (settled) return;
|
|
4867
|
+
const textToSweep = (corrected && corrected !== rawText) ? corrected : rawText;
|
|
4868
|
+
if (corrected && corrected !== rawText) {
|
|
4869
|
+
const b = buffer.slice(0, insertPoint);
|
|
4870
|
+
const a = buffer.slice(insertPoint + rawText.length);
|
|
4871
|
+
buffer = b + corrected + a;
|
|
4872
|
+
cursorPos = insertPoint + corrected.length;
|
|
4873
|
+
}
|
|
4874
|
+
voiceSegment = { start: insertPoint, length: textToSweep.length, state: "raw" };
|
|
4875
|
+
render();
|
|
4876
|
+
sweepToGreen(insertPoint, textToSweep.length);
|
|
4877
|
+
});
|
|
4878
|
+
} else {
|
|
4879
|
+
sweepToGreen(insertPoint, rawText.length);
|
|
4880
|
+
}
|
|
4881
|
+
} catch (err) {
|
|
4882
|
+
voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
|
|
4883
|
+
voiceRecording = false;
|
|
4884
|
+
render();
|
|
4885
|
+
setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
|
|
4683
4886
|
}
|
|
4684
|
-
}
|
|
4685
|
-
|
|
4686
|
-
voiceRecording = false;
|
|
4687
|
-
render();
|
|
4688
|
-
setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
|
|
4689
|
-
}
|
|
4690
|
-
})();
|
|
4887
|
+
})();
|
|
4888
|
+
}
|
|
4691
4889
|
return;
|
|
4692
4890
|
}
|
|
4693
4891
|
|
|
4694
4892
|
if (isPrintableKey(str, key)) {
|
|
4695
|
-
buffer
|
|
4893
|
+
buffer = buffer.slice(0, cursorPos) + str + buffer.slice(cursorPos);
|
|
4894
|
+
cursorPos += str.length;
|
|
4696
4895
|
selectedIndex = 0;
|
|
4697
4896
|
render();
|
|
4698
4897
|
|
|
@@ -4733,7 +4932,8 @@ async function readInteractiveLine(options = {}) {
|
|
|
4733
4932
|
|
|
4734
4933
|
if (looksLikePastedBlock) {
|
|
4735
4934
|
if (normalized) {
|
|
4736
|
-
buffer
|
|
4935
|
+
buffer = buffer.slice(0, cursorPos) + normalized + buffer.slice(cursorPos);
|
|
4936
|
+
cursorPos += normalized.length;
|
|
4737
4937
|
selectedIndex = 0;
|
|
4738
4938
|
render();
|
|
4739
4939
|
}
|
|
@@ -4758,8 +4958,9 @@ async function readInteractiveLine(options = {}) {
|
|
|
4758
4958
|
return;
|
|
4759
4959
|
}
|
|
4760
4960
|
if (ch === "\u007f" || ch === "\b") {
|
|
4761
|
-
if (
|
|
4762
|
-
buffer = buffer.slice(0, -1);
|
|
4961
|
+
if (cursorPos > 0) {
|
|
4962
|
+
buffer = buffer.slice(0, cursorPos - 1) + buffer.slice(cursorPos);
|
|
4963
|
+
cursorPos--;
|
|
4763
4964
|
selectedIndex = 0;
|
|
4764
4965
|
render();
|
|
4765
4966
|
}
|
|
@@ -4767,7 +4968,8 @@ async function readInteractiveLine(options = {}) {
|
|
|
4767
4968
|
}
|
|
4768
4969
|
if (ch.charCodeAt(0) < 32 || ch.charCodeAt(0) === 127) continue;
|
|
4769
4970
|
if (ch.includes("\x1b")) continue;
|
|
4770
|
-
buffer
|
|
4971
|
+
buffer = buffer.slice(0, cursorPos) + ch + buffer.slice(cursorPos);
|
|
4972
|
+
cursorPos++;
|
|
4771
4973
|
selectedIndex = 0;
|
|
4772
4974
|
render();
|
|
4773
4975
|
}
|
|
@@ -5066,8 +5268,9 @@ async function promptLoop(agent, session, context) {
|
|
|
5066
5268
|
}
|
|
5067
5269
|
|
|
5068
5270
|
async function handleNaturalInput(line) {
|
|
5069
|
-
// Product builder intake:
|
|
5070
|
-
|
|
5271
|
+
// Product builder intake: only in standard/guide mode. Expert mode uses cockpit.
|
|
5272
|
+
const currentMode = agent.getExperienceMode();
|
|
5273
|
+
if (detectProductRequest(line) && currentMode !== "expert") {
|
|
5071
5274
|
const intent = parseProductIntent(line);
|
|
5072
5275
|
|
|
5073
5276
|
// Extract everything from the request — no interactive prompts (stdin is unreliable on Windows)
|
|
@@ -5513,8 +5716,14 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
|
|
|
5513
5716
|
lastUsage: context.lastUsage,
|
|
5514
5717
|
costTracker: context.costTracker
|
|
5515
5718
|
});
|
|
5516
|
-
|
|
5517
|
-
|
|
5719
|
+
const modes = [];
|
|
5720
|
+
if (context.voiceModeEnabled) modes.push("Voice ON");
|
|
5721
|
+
if (context.speakModeEnabled) modes.push("Speak ON");
|
|
5722
|
+
if (modes.length > 0) {
|
|
5723
|
+
const hints = [];
|
|
5724
|
+
if (context.voiceModeEnabled) hints.push("space to record/stop");
|
|
5725
|
+
if (context.speakModeEnabled) hints.push("esc stops speech");
|
|
5726
|
+
return `${modes.join(" | ")} — ${hints.join(", ")} | ${footer}`;
|
|
5518
5727
|
}
|
|
5519
5728
|
return footer;
|
|
5520
5729
|
},
|
|
@@ -7292,18 +7501,49 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
|
|
|
7292
7501
|
context.voiceSession = await setupVoice((msg) => console.log(msg));
|
|
7293
7502
|
}
|
|
7294
7503
|
context.voiceModeEnabled = true;
|
|
7295
|
-
console.log("Voice
|
|
7504
|
+
console.log("Voice dictation ON. Press spacebar to record, spacebar to stop.");
|
|
7296
7505
|
console.log(dim("Tip: Grok will auto-correct technical terms after transcription."));
|
|
7297
7506
|
} catch (error) {
|
|
7298
7507
|
console.log(`Voice mode failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
7299
7508
|
}
|
|
7300
7509
|
} else {
|
|
7301
7510
|
context.voiceModeEnabled = false;
|
|
7302
|
-
if
|
|
7511
|
+
// Only tear down session if speak mode is also off
|
|
7512
|
+
if (!context.speakModeEnabled && context.voiceSession) {
|
|
7513
|
+
context.voiceSession.destroy();
|
|
7514
|
+
context.voiceSession = null;
|
|
7515
|
+
}
|
|
7516
|
+
console.log("Voice dictation OFF.");
|
|
7517
|
+
}
|
|
7518
|
+
continue;
|
|
7519
|
+
}
|
|
7520
|
+
|
|
7521
|
+
if (line === "/speak") {
|
|
7522
|
+
if (!context.speakModeEnabled) {
|
|
7523
|
+
try {
|
|
7524
|
+
if (!context.runtime.apiKey) {
|
|
7525
|
+
console.log("Speak mode requires an xAI API key. Set your API key first.");
|
|
7526
|
+
continue;
|
|
7527
|
+
}
|
|
7528
|
+
// Voice session is needed for sox playback; set up if not already done
|
|
7529
|
+
if (!context.voiceSession) {
|
|
7530
|
+
const { setupVoice } = await import("./voice.js");
|
|
7531
|
+
context.voiceSession = await setupVoice((msg) => console.log(msg));
|
|
7532
|
+
}
|
|
7533
|
+
context.speakModeEnabled = true;
|
|
7534
|
+
console.log("Speak mode ON. Agent will read responses aloud via xAI.");
|
|
7535
|
+
console.log(dim("Tip: Press Escape to stop speech mid-sentence."));
|
|
7536
|
+
} catch (error) {
|
|
7537
|
+
console.log(`Speak mode failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
7538
|
+
}
|
|
7539
|
+
} else {
|
|
7540
|
+
context.speakModeEnabled = false;
|
|
7541
|
+
// Only tear down session if voice mode is also off
|
|
7542
|
+
if (!context.voiceModeEnabled && context.voiceSession) {
|
|
7303
7543
|
context.voiceSession.destroy();
|
|
7304
7544
|
context.voiceSession = null;
|
|
7305
7545
|
}
|
|
7306
|
-
console.log("
|
|
7546
|
+
console.log("Speak mode OFF.");
|
|
7307
7547
|
}
|
|
7308
7548
|
continue;
|
|
7309
7549
|
}
|
package/src/voice.js
CHANGED
|
@@ -13,8 +13,9 @@ const execFileAsync = promisify(execFile);
|
|
|
13
13
|
// Paths
|
|
14
14
|
// ---------------------------------------------------------------------------
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
const
|
|
16
|
+
// STT model (Moonshine)
|
|
17
|
+
const STT_MODEL_DIR_NAME = "sherpa-onnx-moonshine-base-en-int8";
|
|
18
|
+
const STT_MODEL_FILES = [
|
|
18
19
|
"preprocess.onnx",
|
|
19
20
|
"encode.int8.onnx",
|
|
20
21
|
"uncached_decode.int8.onnx",
|
|
@@ -22,15 +23,33 @@ const MODEL_FILES = [
|
|
|
22
23
|
"tokens.txt"
|
|
23
24
|
];
|
|
24
25
|
|
|
26
|
+
// Streaming STT model (Zipformer 20M)
|
|
27
|
+
const STREAM_MODEL_DIR_NAME = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17";
|
|
28
|
+
const STREAM_MODEL_FILES = [
|
|
29
|
+
"encoder-epoch-99-avg-1.int8.onnx",
|
|
30
|
+
"decoder-epoch-99-avg-1.int8.onnx",
|
|
31
|
+
"joiner-epoch-99-avg-1.int8.onnx",
|
|
32
|
+
"tokens.txt"
|
|
33
|
+
];
|
|
34
|
+
|
|
35
|
+
// xAI TTS API
|
|
36
|
+
const XAI_TTS_URL = "https://api.x.ai/v1/tts";
|
|
37
|
+
|
|
25
38
|
function getWaterbrotherHome() {
|
|
26
39
|
const home = process.env.HOME || process.env.USERPROFILE || "";
|
|
27
40
|
return path.join(home, ".waterbrother");
|
|
28
41
|
}
|
|
29
42
|
|
|
30
|
-
function
|
|
31
|
-
return path.join(getWaterbrotherHome(), "models",
|
|
43
|
+
function getSttModelsDir() {
|
|
44
|
+
return path.join(getWaterbrotherHome(), "models", STT_MODEL_DIR_NAME);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function getStreamModelsDir() {
|
|
48
|
+
return path.join(getWaterbrotherHome(), "models", STREAM_MODEL_DIR_NAME);
|
|
32
49
|
}
|
|
33
50
|
|
|
51
|
+
|
|
52
|
+
|
|
34
53
|
function getVoiceRuntimeDir() {
|
|
35
54
|
return path.join(getWaterbrotherHome(), "voice-runtime");
|
|
36
55
|
}
|
|
@@ -94,31 +113,49 @@ async function checkSherpaOnnx() {
|
|
|
94
113
|
}
|
|
95
114
|
}
|
|
96
115
|
|
|
97
|
-
async function
|
|
98
|
-
const dir =
|
|
116
|
+
async function checkSttModel() {
|
|
117
|
+
const dir = getSttModelsDir();
|
|
118
|
+
try {
|
|
119
|
+
const entries = await fs.readdir(dir);
|
|
120
|
+
const missing = STT_MODEL_FILES.filter((f) => !entries.includes(f));
|
|
121
|
+
return { ok: missing.length === 0, dir, missing };
|
|
122
|
+
} catch {
|
|
123
|
+
return { ok: false, dir, missing: STT_MODEL_FILES };
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async function checkStreamModel() {
|
|
128
|
+
const dir = getStreamModelsDir();
|
|
99
129
|
try {
|
|
100
130
|
const entries = await fs.readdir(dir);
|
|
101
|
-
const missing =
|
|
131
|
+
const missing = STREAM_MODEL_FILES.filter((f) => !entries.includes(f));
|
|
102
132
|
return { ok: missing.length === 0, dir, missing };
|
|
103
133
|
} catch {
|
|
104
|
-
return { ok: false, dir, missing:
|
|
134
|
+
return { ok: false, dir, missing: STREAM_MODEL_FILES };
|
|
105
135
|
}
|
|
106
136
|
}
|
|
107
137
|
|
|
138
|
+
|
|
139
|
+
|
|
108
140
|
// ---------------------------------------------------------------------------
|
|
109
141
|
// Model download
|
|
110
142
|
// ---------------------------------------------------------------------------
|
|
111
143
|
|
|
112
|
-
const
|
|
113
|
-
`https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${
|
|
144
|
+
const STT_MODEL_ARCHIVE_URL =
|
|
145
|
+
`https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${STT_MODEL_DIR_NAME}.tar.bz2`;
|
|
114
146
|
|
|
115
|
-
|
|
116
|
-
|
|
147
|
+
const STREAM_MODEL_ARCHIVE_URL =
|
|
148
|
+
`https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${STREAM_MODEL_DIR_NAME}.tar.bz2`;
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
async function downloadArchive(archiveUrl, dirName, onProgress) {
|
|
153
|
+
const modelsRoot = path.join(getWaterbrotherHome(), "models");
|
|
117
154
|
await fs.mkdir(modelsRoot, { recursive: true });
|
|
118
155
|
|
|
119
156
|
if (onProgress) onProgress({ status: "downloading" });
|
|
120
157
|
|
|
121
|
-
const response = await fetch(
|
|
158
|
+
const response = await fetch(archiveUrl, { redirect: "follow" });
|
|
122
159
|
if (!response.ok) {
|
|
123
160
|
throw new Error(`Failed to download model archive: HTTP ${response.status}`);
|
|
124
161
|
}
|
|
@@ -139,7 +176,7 @@ async function downloadModel(onProgress) {
|
|
|
139
176
|
}
|
|
140
177
|
|
|
141
178
|
// Write archive to temp file, then extract
|
|
142
|
-
const archivePath = path.join(modelsRoot, `${
|
|
179
|
+
const archivePath = path.join(modelsRoot, `${dirName}.tar.bz2`);
|
|
143
180
|
const archiveBuffer = Buffer.concat(chunks);
|
|
144
181
|
await fs.writeFile(archivePath, archiveBuffer);
|
|
145
182
|
if (onProgress) onProgress({ status: "extracting" });
|
|
@@ -174,7 +211,7 @@ function createRecognizer() {
|
|
|
174
211
|
const sherpa = _sherpaOnnx;
|
|
175
212
|
if (!sherpa) throw new Error("sherpa-onnx-node not loaded");
|
|
176
213
|
|
|
177
|
-
const dir =
|
|
214
|
+
const dir = getSttModelsDir();
|
|
178
215
|
const config = {
|
|
179
216
|
modelConfig: {
|
|
180
217
|
moonshine: {
|
|
@@ -193,6 +230,35 @@ function createRecognizer() {
|
|
|
193
230
|
return new sherpa.OfflineRecognizer(config);
|
|
194
231
|
}
|
|
195
232
|
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
function createStreamingRecognizer() {
|
|
236
|
+
const sherpa = _sherpaOnnx;
|
|
237
|
+
if (!sherpa) throw new Error("sherpa-onnx-node not loaded");
|
|
238
|
+
|
|
239
|
+
const dir = getStreamModelsDir();
|
|
240
|
+
const config = {
|
|
241
|
+
featConfig: { sampleRate: 16000, featureDim: 80 },
|
|
242
|
+
modelConfig: {
|
|
243
|
+
transducer: {
|
|
244
|
+
encoder: path.join(dir, "encoder-epoch-99-avg-1.int8.onnx"),
|
|
245
|
+
decoder: path.join(dir, "decoder-epoch-99-avg-1.int8.onnx"),
|
|
246
|
+
joiner: path.join(dir, "joiner-epoch-99-avg-1.int8.onnx"),
|
|
247
|
+
},
|
|
248
|
+
tokens: path.join(dir, "tokens.txt"),
|
|
249
|
+
numThreads: 2,
|
|
250
|
+
provider: "cpu",
|
|
251
|
+
debug: 0,
|
|
252
|
+
},
|
|
253
|
+
decodingMethod: "greedy_search",
|
|
254
|
+
enableEndpoint: true,
|
|
255
|
+
rule1MinTrailingSilence: 2.4,
|
|
256
|
+
rule2MinTrailingSilence: 1.2,
|
|
257
|
+
rule3MinUtteranceLength: 20,
|
|
258
|
+
};
|
|
259
|
+
return new sherpa.OnlineRecognizer(config);
|
|
260
|
+
}
|
|
261
|
+
|
|
196
262
|
// ---------------------------------------------------------------------------
|
|
197
263
|
// Audio device detection (Windows)
|
|
198
264
|
// ---------------------------------------------------------------------------
|
|
@@ -450,32 +516,50 @@ export async function setupVoice(onStatus) {
|
|
|
450
516
|
}
|
|
451
517
|
log(" sherpa-onnx: ready");
|
|
452
518
|
|
|
453
|
-
// 3.
|
|
454
|
-
const
|
|
455
|
-
if (!
|
|
456
|
-
log(" Downloading Moonshine Base model (~250 MB)...");
|
|
457
|
-
await
|
|
458
|
-
|
|
459
|
-
const pct = Math.round((downloaded / total) * 100);
|
|
460
|
-
process.stdout.write(`\r ${pct}% (${formatBytes(downloaded)}/${formatBytes(total)})`);
|
|
461
|
-
} else if (status === "extracting") {
|
|
462
|
-
process.stdout.write(`\r Extracting... \n`);
|
|
463
|
-
} else if (status === "done") {
|
|
464
|
-
log(` Done (${formatBytes(size)})`);
|
|
465
|
-
}
|
|
466
|
-
});
|
|
467
|
-
log(" Model ready.");
|
|
519
|
+
// 3. STT model — auto-download if missing
|
|
520
|
+
const sttModel = await checkSttModel();
|
|
521
|
+
if (!sttModel.ok) {
|
|
522
|
+
log(" Downloading Moonshine Base STT model (~250 MB)...");
|
|
523
|
+
await downloadArchive(STT_MODEL_ARCHIVE_URL, STT_MODEL_DIR_NAME, downloadProgressHandler(log));
|
|
524
|
+
log(" STT model ready.");
|
|
468
525
|
} else {
|
|
469
526
|
log(" Moonshine Base: ready");
|
|
470
527
|
}
|
|
471
528
|
|
|
472
|
-
// 4.
|
|
529
|
+
// 4. Streaming STT model — auto-download if missing
|
|
530
|
+
const streamModel = await checkStreamModel();
|
|
531
|
+
if (!streamModel.ok) {
|
|
532
|
+
log(" Downloading Zipformer streaming STT model (~122 MB)...");
|
|
533
|
+
await downloadArchive(STREAM_MODEL_ARCHIVE_URL, STREAM_MODEL_DIR_NAME, downloadProgressHandler(log));
|
|
534
|
+
log(" Streaming STT model ready.");
|
|
535
|
+
} else {
|
|
536
|
+
log(" Zipformer streaming: ready");
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// 5. Detect audio device (Windows)
|
|
473
540
|
const soxPath = sox.path;
|
|
474
541
|
const audioDevice = await detectAudioDevice(soxPath, log);
|
|
475
542
|
|
|
476
|
-
//
|
|
543
|
+
// 6. Initialize recognizers
|
|
477
544
|
await loadSherpaOnnx();
|
|
478
545
|
const recognizer = createRecognizer();
|
|
546
|
+
let streamingRecognizer = null;
|
|
547
|
+
try {
|
|
548
|
+
streamingRecognizer = createStreamingRecognizer();
|
|
549
|
+
log(" Zipformer streaming: initialized");
|
|
550
|
+
} catch (err) {
|
|
551
|
+
log(` Zipformer streaming: failed (${err.message}) — falling back to batch mode`);
|
|
552
|
+
}
|
|
553
|
+
log(" TTS: xAI voice API (requires /speak + API key)");
|
|
554
|
+
|
|
555
|
+
// Active streaming recording state
|
|
556
|
+
let _streamingSox = null;
|
|
557
|
+
let _streamingStream = null;
|
|
558
|
+
let _streamingChunks = [];
|
|
559
|
+
|
|
560
|
+
// TTS playback state
|
|
561
|
+
let _ttsPlayback = null;
|
|
562
|
+
let _ttsCancelled = false;
|
|
479
563
|
|
|
480
564
|
return {
|
|
481
565
|
// Record for a fixed duration (sox exits cleanly, no kill).
|
|
@@ -492,11 +576,186 @@ export async function setupVoice(onStatus) {
|
|
|
492
576
|
return { error: `No speech detected (${durationMs}ms, amp=${maxAmp.toFixed(4)})` };
|
|
493
577
|
},
|
|
494
578
|
|
|
579
|
+
// Start streaming recognition. onPartial(text) fires as words are recognized.
|
|
580
|
+
// Returns true if streaming started, false if falling back to batch.
|
|
581
|
+
startStreaming(onPartial) {
|
|
582
|
+
if (!streamingRecognizer) return false;
|
|
583
|
+
|
|
584
|
+
_streamingChunks = [];
|
|
585
|
+
_streamingStream = streamingRecognizer.createStream();
|
|
586
|
+
|
|
587
|
+
const isWin = process.platform === "win32";
|
|
588
|
+
const inputArgs = isWin
|
|
589
|
+
? ["-t", "waveaudio", audioDevice || "default"]
|
|
590
|
+
: ["-d"];
|
|
591
|
+
const args = [
|
|
592
|
+
...inputArgs,
|
|
593
|
+
"-t", "raw", "-r", "16000", "-c", "1", "-b", "16", "-e", "signed-integer",
|
|
594
|
+
"-"
|
|
595
|
+
];
|
|
596
|
+
|
|
597
|
+
_streamingSox = spawn(soxPath, args, { stdio: ["ignore", "pipe", "ignore"] });
|
|
598
|
+
|
|
599
|
+
_streamingSox.stdout.on("data", (chunk) => {
|
|
600
|
+
_streamingChunks.push(chunk);
|
|
601
|
+
const samples = new Float32Array(Math.floor(chunk.length / 2));
|
|
602
|
+
for (let i = 0; i < samples.length; i++) {
|
|
603
|
+
samples[i] = chunk.readInt16LE(i * 2) / 32768.0;
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
_streamingStream.acceptWaveform({ sampleRate: 16000, samples });
|
|
607
|
+
while (streamingRecognizer.isReady(_streamingStream)) {
|
|
608
|
+
streamingRecognizer.decode(_streamingStream);
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
const text = streamingRecognizer.getResult(_streamingStream).text.trim();
|
|
612
|
+
if (text) onPartial(text);
|
|
613
|
+
});
|
|
614
|
+
|
|
615
|
+
return true;
|
|
616
|
+
},
|
|
617
|
+
|
|
618
|
+
// Stop streaming and finalize with Moonshine for accuracy.
|
|
619
|
+
// Returns final text or { error: "..." }.
|
|
620
|
+
async stopStreaming() {
|
|
621
|
+
if (_streamingSox) {
|
|
622
|
+
_streamingSox.kill();
|
|
623
|
+
_streamingSox = null;
|
|
624
|
+
}
|
|
625
|
+
if (_streamingStream) {
|
|
626
|
+
streamingRecognizer.reset(_streamingStream);
|
|
627
|
+
_streamingStream = null;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Combine all captured chunks and run Moonshine for final accuracy
|
|
631
|
+
if (_streamingChunks.length === 0) {
|
|
632
|
+
return { error: "No audio captured" };
|
|
633
|
+
}
|
|
634
|
+
const fullBuffer = Buffer.concat(_streamingChunks);
|
|
635
|
+
_streamingChunks = [];
|
|
636
|
+
const samples = new Float32Array(Math.floor(fullBuffer.length / 2));
|
|
637
|
+
for (let i = 0; i < samples.length; i++) {
|
|
638
|
+
samples[i] = fullBuffer.readInt16LE(i * 2) / 32768.0;
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
const durationMs = Math.round((samples.length / 16000) * 1000);
|
|
642
|
+
let maxAmp = 0;
|
|
643
|
+
for (const v of samples) { const a = Math.abs(v); if (a > maxAmp) maxAmp = a; }
|
|
644
|
+
const text = transcribe(recognizer, samples);
|
|
645
|
+
if (text) return text;
|
|
646
|
+
if (samples.length < 1600) return { error: `Recording too short (${durationMs}ms)` };
|
|
647
|
+
if (maxAmp < 0.01) return { error: `Silence (${durationMs}ms, amp=${maxAmp.toFixed(4)}) — mic not active` };
|
|
648
|
+
return { error: `No speech detected (${durationMs}ms, amp=${maxAmp.toFixed(4)})` };
|
|
649
|
+
},
|
|
650
|
+
|
|
651
|
+
hasStreaming() { return streamingRecognizer !== null; },
|
|
652
|
+
|
|
495
653
|
async correctTranscript(rawText, grokConfig) {
|
|
496
654
|
return correctTranscript(rawText, grokConfig);
|
|
497
655
|
},
|
|
498
656
|
|
|
499
|
-
|
|
657
|
+
// Speak full text via xAI TTS API. Streams MP3 to temp file, plays via sox.
|
|
658
|
+
// Can be cancelled via stopSpeaking(). Requires grokConfig with apiKey.
|
|
659
|
+
async speakFull(text, { apiKey, baseUrl } = {}) {
|
|
660
|
+
if (!text || !apiKey) return;
|
|
661
|
+
_ttsCancelled = false;
|
|
662
|
+
|
|
663
|
+
// Strip ANSI, markdown, code blocks, emojis
|
|
664
|
+
let clean = text.replace(/\x1b\[[0-9;]*m/g, "").replace(/```[\s\S]*?```/g, "").replace(/`[^`]+`/g, "");
|
|
665
|
+
clean = clean.replace(/[#*_~>]/g, "");
|
|
666
|
+
clean = clean.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{27BF}\u{2B50}\u{2B55}\u{231A}-\u{23F3}\u{23CF}\u{200D}\u{FE0F}\u{20E3}\u{E0020}-\u{E007F}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}]/gu, "");
|
|
667
|
+
const lines = clean.split("\n").map((l) => l.trim()).filter(Boolean);
|
|
668
|
+
const prose = lines.filter((l) => !(/^[/\\+\-@{]/.test(l) || /^\d+[:|]/.test(l) || l.length < 3));
|
|
669
|
+
const fullText = prose.join(". ");
|
|
670
|
+
if (!fullText.trim()) return;
|
|
671
|
+
|
|
672
|
+
try {
|
|
673
|
+
const response = await fetch(XAI_TTS_URL, {
|
|
674
|
+
method: "POST",
|
|
675
|
+
headers: {
|
|
676
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
677
|
+
"Content-Type": "application/json",
|
|
678
|
+
},
|
|
679
|
+
body: JSON.stringify({
|
|
680
|
+
text: fullText,
|
|
681
|
+
voice_id: "eve",
|
|
682
|
+
language: "en",
|
|
683
|
+
}),
|
|
684
|
+
});
|
|
685
|
+
|
|
686
|
+
if (!response.ok) return;
|
|
687
|
+
if (_ttsCancelled) return;
|
|
688
|
+
|
|
689
|
+
const audioBuffer = Buffer.from(await response.arrayBuffer());
|
|
690
|
+
if (_ttsCancelled || !audioBuffer.length) return;
|
|
691
|
+
|
|
692
|
+
const tmpDir = path.join(getWaterbrotherHome(), "tmp");
|
|
693
|
+
await fs.mkdir(tmpDir, { recursive: true });
|
|
694
|
+
const ts = Date.now();
|
|
695
|
+
const mp3Path = path.join(tmpDir, `tts-${ts}.mp3`);
|
|
696
|
+
await fs.writeFile(mp3Path, audioBuffer);
|
|
697
|
+
|
|
698
|
+
if (_ttsCancelled) { fs.unlink(mp3Path).catch(() => {}); return; }
|
|
699
|
+
|
|
700
|
+
// Play MP3 — platform-native players
|
|
701
|
+
const cleanupFiles = [mp3Path];
|
|
702
|
+
let playCmd, playArgs;
|
|
703
|
+
if (process.platform === "darwin") {
|
|
704
|
+
playCmd = "afplay";
|
|
705
|
+
playArgs = [mp3Path];
|
|
706
|
+
} else if (process.platform === "win32") {
|
|
707
|
+
// PowerShell MediaPlayer — write temp .ps1 to avoid escaping issues
|
|
708
|
+
const psPath = path.join(tmpDir, `tts-${ts}.ps1`);
|
|
709
|
+
await fs.writeFile(psPath, [
|
|
710
|
+
"Add-Type -AssemblyName PresentationCore",
|
|
711
|
+
"$p = New-Object System.Windows.Media.MediaPlayer",
|
|
712
|
+
`$p.Open([uri]"${mp3Path.replace(/\\/g, "/")}")`,
|
|
713
|
+
"$p.Play()",
|
|
714
|
+
"Start-Sleep -Milliseconds 500",
|
|
715
|
+
"while($p.Position -lt $p.NaturalDuration.TimeSpan){ Start-Sleep -Milliseconds 200 }",
|
|
716
|
+
"$p.Close()",
|
|
717
|
+
].join("\n"));
|
|
718
|
+
cleanupFiles.push(psPath);
|
|
719
|
+
playCmd = "powershell.exe";
|
|
720
|
+
playArgs = ["-NoProfile", "-ExecutionPolicy", "Bypass", "-File", psPath];
|
|
721
|
+
} else {
|
|
722
|
+
playCmd = "mpv";
|
|
723
|
+
playArgs = ["--no-video", "--really-quiet", mp3Path];
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
await new Promise((resolve) => {
|
|
727
|
+
const child = spawn(playCmd, playArgs, { stdio: "ignore" });
|
|
728
|
+
_ttsPlayback = child;
|
|
729
|
+
child.on("exit", () => {
|
|
730
|
+
if (_ttsPlayback === child) _ttsPlayback = null;
|
|
731
|
+
for (const f of cleanupFiles) fs.unlink(f).catch(() => {});
|
|
732
|
+
resolve();
|
|
733
|
+
});
|
|
734
|
+
child.on("error", () => { resolve(); });
|
|
735
|
+
});
|
|
736
|
+
} catch {
|
|
737
|
+
// TTS failed — silently ignore
|
|
738
|
+
}
|
|
739
|
+
_ttsCancelled = false;
|
|
740
|
+
},
|
|
741
|
+
|
|
742
|
+
// Stop any in-progress speech playback.
|
|
743
|
+
stopSpeaking() {
|
|
744
|
+
_ttsCancelled = true;
|
|
745
|
+
if (_ttsPlayback) {
|
|
746
|
+
try { _ttsPlayback.kill(); } catch {}
|
|
747
|
+
_ttsPlayback = null;
|
|
748
|
+
}
|
|
749
|
+
},
|
|
750
|
+
|
|
751
|
+
isSpeaking() { return _ttsPlayback !== null; },
|
|
752
|
+
|
|
753
|
+
hasTts() { return true; },
|
|
754
|
+
|
|
755
|
+
destroy() {
|
|
756
|
+
if (_streamingSox) { _streamingSox.kill(); _streamingSox = null; }
|
|
757
|
+
this.stopSpeaking();
|
|
758
|
+
}
|
|
500
759
|
};
|
|
501
760
|
}
|
|
502
761
|
|
|
@@ -504,6 +763,19 @@ export async function setupVoice(onStatus) {
|
|
|
504
763
|
// Helpers
|
|
505
764
|
// ---------------------------------------------------------------------------
|
|
506
765
|
|
|
766
|
+
function downloadProgressHandler(log) {
|
|
767
|
+
return ({ status, downloaded, total, size }) => {
|
|
768
|
+
if (status === "progress" && total > 0) {
|
|
769
|
+
const pct = Math.round((downloaded / total) * 100);
|
|
770
|
+
process.stdout.write(`\r ${pct}% (${formatBytes(downloaded)}/${formatBytes(total)})`);
|
|
771
|
+
} else if (status === "extracting") {
|
|
772
|
+
process.stdout.write(`\r Extracting... \n`);
|
|
773
|
+
} else if (status === "done") {
|
|
774
|
+
log(` Done (${formatBytes(size)})`);
|
|
775
|
+
}
|
|
776
|
+
};
|
|
777
|
+
}
|
|
778
|
+
|
|
507
779
|
function formatBytes(bytes) {
|
|
508
780
|
if (bytes < 1024) return `${bytes} B`;
|
|
509
781
|
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|