verbalcoding 0.2.11 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.env.example +27 -1
  2. package/README.es.md +132 -0
  3. package/README.fr.md +132 -0
  4. package/README.ja.md +132 -0
  5. package/README.ko.md +132 -0
  6. package/README.md +116 -74
  7. package/README.ru.md +132 -0
  8. package/README.zh.md +131 -0
  9. package/app-node/agent_adapters.mjs +37 -5
  10. package/app-node/agent_adapters.test.mjs +13 -1
  11. package/app-node/agent_detect.mjs +73 -0
  12. package/app-node/agent_detect.test.mjs +77 -0
  13. package/app-node/install_config.mjs +3 -0
  14. package/app-node/main.mjs +339 -4
  15. package/app-node/notify.mjs +73 -0
  16. package/app-node/notify.test.mjs +68 -0
  17. package/app-node/plan_mode.mjs +174 -0
  18. package/app-node/plan_mode.test.mjs +153 -0
  19. package/app-node/smart_progress.mjs +94 -0
  20. package/app-node/smart_progress.test.mjs +66 -0
  21. package/app-node/stream_sentencer.mjs +61 -0
  22. package/app-node/stream_sentencer.test.mjs +64 -0
  23. package/app-node/streaming_tts_queue.mjs +48 -0
  24. package/app-node/streaming_tts_queue.test.mjs +58 -0
  25. package/app-node/text_routing.mjs +20 -0
  26. package/app-node/text_routing.test.mjs +23 -1
  27. package/docs/CONFIGURATION.md +69 -96
  28. package/docs/FRESH_INSTALL.md +105 -63
  29. package/docs/HERMES_VOICE.md +65 -0
  30. package/docs/MULTI_INSTANCE.md +16 -0
  31. package/docs/README.md +49 -0
  32. package/docs/RELEASE.md +42 -19
  33. package/docs/ROADMAP.md +38 -0
  34. package/docs/TROUBLESHOOTING.md +126 -0
  35. package/docs/USAGE.md +72 -40
  36. package/docs/assets/figures/verbalcoding-flow.svg +1 -1
  37. package/docs/i18n/CONFIGURATION.es.md +25 -0
  38. package/docs/i18n/CONFIGURATION.fr.md +25 -0
  39. package/docs/i18n/CONFIGURATION.ja.md +25 -0
  40. package/docs/i18n/CONFIGURATION.ko.md +25 -0
  41. package/docs/i18n/CONFIGURATION.ru.md +25 -0
  42. package/docs/i18n/CONFIGURATION.zh.md +25 -0
  43. package/docs/i18n/FRESH_INSTALL.es.md +27 -2
  44. package/docs/i18n/FRESH_INSTALL.fr.md +27 -2
  45. package/docs/i18n/FRESH_INSTALL.ja.md +27 -2
  46. package/docs/i18n/FRESH_INSTALL.ko.md +27 -2
  47. package/docs/i18n/FRESH_INSTALL.ru.md +27 -2
  48. package/docs/i18n/FRESH_INSTALL.zh.md +27 -2
  49. package/docs/i18n/HERMES_VOICE.es.md +46 -0
  50. package/docs/i18n/HERMES_VOICE.fr.md +46 -0
  51. package/docs/i18n/HERMES_VOICE.ja.md +46 -0
  52. package/docs/i18n/HERMES_VOICE.ko.md +65 -0
  53. package/docs/i18n/HERMES_VOICE.ru.md +46 -0
  54. package/docs/i18n/HERMES_VOICE.zh.md +46 -0
  55. package/docs/i18n/MULTI_INSTANCE.es.md +25 -0
  56. package/docs/i18n/MULTI_INSTANCE.fr.md +25 -0
  57. package/docs/i18n/MULTI_INSTANCE.ja.md +25 -0
  58. package/docs/i18n/MULTI_INSTANCE.ko.md +25 -0
  59. package/docs/i18n/MULTI_INSTANCE.ru.md +25 -0
  60. package/docs/i18n/MULTI_INSTANCE.zh.md +25 -0
  61. package/docs/i18n/README.es.md +20 -134
  62. package/docs/i18n/README.fr.md +20 -134
  63. package/docs/i18n/README.ja.md +20 -134
  64. package/docs/i18n/README.ko.md +20 -133
  65. package/docs/i18n/README.ru.md +20 -134
  66. package/docs/i18n/README.zh.md +20 -133
  67. package/docs/i18n/RELEASE.es.md +26 -1
  68. package/docs/i18n/RELEASE.fr.md +26 -1
  69. package/docs/i18n/RELEASE.ja.md +26 -1
  70. package/docs/i18n/RELEASE.ko.md +26 -1
  71. package/docs/i18n/RELEASE.ru.md +26 -1
  72. package/docs/i18n/RELEASE.zh.md +26 -1
  73. package/docs/i18n/TROUBLESHOOTING.es.md +39 -0
  74. package/docs/i18n/TROUBLESHOOTING.fr.md +39 -0
  75. package/docs/i18n/TROUBLESHOOTING.ja.md +39 -0
  76. package/docs/i18n/TROUBLESHOOTING.ko.md +39 -0
  77. package/docs/i18n/TROUBLESHOOTING.ru.md +39 -0
  78. package/docs/i18n/TROUBLESHOOTING.zh.md +39 -0
  79. package/docs/i18n/USAGE.es.md +25 -0
  80. package/docs/i18n/USAGE.fr.md +25 -0
  81. package/docs/i18n/USAGE.ja.md +25 -0
  82. package/docs/i18n/USAGE.ko.md +25 -0
  83. package/docs/i18n/USAGE.ru.md +25 -0
  84. package/docs/i18n/USAGE.zh.md +25 -0
  85. package/docs/superpowers/plans/2026-05-13-phase1-streaming-pipeline.md +122 -0
  86. package/docs/superpowers/plans/2026-05-13-phase10-push-notifications.md +152 -0
  87. package/docs/superpowers/plans/2026-05-13-phase2-agent-adapters.md +242 -0
  88. package/docs/superpowers/plans/2026-05-13-phase6-smart-progress.md +172 -0
  89. package/docs/superpowers/plans/2026-05-13-phase7-voice-plan-mode.md +108 -0
  90. package/package.json +2 -1
  91. package/scripts/cli.mjs +4 -3
  92. package/scripts/doctor.mjs +11 -0
  93. package/scripts/install.mjs +15 -1
@@ -0,0 +1,61 @@
1
+ import { EventEmitter } from 'node:events';
2
+
3
+ const ANSI_RE = /\x1b\[[0-9;?]*[ -/]*[@-~]/g;
4
+ const BOX_RE = /[╭╮╰╯│┊─]/g;
5
+ const PROGRESS_LINE_RE = /^VERBALCODING_PROGRESS\s*:.*$/i;
6
+ const TERMINAL_RE = /[.!?。!?…]+(?=[\s"'\)\]\}]|$)/;
7
+
8
+ function clean(text) {
9
+ return String(text || '')
10
+ .replace(ANSI_RE, '')
11
+ .split(/\r?\n/)
12
+ .filter(line => !PROGRESS_LINE_RE.test(line.trim()))
13
+ .join('\n')
14
+ .replace(BOX_RE, '')
15
+ .replace(/[ \t]+/g, ' ');
16
+ }
17
+
18
+ export function createSentencer({ minChars = 40, maxLatencyMs = 800 } = {}) {
19
+ const ee = new EventEmitter();
20
+ let buffer = '';
21
+ let lastEmit = Date.now();
22
+
23
+ function emit(text) {
24
+ const trimmed = String(text || '').trim();
25
+ if (!trimmed) return;
26
+ ee.emit('sentence', trimmed);
27
+ lastEmit = Date.now();
28
+ }
29
+
30
+ function scan() {
31
+ while (true) {
32
+ const match = buffer.match(TERMINAL_RE);
33
+ if (!match) break;
34
+ const end = match.index + match[0].length;
35
+ const sentence = buffer.slice(0, end);
36
+ buffer = buffer.slice(end).replace(/^\s+/, '');
37
+ emit(sentence);
38
+ }
39
+ if (buffer.length >= minChars && Date.now() - lastEmit >= maxLatencyMs) {
40
+ const cut = buffer.lastIndexOf(' ');
41
+ if (cut > Math.floor(minChars / 2)) {
42
+ emit(buffer.slice(0, cut));
43
+ buffer = buffer.slice(cut).trim();
44
+ }
45
+ }
46
+ }
47
+
48
+ return {
49
+ on: (event, fn) => ee.on(event, fn),
50
+ push(text) {
51
+ const cleaned = clean(text);
52
+ if (!cleaned) return;
53
+ buffer += cleaned;
54
+ scan();
55
+ },
56
+ flush() {
57
+ emit(buffer);
58
+ buffer = '';
59
+ },
60
+ };
61
+ }
@@ -0,0 +1,64 @@
1
+ import { test } from 'node:test';
2
+ import assert from 'node:assert/strict';
3
+ import { createSentencer } from './stream_sentencer.mjs';
4
+
5
+ test('emits a sentence on terminal punctuation', () => {
6
+ const out = [];
7
+ const s = createSentencer({ minChars: 1, maxLatencyMs: 999999 });
8
+ s.on('sentence', t => out.push(t));
9
+ s.push('Hello world. ');
10
+ assert.deepEqual(out, ['Hello world.']);
11
+ });
12
+
13
+ test('does not emit on partial sentence', () => {
14
+ const out = [];
15
+ const s = createSentencer({ minChars: 1, maxLatencyMs: 999999 });
16
+ s.on('sentence', t => out.push(t));
17
+ s.push('Reading file');
18
+ assert.deepEqual(out, []);
19
+ s.push(' main.mjs.');
20
+ assert.deepEqual(out, ['Reading file main.mjs.']);
21
+ });
22
+
23
+ test('strips ANSI before emitting', () => {
24
+ const out = [];
25
+ const s = createSentencer({ minChars: 1, maxLatencyMs: 999999 });
26
+ s.on('sentence', t => out.push(t));
27
+ s.push('\x1b[32mDone.\x1b[0m ');
28
+ assert.deepEqual(out, ['Done.']);
29
+ });
30
+
31
+ test('filters VERBALCODING_PROGRESS lines', () => {
32
+ const out = [];
33
+ const s = createSentencer({ minChars: 1, maxLatencyMs: 999999 });
34
+ s.on('sentence', t => out.push(t));
35
+ s.push('VERBALCODING_PROGRESS: reading files main.mjs\nAll set.');
36
+ s.flush();
37
+ assert.deepEqual(out, ['All set.']);
38
+ });
39
+
40
+ test('flush emits residual on close', () => {
41
+ const out = [];
42
+ const s = createSentencer({ minChars: 1, maxLatencyMs: 999999 });
43
+ s.on('sentence', t => out.push(t));
44
+ s.push('No terminator here');
45
+ s.flush();
46
+ assert.deepEqual(out, ['No terminator here']);
47
+ });
48
+
49
+ test('strips Hermes box characters', () => {
50
+ const out = [];
51
+ const s = createSentencer({ minChars: 1, maxLatencyMs: 999999 });
52
+ s.on('sentence', t => out.push(t));
53
+ s.push('│ Done.');
54
+ s.flush();
55
+ assert.deepEqual(out, ['Done.']);
56
+ });
57
+
58
+ test('emits multiple sentences in one push', () => {
59
+ const out = [];
60
+ const s = createSentencer({ minChars: 1, maxLatencyMs: 999999 });
61
+ s.on('sentence', t => out.push(t));
62
+ s.push('First. Second. Third.');
63
+ assert.deepEqual(out, ['First.', 'Second.', 'Third.']);
64
+ });
@@ -0,0 +1,48 @@
1
+ export function createStreamingTTSQueue({ synth, play, signal, cleanup, log = () => {} } = {}) {
2
+ if (typeof synth !== 'function') throw new Error('synth is required');
3
+ if (typeof play !== 'function') throw new Error('play is required');
4
+
5
+ const queue = [];
6
+ let pumping = null;
7
+
8
+ async function pump() {
9
+ while (queue.length && !signal?.aborted) {
10
+ const text = queue.shift();
11
+ let file;
12
+ try {
13
+ file = await synth(text);
14
+ } catch (e) {
15
+ log('streaming tts synth failed', e?.message || e);
16
+ continue;
17
+ }
18
+ if (!file) continue;
19
+ if (signal?.aborted) {
20
+ try { await cleanup?.(file); } catch {}
21
+ return;
22
+ }
23
+ try {
24
+ await play(file);
25
+ } catch (e) {
26
+ if (signal?.aborted) {
27
+ try { await cleanup?.(file); } catch {}
28
+ return;
29
+ }
30
+ log('streaming tts play failed', e?.message || e);
31
+ }
32
+ try { await cleanup?.(file); } catch {}
33
+ }
34
+ }
35
+
36
+ return {
37
+ enqueue(text) {
38
+ const trimmed = String(text || '').trim();
39
+ if (!trimmed || signal?.aborted) return;
40
+ queue.push(trimmed);
41
+ if (!pumping) pumping = pump().finally(() => { pumping = null; });
42
+ },
43
+ async drain() {
44
+ while (pumping) await pumping;
45
+ },
46
+ get size() { return queue.length; },
47
+ };
48
+ }
@@ -0,0 +1,58 @@
1
+ import { test } from 'node:test';
2
+ import assert from 'node:assert/strict';
3
+ import { createStreamingTTSQueue } from './streaming_tts_queue.mjs';
4
+
5
+ test('synths and plays in enqueue order', async () => {
6
+ const order = [];
7
+ const q = createStreamingTTSQueue({
8
+ synth: async (t) => { order.push(`synth:${t}`); return `f-${t}`; },
9
+ play: async (f) => { order.push(`play:${f}`); },
10
+ });
11
+ q.enqueue('A.');
12
+ q.enqueue('B.');
13
+ await q.drain();
14
+ assert.deepEqual(order, ['synth:A.', 'play:f-A.', 'synth:B.', 'play:f-B.']);
15
+ });
16
+
17
+ test('abort stops further playback', async () => {
18
+ const ctrl = new AbortController();
19
+ const order = [];
20
+ const q = createStreamingTTSQueue({
21
+ synth: async (t) => `f-${t}`,
22
+ play: async (f) => { order.push(`play:${f}`); if (f === 'f-A.') ctrl.abort(); },
23
+ signal: ctrl.signal,
24
+ });
25
+ q.enqueue('A.');
26
+ q.enqueue('B.');
27
+ await q.drain();
28
+ assert.deepEqual(order, ['play:f-A.']);
29
+ });
30
+
31
+ test('cleanup runs after play', async () => {
32
+ const cleaned = [];
33
+ const q = createStreamingTTSQueue({
34
+ synth: async (t) => `f-${t}`,
35
+ play: async () => {},
36
+ cleanup: async (f) => { cleaned.push(f); },
37
+ });
38
+ q.enqueue('A.');
39
+ await q.drain();
40
+ assert.deepEqual(cleaned, ['f-A.']);
41
+ });
42
+
43
+ test('synth error skips that sentence but continues', async () => {
44
+ const played = [];
45
+ const q = createStreamingTTSQueue({
46
+ synth: async (t) => { if (t === 'A.') throw new Error('boom'); return `f-${t}`; },
47
+ play: async (f) => { played.push(f); },
48
+ });
49
+ q.enqueue('A.');
50
+ q.enqueue('B.');
51
+ await q.drain();
52
+ assert.deepEqual(played, ['f-B.']);
53
+ });
54
+
55
+ test('throws when synth or play missing', () => {
56
+ assert.throws(() => createStreamingTTSQueue({ play: async () => {} }), /synth is required/);
57
+ assert.throws(() => createStreamingTTSQueue({ synth: async () => {} }), /play is required/);
58
+ });
@@ -6,3 +6,23 @@ export function shouldRouteDiscordTextToAgent({ content = '', channelId = '', tr
6
6
  if (!target) return true;
7
7
  return String(channelId || '') === target;
8
8
  }
9
+
10
+ export function appendRecentDiscordText(state, { channelId = '', authorLabel = 'user', content = '', now = Date.now(), maxEntries = 12 } = {}) {
11
+ const id = String(channelId || '').trim();
12
+ const text = String(content || '').trim();
13
+ if (!id || !text || text.startsWith('!')) return;
14
+ const entries = state.get(id) || [];
15
+ entries.push({ at: Number(now) || Date.now(), authorLabel: String(authorLabel || 'user'), content: text.slice(0, 500) });
16
+ state.set(id, entries.slice(-maxEntries));
17
+ }
18
+
19
+ export function formatRecentDiscordContext(state, { channelId = '', now = Date.now(), maxAgeMs = 10 * 60 * 1000, maxEntries = 6 } = {}) {
20
+ const id = String(channelId || '').trim();
21
+ if (!id) return '';
22
+ const cutoff = (Number(now) || Date.now()) - maxAgeMs;
23
+ const entries = (state.get(id) || [])
24
+ .filter(entry => Number(entry.at) >= cutoff)
25
+ .slice(-maxEntries);
26
+ if (!entries.length) return '';
27
+ return ['최근 텍스트 채널 메시지:', ...entries.map(entry => `- ${entry.authorLabel}: ${entry.content}`)].join('\n');
28
+ }
@@ -1,7 +1,11 @@
1
1
  import test from 'node:test';
2
2
  import assert from 'node:assert/strict';
3
3
 
4
- import { shouldRouteDiscordTextToAgent } from './text_routing.mjs';
4
+ import {
5
+ appendRecentDiscordText,
6
+ formatRecentDiscordContext,
7
+ shouldRouteDiscordTextToAgent,
8
+ } from './text_routing.mjs';
5
9
 
6
10
  test('routes normal transcript-channel text to the shared agent session', () => {
7
11
  assert.equal(shouldRouteDiscordTextToAgent({
@@ -16,3 +20,21 @@ test('does not route commands or other channels to the shared agent session', ()
16
20
  assert.equal(shouldRouteDiscordTextToAgent({ content: '다른 채널 말', channelId: 'other', transcriptChannelId: 'transcript' }), false);
17
21
  assert.equal(shouldRouteDiscordTextToAgent({ content: ' ', channelId: 'transcript', transcriptChannelId: 'transcript' }), false);
18
22
  });
23
+
24
+ test('formats recent Discord text context for voice turns without commands', () => {
25
+ const state = new Map();
26
+ appendRecentDiscordText(state, { channelId: 'thread', authorLabel: 'user', content: '음성채널에서만 나가줘', now: 1000 });
27
+ appendRecentDiscordText(state, { channelId: 'thread', authorLabel: 'user', content: '!ping', now: 1100 });
28
+ appendRecentDiscordText(state, { channelId: 'thread', authorLabel: 'assistant', content: '알겠어', now: 1200 });
29
+
30
+ const context = formatRecentDiscordContext(state, {
31
+ channelId: 'thread',
32
+ now: 2000,
33
+ maxAgeMs: 5000,
34
+ });
35
+
36
+ assert.match(context, /최근 텍스트 채널 메시지/);
37
+ assert.match(context, /user: 음성채널에서만 나가줘/);
38
+ assert.doesNotMatch(context, /!ping/);
39
+ assert.match(context, /assistant: 알겠어/);
40
+ });
@@ -1,32 +1,70 @@
1
1
  # VerbalCoding Configuration
2
2
 
3
- ## Setup Wizard
3
+ <!-- readme-glow-up:intro -->
4
+ <p align="center">
5
+ <a href="../README.md">README</a> ·
6
+ <a href="README.md">Docs hub</a> ·
7
+ <a href="FRESH_INSTALL.md">Fresh Install</a> ·
8
+ <a href="USAGE.md">Usage</a> ·
9
+ <a href="CONFIGURATION.md">Configuration</a> ·
10
+ <a href="TROUBLESHOOTING.md">Troubleshooting</a> ·
11
+ <a href="MULTI_INSTANCE.md">Multi-Instance</a>
12
+ </p>
13
+
14
+ > Settings reference for Discord, agents, TTS, MCP, and runtime behavior.
15
+ >
16
+ > Fast path: `vc setup handles normal config; edit .env only for advanced overrides`
17
+ <!-- /readme-glow-up:intro -->
18
+
19
+ ## Setup Command Map
20
+
21
+ For npm/global installs, use `vc` commands instead of manually editing `.env`:
4
22
 
5
- Discord bot/application setup is intentionally not re-explained from scratch here. Use these upstream guides for the Discord-side steps, then return to VerbalCoding setup:
23
+ ```bash
24
+ vc setup # guided setup: prerequisites, Discord token, voice channels
25
+ vc setup --yes # non-interactive bootstrap/starter config
26
+ vc setup token # later update Discord bot token
27
+ vc setup channels "General,Team Voice" # later update auto-join voice channel names
28
+ vc setup channel "General" # alias
29
+ vc setup voice "General" # alias
30
+ vc doctor # redacted health check and supported auto-fixes
31
+ vc start # run the default bridge
32
+ ```
6
33
 
7
- - Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
8
- - Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
9
- - Discord official quick start: <https://docs.discord.com/developers/quick-start/getting-started>
34
+ Clone-only setup remains available:
10
35
 
11
36
  ```bash
12
- ./scripts/install.sh
37
+ ./scripts/install.sh --yes
13
38
  ```
14
39
 
15
- The installer asks for Discord token, allowed users, auto-join voice channel names, transcript channel/thread, CLI harness backend, default voice language, TTS settings, and wake-word behavior. It writes `.env` with mode `0600`; `.env` is ignored by git. It also links the short shell command `vc`.
40
+ `vc setup token` updates `DISCORD_BOT_TOKEN` and optional `DISCORD_CLIENT_ID`. `vc setup channels` updates `AUTO_JOIN_VOICE_CHANNELS`. Both preserve unrelated `.env` values, write the file with mode `0600`, and avoid printing token values.
41
+
42
+ ## Discord Bot/Application Setup
43
+
44
+ Use these upstream guides for the Discord-side steps, then return to VerbalCoding setup:
16
45
 
17
- If you only need the shell command after manual install:
46
+ - Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
47
+ - Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
48
+ - Discord official quick start: <https://docs.discord.com/developers/quick-start/getting-started>
49
+
50
+ Minimum flow:
18
51
 
19
52
  ```bash
20
- npm link
53
+ vc bot invite <discord-client-id>
54
+ vc setup token <bot-token> --client-id <discord-client-id>
55
+ vc setup channels "VerbalCoding,General"
56
+ vc doctor
21
57
  ```
22
58
 
59
+ The bot needs Message Content privileged intent plus text/voice permissions for the target channels.
60
+
23
61
  ## Supported Agent Backends
24
62
 
25
63
  Set `AGENT_BACKEND` in `.env`.
26
64
 
27
65
  | Backend | Default command | Notes |
28
66
  |---|---|---|
29
- | `hermes` | `hermes chat -Q -q` | Default. Preserves `.verbalcoding-session` resume behavior. |
67
+ | `hermes` | `hermes chat -Q -q` | Default. Preserves `.verbalcoding-session` resume behavior. `vc doctor` can auto-install the Hermes CLI on supported macOS/Linux installs. |
30
68
  | `claude-code` / `claude` | `claude -p` | Override with `CLAUDE_COMMAND` or `AGENT_COMMAND`. |
31
69
  | `codex` | `codex exec` | Override with `CODEX_COMMAND` or `AGENT_COMMAND`. |
32
70
  | `gemini` | `gemini -p` | Override with `GEMINI_COMMAND` or `AGENT_COMMAND`. |
@@ -62,8 +100,9 @@ New backends should implement the same contract and keep voice/STT/TTS behavior
62
100
 
63
101
  ```bash
64
102
  DISCORD_BOT_TOKEN="***"
103
+ DISCORD_CLIENT_ID="123456789012345678"
65
104
  DISCORD_ALLOWED_USERS="123456789012345678"
66
- AUTO_JOIN_VOICE_CHANNELS="일반,General,general"
105
+ AUTO_JOIN_VOICE_CHANNELS="VerbalCoding,General"
67
106
  TRANSCRIPT_CHANNEL_ID="123456789012345678"
68
107
 
69
108
  AGENT_BACKEND="hermes"
@@ -95,9 +134,7 @@ Language presets and voice selection are separate:
95
134
  - Live voice commands such as “남자 한국어 목소리로 바꿔”, “여자 한국어 목소리로 바꿔”, `change voice to Korean female`, and `switch speaker to English` change only the speaker/voice type.
96
135
  - `!voice-test <text>` plays a quick sample with the currently selected backend and voice.
97
136
 
98
- Voice selection is stored in `config/tts-voices.json` by default. Override the path with `TTS_VOICE_CONFIG`. The running bridge re-reads/applies voice selection before synthesis, so voice commands take effect without a full restart.
99
-
100
- Default Edge catalog:
137
+ Voice selection is stored in `config/tts-voices.json` by default. Override the path with `TTS_VOICE_CONFIG`.
101
138
 
102
139
  | `TTS_VOICE_TYPE` | `TTS_VOICE` | Language |
103
140
  |---|---|---|
@@ -107,29 +144,9 @@ Default Edge catalog:
107
144
  | `english_male` | `en-US-GuyNeural` | English |
108
145
  | `english_female` | `en-US-AriaNeural` | English |
109
146
 
110
- Manual persistent override:
111
-
112
- ```bash
113
- TTS_BACKEND="edge"
114
- TTS_VOICE_TYPE="korean_male"
115
- TTS_VOICE="ko-KR-InJoonNeural"
116
- TTS_VOICE_CONFIG="config/tts-voices.json"
117
- ```
118
-
119
- For OpenVoice, SpeechSwift, or Supertonic, keep the backend-specific voice/reference settings in the sections below; the same voice catalog file can still track the active voice type.
120
-
121
- Backend-specific voice options:
122
-
123
- | Backend | Settings | Voice choices |
124
- |---|---|---|
125
- | Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | Built-in types above, plus any voice returned by `edge-tts --list-voices` |
126
- | Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; language `ko`, `en`, `es`, `pt`, `fr` |
127
- | OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | User-provided permitted reference WAV; style defaults to `default` |
128
- | SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | Reference-sample voices for CosyVoice, or backend-supported speaker/model IDs |
129
-
130
147
  ## Utterance Segmentation
131
148
 
132
- `UTTERANCE_IDLE_MS` controls how long the bridge waits after a speech segment before it decides the user is done and starts STT. The default is `4500` ms to preserve longer spoken instructions with natural pauses. Lower values feel faster for short commands but can split long dictation; higher values are safer for thoughtful speech.
149
+ `UTTERANCE_IDLE_MS` controls how long the bridge waits after a speech segment before it decides the user is done and starts STT.
133
150
 
134
151
  ```bash
135
152
  UTTERANCE_IDLE_MS="4500" # balanced default
@@ -138,7 +155,7 @@ UTTERANCE_IDLE_MS="6000" # safer for long dictation with pauses
138
155
 
139
156
  ## MCP Server
140
157
 
141
- VerbalCoding ships a stdio MCP server so Hermes Agent or any MCP client can control the bridge through tools instead of relying on skills or free-form shell commands.
158
+ VerbalCoding ships a stdio MCP server so Hermes Agent or any MCP client can control the bridge through tools.
142
159
 
143
160
  Hermes config example:
144
161
 
@@ -161,74 +178,30 @@ Exposed MCP tools:
161
178
  | `set_language` | Update STT/progress/TTS language together |
162
179
  | `start`, `stop`, `restart` | Control the Discord voice bridge |
163
180
 
164
- ## Optional OpenVoice TTS
181
+ ## Docker / Container Networking
165
182
 
166
- Edge TTS remains the default and fallback. To try local voice cloning with OpenVoice V2:
183
+ Discord voice needs outbound UDP. If Docker logs show `Cannot perform IP discovery - socket closed`, try Linux host networking:
167
184
 
168
- ```bash
169
- ./scripts/setup_openvoice.sh
170
- # Download checkpoints_v2_0417.zip from OpenVoice docs and extract under vendor/OpenVoice/checkpoints_v2/
171
- mkdir -p voice-samples
172
- # Put a permitted reference sample at voice-samples/user-reference.wav,
173
- # or capture one from Discord with !voice-clone capture.
174
- python3 integrations/openvoice/synth.py --openvoice-dir vendor/OpenVoice --ref-audio voice-samples/user-reference.wav --text '안녕하세요. 버벌코딩 목소리 복제 테스트입니다.' --output /tmp/verbalcoding-openvoice-smoke.wav
175
- ```
176
-
177
- Then set:
178
-
179
- ```bash
180
- TTS_BACKEND="openvoice"
181
- OPENVOICE_REF_AUDIO="./voice-samples/user-reference.wav"
182
- OPENVOICE_PROGRESS="0"
183
- ```
184
-
185
- Only clone voices you own or have permission to use. If OpenVoice fails or times out, VerbalCoding falls back to Edge TTS.
186
-
187
- ## Optional Supertonic TTS
188
-
189
- ```bash
190
- ./scripts/setup_supertonic.sh
191
- supertonic tts '안녕하세요. 수퍼토닉 테스트입니다.' --lang ko --voice M1 --steps 2 --speed 1.0 -o /tmp/verbalcoding-supertonic.wav
192
- ```
193
-
194
- Then set:
195
-
196
- ```bash
197
- TTS_BACKEND="supertonic"
198
- SUPERTONIC_COMMAND="./.venv-supertonic/bin/supertonic"
199
- SUPERTONIC_VOICE="M1"
200
- SUPERTONIC_LANGUAGE="ko"
201
- SUPERTONIC_STEPS="2"
202
- SUPERTONIC_SPEED="1.0"
203
- SUPERTONIC_PROGRESS="0"
185
+ ```yaml
186
+ services:
187
+ verbalcoding:
188
+ network_mode: "host"
204
189
  ```
205
190
 
206
- If Supertonic is missing, fails, or times out, VerbalCoding falls back to Edge TTS.
207
-
208
- ## Optional SpeechSwift / CosyVoice TTS
191
+ Remove `ports:` from that Compose service. On Docker Desktop for macOS/Windows, host networking may not expose UDP the same way; run on the host or a Linux VM if voice still fails.
209
192
 
210
- On Apple Silicon, `speech-swift` is a local backend for Korean voice cloning with MLX-native CosyVoice/Qwen3-TTS.
193
+ ## Optional TTS Backends
211
194
 
212
- ```bash
213
- brew tap soniqo/speech https://github.com/soniqo/speech-swift
214
- brew install speech
215
- ```
195
+ Edge TTS remains the default and fallback. Optional local backends are configured with their own env vars:
216
196
 
217
- Recommended env:
218
-
219
- ```bash
220
- TTS_BACKEND="speechswift"
221
- SPEECHSWIFT_MODE="server"
222
- SPEECHSWIFT_ENGINE="cosyvoice"
223
- SPEECHSWIFT_LANGUAGE="korean"
224
- SPEECHSWIFT_REF_AUDIO="./voice-samples/user-reference.wav"
225
- SPEECHSWIFT_SERVER_HOST="127.0.0.1"
226
- SPEECHSWIFT_SERVER_PORT="18080"
227
- SPEECHSWIFT_SERVER_URL="http://127.0.0.1:18080"
228
- SPEECHSWIFT_PROGRESS="0"
229
- ```
197
+ | Backend | Settings | Voice choices |
198
+ |---|---|---|
199
+ | Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | Built-in types above, plus any voice returned by `edge-tts --list-voices` |
200
+ | Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; language `ko`, `en`, `es`, `pt`, `fr` |
201
+ | OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | User-provided permitted reference WAV; style defaults to `default` |
202
+ | SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | Reference-sample voices for CosyVoice, or backend-supported speaker/model IDs |
230
203
 
231
- Keep Edge for quick progress/backchannel prompts.
204
+ Only clone voices you own or have permission to use. If a local backend fails or times out, VerbalCoding falls back to Edge TTS.
232
205
 
233
206
  ## Operational Notes
234
207