verbalcoding 0.2.11 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.env.example +27 -1
  2. package/README.es.md +132 -0
  3. package/README.fr.md +132 -0
  4. package/README.ja.md +132 -0
  5. package/README.ko.md +132 -0
  6. package/README.md +116 -74
  7. package/README.ru.md +132 -0
  8. package/README.zh.md +131 -0
  9. package/app-node/agent_adapters.mjs +37 -5
  10. package/app-node/agent_adapters.test.mjs +13 -1
  11. package/app-node/agent_detect.mjs +73 -0
  12. package/app-node/agent_detect.test.mjs +77 -0
  13. package/app-node/install_config.mjs +3 -0
  14. package/app-node/main.mjs +339 -4
  15. package/app-node/notify.mjs +73 -0
  16. package/app-node/notify.test.mjs +68 -0
  17. package/app-node/plan_mode.mjs +174 -0
  18. package/app-node/plan_mode.test.mjs +153 -0
  19. package/app-node/smart_progress.mjs +94 -0
  20. package/app-node/smart_progress.test.mjs +66 -0
  21. package/app-node/stream_sentencer.mjs +61 -0
  22. package/app-node/stream_sentencer.test.mjs +64 -0
  23. package/app-node/streaming_tts_queue.mjs +48 -0
  24. package/app-node/streaming_tts_queue.test.mjs +58 -0
  25. package/app-node/text_routing.mjs +20 -0
  26. package/app-node/text_routing.test.mjs +23 -1
  27. package/docs/CONFIGURATION.md +69 -96
  28. package/docs/FRESH_INSTALL.md +105 -63
  29. package/docs/HERMES_VOICE.md +65 -0
  30. package/docs/MULTI_INSTANCE.md +16 -0
  31. package/docs/README.md +49 -0
  32. package/docs/RELEASE.md +42 -19
  33. package/docs/ROADMAP.md +38 -0
  34. package/docs/TROUBLESHOOTING.md +126 -0
  35. package/docs/USAGE.md +72 -40
  36. package/docs/assets/figures/verbalcoding-flow.svg +1 -1
  37. package/docs/i18n/CONFIGURATION.es.md +25 -0
  38. package/docs/i18n/CONFIGURATION.fr.md +25 -0
  39. package/docs/i18n/CONFIGURATION.ja.md +25 -0
  40. package/docs/i18n/CONFIGURATION.ko.md +25 -0
  41. package/docs/i18n/CONFIGURATION.ru.md +25 -0
  42. package/docs/i18n/CONFIGURATION.zh.md +25 -0
  43. package/docs/i18n/FRESH_INSTALL.es.md +27 -2
  44. package/docs/i18n/FRESH_INSTALL.fr.md +27 -2
  45. package/docs/i18n/FRESH_INSTALL.ja.md +27 -2
  46. package/docs/i18n/FRESH_INSTALL.ko.md +27 -2
  47. package/docs/i18n/FRESH_INSTALL.ru.md +27 -2
  48. package/docs/i18n/FRESH_INSTALL.zh.md +27 -2
  49. package/docs/i18n/HERMES_VOICE.es.md +46 -0
  50. package/docs/i18n/HERMES_VOICE.fr.md +46 -0
  51. package/docs/i18n/HERMES_VOICE.ja.md +46 -0
  52. package/docs/i18n/HERMES_VOICE.ko.md +65 -0
  53. package/docs/i18n/HERMES_VOICE.ru.md +46 -0
  54. package/docs/i18n/HERMES_VOICE.zh.md +46 -0
  55. package/docs/i18n/MULTI_INSTANCE.es.md +25 -0
  56. package/docs/i18n/MULTI_INSTANCE.fr.md +25 -0
  57. package/docs/i18n/MULTI_INSTANCE.ja.md +25 -0
  58. package/docs/i18n/MULTI_INSTANCE.ko.md +25 -0
  59. package/docs/i18n/MULTI_INSTANCE.ru.md +25 -0
  60. package/docs/i18n/MULTI_INSTANCE.zh.md +25 -0
  61. package/docs/i18n/README.es.md +20 -134
  62. package/docs/i18n/README.fr.md +20 -134
  63. package/docs/i18n/README.ja.md +20 -134
  64. package/docs/i18n/README.ko.md +20 -133
  65. package/docs/i18n/README.ru.md +20 -134
  66. package/docs/i18n/README.zh.md +20 -133
  67. package/docs/i18n/RELEASE.es.md +26 -1
  68. package/docs/i18n/RELEASE.fr.md +26 -1
  69. package/docs/i18n/RELEASE.ja.md +26 -1
  70. package/docs/i18n/RELEASE.ko.md +26 -1
  71. package/docs/i18n/RELEASE.ru.md +26 -1
  72. package/docs/i18n/RELEASE.zh.md +26 -1
  73. package/docs/i18n/TROUBLESHOOTING.es.md +39 -0
  74. package/docs/i18n/TROUBLESHOOTING.fr.md +39 -0
  75. package/docs/i18n/TROUBLESHOOTING.ja.md +39 -0
  76. package/docs/i18n/TROUBLESHOOTING.ko.md +39 -0
  77. package/docs/i18n/TROUBLESHOOTING.ru.md +39 -0
  78. package/docs/i18n/TROUBLESHOOTING.zh.md +39 -0
  79. package/docs/i18n/USAGE.es.md +25 -0
  80. package/docs/i18n/USAGE.fr.md +25 -0
  81. package/docs/i18n/USAGE.ja.md +25 -0
  82. package/docs/i18n/USAGE.ko.md +25 -0
  83. package/docs/i18n/USAGE.ru.md +25 -0
  84. package/docs/i18n/USAGE.zh.md +25 -0
  85. package/docs/superpowers/plans/2026-05-13-phase1-streaming-pipeline.md +122 -0
  86. package/docs/superpowers/plans/2026-05-13-phase10-push-notifications.md +152 -0
  87. package/docs/superpowers/plans/2026-05-13-phase2-agent-adapters.md +242 -0
  88. package/docs/superpowers/plans/2026-05-13-phase6-smart-progress.md +172 -0
  89. package/docs/superpowers/plans/2026-05-13-phase7-voice-plan-mode.md +108 -0
  90. package/package.json +2 -1
  91. package/scripts/cli.mjs +4 -3
  92. package/scripts/doctor.mjs +11 -0
  93. package/scripts/install.mjs +15 -1
package/README.md CHANGED
@@ -1,148 +1,190 @@
1
1
  # VerbalCoding
2
2
 
3
3
  <p align="center">
4
- <strong>Talk to your CLI coding agents through Discord voice like a phone call for software work.</strong>
4
+ <strong>The voice layer for any coding agent real barge-in, streaming latency, and the agents you already use.</strong>
5
5
  </p>
6
6
 
7
7
  <p align="center">
8
- <a href="docs/i18n/README.ko.md">한국어</a> ·
9
- <a href="docs/i18n/README.ja.md">日本語</a> ·
10
- <a href="docs/i18n/README.zh.md">中文</a> ·
11
- <a href="docs/i18n/README.es.md">Español</a> ·
12
- <a href="docs/i18n/README.fr.md">Français</a> ·
13
- <a href="docs/i18n/README.ru.md">Русский</a>
8
+ <a href="./README.ko.md">한국어</a> ·
9
+ <a href="./README.ja.md">日本語</a> ·
10
+ <a href="./README.zh.md">中文</a> ·
11
+ <a href="./README.es.md">Español</a> ·
12
+ <a href="./README.fr.md">Français</a> ·
13
+ <a href="./README.ru.md">Русский</a>
14
14
  </p>
15
15
 
16
16
  <p align="center">
17
+ <img alt="npm" src="https://img.shields.io/npm/v/verbalcoding?color=CB3837&logo=npm&logoColor=white">
17
18
  <img alt="Node.js" src="https://img.shields.io/badge/Node.js-20%2B-339933?logo=node.js&logoColor=white">
18
19
  <img alt="Discord" src="https://img.shields.io/badge/Discord-voice%20bridge-5865F2?logo=discord&logoColor=white">
19
20
  <img alt="STT" src="https://img.shields.io/badge/STT-whisper.cpp-7C3AED">
20
- <img alt="TTS" src="https://img.shields.io/badge/TTS-Edge%20%7C%20OpenVoice%20%7C%20Supertonic%20%7C%20SpeechSwift-0EA5E9">
21
- <img alt="Agents" src="https://img.shields.io/badge/Agents-Hermes%20%7C%20Claude%20%7C%20Codex%20%7C%20Gemini%20%7C%20OpenCode-111827">
21
+ <img alt="TTS" src="https://img.shields.io/badge/TTS-Edge%20%7C%20OpenVoice%20%7C%20SpeechSwift-0EA5E9">
22
+ <img alt="License" src="https://img.shields.io/github/license/ca1773130n/VerbalCoding">
22
23
  </p>
23
24
 
24
25
  <p align="center">
25
26
  <img src="docs/assets/figures/verbalcoding-flow.svg" alt="VerbalCoding voice-to-agent flow" width="860">
26
27
  </p>
27
28
 
28
- ## Why
29
+ ## Why it exists
29
30
 
30
- VerbalCoding turns a Discord voice channel into a hands-free control surface for coding agents. Speak a request, let your CLI agent work, and hear a concise answer back — with text transcripts, progress events, and guardrails for noisy code/log output.
31
+ VerbalCoding turns a Discord voice channel into a hands-free cockpit for **any** CLI coding agent. Hermes ships its own `/voice join` for Hermes; VerbalCoding is a thin, agent-agnostic layer that puts the same loop on top of Hermes, Claude Code, Codex, Gemini, OpenCode, OpenClaw, Aider, Cursor CLI, or any non-interactive shell command — with the rough edges other voice frontends still have on their roadmap:
31
32
 
32
- ## Highlights
33
+ - **True audio barge-in** — interrupt the agent mid-sentence; Hermes' built-in voice pauses its listener during TTS.
34
+ - **Streaming pipeline** — first sentence plays while the agent is still writing (Hermes lists this as a future Phase-4 item).
35
+ - **Smart progress narration** — describes intent ("wiring the new login route"), not file lists.
36
+ - **Voice plan mode** — say "plan it first", edit by voice ("skip step 3"), say "approve" to execute.
37
+ - **Phone-down mode** — push notification with a voice summary when a long task completes and the room is empty.
33
38
 
34
- | What you get | Why it feels good |
39
+ ## What feels different
40
+
41
+ | Capability | Why it matters |
35
42
  |---|---|
36
- | Voice-first agent control | Talk to Hermes Agent, Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, or any custom CLI harness. |
37
- | On-device speech loop | Discord voice capture local `whisper-cli` transcription agent chunked TTS playback. |
38
- | Shared voice + text context | Voice turns and `!ask` text commands can reuse the same supported agent session. |
39
- | Barge-in and sensitivity modes | Interrupt playback naturally and switch between normal and conservative/noisy environments. |
40
- | Multilingual voice presets | Switch STT, progress language, and TTS voice together with `vc language ko/en/auto`. |
41
- | Multi-room project isolation | Run one bot per project room with isolated Hermes profiles, sessions, memory, and logs. |
43
+ | Agent choice, first-class | Hermes Agent, Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Aider, Cursor CLI, or any custom command. `vc setup` auto-detects what's installed. |
44
+ | Real barge-in | VAD thresholds tuned for indoor and noisy rooms; cut in mid-utterance and resume the conversation. |
45
+ | Streaming end-to-end | `STREAMING_TTS=1` plays sentences as the agent produces them; first audio in well under a second on a warm cache. |
46
+ | Smart progress | Optional LLM summarizer collapses raw events into one human sentence; falls back to the existing regex labels when no key is set. |
47
+ | Plan-mode by voice | Narrated, editable, voice-driven plans without touching the keyboard. |
48
+ | Phone-down handoff | Long task + empty VC = push notification (`ntfy`/`pushover`) with a redacted one-line summary and tap-to-rejoin link. |
49
+ | Local speech loop | Discord audio is transcribed by local `whisper-cli`; TTS via Edge, OpenVoice, SpeechSwift/CosyVoice, or Supertonic. |
50
+ | Real operations support | Doctor auto-fixes, Docker UDP guidance, latency metrics, multi-instance project rooms, redacted config checks. |
42
51
 
43
- ## Quick Start
52
+ > **Already using Hermes Agent?** Hermes itself has a working Discord voice loop via `/voice join` / `/voice channel`. Use VerbalCoding when you want it agent-agnostic, want barge-in and streaming today, or want plan-mode, push handoff, and smart narration on top of the same loop. The two coexist — VerbalCoding can drive Hermes as its backend.
44
53
 
45
- Fastest path with npm:
54
+ ## Quick Start
46
55
 
47
56
  ```bash
48
- npm install -g verbalcoding
49
- vc setup --yes
57
+ npm install -g verbalcoding@latest
58
+ vc setup # detects installed agents and lets you pick
50
59
  vc doctor
51
60
  vc start
52
61
  ```
53
62
 
54
- Or run directly without a permanent global install:
63
+ `vc setup` is the normal human path. Keep Discord Developer Portal open while it asks for your bot token, application/client ID, transcript target, and voice channel names.
64
+
65
+ Automation can skip prompts, then fill Discord details later:
55
66
 
56
67
  ```bash
57
- npx verbalcoding setup --yes
68
+ vc setup --yes
69
+ vc setup token <bot-token> --client-id <discord-client-id>
70
+ vc setup channels "General,Team Voice"
58
71
  vc doctor
59
- vc start
60
72
  ```
61
73
 
62
- GitHub clone path for contributors:
74
+ Contributor clone path:
63
75
 
64
76
  ```bash
65
77
  git clone https://github.com/ca1773130n/VerbalCoding.git
66
78
  cd VerbalCoding
67
- ./scripts/install.sh --yes
79
+ ./scripts/install.sh
68
80
  vc doctor
69
81
  ./run.sh
70
82
  ```
71
83
 
72
- `vc setup --yes` bootstraps local prerequisites from the npm package. `./scripts/install.sh --yes` does the same for GitHub clone installs. Both cover Node/npm dependencies, `ffmpeg`, `whisper-cli`, the default whisper.cpp model, a local `.venv-tts` Edge TTS helper, and setup wizard configuration where possible. They support macOS/Homebrew plus common Linux package managers (`apt`, `dnf`, `pacman`); rerun with `--no-wizard` for dependency-only setup or `--skip-system` if you want to install OS packages yourself.
73
-
74
- Need a clean install walkthrough? Start with [Fresh Install](docs/FRESH_INSTALL.md).
75
-
76
- ## Supported Agent Backends
84
+ ## Discord setup in one minute
77
85
 
78
- | Backend | Default command | Session support |
79
- |---|---:|---|
80
- | Hermes Agent | `hermes chat -Q -q` | Resume, verbose progress, cancellation, final-answer recovery |
81
- | Claude Code | `claude -p` | CLI session file support through adapter defaults |
82
- | Codex CLI | `codex exec` | CLI session file support through adapter defaults |
83
- | Gemini CLI | `gemini -p` | CLI session file support through adapter defaults |
84
- | OpenCode | `opencode run` | CLI session file support through adapter defaults |
85
- | OpenClaw | `openclaw run` | CLI session file support through adapter defaults |
86
- | Custom | `AGENT_COMMAND` | Bring your own non-interactive command |
86
+ 1. Create a Discord application and bot in <https://discord.com/developers/applications>.
87
+ 2. Enable the Message Content privileged intent.
88
+ 3. Run `vc setup` and paste the bot token plus application/client ID when prompted.
89
+ 4. Enter exact voice channel names for auto-join.
90
+ 5. Invite the bot with:
87
91
 
88
- ## Learn More
92
+ ```bash
93
+ vc bot invite <discord-client-id>
94
+ vc bot invite <discord-client-id> --guild <guild-id>
95
+ ```
89
96
 
90
- | Guide | What you get |
91
- |---|---|
92
- | [Fresh Install](docs/FRESH_INSTALL.md) | Clean clone setup, model download, first run |
93
- | [Usage Guide](docs/USAGE.md) | CLI commands, Discord commands, progress mode, latency metrics |
94
- | [Configuration](docs/CONFIGURATION.md) | `.env`, agent backends, MCP, TTS backends, operational notes |
95
- | [Multi-Instance](docs/MULTI_INSTANCE.md) | One permanent Discord voice room per project |
96
- | [Release Notes](docs/RELEASE.md) | Current capabilities and pre-release checklist |
97
+ Secrets are stored in ignored local env files with mode `0600` and are not printed back by `vc doctor`.
97
98
 
98
- ## Tiny Command Map
99
+ ## Tiny command map
99
100
 
100
101
  ```bash
101
- vc status # current language, TTS, and bridge settings
102
- vc language ko|en|auto # switch STT/progress/TTS language preset
103
- vc bot invite CLIENT_ID # generate the Discord bot invite URL
104
- vc instance setup NAME # create an isolated project voice bot
105
- vc instance start NAME # run that bot in the background
106
- vc doctor # redacted health check
107
- vc start # start the default bridge
102
+ vc setup # guided setup with agent auto-detection
103
+ vc setup --yes # non-interactive bootstrap/starter config
104
+ vc setup token # rotate or add Discord bot token/client ID later
105
+ vc setup channels "General,Team Voice" # update auto-join voice channel names
106
+ vc bot invite CLIENT_ID # generate a Discord bot invite URL
107
+ vc status # show active language, TTS, bridge settings, and resolved backend
108
+ vc language ko|en|auto # switch STT/progress/TTS language preset
109
+ vc doctor # redacted health check with auto-fix suggestions
110
+ vc start # start the default bridge
111
+ vc instance setup NAME # create an isolated project voice bot
112
+ vc instance start NAME # run that bot in the background
108
113
  ```
109
114
 
110
115
  In Discord:
111
116
 
112
117
  | Command | What it does |
113
118
  |---|---|
114
- | `!join` | Join your current voice channel. |
115
- | `!ask <prompt>` | Send text to the same agent backend. |
116
- | `!verbose on\|off` | Show/speak short progress updates. |
117
- | `!latency` | Summarize recent voice/STT/agent/TTS latency. |
118
- | `!sensitivity normal` | Use normal indoor barge-in sensitivity. |
119
- | `!sensitivity conservative` | Use stricter noisy/outdoor sensitivity. |
119
+ | `!join` / `!leave` | Join or leave your current voice channel. |
120
+ | `!ask <prompt>` | Send text to the same selected agent backend. |
121
+ | `!verbose on\|off` | Toggle short progress updates. |
122
+ | `!latency` / `!metrics` | Summarize recent STT/agent/TTS latency. |
123
+ | `!sensitivity normal\|conservative` | Tune barge-in for indoor or noisy environments. |
120
124
  | `!session new <name> <workdir> [context] --voice <voice-channel>` | Bind a project session to a voice room. |
121
125
 
126
+ ## Roadmap
127
+
128
+ The differentiation push is tracked in [docs/ROADMAP.md](./docs/ROADMAP.md). Five phases land the claims above:
129
+
130
+ | # | Phase | What it adds |
131
+ |---|---|---|
132
+ | 1 | Streaming pipeline | Sentence-by-sentence TTS while the agent is still writing. |
133
+ | 2 | Agent-agnostic adapters | First-class Aider + Cursor CLI; `vc setup` auto-detects. |
134
+ | 6 | Smart progress | LLM-summarized narration. Falls back to today's regex labels. |
135
+ | 7 | Voice plan mode | Narrate plan, voice-edit, approve to execute. |
136
+ | 10 | Push notification handoff | ntfy/Pushover when a long task ends and the room is empty. |
137
+
138
+ ## Learn more
139
+
140
+ | Guide | What you get |
141
+ |---|---|
142
+ | [Docs hub](docs/README.md) | One page linking every guide and localized doc set. |
143
+ | [Roadmap](docs/ROADMAP.md) | Differentiation plan and per-phase implementation plans. |
144
+ | [Fresh Install](docs/FRESH_INSTALL.md) | npm/global setup, Discord app setup, token/channel commands, first run. |
145
+ | [Usage Guide](docs/USAGE.md) | CLI commands, Discord commands, run modes, voice changes, latency metrics. |
146
+ | [Hermes Built-in Voice vs VerbalCoding](docs/HERMES_VOICE.md) | What Hermes already supports and when VerbalCoding is worth adding. |
147
+ | [Configuration](docs/CONFIGURATION.md) | `.env`, agent backends, MCP server, TTS backends, operational notes. |
148
+ | [Troubleshooting](docs/TROUBLESHOOTING.md) | Docker host networking, UDP voice failures, missing token/channel diagnostics. |
149
+ | [Multi-Instance](docs/MULTI_INSTANCE.md) | One permanent Discord voice room per project. |
150
+ | [Release Notes](docs/RELEASE.md) | Current capabilities, checks, and public-release gaps. |
151
+
122
152
  ## Requirements
123
153
 
124
154
  | Layer | Default |
125
155
  |---|---|
126
- | Runtime | Node.js 20+, npm; install script can install via Homebrew/apt/dnf/pacman |
127
- | Audio | `ffmpeg`; install script can install it |
128
- | Speech recognition | Local `whisper-cli` from whisper.cpp; install script uses Homebrew on macOS or local Linux build fallback |
129
- | TTS | Edge TTS CLI; install script creates `.venv-tts` if needed |
130
- | Discord | Bot token, Message Content intent, voice permissions |
131
- | Agent | At least one authenticated CLI harness, Hermes Agent by default |
132
- | Platform focus | macOS / Apple Silicon most tested; Linux bootstrap is best-effort and documented |
156
+ | Runtime | Node.js 20+ and npm; setup can install via Homebrew/apt/dnf/pacman where supported. |
157
+ | Audio | `ffmpeg`; setup/doctor can install it on supported OSes. |
158
+ | Speech recognition | Local `whisper-cli` from whisper.cpp plus `models/ggml-small-q5_1.bin`. |
159
+ | TTS | Edge TTS by default; optional OpenVoice, SpeechSwift/CosyVoice, and Supertonic paths. |
160
+ | Discord | Bot token, Message Content intent, voice permissions, matching auto-join channel names. |
161
+ | Agent | At least one CLI harness installed; `vc setup` auto-detects Hermes, Claude Code, Codex, Gemini, OpenCode, OpenClaw, Aider, Cursor CLI. |
162
+ | Platform focus | macOS / Apple Silicon most tested; Linux bootstrap is best-effort; Windows unsupported for now. |
163
+
164
+ ## Docker / container note
165
+
166
+ Discord text login can work while voice join fails if outbound UDP is blocked. If logs show `Cannot perform IP discovery - socket closed`, use Linux host networking for the service that runs `vc start`:
167
+
168
+ ```yaml
169
+ services:
170
+ verbalcoding:
171
+ network_mode: "host"
172
+ ```
173
+
174
+ Do not combine `network_mode: "host"` with `ports:`. Docker Desktop for macOS/Windows behaves differently; if UDP still fails there, run VerbalCoding directly on the host or a Linux VM.
133
175
 
134
176
  ## Contributing
135
177
 
136
- Run the lightweight checks before sending changes:
178
+ Run lightweight checks before sending changes:
137
179
 
138
180
  ```bash
139
181
  node --check app-node/main.mjs
140
182
  npm test
141
- bash -n run.sh scripts/install.sh
183
+ bash -n run.sh scripts/install.sh scripts/bootstrap_prereqs.sh
142
184
  npm pack --dry-run
143
185
  vc doctor
144
186
  ```
145
187
 
146
188
  ## Status
147
189
 
148
- VerbalCoding is public-release oriented but still early. Demo video/GIF, broader Linux validation, CI, and deeper security review are still TODOs.
190
+ Public-release oriented but still early. The roadmap above tracks live differentiation work. Demo video/GIF, broader Linux validation, CI, and deeper security review are still TODOs.
package/README.ru.md ADDED
@@ -0,0 +1,132 @@
1
+ # VerbalCoding
2
+
3
+ <p align="center"><strong>Общайтесь с CLI-агентами для разработки голосом в Discord, как по телефону.</strong></p>
4
+
5
+ <p align="center"><a href="./README.md">English</a> · <a href="./README.ko.md">한국어</a> · <a href="./README.ja.md">日本語</a> · <a href="./README.zh.md">中文</a> · <a href="./README.es.md">Español</a> · <a href="./README.fr.md">Français</a></p>
6
+
7
+ <p align="center">
8
+ <img alt="npm" src="https://img.shields.io/npm/v/verbalcoding?color=CB3837&logo=npm&logoColor=white">
9
+ <img alt="Node.js" src="https://img.shields.io/badge/Node.js-20%2B-339933?logo=node.js&logoColor=white">
10
+ <img alt="Discord" src="https://img.shields.io/badge/Discord-voice%20bridge-5865F2?logo=discord&logoColor=white">
11
+ <img alt="STT" src="https://img.shields.io/badge/STT-whisper.cpp-7C3AED">
12
+ <img alt="TTS" src="https://img.shields.io/badge/TTS-Edge%20%7C%20OpenVoice%20%7C%20SpeechSwift-0EA5E9">
13
+ <img alt="License" src="https://img.shields.io/github/license/ca1773130n/VerbalCoding">
14
+ </p>
15
+
16
+ <p align="center">
17
+ <img src="docs/assets/figures/verbalcoding-flow.svg" alt="VerbalCoding voice-to-agent flow" width="860">
18
+ </p>
19
+
20
+ ## Зачем это нужно
21
+
22
+ VerbalCoding превращает голосовую комнату Discord в hands-free кабину для coding agents. Вы произносите задачу, CLI-агент работает, а в ответ получаете короткую озвучку, текстовую расшифровку и события прогресса. Diffs и logs не зачитываются длинным TTS.
23
+
24
+ > **Уже используете Hermes Agent?** В Hermes уже есть встроенная поддержка голосовых каналов Discord через `/voice join` / `/voice channel`: бот может зайти в текущий VC, распознать речь через Whisper и ответить TTS. Для этого базового цикла VerbalCoding не обязателен. VerbalCoding добавляет workflow-слой: маршрутизацию проектов/сессий, общий контекст голоса+текста, правила прерывания, голосовой прогресс, языковые пресеты, метрики задержки и переключение CLI-бэкендов помимо Hermes.
25
+
26
+ ## Что ощущается иначе
27
+
28
+ | Возможность | Зачем это важно |
29
+ |---|---|
30
+ | Работа как звонок | Говорите, слушайте, перебивайте и продолжайте в одном голосовом канале Discord. |
31
+ | Пошаговая настройка | `vc setup` проводит через prerequisites, Discord token/client ID, voice channel, transcript target, backend и TTS settings за один проход. |
32
+ | Локальный голосовой цикл | Discord audio → local `whisper-cli` → selected CLI agent → TTS reply. |
33
+ | Выбор агента | Hermes Agent, Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw или custom command. |
34
+ | Больше, чем встроенный голос Hermes | Сохраняет тот же VC-голосовой цикл и добавляет проектные комнаты, общий контекст `!ask`, тонкую обработку прерываний, голос прогресса/статуса и управление multi-agent бэкендами. |
35
+ | Готовность к эксплуатации | doctor auto-fix, Docker UDP guide, latency metrics, multi-instance rooms и redacted config checks встроены. |
36
+
37
+ ## Быстрый старт
38
+
39
+ ```bash
40
+ npm install -g verbalcoding@latest
41
+ vc setup
42
+ vc doctor
43
+ vc start
44
+ ```
45
+
46
+ `vc setup` — обычный путь для человека. Держите Discord Developer Portal открытым и введите bot token, application/client ID, transcript target и voice channel names.
47
+
48
+ Для автоматизации можно пропустить prompts и добавить Discord-данные позже.
49
+
50
+ ```bash
51
+ vc setup --yes
52
+ vc setup token <bot-token> --client-id <discord-client-id>
53
+ vc setup channels "General,Team Voice"
54
+ vc doctor
55
+ ```
56
+
57
+ ## Discord за одну минуту
58
+
59
+ 1. Создайте application и bot в Discord Developer Portal.
60
+ 2. Включите Message Content privileged intent.
61
+ 3. Запустите `vc setup` и вставьте bot token и application/client ID.
62
+ 4. Введите точные имена voice channels для auto-join.
63
+ 5. Пригласите bot этими командами.
64
+
65
+ ```bash
66
+ vc bot invite <discord-client-id>
67
+ vc bot invite <discord-client-id> --guild <guild-id>
68
+ ```
69
+
70
+ ## Краткая карта команд
71
+
72
+ ```bash
73
+ vc setup # пошаговая настройка: prerequisites, Discord, backend, voice
74
+ vc setup --yes # неинтерактивный bootstrap/starter config
75
+ vc setup token # позже обновить или добавить Discord bot token/client ID
76
+ vc setup channels "General,Team Voice" # обновить auto-join voice channel names
77
+ vc bot invite CLIENT_ID # сгенерировать Discord bot invite URL
78
+ vc status # показать текущие настройки
79
+ vc language ko|en|auto # переключить language preset
80
+ vc doctor # redacted health check и auto-fixes
81
+ vc start # запустить bridge по умолчанию
82
+ vc instance setup NAME # создать изолированный project voice bot
83
+ vc instance start NAME # запустить этот bot в background
84
+ ```
85
+
86
+ ## Подробнее
87
+
88
+ | Гайд | Что внутри |
89
+ |---|---|
90
+ | [Центр документации](docs/i18n/README.ru.md) | Индекс локализованных гайдов. |
91
+ | [Fresh Install](docs/i18n/FRESH_INSTALL.ru.md) | npm/global setup, настройка Discord и первый запуск. |
92
+ | [Usage](docs/i18n/USAGE.ru.md) | CLI-команды, Discord-команды, режимы запуска и latency. |
93
+ | [Встроенный голос Hermes vs VerbalCoding](docs/i18n/HERMES_VOICE.ru.md) | Что Hermes уже умеет в Discord voice и чем отличается VerbalCoding. |
94
+ | [Configuration](docs/i18n/CONFIGURATION.ru.md) | .env, agent backends, MCP, TTS и эксплуатация. |
95
+ | [Troubleshooting](docs/i18n/TROUBLESHOOTING.ru.md) | Docker UDP и проверки token/channel. |
96
+ | [Multi-Instance](docs/i18n/MULTI_INSTANCE.ru.md) | Одна постоянная voice room на проект. |
97
+
98
+ ## Требования
99
+
100
+ | Слой | По умолчанию |
101
+ |---|---|
102
+ | Runtime | Node.js 20+ и npm. |
103
+ | Audio | `ffmpeg` и local `whisper-cli`. |
104
+ | TTS | По умолчанию Edge TTS; опционально OpenVoice, SpeechSwift/CosyVoice, Supertonic. |
105
+ | Discord | Bot token, Message Content intent, voice permissions и совпадающие channel names. |
106
+ | Agent | Минимум один аутентифицированный CLI harness; по умолчанию Hermes Agent. |
107
+
108
+ ## Docker / контейнеры
109
+
110
+ Если в logs видно `Cannot perform IP discovery - socket closed`, Discord voice UDP заблокирован. В Linux Docker Compose используйте:
111
+
112
+ ```yaml
113
+ services:
114
+ verbalcoding:
115
+ network_mode: "host"
116
+ ```
117
+
118
+ Не совмещайте `network_mode: "host"` с `ports:`.
119
+
120
+ ## Участие
121
+
122
+ ```bash
123
+ node --check app-node/main.mjs
124
+ npm test
125
+ bash -n run.sh scripts/install.sh scripts/bootstrap_prereqs.sh
126
+ npm pack --dry-run
127
+ vc doctor
128
+ ```
129
+
130
+ ## Статус
131
+
132
+ VerbalCoding ориентирован на публичный релиз, но проект ещё ранний. Demo video/GIF, более широкая Linux validation, CI и security review остаются TODO.
package/README.zh.md ADDED
@@ -0,0 +1,131 @@
1
+ # VerbalCoding
2
+
3
+ <p align="center"><strong>像打电话一样,通过 Discord 语音控制 CLI 编程代理。</strong></p>
4
+
5
+ <p align="center"><a href="./README.md">English</a> · <a href="./README.ko.md">한국어</a> · <a href="./README.ja.md">日本語</a> · <a href="./README.es.md">Español</a> · <a href="./README.fr.md">Français</a> · <a href="./README.ru.md">Русский</a></p>
6
+
7
+ <p align="center">
8
+ <img alt="npm" src="https://img.shields.io/npm/v/verbalcoding?color=CB3837&logo=npm&logoColor=white">
9
+ <img alt="Node.js" src="https://img.shields.io/badge/Node.js-20%2B-339933?logo=node.js&logoColor=white">
10
+ <img alt="Discord" src="https://img.shields.io/badge/Discord-voice%20bridge-5865F2?logo=discord&logoColor=white">
11
+ <img alt="STT" src="https://img.shields.io/badge/STT-whisper.cpp-7C3AED">
12
+ <img alt="TTS" src="https://img.shields.io/badge/TTS-Edge%20%7C%20OpenVoice%20%7C%20SpeechSwift-0EA5E9">
13
+ <img alt="License" src="https://img.shields.io/github/license/ca1773130n/VerbalCoding">
14
+ </p>
15
+
16
+ <p align="center">
17
+ <img src="docs/assets/figures/verbalcoding-flow.svg" alt="VerbalCoding voice-to-agent flow" width="860">
18
+ </p>
19
+
20
+ ## 为什么需要它
21
+
22
+ VerbalCoding 把 Discord 语音房间变成编码代理的免提驾驶舱。你说出需求,让 CLI 代理工作,并收到简短语音回复和文本记录;diff 和日志不会被 TTS 长篇朗读。
23
+
24
+ > **已经在用 Hermes Agent?** Hermes 本身已经通过 `/voice join` / `/voice channel` 支持 Discord 语音频道:它可以加入你当前所在的 VC,用 Whisper 做语音转文字,并用 TTS 回答。只需要这个基础闭环时,VerbalCoding 不是必需的。VerbalCoding 是加在上面的工作流层:项目/会话路由、语音+文本共享上下文、插话规则、进度语音、语言预设、延迟指标,以及 Hermes 之外的 CLI 后端切换。
25
+
26
+ ## 体验亮点
27
+
28
+ | 能力 | 价值 |
29
+ |---|---|
30
+ | 电话式工作流 | 在同一个 Discord 语音频道里说话、收听、打断、继续。 |
31
+ | 面向人的引导设置 | `vc setup` 一次引导 prerequisites、Discord token/client ID、voice channel、transcript target、backend 和 TTS 设置。 |
32
+ | 本地语音闭环 | Discord audio → local `whisper-cli` → selected CLI agent → TTS reply。 |
33
+ | 可选代理 | 支持 Hermes Agent、Claude Code、Codex、Gemini CLI、OpenCode、OpenClaw 或 custom command。 |
34
+ | 超越 Hermes 内置语音 | 在同一个 VC 语音闭环上增加项目房间、`!ask` 共享上下文、细粒度打断处理、进度/状态语音和多代理后端控制。 |
35
+ | 真实运维支持 | 内置 doctor auto-fix、Docker UDP 指南、latency metrics、multi-instance rooms 和 redacted config checks。 |
36
+
37
+ ## 快速开始
38
+
39
+ ```bash
40
+ npm install -g verbalcoding@latest
41
+ vc setup
42
+ vc doctor
43
+ vc start
44
+ ```
45
+
46
+ 普通用户路径是 `vc setup`。运行时请打开 Discord Developer Portal,并按提示输入 bot token、application/client ID、transcript target 和 voice channel names。
47
+
48
+ 自动化场景可以跳过提示,然后再补充 Discord 信息。
49
+
50
+ ```bash
51
+ vc setup --yes
52
+ vc setup token <bot-token> --client-id <discord-client-id>
53
+ vc setup channels "General,Team Voice"
54
+ vc doctor
55
+ ```
56
+
57
+ ## 一分钟完成 Discord 设置
58
+
59
+ 1. 在 Discord Developer Portal 创建 application 和 bot。
60
+ 2. 启用 Message Content privileged intent。
61
+ 3. 运行 `vc setup`,粘贴 bot token 和 application/client ID。
62
+ 4. 输入要自动加入的精确 voice channel 名称。
63
+ 5. 用下面的命令邀请 bot。
64
+
65
+ ```bash
66
+ vc bot invite <discord-client-id>
67
+ vc bot invite <discord-client-id> --guild <guild-id>
68
+ ```
69
+
70
+ ## 迷你命令地图
71
+
72
+ ```bash
73
+ vc setup # 引导式设置: prerequisites, Discord, backend, voice
74
+ vc setup --yes # 非交互 bootstrap/starter config
75
+ vc setup token # 稍后轮换或添加 Discord bot token/client ID
76
+ vc setup channels "General,Team Voice" # 更新 auto-join voice channel names
77
+ vc bot invite CLIENT_ID # 生成 Discord bot invite URL
78
+ vc status # 显示当前设置
79
+ vc language ko|en|auto # 切换 language preset
80
+ vc doctor # redacted health check 和 auto-fix
81
+ vc start # 启动默认 bridge
82
+ vc instance setup NAME # 创建隔离的 project voice bot
83
+ vc instance start NAME # 后台运行该 bot
84
+ ```
85
+
86
+ ## 了解更多
87
+
88
+ | 指南 | 内容 |
89
+ |---|---|
90
+ | [文档中心](docs/i18n/README.zh.md) | 本地化指南索引。 |
91
+ | [Fresh Install](docs/i18n/FRESH_INSTALL.zh.md) | npm/global setup、Discord 设置、首次运行。 |
92
+ | [Usage](docs/i18n/USAGE.zh.md) | CLI 命令、Discord 命令、运行模式、latency。 |
93
+ | [Configuration](docs/i18n/CONFIGURATION.zh.md) | .env、agent backends、MCP、TTS、运维。 |
94
+ | [Troubleshooting](docs/i18n/TROUBLESHOOTING.zh.md) | Docker UDP、token/channel 缺失检查。 |
95
+ | [Multi-Instance](docs/i18n/MULTI_INSTANCE.zh.md) | 每个项目一个固定语音房间。 |
96
+
97
+ ## 要求
98
+
99
+ | 层级 | 默认 |
100
+ |---|---|
101
+ | Runtime | Node.js 20+ 和 npm。 |
102
+ | Audio | `ffmpeg` 和 local `whisper-cli`。 |
103
+ | TTS | 默认 Edge TTS;可选 OpenVoice、SpeechSwift/CosyVoice、Supertonic。 |
104
+ | Discord | Bot token、Message Content intent、voice permissions、匹配的 channel names。 |
105
+ | Agent | 至少一个已认证 CLI harness;默认 Hermes Agent。 |
106
+
107
+ ## Docker / 容器说明
108
+
109
+ 如果日志出现 `Cannot perform IP discovery - socket closed`,说明 Discord voice UDP 被阻断。在 Linux Docker Compose 中使用:
110
+
111
+ ```yaml
112
+ services:
113
+ verbalcoding:
114
+ network_mode: "host"
115
+ ```
116
+
117
+ 不要同时使用 `network_mode: "host"` 和 `ports:`。
118
+
119
+ ## 贡献
120
+
121
+ ```bash
122
+ node --check app-node/main.mjs
123
+ npm test
124
+ bash -n run.sh scripts/install.sh scripts/bootstrap_prereqs.sh
125
+ npm pack --dry-run
126
+ vc doctor
127
+ ```
128
+
129
+ ## 状态
130
+
131
+ VerbalCoding 面向公开发布,但仍处于早期阶段。演示视频/GIF、更广泛的 Linux 验证、CI 和安全审查仍是 TODO。
@@ -23,12 +23,14 @@ export function voiceBridgePrompt(text, options = {}) {
23
23
  const english = /^en/i.test(String(options.language || ''));
24
24
  const lines = english ? [
25
25
  'This is a user utterance from a Discord voice call.',
26
+ 'Consider Discord voice-channel speech and text-channel messages as one shared conversation context when inferring intent.',
26
27
  'Answer in English. For simple conversation/status questions, do not use tools; answer directly in 1-3 sentences.',
27
28
  'Use tools only for real work requests such as file edits, command execution, log checks, or web/search tasks.',
28
29
  'If code changes are made, do not read diffs or full code aloud; summarize outcome and next checks briefly.',
29
30
  'Do not include CLI metadata or session_id in the answer.',
30
31
  ] : [
31
32
  'Discord 음성 대화로 들어온 사용자 발화다.',
33
+ '의도를 판단할 때 음성 채널 발화와 텍스트 채널 메시지를 같은 대화 맥락으로 함께 고려해라.',
32
34
  '단순 대화/상태 질문이면 도구를 쓰지 말고 1~3문장으로 바로 한국어 답변해라.',
33
35
  '파일 수정, 실행, 로그 확인, 검색 같은 실제 작업 지시일 때만 필요한 도구를 사용해라.',
34
36
  '코드 변경을 수행했다면 음성 답변에는 diff나 코드 전문을 읽지 말고, 작업 결과와 다음 확인 사항만 짧게 말해라.',
@@ -57,6 +59,10 @@ export function voiceBridgePrompt(text, options = {}) {
57
59
  lines.push(english ? 'Route this turn through the following project/session context:' : '이 턴은 아래 프로젝트/세션 컨텍스트로 처리해라.');
58
60
  lines.push(String(options.projectContext).trim());
59
61
  }
62
+ if (options.recentDiscordContext) {
63
+ lines.push(english ? 'Recent Discord text-channel context to consider with this voice utterance:' : '이 음성 발화와 함께 고려할 최근 Discord 텍스트 채널 맥락:');
64
+ lines.push(String(options.recentDiscordContext).trim());
65
+ }
60
66
  return lines.concat(['', text]).join('\n');
61
67
  }
62
68
 
@@ -251,6 +257,24 @@ export function buildAgentSettings({ ROOT, env = process.env } = {}) {
251
257
  sessionFile: env.AGENT_SESSION_FILE || path.join(root, '.agent-sessions', 'openclaw'),
252
258
  supportsHermesSession: false,
253
259
  },
260
+ aider: {
261
+ label: 'Aider',
262
+ command: env.AIDER_COMMAND || 'aider --no-pretty --yes-always --message',
263
+ sessionFile: env.AGENT_SESSION_FILE || path.join(root, '.agent-sessions', 'aider'),
264
+ supportsHermesSession: false,
265
+ },
266
+ cursor: {
267
+ label: 'Cursor CLI',
268
+ command: env.CURSOR_COMMAND || 'cursor-agent --print --prompt',
269
+ sessionFile: env.AGENT_SESSION_FILE || path.join(root, '.agent-sessions', 'cursor'),
270
+ supportsHermesSession: false,
271
+ },
272
+ 'cursor-cli': {
273
+ label: 'Cursor CLI',
274
+ command: env.CURSOR_COMMAND || 'cursor-agent --print --prompt',
275
+ sessionFile: env.AGENT_SESSION_FILE || path.join(root, '.agent-sessions', 'cursor'),
276
+ supportsHermesSession: false,
277
+ },
254
278
  custom: {
255
279
  label: env.AGENT_LABEL || 'Custom Agent',
256
280
  command: env.AGENT_COMMAND || '',
@@ -294,6 +318,7 @@ export function createAgentAdapter(settings, deps = {}) {
294
318
  const hermesSessionsDir = deps.hermesSessionsDir || path.join(os.homedir(), '.hermes', 'sessions');
295
319
  const spawnProcess = deps.spawn;
296
320
  const onProgress = deps.onProgress || (() => {});
321
+ const onStdoutChunk = deps.onStdoutChunk || null;
297
322
  const emittedProgress = new Set();
298
323
  let activeProgressLanguage = settings.language;
299
324
  const capabilities = agentAdapterCapabilities(settings);
@@ -308,7 +333,7 @@ export function createAgentAdapter(settings, deps = {}) {
308
333
  }
309
334
 
310
335
  function execWithOptionalProgress(cmd, args, options, verbose) {
311
- if (!verbose || !spawnProcess) return execFileAsync(cmd, args, options);
336
+ if ((!verbose && !onStdoutChunk) || !spawnProcess) return execFileAsync(cmd, args, options);
312
337
  return new Promise((resolve, reject) => {
313
338
  const child = spawnProcess(cmd, args, {
314
339
  env: options.env,
@@ -353,7 +378,8 @@ export function createAgentAdapter(settings, deps = {}) {
353
378
  child.stdout?.on('data', chunk => {
354
379
  const s = chunk.toString();
355
380
  stdout += s;
356
- emitVerboseProgress(s);
381
+ if (onStdoutChunk) { try { onStdoutChunk(s); } catch (e) { warn('onStdoutChunk failed', e?.stack || e); } }
382
+ if (verbose) emitVerboseProgress(s);
357
383
  if (stdout.length + stderr.length > options.maxBuffer) {
358
384
  const err = new Error('maxBuffer exceeded');
359
385
  err.code = 'ERR_CHILD_PROCESS_STDIO_MAXBUFFER';
@@ -364,7 +390,7 @@ export function createAgentAdapter(settings, deps = {}) {
364
390
  child.stderr?.on('data', chunk => {
365
391
  const s = chunk.toString();
366
392
  stderr += s;
367
- emitVerboseProgress(s);
393
+ if (verbose) emitVerboseProgress(s);
368
394
  if (stdout.length + stderr.length > options.maxBuffer) {
369
395
  const err = new Error('maxBuffer exceeded');
370
396
  err.code = 'ERR_CHILD_PROCESS_STDIO_MAXBUFFER';
@@ -467,7 +493,12 @@ export function createAgentAdapter(settings, deps = {}) {
467
493
  function buildArgs(text, options = {}) {
468
494
  const argv = shellSplit(settings.command);
469
495
  const cmd = argv[0];
470
- const query = voiceBridgePrompt(text, { verboseProgress: options.verboseProgress, language: options.language, projectContext: options.projectContext });
496
+ const query = voiceBridgePrompt(text, {
497
+ verboseProgress: options.verboseProgress,
498
+ language: options.language,
499
+ projectContext: options.projectContext,
500
+ recentDiscordContext: options.recentDiscordContext,
501
+ });
471
502
  let args = argv.slice(1);
472
503
  if (settings.backend === 'hermes' && options.verboseProgress) {
473
504
  // Hermes quiet mode intentionally suppresses tool previews. In verbose
@@ -491,8 +522,9 @@ export function createAgentAdapter(settings, deps = {}) {
491
522
  const language = plan.language || settings.language;
492
523
  activeProgressLanguage = language;
493
524
  const projectContext = plan.projectContext || settings.projectContext || '';
525
+ const recentDiscordContext = plan.recentDiscordContext || '';
494
526
  emittedProgress.clear();
495
- const { cmd, args, sessionId } = buildArgs(text, { verboseProgress, language, projectContext });
527
+ const { cmd, args, sessionId } = buildArgs(text, { verboseProgress, language, projectContext, recentDiscordContext });
496
528
  const start = Date.now();
497
529
  const label = plan.label || settings.label;
498
530
  const { args: finalArgs, outputPath } = addCodexOutputCapture(args);