verbalcoding 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/.env.example +83 -0
  2. package/LICENSE +21 -0
  3. package/README.md +157 -0
  4. package/app-node/agent_adapters.mjs +576 -0
  5. package/app-node/agent_adapters.test.mjs +455 -0
  6. package/app-node/agent_contract.mjs +45 -0
  7. package/app-node/barge_in.mjs +148 -0
  8. package/app-node/barge_in.test.mjs +179 -0
  9. package/app-node/bridge_logger.mjs +66 -0
  10. package/app-node/bridge_logger.test.mjs +73 -0
  11. package/app-node/bridge_state.mjs +104 -0
  12. package/app-node/bridge_state.test.mjs +64 -0
  13. package/app-node/cli_install.test.mjs +97 -0
  14. package/app-node/deferred_queue.mjs +12 -0
  15. package/app-node/deferred_queue.test.mjs +20 -0
  16. package/app-node/discord_invite_cli.test.mjs +31 -0
  17. package/app-node/discord_text.mjs +29 -0
  18. package/app-node/discord_text.test.mjs +32 -0
  19. package/app-node/hermes_profiles.mjs +164 -0
  20. package/app-node/hermes_profiles.test.mjs +276 -0
  21. package/app-node/install_config.mjs +263 -0
  22. package/app-node/install_config.test.mjs +205 -0
  23. package/app-node/instance_doctor.mjs +137 -0
  24. package/app-node/instance_doctor.test.mjs +128 -0
  25. package/app-node/instance_profile_lifecycle.mjs +16 -0
  26. package/app-node/instances.mjs +153 -0
  27. package/app-node/instances.test.mjs +102 -0
  28. package/app-node/language_config.mjs +73 -0
  29. package/app-node/language_config.test.mjs +51 -0
  30. package/app-node/latency_metrics.mjs +133 -0
  31. package/app-node/latency_metrics.test.mjs +71 -0
  32. package/app-node/main.mjs +1771 -0
  33. package/app-node/mcp_tools.mjs +198 -0
  34. package/app-node/mcp_tools.test.mjs +39 -0
  35. package/app-node/progress_cache.mjs +7 -0
  36. package/app-node/progress_cache.test.mjs +23 -0
  37. package/app-node/progress_speech.mjs +102 -0
  38. package/app-node/progress_speech.test.mjs +48 -0
  39. package/app-node/project_sessions.mjs +148 -0
  40. package/app-node/project_sessions.test.mjs +77 -0
  41. package/app-node/restart_notice.mjs +57 -0
  42. package/app-node/restart_notice.test.mjs +37 -0
  43. package/app-node/restart_policy.mjs +27 -0
  44. package/app-node/restart_policy.test.mjs +33 -0
  45. package/app-node/text_routing.mjs +8 -0
  46. package/app-node/text_routing.test.mjs +18 -0
  47. package/app-node/tts_backends.mjs +251 -0
  48. package/app-node/tts_backends.test.mjs +400 -0
  49. package/app-node/tts_chunks.mjs +57 -0
  50. package/app-node/tts_chunks.test.mjs +35 -0
  51. package/app-node/tts_prefetch.mjs +38 -0
  52. package/app-node/tts_prefetch.test.mjs +49 -0
  53. package/app-node/tts_settings.mjs +72 -0
  54. package/app-node/tts_settings.test.mjs +127 -0
  55. package/app-node/tts_voice_config.mjs +127 -0
  56. package/app-node/tts_voice_config.test.mjs +64 -0
  57. package/app-node/voice_clone_capture.mjs +76 -0
  58. package/app-node/voice_clone_capture.test.mjs +51 -0
  59. package/app-node/voice_messages.mjs +62 -0
  60. package/app-node/voice_messages.test.mjs +33 -0
  61. package/docs/CONFIGURATION.md +183 -0
  62. package/docs/FRESH_INSTALL.md +193 -0
  63. package/docs/MULTI_INSTANCE.md +183 -0
  64. package/docs/RELEASE.md +72 -0
  65. package/docs/USAGE.md +108 -0
  66. package/docs/assets/figures/verbalcoding-flow.svg +63 -0
  67. package/docs/i18n/README.es.md +121 -0
  68. package/docs/i18n/README.fr.md +121 -0
  69. package/docs/i18n/README.ja.md +121 -0
  70. package/docs/i18n/README.ko.md +121 -0
  71. package/docs/i18n/README.ru.md +121 -0
  72. package/docs/i18n/README.zh.md +121 -0
  73. package/package.json +58 -0
  74. package/run.sh +82 -0
  75. package/scripts/bootstrap_prereqs.sh +193 -0
  76. package/scripts/cli.mjs +369 -0
  77. package/scripts/docker_ubuntu_smoke.sh +76 -0
  78. package/scripts/doctor.mjs +134 -0
  79. package/scripts/install.mjs +108 -0
  80. package/scripts/install.sh +44 -0
  81. package/scripts/mcp-server.mjs +84 -0
  82. package/scripts/openvoice_smoke.py +34 -0
  83. package/scripts/openvoice_synth.py +103 -0
  84. package/scripts/setup_openvoice.sh +34 -0
  85. package/scripts/setup_supertonic.sh +18 -0
package/.env.example ADDED
@@ -0,0 +1,83 @@
1
+ # Copy to .env and fill local values. Do not commit .env.
2
+
3
+ DISCORD_BOT_TOKEN=""
4
+ DISCORD_ALLOWED_USERS=""
5
+ AUTO_JOIN_VOICE_CHANNELS="일반,General,general"
6
+ TRANSCRIPT_CHANNEL_ID=""
7
+
8
+ # Agent harness: hermes, claude-code, claude, codex, gemini, opencode, openclaw, custom
9
+ AGENT_BACKEND="hermes"
10
+ # AGENT_LABEL="My Harness"
11
+ # AGENT_COMMAND="my-harness run --non-interactive"
12
+ AGENT_TASK_TIMEOUT_MS="0"
13
+ AGENT_CHAT_TIMEOUT_MS="45000"
14
+ AGENT_VERBOSE_PROGRESS="0" # default off; toggle in Discord with !verbose on/off
15
+ LATENCY_LOG_PATH="./.logs/latency.jsonl"
16
+ PROJECT_SESSIONS_FILE="./config/project-sessions.json"
17
+ # Agent workflow helper: off by default. Toggle with `vc restart auto on|off`.
18
+ VERBALCODING_AUTO_RESTART_VOICE_BOT="0"
19
+
20
+ STT_ENGINE="whisper_cpp"
21
+ WHISPER_CPP_BIN="whisper-cli"
22
+ WHISPER_CPP_MODEL="./models/ggml-small-q5_1.bin"
23
+ VOICE_LANGUAGE="ko" # ko | en | auto; controls progress/status language
24
+ WHISPER_CPP_LANGUAGE="ko" # ko | en | auto; auto omits forced whisper language
25
+ STT_LANGUAGE="ko"
26
+
27
+ TTS_BACKEND="edge" # edge | openvoice | speechswift | supertonic
28
+ EDGE_TTS_COMMAND="edge-tts"
29
+ TTS_VOICE="ko-KR-SunHiNeural"
30
+ TTS_RATE="+10%"
31
+ TTS_MAX_CHARS="495"
32
+ TTS_VOLUME="1.0" # Discord playback gain; 1.0 = unchanged, 1.6 = louder
33
+
34
+ # Optional local Supertonic TTS backend. Install with: python3 -m pip install supertonic
35
+ # First run downloads the model to SUPERTONIC_CACHE_DIR or ~/.cache/supertonic.
36
+ SUPERTONIC_COMMAND="supertonic"
37
+ SUPERTONIC_VOICE="M1" # M1-M5, F1-F5
38
+ SUPERTONIC_LANGUAGE="ko" # en | ko | es | pt | fr
39
+ SUPERTONIC_STEPS="2" # 2 is fastest; higher is better but slower
40
+ SUPERTONIC_SPEED="1.0"
41
+ SUPERTONIC_MAX_CHUNK_LENGTH="300"
42
+ SUPERTONIC_SILENCE_DURATION="0.15"
43
+ SUPERTONIC_PROGRESS="0" # keep progress prompts fast via Edge unless set to 1
44
+ # SUPERTONIC_CACHE_DIR="./.cache/supertonic"
45
+ # SUPERTONIC_INTRA_OP_THREADS="4"
46
+ # SUPERTONIC_INTER_OP_THREADS="1"
47
+
48
+ # Optional local speech-swift / CosyVoice backend.
49
+ # Server mode keeps audio-server warm and avoids launching the audio CLI per TTS request.
50
+ SPEECHSWIFT_MODE="server" # cli | server
51
+ SPEECHSWIFT_ENGINE="cosyvoice"
52
+ SPEECHSWIFT_LANGUAGE="korean"
53
+ SPEECHSWIFT_REF_AUDIO="./voice-samples/user-reference.wav"
54
+ SPEECHSWIFT_SERVER_HOST="127.0.0.1"
55
+ SPEECHSWIFT_SERVER_PORT="18080"
56
+ SPEECHSWIFT_SERVER_URL="http://127.0.0.1:18080"
57
+ SPEECHSWIFT_PROGRESS="0" # keep progress prompts fast via Edge unless set to 1
58
+
59
+ # Optional local voice cloning with myshell-ai/OpenVoice V2.
60
+ # Use only a reference sample you own or have permission to clone.
61
+ OPENVOICE_DIR="./vendor/OpenVoice"
62
+ OPENVOICE_VENV="./.venv-openvoice"
63
+ OPENVOICE_REF_AUDIO="./voice-samples/user-reference.wav"
64
+ OPENVOICE_LANGUAGE="KR"
65
+ OPENVOICE_STYLE="default"
66
+ OPENVOICE_TIMEOUT_MS="90000"
67
+ OPENVOICE_PROGRESS="0" # keep progress prompts fast via Edge unless set to 1
68
+ REQUIRE_WAKE_WORD="0"
69
+ MIN_UTTERANCE_SECONDS="1.4"
70
+ UTTERANCE_IDLE_MS="2000"
71
+ MIN_MEAN_VOLUME_DB="-35"
72
+ MIN_MAX_VOLUME_DB="-12"
73
+ BARGE_IN_MIN_SECONDS="1.4"
74
+ BARGE_IN_MIN_MEAN_VOLUME_DB="-30"
75
+ BARGE_IN_MIN_MAX_VOLUME_DB="-14"
76
+ PLAYBACK_BARGE_IN_MIN_SECONDS="0.9"
77
+ PLAYBACK_BARGE_IN_MIN_MEAN_VOLUME_DB="-36"
78
+ PLAYBACK_BARGE_IN_MIN_MAX_VOLUME_DB="-18"
79
+ PLAYBACK_BARGE_IN_REQUIRE_BOTH="1"
80
+ BARGE_IN_CONSERVATIVE_MIN_SECONDS="1.8"
81
+ BARGE_IN_CONSERVATIVE_MIN_MEAN_VOLUME_DB="-27"
82
+ BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB="-12"
83
+ MAX_DEFERRED_PROCESSING_UTTERANCES="3"
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 VerbalCoding contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,157 @@
1
+ # VerbalCoding
2
+
3
+ <p align="center">
4
+ <strong>Talk to your CLI coding agents through Discord voice — like a phone call for software work.</strong>
5
+ </p>
6
+
7
+ <p align="center">
8
+ <a href="docs/i18n/README.ko.md">한국어</a> ·
9
+ <a href="docs/i18n/README.ja.md">日本語</a> ·
10
+ <a href="docs/i18n/README.zh.md">中文</a> ·
11
+ <a href="docs/i18n/README.es.md">Español</a> ·
12
+ <a href="docs/i18n/README.fr.md">Français</a> ·
13
+ <a href="docs/i18n/README.ru.md">Русский</a>
14
+ </p>
15
+
16
+ <p align="center">
17
+ <img alt="Node.js" src="https://img.shields.io/badge/Node.js-20%2B-339933?logo=node.js&logoColor=white">
18
+ <img alt="Discord" src="https://img.shields.io/badge/Discord-voice%20bridge-5865F2?logo=discord&logoColor=white">
19
+ <img alt="STT" src="https://img.shields.io/badge/STT-whisper.cpp-7C3AED">
20
+ <img alt="TTS" src="https://img.shields.io/badge/TTS-Edge%20%7C%20OpenVoice%20%7C%20Supertonic%20%7C%20SpeechSwift-0EA5E9">
21
+ <img alt="Agents" src="https://img.shields.io/badge/Agents-Hermes%20%7C%20Claude%20%7C%20Codex%20%7C%20Gemini%20%7C%20OpenCode-111827">
22
+ </p>
23
+
24
+ <p align="center">
25
+ <img src="docs/assets/figures/verbalcoding-flow.svg" alt="VerbalCoding voice-to-agent flow" width="860">
26
+ </p>
27
+
28
+ ## Why
29
+
30
+ VerbalCoding turns a Discord voice channel into a hands-free control surface for coding agents. Speak a request, let your CLI agent work, and hear a concise answer back — with text transcripts, progress events, and guardrails for noisy code/log output.
31
+
32
+ ## Highlights
33
+
34
+ | What you get | Why it feels good |
35
+ |---|---|
36
+ | Voice-first agent control | Talk to Hermes Agent, Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, or any custom CLI harness. |
37
+ | Local-first speech loop | Discord voice capture → `whisper.cpp` STT → agent → chunked TTS playback. |
38
+ | Shared voice + text context | Voice turns and `!ask` text commands can reuse the same supported agent session. |
39
+ | Barge-in and sensitivity modes | Interrupt playback naturally and switch between normal and conservative/noisy environments. |
40
+ | Multilingual voice presets | Switch STT, progress language, and TTS voice together with `vc language ko/en/auto`. |
41
+ | Multi-room project isolation | Run one bot per project room with isolated Hermes profiles, sessions, memory, and logs. |
42
+
43
+ ## Quick Start
44
+
45
+ Fastest path with npm:
46
+
47
+ ```bash
48
+ npm install -g verbalcoding
49
+ vc setup --yes
50
+ vc doctor
51
+ vc start
52
+ ```
53
+
54
+ Or run directly without a permanent global install:
55
+
56
+ ```bash
57
+ npx verbalcoding setup --yes
58
+ vc doctor
59
+ vc start
60
+ ```
61
+
62
+ GitHub clone path for contributors:
63
+
64
+ ```bash
65
+ git clone https://github.com/ca1773130n/VerbalCoding.git
66
+ cd VerbalCoding
67
+ ./scripts/install.sh --yes
68
+ vc doctor
69
+ ./run.sh
70
+ ```
71
+
72
+ `vc setup --yes` and `./scripts/install.sh --yes` bootstrap local prerequisites where possible: Node/npm dependencies, `ffmpeg`, `whisper-cli`, the default whisper.cpp model, a local `.venv-tts` Edge TTS helper, and the short `vc` shell command for clone installs. They support macOS/Homebrew plus common Linux package managers (`apt`, `dnf`, `pacman`); rerun with `--no-wizard` for dependency-only setup or `--skip-system` if you want to install OS packages yourself.
73
+
74
+ Need a clean install walkthrough? Start with [Fresh Install](docs/FRESH_INSTALL.md).
75
+
76
+ ## How It Works
77
+
78
+ ```mermaid
79
+ flowchart LR
80
+ A[Discord voice] --> B["@discordjs/voice"]
81
+ B --> C[PCM cleanup + gates]
82
+ C --> D["whisper.cpp STT"]
83
+ D --> E["CLI agent adapter"]
84
+ E --> F["Concise answer"]
85
+ F --> G["Chunked TTS"]
86
+ G --> H["Discord playback"]
87
+ ```
88
+
89
+ ## Supported Agent Backends
90
+
91
+ | Backend | Default command | Session support |
92
+ |---|---:|---|
93
+ | Hermes Agent | `hermes chat -Q -q` | Resume, verbose progress, cancellation, final-answer recovery |
94
+ | Claude Code | `claude -p` | CLI session file support through adapter defaults |
95
+ | Codex CLI | `codex exec` | CLI session file support through adapter defaults |
96
+ | Gemini CLI | `gemini -p` | CLI session file support through adapter defaults |
97
+ | OpenCode | `opencode run` | CLI session file support through adapter defaults |
98
+ | OpenClaw | `openclaw run` | CLI session file support through adapter defaults |
99
+ | Custom | `AGENT_COMMAND` | Bring your own non-interactive command |
100
+
101
+ ## Learn More
102
+
103
+ | Guide | What you get |
104
+ |---|---|
105
+ | [Fresh Install](docs/FRESH_INSTALL.md) | Clean clone setup, model download, first run |
106
+ | [Usage Guide](docs/USAGE.md) | CLI commands, Discord commands, progress mode, latency metrics |
107
+ | [Configuration](docs/CONFIGURATION.md) | `.env`, agent backends, MCP, TTS backends, operational notes |
108
+ | [Multi-Instance](docs/MULTI_INSTANCE.md) | One permanent Discord voice room per project |
109
+ | [Release Notes](docs/RELEASE.md) | Current capabilities and pre-release checklist |
110
+
111
+ ## Tiny Command Map
112
+
113
+ ```bash
114
+ vc status # current language, TTS, and bridge settings
115
+ vc language ko|en|auto # switch STT/progress/TTS language preset
116
+ vc bot invite CLIENT_ID # generate the Discord bot invite URL
117
+ vc instance setup NAME # create an isolated project voice bot
118
+ vc instance start NAME # run that bot in the background
119
+ vc doctor # redacted health check
120
+ vc start # start the default bridge
121
+ ```
122
+
123
+ In Discord:
124
+
125
+ ```text
126
+ !join !ask <prompt> !verbose on/off
127
+ !latency !sensitivity normal !sensitivity conservative
128
+ !session new <name> <workdir> [context] --voice <voice-channel>
129
+ ```
130
+
131
+ ## Requirements
132
+
133
+ | Layer | Default |
134
+ |---|---|
135
+ | Runtime | Node.js 20+, npm; install script can install via Homebrew/apt/dnf/pacman |
136
+ | Audio | `ffmpeg`; install script can install it |
137
+ | STT | `whisper.cpp` / `whisper-cli`; install script uses Homebrew on macOS or local Linux build fallback |
138
+ | TTS | Edge TTS CLI; install script creates `.venv-tts` if needed |
139
+ | Discord | Bot token, Message Content intent, voice permissions |
140
+ | Agent | At least one authenticated CLI harness, Hermes Agent by default |
141
+ | Platform focus | macOS / Apple Silicon most tested; Linux bootstrap is best-effort and documented |
142
+
143
+ ## Contributing
144
+
145
+ Run the lightweight checks before sending changes:
146
+
147
+ ```bash
148
+ node --check app-node/main.mjs
149
+ npm test
150
+ bash -n run.sh scripts/install.sh
151
+ npm pack --dry-run
152
+ vc doctor
153
+ ```
154
+
155
+ ## Status
156
+
157
+ VerbalCoding is public-release oriented but still early. Demo video/GIF, broader Linux notes, and a formal license file are still TODOs.