verbalcoding 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +5 -3
- package/app-node/discord_text.test.mjs +14 -0
- package/app-node/install_config.mjs +1 -1
- package/app-node/install_config.test.mjs +1 -1
- package/app-node/main.mjs +19 -10
- package/docs/CONFIGURATION.md +26 -2
- package/docs/FRESH_INSTALL.md +8 -0
- package/docs/RELEASE.md +1 -1
- package/docs/USAGE.md +28 -0
- package/docs/i18n/CONFIGURATION.ko.md +26 -2
- package/docs/i18n/FRESH_INSTALL.ko.md +8 -0
- package/docs/i18n/RELEASE.ko.md +1 -1
- package/docs/i18n/USAGE.ko.md +28 -0
- package/package.json +1 -1
- package/run.sh +1 -1
- package/scripts/docker_ubuntu_smoke.sh +1 -1
- package/scripts/doctor.mjs +1 -1
- package/scripts/install.mjs +1 -1
package/.env.example
CHANGED
|
@@ -26,6 +26,7 @@ STT_LANGUAGE="ko"
|
|
|
26
26
|
|
|
27
27
|
TTS_BACKEND="edge" # edge | openvoice | speechswift | supertonic
|
|
28
28
|
EDGE_TTS_COMMAND="edge-tts"
|
|
29
|
+
TTS_VOICE_TYPE="korean_female" # edge: korean_male | korean_female | korean_multilingual_male | english_male | english_female
|
|
29
30
|
TTS_VOICE="ko-KR-SunHiNeural"
|
|
30
31
|
TTS_RATE="+10%"
|
|
31
32
|
TTS_MAX_CHARS="495"
|
|
@@ -66,8 +67,9 @@ OPENVOICE_STYLE="default"
|
|
|
66
67
|
OPENVOICE_TIMEOUT_MS="90000"
|
|
67
68
|
OPENVOICE_PROGRESS="0" # keep progress prompts fast via Edge unless set to 1
|
|
68
69
|
REQUIRE_WAKE_WORD="0"
|
|
69
|
-
MIN_UTTERANCE_SECONDS="1.
|
|
70
|
-
|
|
70
|
+
MIN_UTTERANCE_SECONDS="1.0"
|
|
71
|
+
# Wait for natural thinking pauses before STT. Lower for faster but more fragmented turns.
|
|
72
|
+
UTTERANCE_IDLE_MS="4500"
|
|
71
73
|
MIN_MEAN_VOLUME_DB="-35"
|
|
72
74
|
MIN_MAX_VOLUME_DB="-12"
|
|
73
75
|
BARGE_IN_MIN_SECONDS="1.4"
|
|
@@ -80,4 +82,4 @@ PLAYBACK_BARGE_IN_REQUIRE_BOTH="1"
|
|
|
80
82
|
BARGE_IN_CONSERVATIVE_MIN_SECONDS="1.8"
|
|
81
83
|
BARGE_IN_CONSERVATIVE_MIN_MEAN_VOLUME_DB="-27"
|
|
82
84
|
BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB="-12"
|
|
83
|
-
MAX_DEFERRED_PROCESSING_UTTERANCES="
|
|
85
|
+
MAX_DEFERRED_PROCESSING_UTTERANCES="0"
|
|
@@ -8,6 +8,20 @@ test('splitDiscordMessage chunks long text for Discord', () => {
|
|
|
8
8
|
assert.deepEqual(chunks.map(c => c.length), [1900, 1900, 201]);
|
|
9
9
|
});
|
|
10
10
|
|
|
11
|
+
test('sendDiscordText returns false without fetching when transcript channel id is missing', async () => {
|
|
12
|
+
const warnings = [];
|
|
13
|
+
let fetched = false;
|
|
14
|
+
const delivered = await sendDiscordText({
|
|
15
|
+
channelId: '',
|
|
16
|
+
text: 'restart complete',
|
|
17
|
+
client: { channels: { fetch: async () => { fetched = true; } } },
|
|
18
|
+
warn: (...args) => warnings.push(args.join(' ')),
|
|
19
|
+
});
|
|
20
|
+
assert.equal(delivered, false);
|
|
21
|
+
assert.equal(fetched, false);
|
|
22
|
+
assert.match(warnings.join('\n'), /missing transcript channel id/);
|
|
23
|
+
});
|
|
24
|
+
|
|
11
25
|
test('sendDiscordText returns false when target is not text based', async () => {
|
|
12
26
|
const warnings = [];
|
|
13
27
|
const delivered = await sendDiscordText({
|
|
@@ -57,7 +57,7 @@ export function normalizeInstallAnswers(input = {}) {
|
|
|
57
57
|
OPENVOICE_PROGRESS: input.openvoiceProgress === true || input.OPENVOICE_PROGRESS === '1' ? '1' : '0',
|
|
58
58
|
REQUIRE_WAKE_WORD: input.requireWakeWord === true || input.REQUIRE_WAKE_WORD === '1' ? '1' : '0',
|
|
59
59
|
MIN_UTTERANCE_SECONDS: clean(input.minUtteranceSeconds || input.MIN_UTTERANCE_SECONDS, '1.0'),
|
|
60
|
-
UTTERANCE_IDLE_MS: clean(input.utteranceIdleMs || input.UTTERANCE_IDLE_MS, '
|
|
60
|
+
UTTERANCE_IDLE_MS: clean(input.utteranceIdleMs || input.UTTERANCE_IDLE_MS, '4500'),
|
|
61
61
|
HERMES_TASK_TIMEOUT_MS: clean(input.taskTimeoutMs || input.HERMES_TASK_TIMEOUT_MS, '0'),
|
|
62
62
|
HERMES_CHAT_TIMEOUT_MS: clean(input.chatTimeoutMs || input.HERMES_CHAT_TIMEOUT_MS, '45000'),
|
|
63
63
|
AGENT_VERBOSE_PROGRESS: input.verboseProgress === true || input.AGENT_VERBOSE_PROGRESS === '1' ? '1' : '0',
|
|
@@ -63,7 +63,7 @@ test('normalizeInstallAnswers maps supported harnesses to backend env', () => {
|
|
|
63
63
|
assert.equal(answers.SUPERTONIC_LANGUAGE, 'ko');
|
|
64
64
|
assert.equal(answers.OPENVOICE_LANGUAGE, 'KR');
|
|
65
65
|
assert.equal(answers.REQUIRE_WAKE_WORD, '0');
|
|
66
|
-
assert.equal(answers.UTTERANCE_IDLE_MS, '
|
|
66
|
+
assert.equal(answers.UTTERANCE_IDLE_MS, '4500');
|
|
67
67
|
});
|
|
68
68
|
|
|
69
69
|
test('buildEnvFile writes configurable CLI harness and Discord settings without comments leaking into values', () => {
|
package/app-node/main.mjs
CHANGED
|
@@ -166,7 +166,7 @@ const settings = {
|
|
|
166
166
|
token: process.env.DISCORD_BOT_TOKEN || process.env.DISCORD_TOKEN,
|
|
167
167
|
allowedUsers: new Set((process.env.DISCORD_ALLOWED_USERS || '').split(/[;,]/).map(s => s.trim()).filter(Boolean)),
|
|
168
168
|
autoJoinVoiceChannels: (process.env.AUTO_JOIN_VOICE_CHANNELS || '일반,General,general').split(',').map(s => s.trim().toLowerCase()).filter(Boolean),
|
|
169
|
-
transcriptChannelId: (process.env.TRANSCRIPT_CHANNEL_ID || '
|
|
169
|
+
transcriptChannelId: (process.env.TRANSCRIPT_CHANNEL_ID || '').trim(),
|
|
170
170
|
whisperBin: process.env.WHISPER_CPP_BIN || 'whisper-cli',
|
|
171
171
|
whisperModel: process.env.WHISPER_CPP_MODEL || path.join(ROOT, 'models', 'ggml-small-q5_1.bin'),
|
|
172
172
|
whisperLanguage: process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
|
|
@@ -217,7 +217,10 @@ const BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB = Number(process.env.BARGE_IN_CONS
|
|
|
217
217
|
const SENSITIVITY_MODE_DEFAULT = (process.env.BARGE_IN_SENSITIVITY_MODE || 'normal').toLowerCase() === 'conservative' ? 'conservative' : 'normal';
|
|
218
218
|
const SENSITIVITY_OUTDOOR_SECONDS = Number(process.env.BARGE_IN_OUTDOOR_SECONDS || '900');
|
|
219
219
|
const SUBSCRIBE_AFTER_SILENCE_MS = Number(process.env.SUBSCRIBE_AFTER_SILENCE_MS || '2200');
|
|
220
|
-
|
|
220
|
+
// Wait long enough for natural mid-sentence pauses before sending audio to STT.
|
|
221
|
+
// If this is too short, a long thought gets split: the first fragment starts an
|
|
222
|
+
// agent turn and the rest is treated as barge-in/processing speech.
|
|
223
|
+
const UTTERANCE_IDLE_MS = Number(process.env.UTTERANCE_IDLE_MS || '4500');
|
|
221
224
|
const MIN_MEAN_VOLUME_DB = Number(process.env.MIN_MEAN_VOLUME_DB || '-35');
|
|
222
225
|
const MIN_MAX_VOLUME_DB = Number(process.env.MIN_MAX_VOLUME_DB || '-12');
|
|
223
226
|
const STT_START_VOICE_NOTICE = !['0', 'false', 'no', 'off'].includes((process.env.STT_START_VOICE_NOTICE || '1').toLowerCase());
|
|
@@ -1399,26 +1402,32 @@ async function connectTo(channel) {
|
|
|
1399
1402
|
selfDeaf: false,
|
|
1400
1403
|
selfMute: false,
|
|
1401
1404
|
});
|
|
1402
|
-
connection
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
+
const voiceConnection = connection;
|
|
1406
|
+
voiceConnection.subscribe(player);
|
|
1407
|
+
voiceConnection.on('error', e => warn('voice connection error', e?.stack || e));
|
|
1408
|
+
voiceConnection.on('stateChange', async (oldState, newState) => {
|
|
1405
1409
|
log('voice connection state', oldState.status, '->', newState.status);
|
|
1410
|
+
if (connection !== voiceConnection) {
|
|
1411
|
+
log('ignore stale voice connection state', oldState.status, '->', newState.status);
|
|
1412
|
+
return;
|
|
1413
|
+
}
|
|
1406
1414
|
if (newState.status === VoiceConnectionStatus.Disconnected) {
|
|
1407
1415
|
try {
|
|
1408
1416
|
await Promise.race([
|
|
1409
|
-
entersState(
|
|
1410
|
-
entersState(
|
|
1417
|
+
entersState(voiceConnection, VoiceConnectionStatus.Signalling, 5000),
|
|
1418
|
+
entersState(voiceConnection, VoiceConnectionStatus.Connecting, 5000),
|
|
1411
1419
|
]);
|
|
1412
1420
|
} catch (e) {
|
|
1421
|
+
if (connection !== voiceConnection) return;
|
|
1413
1422
|
warn('voice connection disconnected; reconnecting to channel', channel.guild.name, channel.name, e?.message || e);
|
|
1414
|
-
try {
|
|
1423
|
+
try { voiceConnection.destroy(); } catch {}
|
|
1415
1424
|
connection = null;
|
|
1416
1425
|
setTimeout(() => connectTo(channel).catch(err => warn('voice reconnect failed', err?.stack || err)), 1500);
|
|
1417
1426
|
}
|
|
1418
1427
|
}
|
|
1419
1428
|
});
|
|
1420
|
-
await entersState(
|
|
1421
|
-
|
|
1429
|
+
await entersState(voiceConnection, VoiceConnectionStatus.Ready, 30000);
|
|
1430
|
+
voiceConnection.receiver.speaking.on('start', userId => subscribeUser(voiceConnection.receiver, userId));
|
|
1422
1431
|
log(`Listening in voice channel ${channel.guild.name} / ${channel.name}`);
|
|
1423
1432
|
}
|
|
1424
1433
|
|
package/docs/CONFIGURATION.md
CHANGED
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
## Setup Wizard
|
|
4
4
|
|
|
5
|
+
Discord bot/application setup is intentionally not re-explained from scratch here. Use these upstream guides for the Discord-side steps, then return to VerbalCoding setup:
|
|
6
|
+
|
|
7
|
+
- Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
|
|
8
|
+
- Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
|
|
9
|
+
- Discord official quick start: <https://docs.discord.com/developers/quick-start/getting-started>
|
|
10
|
+
|
|
5
11
|
```bash
|
|
6
12
|
./scripts/install.sh
|
|
7
13
|
```
|
|
@@ -37,7 +43,7 @@ AGENT_COMMAND="my-harness run --non-interactive"
|
|
|
37
43
|
AGENT_TASK_TIMEOUT_MS=0
|
|
38
44
|
AGENT_CHAT_TIMEOUT_MS=45000
|
|
39
45
|
AGENT_VERBOSE_PROGRESS=0
|
|
40
|
-
UTTERANCE_IDLE_MS=
|
|
46
|
+
UTTERANCE_IDLE_MS=4500
|
|
41
47
|
LATENCY_LOG_PATH=./.logs/latency.jsonl
|
|
42
48
|
```
|
|
43
49
|
|
|
@@ -74,7 +80,7 @@ TTS_VOLUME="1.0"
|
|
|
74
80
|
|
|
75
81
|
REQUIRE_WAKE_WORD="0"
|
|
76
82
|
MIN_UTTERANCE_SECONDS="1.0"
|
|
77
|
-
UTTERANCE_IDLE_MS="
|
|
83
|
+
UTTERANCE_IDLE_MS="4500"
|
|
78
84
|
HERMES_TASK_TIMEOUT_MS="0"
|
|
79
85
|
HERMES_CHAT_TIMEOUT_MS="45000"
|
|
80
86
|
AGENT_VERBOSE_PROGRESS="0"
|
|
@@ -112,6 +118,24 @@ TTS_VOICE_CONFIG="config/tts-voices.json"
|
|
|
112
118
|
|
|
113
119
|
For OpenVoice, SpeechSwift, or Supertonic, keep the backend-specific voice/reference settings in the sections below; the same voice catalog file can still track the active voice type.
|
|
114
120
|
|
|
121
|
+
Backend-specific voice options:
|
|
122
|
+
|
|
123
|
+
| Backend | Settings | Voice choices |
|
|
124
|
+
|---|---|---|
|
|
125
|
+
| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | Built-in types above, plus any voice returned by `edge-tts --list-voices` |
|
|
126
|
+
| Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; language `ko`, `en`, `es`, `pt`, `fr` |
|
|
127
|
+
| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | User-provided permitted reference WAV; style defaults to `default` |
|
|
128
|
+
| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | Reference-sample voices for CosyVoice, or backend-supported speaker/model IDs |
|
|
129
|
+
|
|
130
|
+
## Utterance Segmentation
|
|
131
|
+
|
|
132
|
+
`UTTERANCE_IDLE_MS` controls how long the bridge waits after a speech segment before it decides the user is done and starts STT. The default is `4500` ms to preserve longer spoken instructions with natural pauses. Lower values feel faster for short commands but can split long dictation; higher values are safer for thoughtful speech.
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
UTTERANCE_IDLE_MS="4500" # balanced default
|
|
136
|
+
UTTERANCE_IDLE_MS="6000" # safer for long dictation with pauses
|
|
137
|
+
```
|
|
138
|
+
|
|
115
139
|
## MCP Server
|
|
116
140
|
|
|
117
141
|
VerbalCoding ships a stdio MCP server so Hermes Agent or any MCP client can control the bridge through tools instead of relying on skills or free-form shell commands.
|
package/docs/FRESH_INSTALL.md
CHANGED
|
@@ -77,6 +77,14 @@ If your OS is unsupported, install these manually before rerunning:
|
|
|
77
77
|
|
|
78
78
|
## 3. Discord application setup
|
|
79
79
|
|
|
80
|
+
Read the upstream Discord bot setup guides first if this is your first bot:
|
|
81
|
+
|
|
82
|
+
- Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
|
|
83
|
+
- Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
|
|
84
|
+
- Discord official getting started guide: <https://docs.discord.com/developers/quick-start/getting-started>
|
|
85
|
+
|
|
86
|
+
Those pages show how to create a Discord application, add a bot user, enable privileged intents, and invite it to a server. VerbalCoding uses the same Discord bot setup, then adds voice receive, STT, CLI-agent execution, and TTS playback on top.
|
|
87
|
+
|
|
80
88
|
1. Create a Discord application and bot in the Discord Developer Portal.
|
|
81
89
|
2. Enable the Message Content privileged intent.
|
|
82
90
|
3. Copy the bot token into the installer prompt or `.env` as `DISCORD_BOT_TOKEN`.
|
package/docs/RELEASE.md
CHANGED
|
@@ -25,7 +25,7 @@ VerbalCoding is a Discord voice bridge for controlling CLI-based coding agents b
|
|
|
25
25
|
- npm package install path: `npm install -g verbalcoding`, `vc setup --yes`, and `vc start`.
|
|
26
26
|
- Optional verbose progress mode for text-only middle-step updates during long agent work.
|
|
27
27
|
- Always-on JSONL latency metrics plus `!latency` / `!metrics` summary for pipeline optimization.
|
|
28
|
-
-
|
|
28
|
+
- More patient utterance idle wait (`UTTERANCE_IDLE_MS=4500`) so long spoken instructions with natural pauses are not split into a partial prompt plus ignored processing-time speech.
|
|
29
29
|
- Multi-instance Hermes profile isolation: `vc instance setup <name>` auto-clones a Hermes profile to `~/.hermes/profiles/<name>` with the instance workdir, seeds SOUL.md, and writes `HERMES_HOME` into the instance env so per-project memory and skills stay separate; `vc instance start` self-heals a missing profile, and `vc doctor` checks profile-dir presence and `terminal.cwd` consistency.
|
|
30
30
|
|
|
31
31
|
### Pre-release checklist
|
package/docs/USAGE.md
CHANGED
|
@@ -43,6 +43,13 @@ The bot auto-joins the first configured channel name, defaulting to `일반,Gene
|
|
|
43
43
|
|
|
44
44
|
## Discord Commands
|
|
45
45
|
|
|
46
|
+
Before wiring commands, set up the Discord application/bot using the upstream guides:
|
|
47
|
+
|
|
48
|
+
- Hermes Agent Discord guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
|
|
49
|
+
- Discord official bot docs: <https://docs.discord.com/developers/bots/overview>
|
|
50
|
+
|
|
51
|
+
Then use `vc bot invite CLIENT_ID` to generate the VerbalCoding-specific invite URL with text and voice permissions.
|
|
52
|
+
|
|
46
53
|
| Command | Purpose |
|
|
47
54
|
|---|---|
|
|
48
55
|
| `!ping` | Basic bot check |
|
|
@@ -88,6 +95,27 @@ Built-in Edge voice types:
|
|
|
88
95
|
|
|
89
96
|
For persistent manual config, set `TTS_BACKEND=edge`, `TTS_VOICE_TYPE=<voice-type>`, and optionally `TTS_VOICE=<edge-voice>` in `.env`, or edit `config/tts-voices.json` for custom voice catalogs.
|
|
90
97
|
|
|
98
|
+
Backend-specific voice knobs:
|
|
99
|
+
|
|
100
|
+
| Backend | Voice setting | Common choices |
|
|
101
|
+
|---|---|---|
|
|
102
|
+
| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | `korean_male`, `korean_female`, `korean_multilingual_male`, `english_male`, `english_female`; any Edge voice from `edge-tts --list-voices` |
|
|
103
|
+
| Supertonic | `SUPERTONIC_VOICE` | `M1`–`M5`, `F1`–`F5`; set `SUPERTONIC_LANGUAGE=ko|en|es|pt|fr` |
|
|
104
|
+
| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE` | a permitted reference WAV plus style such as `default` |
|
|
105
|
+
| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER` | reference WAV for CosyVoice, or backend-supported speaker/model values |
|
|
106
|
+
|
|
107
|
+
For Supertonic and local clone backends, use the backend env vars above plus `!voice-test <text>` to audition changes. Voice-command switching currently maps the built-in Edge-style voice types; richer backend catalogs can be added in `config/tts-voices.json`.
|
|
108
|
+
|
|
109
|
+
## Long Dictation and Pauses
|
|
110
|
+
|
|
111
|
+
VerbalCoding waits for an idle window before sending speech to STT. The default `UTTERANCE_IDLE_MS=4500` is intentionally a bit patient so a natural pause in a long instruction does not split the sentence, start an agent turn too early, and then treat the rest as a processing-time interruption.
|
|
112
|
+
|
|
113
|
+
If you prefer faster short commands, lower it in `.env`; if long Korean dictation is still being split, raise it:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
UTTERANCE_IDLE_MS="6000"
|
|
117
|
+
```
|
|
118
|
+
|
|
91
119
|
## Verbose Progress Mode
|
|
92
120
|
|
|
93
121
|
Verbose progress is off by default unless `AGENT_VERBOSE_PROGRESS=1` is set. Enable it with `!verbose on` or a voice command like “상세 진행 켜”. It can emit short progress lines such as:
|
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
## 설정 마법사
|
|
4
4
|
|
|
5
|
+
Discord 봇/애플리케이션 생성 절차는 여기에서 처음부터 반복 설명하지 않습니다. Discord 쪽 설정은 아래 상위 문서를 보고 진행한 뒤 VerbalCoding 설정으로 돌아오세요.
|
|
6
|
+
|
|
7
|
+
- Hermes Agent Discord 메시징 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
|
|
8
|
+
- Discord 공식 봇 개요: <https://docs.discord.com/developers/bots/overview>
|
|
9
|
+
- Discord 공식 시작 가이드: <https://docs.discord.com/developers/quick-start/getting-started>
|
|
10
|
+
|
|
5
11
|
npm으로 설치한 경우:
|
|
6
12
|
|
|
7
13
|
```bash
|
|
@@ -45,7 +51,7 @@ AGENT_COMMAND="my-harness run --non-interactive"
|
|
|
45
51
|
AGENT_TASK_TIMEOUT_MS=0
|
|
46
52
|
AGENT_CHAT_TIMEOUT_MS=45000
|
|
47
53
|
AGENT_VERBOSE_PROGRESS=0
|
|
48
|
-
UTTERANCE_IDLE_MS=
|
|
54
|
+
UTTERANCE_IDLE_MS=4500
|
|
49
55
|
LATENCY_LOG_PATH=./.logs/latency.jsonl
|
|
50
56
|
```
|
|
51
57
|
|
|
@@ -82,7 +88,7 @@ TTS_VOLUME="1.0"
|
|
|
82
88
|
|
|
83
89
|
REQUIRE_WAKE_WORD="0"
|
|
84
90
|
MIN_UTTERANCE_SECONDS="1.0"
|
|
85
|
-
UTTERANCE_IDLE_MS="
|
|
91
|
+
UTTERANCE_IDLE_MS="4500"
|
|
86
92
|
HERMES_TASK_TIMEOUT_MS="0"
|
|
87
93
|
HERMES_CHAT_TIMEOUT_MS="45000"
|
|
88
94
|
AGENT_VERBOSE_PROGRESS="0"
|
|
@@ -120,6 +126,24 @@ TTS_VOICE_CONFIG="config/tts-voices.json"
|
|
|
120
126
|
|
|
121
127
|
OpenVoice, SpeechSwift, Supertonic을 쓸 때는 아래 백엔드별 reference/voice 설정을 유지하세요. 같은 voice catalog 파일에서 현재 voice type을 추적할 수 있습니다.
|
|
122
128
|
|
|
129
|
+
백엔드별 목소리 옵션:
|
|
130
|
+
|
|
131
|
+
| 백엔드 | 설정 | 목소리 선택지 |
|
|
132
|
+
|---|---|---|
|
|
133
|
+
| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | 위 기본 타입, 또는 `edge-tts --list-voices`가 반환하는 모든 voice |
|
|
134
|
+
| Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; 언어 `ko`, `en`, `es`, `pt`, `fr` |
|
|
135
|
+
| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | 사용자가 제공한 허가된 reference WAV; style 기본값은 `default` |
|
|
136
|
+
| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | CosyVoice reference sample voice 또는 백엔드가 지원하는 speaker/model ID |
|
|
137
|
+
|
|
138
|
+
## 발화 분리 설정
|
|
139
|
+
|
|
140
|
+
`UTTERANCE_IDLE_MS`는 음성 segment가 끝난 뒤 사용자의 말이 끝났다고 판단하고 STT를 시작하기 전까지 기다리는 시간입니다. 기본값은 `4500` ms입니다. 긴 지시 중 자연스러운 멈춤을 보존하기 위한 값입니다. 낮추면 짧은 명령 반응은 빨라지지만 긴 발화가 잘릴 수 있고, 높이면 생각하면서 말하는 긴 dictation에 더 안전합니다.
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
UTTERANCE_IDLE_MS="4500" # 균형 잡힌 기본값
|
|
144
|
+
UTTERANCE_IDLE_MS="6000" # 중간 멈춤이 있는 긴 발화에 더 안전
|
|
145
|
+
```
|
|
146
|
+
|
|
123
147
|
## MCP 서버
|
|
124
148
|
|
|
125
149
|
VerbalCoding은 stdio MCP 서버를 포함합니다. Hermes Agent 또는 MCP client는 자유 형식 shell 명령 대신 도구로 브릿지를 제어할 수 있습니다.
|
|
@@ -77,6 +77,14 @@ OS가 지원되지 않으면 아래를 직접 설치한 뒤 다시 실행하세
|
|
|
77
77
|
|
|
78
78
|
## 3. Discord 애플리케이션 설정
|
|
79
79
|
|
|
80
|
+
Discord 봇을 처음 만든다면 먼저 공식/상위 문서를 확인하세요.
|
|
81
|
+
|
|
82
|
+
- Hermes Agent Discord 메시징 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
|
|
83
|
+
- Discord 공식 봇 개요: <https://docs.discord.com/developers/bots/overview>
|
|
84
|
+
- Discord 공식 시작 가이드: <https://docs.discord.com/developers/quick-start/getting-started>
|
|
85
|
+
|
|
86
|
+
위 문서에는 Discord 애플리케이션 생성, bot user 추가, privileged intent 활성화, 서버 초대 방법이 설명되어 있습니다. VerbalCoding도 같은 Discord bot 설정을 사용하고, 그 위에 음성 수신, STT, CLI 에이전트 실행, TTS 재생을 얹습니다.
|
|
87
|
+
|
|
80
88
|
1. Discord Developer Portal에서 애플리케이션과 봇을 만듭니다.
|
|
81
89
|
2. Message Content privileged intent를 켭니다.
|
|
82
90
|
3. 봇 토큰을 설치 프롬프트 또는 `.env`의 `DISCORD_BOT_TOKEN`에 넣습니다.
|
package/docs/i18n/RELEASE.ko.md
CHANGED
|
@@ -24,7 +24,7 @@ VerbalCoding은 음성으로 CLI 기반 코딩 에이전트를 제어하기 위
|
|
|
24
24
|
- 설정 마법사, `.env.example`, `vc doctor` prerequisite checker, OS 패키지/npm 의존성/Edge TTS helper/기본 whisper.cpp 모델을 준비하는 `./scripts/install.sh --yes` 부트스트랩.
|
|
25
25
|
- 긴 에이전트 작업 중 텍스트 전용 중간 단계 업데이트를 위한 선택적 verbose progress mode.
|
|
26
26
|
- 파이프라인 최적화를 위한 JSONL latency metrics와 `!latency` / `!metrics` 요약.
|
|
27
|
-
-
|
|
27
|
+
- 더 여유 있는 utterance idle wait (`UTTERANCE_IDLE_MS=4500`)로 자연스러운 중간 멈춤이 있는 긴 지시가 앞부분 prompt와 무시되는 processing-time speech로 쪼개지지 않도록 개선.
|
|
28
28
|
- 멀티 인스턴스 Hermes 프로필 격리: `vc instance setup <name>`이 자동으로 Hermes 프로필을 `~/.hermes/profiles/<name>`에 clone하고, instance workdir을 설정하고, SOUL.md를 초기화하고, instance env에 `HERMES_HOME`을 기록합니다. `vc instance start`는 누락된 profile을 self-heal하고, `vc doctor`는 profile-dir 존재와 `terminal.cwd` 일관성을 검사합니다.
|
|
29
29
|
- npm 공개 패키지: `npm install -g verbalcoding`, `vc setup --yes`, `vc start` 경로 지원.
|
|
30
30
|
|
package/docs/i18n/USAGE.ko.md
CHANGED
|
@@ -51,6 +51,13 @@ VERBALCODING_INSTANCE_ENV=instances/my-project.env ./run.sh
|
|
|
51
51
|
|
|
52
52
|
## Discord 명령
|
|
53
53
|
|
|
54
|
+
명령을 연결하기 전에 먼저 상위 문서대로 Discord 애플리케이션/봇을 설정하세요.
|
|
55
|
+
|
|
56
|
+
- Hermes Agent Discord 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
|
|
57
|
+
- Discord 공식 봇 문서: <https://docs.discord.com/developers/bots/overview>
|
|
58
|
+
|
|
59
|
+
그 다음 `vc bot invite CLIENT_ID`를 사용하면 VerbalCoding에 필요한 텍스트/음성 권한이 포함된 초대 URL을 만들 수 있습니다.
|
|
60
|
+
|
|
54
61
|
| 명령 | 용도 |
|
|
55
62
|
|---|---|
|
|
56
63
|
| `!ping` | 봇 연결 기본 확인 |
|
|
@@ -96,6 +103,27 @@ switch speaker to English
|
|
|
96
103
|
|
|
97
104
|
영구 수동 설정이 필요하면 `.env`에 `TTS_BACKEND=edge`, `TTS_VOICE_TYPE=<voice-type>`, 필요 시 `TTS_VOICE=<edge-voice>`를 설정하세요. 더 많은 커스텀 목소리 카탈로그는 `config/tts-voices.json`에서 관리할 수 있습니다.
|
|
98
105
|
|
|
106
|
+
백엔드별 목소리 설정:
|
|
107
|
+
|
|
108
|
+
| 백엔드 | 목소리 설정 | 자주 쓰는 선택지 |
|
|
109
|
+
|---|---|---|
|
|
110
|
+
| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | `korean_male`, `korean_female`, `korean_multilingual_male`, `english_male`, `english_female`; `edge-tts --list-voices`의 모든 Edge voice |
|
|
111
|
+
| Supertonic | `SUPERTONIC_VOICE` | `M1`–`M5`, `F1`–`F5`; `SUPERTONIC_LANGUAGE=ko|en|es|pt|fr` |
|
|
112
|
+
| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE` | 사용 허가가 있는 reference WAV와 `default` 같은 style |
|
|
113
|
+
| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER` | CosyVoice reference WAV 또는 백엔드가 지원하는 speaker/model 값 |
|
|
114
|
+
|
|
115
|
+
Supertonic과 로컬 clone 백엔드는 위 env를 바꾼 뒤 `!voice-test <text>`로 바로 들어보세요. 현재 음성 명령 기반 전환은 기본 Edge-style voice type에 매핑되어 있고, 더 풍부한 백엔드 카탈로그는 `config/tts-voices.json`에 추가할 수 있습니다.
|
|
116
|
+
|
|
117
|
+
## 긴 발화와 중간 멈춤
|
|
118
|
+
|
|
119
|
+
VerbalCoding은 말을 STT로 보내기 전에 idle window를 기다립니다. 기본값 `UTTERANCE_IDLE_MS=4500`은 일부러 조금 여유 있게 잡혀 있습니다. 긴 지시 중 자연스러운 멈춤을 문장 끝으로 오해해 앞부분만 에이전트에 보내고, 뒷부분을 processing 중 끼어들기로 처리하는 문제를 줄이기 위해서입니다.
|
|
120
|
+
|
|
121
|
+
짧은 명령 반응을 더 빠르게 하고 싶다면 `.env`에서 낮추고, 긴 한국어 dictation이 여전히 잘리면 더 올리세요.
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
UTTERANCE_IDLE_MS="6000"
|
|
125
|
+
```
|
|
126
|
+
|
|
99
127
|
## 자세한 진행 모드
|
|
100
128
|
|
|
101
129
|
자세한 진행은 기본적으로 꺼져 있습니다. `.env`에 `AGENT_VERBOSE_PROGRESS=1`을 설정하거나 Discord에서 `!verbose on`, 또는 음성으로 “상세 진행 켜”라고 말해 켤 수 있습니다.
|
package/package.json
CHANGED
package/run.sh
CHANGED
|
@@ -8,7 +8,7 @@ mkdir -p /tmp/verbalcoding-node-debug
|
|
|
8
8
|
export NODE_AUDIO_DEBUG_DIR="${NODE_AUDIO_DEBUG_DIR:-/tmp/verbalcoding-node-debug}"
|
|
9
9
|
export MIN_UTTERANCE_SECONDS="${MIN_UTTERANCE_SECONDS:-1.0}"
|
|
10
10
|
export SUBSCRIBE_AFTER_SILENCE_MS="${SUBSCRIBE_AFTER_SILENCE_MS:-2200}"
|
|
11
|
-
export UTTERANCE_IDLE_MS="${UTTERANCE_IDLE_MS:-
|
|
11
|
+
export UTTERANCE_IDLE_MS="${UTTERANCE_IDLE_MS:-4500}"
|
|
12
12
|
export MIN_MEAN_VOLUME_DB="${MIN_MEAN_VOLUME_DB:--35}"
|
|
13
13
|
export MIN_MAX_VOLUME_DB="${MIN_MAX_VOLUME_DB:--18}"
|
|
14
14
|
export TTS_RATE="${TTS_RATE:-+10%}"
|
package/scripts/doctor.mjs
CHANGED
|
@@ -66,7 +66,7 @@ note('Allowed users configured', env.DISCORD_ALLOWED_USERS ? '[REDACTED]' : 'not
|
|
|
66
66
|
note('Auto-join channels', env.AUTO_JOIN_VOICE_CHANNELS || 'default: 일반,General,general');
|
|
67
67
|
note('Verbose progress default', ['1', 'true', 'yes', 'on'].includes(String(env.AGENT_VERBOSE_PROGRESS || env.VERBALCODING_VERBOSE_PROGRESS || '0').toLowerCase()) ? 'on' : 'off');
|
|
68
68
|
note('Auto restart voice bot after commits', autoRestartVoiceBotEnabled(env) ? 'on' : 'off');
|
|
69
|
-
note('Utterance idle wait before STT', `${env.UTTERANCE_IDLE_MS || '
|
|
69
|
+
note('Utterance idle wait before STT', `${env.UTTERANCE_IDLE_MS || '4500'} ms`);
|
|
70
70
|
note('STT language', env.WHISPER_CPP_LANGUAGE || env.STT_LANGUAGE || 'ko');
|
|
71
71
|
note('Progress/voice language', env.VOICE_LANGUAGE || env.WHISPER_CPP_LANGUAGE || env.STT_LANGUAGE || 'ko');
|
|
72
72
|
note('Latency log path', env.LATENCY_LOG_PATH || './.logs/latency.jsonl');
|
package/scripts/install.mjs
CHANGED
|
@@ -58,7 +58,7 @@ async function main() {
|
|
|
58
58
|
const openvoiceRefAudio = await ask('OpenVoice reference audio path', process.env.OPENVOICE_REF_AUDIO || './voice-samples/user-reference.wav');
|
|
59
59
|
const requireWake = (await ask('Require wake word? 1/0', process.env.REQUIRE_WAKE_WORD || '0')) === '1';
|
|
60
60
|
const verboseProgress = (await ask('Verbose progress by default? 1/0', process.env.AGENT_VERBOSE_PROGRESS || process.env.VERBALCODING_VERBOSE_PROGRESS || '0')) === '1';
|
|
61
|
-
const utteranceIdleMs = await ask('Utterance idle wait before STT, ms', process.env.UTTERANCE_IDLE_MS || '
|
|
61
|
+
const utteranceIdleMs = await ask('Utterance idle wait before STT, ms', process.env.UTTERANCE_IDLE_MS || '4500');
|
|
62
62
|
const latencyLogPath = await ask('Latency JSONL log path', process.env.LATENCY_LOG_PATH || './.logs/latency.jsonl');
|
|
63
63
|
|
|
64
64
|
const values = normalizeInstallAnswers({
|