verbalcoding 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -26,6 +26,7 @@ STT_LANGUAGE="ko"
26
26
 
27
27
  TTS_BACKEND="edge" # edge | openvoice | speechswift | supertonic
28
28
  EDGE_TTS_COMMAND="edge-tts"
29
+ TTS_VOICE_TYPE="korean_female" # edge: korean_male | korean_female | korean_multilingual_male | english_male | english_female
29
30
  TTS_VOICE="ko-KR-SunHiNeural"
30
31
  TTS_RATE="+10%"
31
32
  TTS_MAX_CHARS="495"
@@ -66,8 +67,9 @@ OPENVOICE_STYLE="default"
66
67
  OPENVOICE_TIMEOUT_MS="90000"
67
68
  OPENVOICE_PROGRESS="0" # keep progress prompts fast via Edge unless set to 1
68
69
  REQUIRE_WAKE_WORD="0"
69
- MIN_UTTERANCE_SECONDS="1.4"
70
- UTTERANCE_IDLE_MS="2000"
70
+ MIN_UTTERANCE_SECONDS="1.0"
71
+ # Wait for natural thinking pauses before STT. Lower for faster but more fragmented turns.
72
+ UTTERANCE_IDLE_MS="4500"
71
73
  MIN_MEAN_VOLUME_DB="-35"
72
74
  MIN_MAX_VOLUME_DB="-12"
73
75
  BARGE_IN_MIN_SECONDS="1.4"
@@ -80,4 +82,4 @@ PLAYBACK_BARGE_IN_REQUIRE_BOTH="1"
80
82
  BARGE_IN_CONSERVATIVE_MIN_SECONDS="1.8"
81
83
  BARGE_IN_CONSERVATIVE_MIN_MEAN_VOLUME_DB="-27"
82
84
  BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB="-12"
83
- MAX_DEFERRED_PROCESSING_UTTERANCES="3"
85
+ MAX_DEFERRED_PROCESSING_UTTERANCES="0"
@@ -8,6 +8,20 @@ test('splitDiscordMessage chunks long text for Discord', () => {
8
8
  assert.deepEqual(chunks.map(c => c.length), [1900, 1900, 201]);
9
9
  });
10
10
 
11
+ test('sendDiscordText returns false without fetching when transcript channel id is missing', async () => {
12
+ const warnings = [];
13
+ let fetched = false;
14
+ const delivered = await sendDiscordText({
15
+ channelId: '',
16
+ text: 'restart complete',
17
+ client: { channels: { fetch: async () => { fetched = true; } } },
18
+ warn: (...args) => warnings.push(args.join(' ')),
19
+ });
20
+ assert.equal(delivered, false);
21
+ assert.equal(fetched, false);
22
+ assert.match(warnings.join('\n'), /missing transcript channel id/);
23
+ });
24
+
11
25
  test('sendDiscordText returns false when target is not text based', async () => {
12
26
  const warnings = [];
13
27
  const delivered = await sendDiscordText({
@@ -57,7 +57,7 @@ export function normalizeInstallAnswers(input = {}) {
57
57
  OPENVOICE_PROGRESS: input.openvoiceProgress === true || input.OPENVOICE_PROGRESS === '1' ? '1' : '0',
58
58
  REQUIRE_WAKE_WORD: input.requireWakeWord === true || input.REQUIRE_WAKE_WORD === '1' ? '1' : '0',
59
59
  MIN_UTTERANCE_SECONDS: clean(input.minUtteranceSeconds || input.MIN_UTTERANCE_SECONDS, '1.0'),
60
- UTTERANCE_IDLE_MS: clean(input.utteranceIdleMs || input.UTTERANCE_IDLE_MS, '2000'),
60
+ UTTERANCE_IDLE_MS: clean(input.utteranceIdleMs || input.UTTERANCE_IDLE_MS, '4500'),
61
61
  HERMES_TASK_TIMEOUT_MS: clean(input.taskTimeoutMs || input.HERMES_TASK_TIMEOUT_MS, '0'),
62
62
  HERMES_CHAT_TIMEOUT_MS: clean(input.chatTimeoutMs || input.HERMES_CHAT_TIMEOUT_MS, '45000'),
63
63
  AGENT_VERBOSE_PROGRESS: input.verboseProgress === true || input.AGENT_VERBOSE_PROGRESS === '1' ? '1' : '0',
@@ -63,7 +63,7 @@ test('normalizeInstallAnswers maps supported harnesses to backend env', () => {
63
63
  assert.equal(answers.SUPERTONIC_LANGUAGE, 'ko');
64
64
  assert.equal(answers.OPENVOICE_LANGUAGE, 'KR');
65
65
  assert.equal(answers.REQUIRE_WAKE_WORD, '0');
66
- assert.equal(answers.UTTERANCE_IDLE_MS, '2000');
66
+ assert.equal(answers.UTTERANCE_IDLE_MS, '4500');
67
67
  });
68
68
 
69
69
  test('buildEnvFile writes configurable CLI harness and Discord settings without comments leaking into values', () => {
package/app-node/main.mjs CHANGED
@@ -166,7 +166,7 @@ const settings = {
166
166
  token: process.env.DISCORD_BOT_TOKEN || process.env.DISCORD_TOKEN,
167
167
  allowedUsers: new Set((process.env.DISCORD_ALLOWED_USERS || '').split(/[;,]/).map(s => s.trim()).filter(Boolean)),
168
168
  autoJoinVoiceChannels: (process.env.AUTO_JOIN_VOICE_CHANNELS || '일반,General,general').split(',').map(s => s.trim().toLowerCase()).filter(Boolean),
169
- transcriptChannelId: (process.env.TRANSCRIPT_CHANNEL_ID || '123456789012345678').trim(),
169
+ transcriptChannelId: (process.env.TRANSCRIPT_CHANNEL_ID || '').trim(),
170
170
  whisperBin: process.env.WHISPER_CPP_BIN || 'whisper-cli',
171
171
  whisperModel: process.env.WHISPER_CPP_MODEL || path.join(ROOT, 'models', 'ggml-small-q5_1.bin'),
172
172
  whisperLanguage: process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
@@ -217,7 +217,10 @@ const BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB = Number(process.env.BARGE_IN_CONS
217
217
  const SENSITIVITY_MODE_DEFAULT = (process.env.BARGE_IN_SENSITIVITY_MODE || 'normal').toLowerCase() === 'conservative' ? 'conservative' : 'normal';
218
218
  const SENSITIVITY_OUTDOOR_SECONDS = Number(process.env.BARGE_IN_OUTDOOR_SECONDS || '900');
219
219
  const SUBSCRIBE_AFTER_SILENCE_MS = Number(process.env.SUBSCRIBE_AFTER_SILENCE_MS || '2200');
220
- const UTTERANCE_IDLE_MS = Number(process.env.UTTERANCE_IDLE_MS || '2000');
220
+ // Wait long enough for natural mid-sentence pauses before sending audio to STT.
221
+ // If this is too short, a long thought gets split: the first fragment starts an
222
+ // agent turn and the rest is treated as barge-in/processing speech.
223
+ const UTTERANCE_IDLE_MS = Number(process.env.UTTERANCE_IDLE_MS || '4500');
221
224
  const MIN_MEAN_VOLUME_DB = Number(process.env.MIN_MEAN_VOLUME_DB || '-35');
222
225
  const MIN_MAX_VOLUME_DB = Number(process.env.MIN_MAX_VOLUME_DB || '-12');
223
226
  const STT_START_VOICE_NOTICE = !['0', 'false', 'no', 'off'].includes((process.env.STT_START_VOICE_NOTICE || '1').toLowerCase());
@@ -1399,26 +1402,32 @@ async function connectTo(channel) {
1399
1402
  selfDeaf: false,
1400
1403
  selfMute: false,
1401
1404
  });
1402
- connection.subscribe(player);
1403
- connection.on('error', e => warn('voice connection error', e?.stack || e));
1404
- connection.on('stateChange', async (oldState, newState) => {
1405
+ const voiceConnection = connection;
1406
+ voiceConnection.subscribe(player);
1407
+ voiceConnection.on('error', e => warn('voice connection error', e?.stack || e));
1408
+ voiceConnection.on('stateChange', async (oldState, newState) => {
1405
1409
  log('voice connection state', oldState.status, '->', newState.status);
1410
+ if (connection !== voiceConnection) {
1411
+ log('ignore stale voice connection state', oldState.status, '->', newState.status);
1412
+ return;
1413
+ }
1406
1414
  if (newState.status === VoiceConnectionStatus.Disconnected) {
1407
1415
  try {
1408
1416
  await Promise.race([
1409
- entersState(connection, VoiceConnectionStatus.Signalling, 5000),
1410
- entersState(connection, VoiceConnectionStatus.Connecting, 5000),
1417
+ entersState(voiceConnection, VoiceConnectionStatus.Signalling, 5000),
1418
+ entersState(voiceConnection, VoiceConnectionStatus.Connecting, 5000),
1411
1419
  ]);
1412
1420
  } catch (e) {
1421
+ if (connection !== voiceConnection) return;
1413
1422
  warn('voice connection disconnected; reconnecting to channel', channel.guild.name, channel.name, e?.message || e);
1414
- try { connection?.destroy(); } catch {}
1423
+ try { voiceConnection.destroy(); } catch {}
1415
1424
  connection = null;
1416
1425
  setTimeout(() => connectTo(channel).catch(err => warn('voice reconnect failed', err?.stack || err)), 1500);
1417
1426
  }
1418
1427
  }
1419
1428
  });
1420
- await entersState(connection, VoiceConnectionStatus.Ready, 30000);
1421
- connection.receiver.speaking.on('start', userId => subscribeUser(connection.receiver, userId));
1429
+ await entersState(voiceConnection, VoiceConnectionStatus.Ready, 30000);
1430
+ voiceConnection.receiver.speaking.on('start', userId => subscribeUser(voiceConnection.receiver, userId));
1422
1431
  log(`Listening in voice channel ${channel.guild.name} / ${channel.name}`);
1423
1432
  }
1424
1433
 
@@ -2,6 +2,12 @@
2
2
 
3
3
  ## Setup Wizard
4
4
 
5
+ Discord bot/application setup is intentionally not re-explained from scratch here. Use these upstream guides for the Discord-side steps, then return to VerbalCoding setup:
6
+
7
+ - Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
8
+ - Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
9
+ - Discord official quick start: <https://docs.discord.com/developers/quick-start/getting-started>
10
+
5
11
  ```bash
6
12
  ./scripts/install.sh
7
13
  ```
@@ -37,7 +43,7 @@ AGENT_COMMAND="my-harness run --non-interactive"
37
43
  AGENT_TASK_TIMEOUT_MS=0
38
44
  AGENT_CHAT_TIMEOUT_MS=45000
39
45
  AGENT_VERBOSE_PROGRESS=0
40
- UTTERANCE_IDLE_MS=2000
46
+ UTTERANCE_IDLE_MS=4500
41
47
  LATENCY_LOG_PATH=./.logs/latency.jsonl
42
48
  ```
43
49
 
@@ -74,7 +80,7 @@ TTS_VOLUME="1.0"
74
80
 
75
81
  REQUIRE_WAKE_WORD="0"
76
82
  MIN_UTTERANCE_SECONDS="1.0"
77
- UTTERANCE_IDLE_MS="2000"
83
+ UTTERANCE_IDLE_MS="4500"
78
84
  HERMES_TASK_TIMEOUT_MS="0"
79
85
  HERMES_CHAT_TIMEOUT_MS="45000"
80
86
  AGENT_VERBOSE_PROGRESS="0"
@@ -112,6 +118,24 @@ TTS_VOICE_CONFIG="config/tts-voices.json"
112
118
 
113
119
  For OpenVoice, SpeechSwift, or Supertonic, keep the backend-specific voice/reference settings in the sections below; the same voice catalog file can still track the active voice type.
114
120
 
121
+ Backend-specific voice options:
122
+
123
+ | Backend | Settings | Voice choices |
124
+ |---|---|---|
125
+ | Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | Built-in types above, plus any voice returned by `edge-tts --list-voices` |
126
+ | Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; language `ko`, `en`, `es`, `pt`, `fr` |
127
+ | OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | User-provided permitted reference WAV; style defaults to `default` |
128
+ | SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | Reference-sample voices for CosyVoice, or backend-supported speaker/model IDs |
129
+
130
+ ## Utterance Segmentation
131
+
132
+ `UTTERANCE_IDLE_MS` controls how long the bridge waits after a speech segment before it decides the user is done and starts STT. The default is `4500` ms to preserve longer spoken instructions with natural pauses. Lower values feel faster for short commands but can split long dictation; higher values are safer for thoughtful speech.
133
+
134
+ ```bash
135
+ UTTERANCE_IDLE_MS="4500" # balanced default
136
+ UTTERANCE_IDLE_MS="6000" # safer for long dictation with pauses
137
+ ```
138
+
115
139
  ## MCP Server
116
140
 
117
141
  VerbalCoding ships a stdio MCP server so Hermes Agent or any MCP client can control the bridge through tools instead of relying on skills or free-form shell commands.
@@ -77,6 +77,14 @@ If your OS is unsupported, install these manually before rerunning:
77
77
 
78
78
  ## 3. Discord application setup
79
79
 
80
+ Read the upstream Discord bot setup guides first if this is your first bot:
81
+
82
+ - Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
83
+ - Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
84
+ - Discord official getting started guide: <https://docs.discord.com/developers/quick-start/getting-started>
85
+
86
+ Those pages show how to create a Discord application, add a bot user, enable privileged intents, and invite it to a server. VerbalCoding uses the same Discord bot setup, then adds voice receive, STT, CLI-agent execution, and TTS playback on top.
87
+
80
88
  1. Create a Discord application and bot in the Discord Developer Portal.
81
89
  2. Enable the Message Content privileged intent.
82
90
  3. Copy the bot token into the installer prompt or `.env` as `DISCORD_BOT_TOKEN`.
package/docs/RELEASE.md CHANGED
@@ -25,7 +25,7 @@ VerbalCoding is a Discord voice bridge for controlling CLI-based coding agents b
25
25
  - npm package install path: `npm install -g verbalcoding`, `vc setup --yes`, and `vc start`.
26
26
  - Optional verbose progress mode for text-only middle-step updates during long agent work.
27
27
  - Always-on JSONL latency metrics plus `!latency` / `!metrics` summary for pipeline optimization.
28
- - Lower default utterance idle wait (`UTTERANCE_IDLE_MS=2000`) so STT starts about 0.6s sooner after speech ends.
28
+ - More patient utterance idle wait (`UTTERANCE_IDLE_MS=4500`) so long spoken instructions with natural pauses are not split into a partial prompt plus ignored processing-time speech.
29
29
  - Multi-instance Hermes profile isolation: `vc instance setup <name>` auto-clones a Hermes profile to `~/.hermes/profiles/<name>` with the instance workdir, seeds SOUL.md, and writes `HERMES_HOME` into the instance env so per-project memory and skills stay separate; `vc instance start` self-heals a missing profile, and `vc doctor` checks profile-dir presence and `terminal.cwd` consistency.
30
30
 
31
31
  ### Pre-release checklist
package/docs/USAGE.md CHANGED
@@ -43,6 +43,13 @@ The bot auto-joins the first configured channel name, defaulting to `일반,Gene
43
43
 
44
44
  ## Discord Commands
45
45
 
46
+ Before wiring commands, set up the Discord application/bot using the upstream guides:
47
+
48
+ - Hermes Agent Discord guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
49
+ - Discord official bot docs: <https://docs.discord.com/developers/bots/overview>
50
+
51
+ Then use `vc bot invite CLIENT_ID` to generate the VerbalCoding-specific invite URL with text and voice permissions.
52
+
46
53
  | Command | Purpose |
47
54
  |---|---|
48
55
  | `!ping` | Basic bot check |
@@ -88,6 +95,27 @@ Built-in Edge voice types:
88
95
 
89
96
  For persistent manual config, set `TTS_BACKEND=edge`, `TTS_VOICE_TYPE=<voice-type>`, and optionally `TTS_VOICE=<edge-voice>` in `.env`, or edit `config/tts-voices.json` for custom voice catalogs.
90
97
 
98
+ Backend-specific voice knobs:
99
+
100
+ | Backend | Voice setting | Common choices |
101
+ |---|---|---|
102
+ | Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | `korean_male`, `korean_female`, `korean_multilingual_male`, `english_male`, `english_female`; any Edge voice from `edge-tts --list-voices` |
103
+ | Supertonic | `SUPERTONIC_VOICE` | `M1`–`M5`, `F1`–`F5`; set `SUPERTONIC_LANGUAGE=ko|en|es|pt|fr` |
104
+ | OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE` | a permitted reference WAV plus style such as `default` |
105
+ | SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER` | reference WAV for CosyVoice, or backend-supported speaker/model values |
106
+
107
+ For Supertonic and local clone backends, use the backend env vars above plus `!voice-test <text>` to audition changes. Voice-command switching currently maps the built-in Edge-style voice types; richer backend catalogs can be added in `config/tts-voices.json`.
108
+
109
+ ## Long Dictation and Pauses
110
+
111
+ VerbalCoding waits for an idle window before sending speech to STT. The default `UTTERANCE_IDLE_MS=4500` is intentionally a bit patient so a natural pause in a long instruction does not split the sentence, start an agent turn too early, and then treat the rest as a processing-time interruption.
112
+
113
+ If you prefer faster short commands, lower it in `.env`; if long Korean dictation is still being split, raise it:
114
+
115
+ ```bash
116
+ UTTERANCE_IDLE_MS="6000"
117
+ ```
118
+
91
119
  ## Verbose Progress Mode
92
120
 
93
121
  Verbose progress is off by default unless `AGENT_VERBOSE_PROGRESS=1` is set. Enable it with `!verbose on` or a voice command like “상세 진행 켜”. It can emit short progress lines such as:
@@ -2,6 +2,12 @@
2
2
 
3
3
  ## 설정 마법사
4
4
 
5
+ Discord 봇/애플리케이션 생성 절차는 여기에서 처음부터 반복 설명하지 않습니다. Discord 쪽 설정은 아래 상위 문서를 보고 진행한 뒤 VerbalCoding 설정으로 돌아오세요.
6
+
7
+ - Hermes Agent Discord 메시징 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
8
+ - Discord 공식 봇 개요: <https://docs.discord.com/developers/bots/overview>
9
+ - Discord 공식 시작 가이드: <https://docs.discord.com/developers/quick-start/getting-started>
10
+
5
11
  npm으로 설치한 경우:
6
12
 
7
13
  ```bash
@@ -45,7 +51,7 @@ AGENT_COMMAND="my-harness run --non-interactive"
45
51
  AGENT_TASK_TIMEOUT_MS=0
46
52
  AGENT_CHAT_TIMEOUT_MS=45000
47
53
  AGENT_VERBOSE_PROGRESS=0
48
- UTTERANCE_IDLE_MS=2000
54
+ UTTERANCE_IDLE_MS=4500
49
55
  LATENCY_LOG_PATH=./.logs/latency.jsonl
50
56
  ```
51
57
 
@@ -82,7 +88,7 @@ TTS_VOLUME="1.0"
82
88
 
83
89
  REQUIRE_WAKE_WORD="0"
84
90
  MIN_UTTERANCE_SECONDS="1.0"
85
- UTTERANCE_IDLE_MS="2000"
91
+ UTTERANCE_IDLE_MS="4500"
86
92
  HERMES_TASK_TIMEOUT_MS="0"
87
93
  HERMES_CHAT_TIMEOUT_MS="45000"
88
94
  AGENT_VERBOSE_PROGRESS="0"
@@ -120,6 +126,24 @@ TTS_VOICE_CONFIG="config/tts-voices.json"
120
126
 
121
127
  OpenVoice, SpeechSwift, Supertonic을 쓸 때는 아래 백엔드별 reference/voice 설정을 유지하세요. 같은 voice catalog 파일에서 현재 voice type을 추적할 수 있습니다.
122
128
 
129
+ 백엔드별 목소리 옵션:
130
+
131
+ | 백엔드 | 설정 | 목소리 선택지 |
132
+ |---|---|---|
133
+ | Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | 위 기본 타입, 또는 `edge-tts --list-voices`가 반환하는 모든 voice |
134
+ | Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; 언어 `ko`, `en`, `es`, `pt`, `fr` |
135
+ | OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | 사용자가 제공한 허가된 reference WAV; style 기본값은 `default` |
136
+ | SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | CosyVoice reference sample voice 또는 백엔드가 지원하는 speaker/model ID |
137
+
138
+ ## 발화 분리 설정
139
+
140
+ `UTTERANCE_IDLE_MS`는 음성 segment가 끝난 뒤 사용자의 말이 끝났다고 판단하고 STT를 시작하기 전까지 기다리는 시간입니다. 기본값은 `4500` ms입니다. 긴 지시 중 자연스러운 멈춤을 보존하기 위한 값입니다. 낮추면 짧은 명령 반응은 빨라지지만 긴 발화가 잘릴 수 있고, 높이면 생각하면서 말하는 긴 dictation에 더 안전합니다.
141
+
142
+ ```bash
143
+ UTTERANCE_IDLE_MS="4500" # 균형 잡힌 기본값
144
+ UTTERANCE_IDLE_MS="6000" # 중간 멈춤이 있는 긴 발화에 더 안전
145
+ ```
146
+
123
147
  ## MCP 서버
124
148
 
125
149
  VerbalCoding은 stdio MCP 서버를 포함합니다. Hermes Agent 또는 MCP client는 자유 형식 shell 명령 대신 도구로 브릿지를 제어할 수 있습니다.
@@ -77,6 +77,14 @@ OS가 지원되지 않으면 아래를 직접 설치한 뒤 다시 실행하세
77
77
 
78
78
  ## 3. Discord 애플리케이션 설정
79
79
 
80
+ Discord 봇을 처음 만든다면 먼저 공식/상위 문서를 확인하세요.
81
+
82
+ - Hermes Agent Discord 메시징 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
83
+ - Discord 공식 봇 개요: <https://docs.discord.com/developers/bots/overview>
84
+ - Discord 공식 시작 가이드: <https://docs.discord.com/developers/quick-start/getting-started>
85
+
86
+ 위 문서에는 Discord 애플리케이션 생성, bot user 추가, privileged intent 활성화, 서버 초대 방법이 설명되어 있습니다. VerbalCoding도 같은 Discord bot 설정을 사용하고, 그 위에 음성 수신, STT, CLI 에이전트 실행, TTS 재생을 얹습니다.
87
+
80
88
  1. Discord Developer Portal에서 애플리케이션과 봇을 만듭니다.
81
89
  2. Message Content privileged intent를 켭니다.
82
90
  3. 봇 토큰을 설치 프롬프트 또는 `.env`의 `DISCORD_BOT_TOKEN`에 넣습니다.
@@ -24,7 +24,7 @@ VerbalCoding은 음성으로 CLI 기반 코딩 에이전트를 제어하기 위
24
24
  - 설정 마법사, `.env.example`, `vc doctor` prerequisite checker, OS 패키지/npm 의존성/Edge TTS helper/기본 whisper.cpp 모델을 준비하는 `./scripts/install.sh --yes` 부트스트랩.
25
25
  - 긴 에이전트 작업 중 텍스트 전용 중간 단계 업데이트를 위한 선택적 verbose progress mode.
26
26
  - 파이프라인 최적화를 위한 JSONL latency metrics와 `!latency` / `!metrics` 요약.
27
- - 낮아진 기본 utterance idle wait (`UTTERANCE_IDLE_MS=2000`)로 사용자가 말한 STT가 0.6초 빨리 시작.
27
+ - 여유 있는 utterance idle wait (`UTTERANCE_IDLE_MS=4500`)로 자연스러운 중간 멈춤이 있는 지시가 앞부분 prompt와 무시되는 processing-time speech로 쪼개지지 않도록 개선.
28
28
  - 멀티 인스턴스 Hermes 프로필 격리: `vc instance setup <name>`이 자동으로 Hermes 프로필을 `~/.hermes/profiles/<name>`에 clone하고, instance workdir을 설정하고, SOUL.md를 초기화하고, instance env에 `HERMES_HOME`을 기록합니다. `vc instance start`는 누락된 profile을 self-heal하고, `vc doctor`는 profile-dir 존재와 `terminal.cwd` 일관성을 검사합니다.
29
29
  - npm 공개 패키지: `npm install -g verbalcoding`, `vc setup --yes`, `vc start` 경로 지원.
30
30
 
@@ -51,6 +51,13 @@ VERBALCODING_INSTANCE_ENV=instances/my-project.env ./run.sh
51
51
 
52
52
  ## Discord 명령
53
53
 
54
+ 명령을 연결하기 전에 먼저 상위 문서대로 Discord 애플리케이션/봇을 설정하세요.
55
+
56
+ - Hermes Agent Discord 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
57
+ - Discord 공식 봇 문서: <https://docs.discord.com/developers/bots/overview>
58
+
59
+ 그 다음 `vc bot invite CLIENT_ID`를 사용하면 VerbalCoding에 필요한 텍스트/음성 권한이 포함된 초대 URL을 만들 수 있습니다.
60
+
54
61
  | 명령 | 용도 |
55
62
  |---|---|
56
63
  | `!ping` | 봇 연결 기본 확인 |
@@ -96,6 +103,27 @@ switch speaker to English
96
103
 
97
104
  영구 수동 설정이 필요하면 `.env`에 `TTS_BACKEND=edge`, `TTS_VOICE_TYPE=<voice-type>`, 필요 시 `TTS_VOICE=<edge-voice>`를 설정하세요. 더 많은 커스텀 목소리 카탈로그는 `config/tts-voices.json`에서 관리할 수 있습니다.
98
105
 
106
+ 백엔드별 목소리 설정:
107
+
108
+ | 백엔드 | 목소리 설정 | 자주 쓰는 선택지 |
109
+ |---|---|---|
110
+ | Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | `korean_male`, `korean_female`, `korean_multilingual_male`, `english_male`, `english_female`; `edge-tts --list-voices`의 모든 Edge voice |
111
+ | Supertonic | `SUPERTONIC_VOICE` | `M1`–`M5`, `F1`–`F5`; `SUPERTONIC_LANGUAGE=ko|en|es|pt|fr` |
112
+ | OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE` | 사용 허가가 있는 reference WAV와 `default` 같은 style |
113
+ | SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER` | CosyVoice reference WAV 또는 백엔드가 지원하는 speaker/model 값 |
114
+
115
+ Supertonic과 로컬 clone 백엔드는 위 env를 바꾼 뒤 `!voice-test <text>`로 바로 들어보세요. 현재 음성 명령 기반 전환은 기본 Edge-style voice type에 매핑되어 있고, 더 풍부한 백엔드 카탈로그는 `config/tts-voices.json`에 추가할 수 있습니다.
116
+
117
+ ## 긴 발화와 중간 멈춤
118
+
119
+ VerbalCoding은 말을 STT로 보내기 전에 idle window를 기다립니다. 기본값 `UTTERANCE_IDLE_MS=4500`은 일부러 조금 여유 있게 잡혀 있습니다. 긴 지시 중 자연스러운 멈춤을 문장 끝으로 오해해 앞부분만 에이전트에 보내고, 뒷부분을 processing 중 끼어들기로 처리하는 문제를 줄이기 위해서입니다.
120
+
121
+ 짧은 명령 반응을 더 빠르게 하고 싶다면 `.env`에서 낮추고, 긴 한국어 dictation이 여전히 잘리면 더 올리세요.
122
+
123
+ ```bash
124
+ UTTERANCE_IDLE_MS="6000"
125
+ ```
126
+
99
127
  ## 자세한 진행 모드
100
128
 
101
129
  자세한 진행은 기본적으로 꺼져 있습니다. `.env`에 `AGENT_VERBOSE_PROGRESS=1`을 설정하거나 Discord에서 `!verbose on`, 또는 음성으로 “상세 진행 켜”라고 말해 켤 수 있습니다.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "verbalcoding",
3
- "version": "0.2.2",
3
+ "version": "0.2.4",
4
4
  "description": "Discord voice bridge for CLI coding agents.",
5
5
  "license": "MIT",
6
6
  "repository": {
package/run.sh CHANGED
@@ -8,7 +8,7 @@ mkdir -p /tmp/verbalcoding-node-debug
8
8
  export NODE_AUDIO_DEBUG_DIR="${NODE_AUDIO_DEBUG_DIR:-/tmp/verbalcoding-node-debug}"
9
9
  export MIN_UTTERANCE_SECONDS="${MIN_UTTERANCE_SECONDS:-1.0}"
10
10
  export SUBSCRIBE_AFTER_SILENCE_MS="${SUBSCRIBE_AFTER_SILENCE_MS:-2200}"
11
- export UTTERANCE_IDLE_MS="${UTTERANCE_IDLE_MS:-2600}"
11
+ export UTTERANCE_IDLE_MS="${UTTERANCE_IDLE_MS:-4500}"
12
12
  export MIN_MEAN_VOLUME_DB="${MIN_MEAN_VOLUME_DB:--35}"
13
13
  export MIN_MAX_VOLUME_DB="${MIN_MAX_VOLUME_DB:--18}"
14
14
  export TTS_RATE="${TTS_RATE:-+10%}"
@@ -53,7 +53,7 @@ TTS_VOICE="en-US-GuyNeural"
53
53
  TTS_RATE="+0%"
54
54
  TTS_VOLUME="1.0"
55
55
  REQUIRE_WAKE_WORD="0"
56
- UTTERANCE_IDLE_MS="2000"
56
+ UTTERANCE_IDLE_MS="4500"
57
57
  LATENCY_LOG_PATH="./.logs/latency.jsonl"
58
58
  ENV
59
59
  chmod 600 .env
@@ -66,7 +66,7 @@ note('Allowed users configured', env.DISCORD_ALLOWED_USERS ? '[REDACTED]' : 'not
66
66
  note('Auto-join channels', env.AUTO_JOIN_VOICE_CHANNELS || 'default: 일반,General,general');
67
67
  note('Verbose progress default', ['1', 'true', 'yes', 'on'].includes(String(env.AGENT_VERBOSE_PROGRESS || env.VERBALCODING_VERBOSE_PROGRESS || '0').toLowerCase()) ? 'on' : 'off');
68
68
  note('Auto restart voice bot after commits', autoRestartVoiceBotEnabled(env) ? 'on' : 'off');
69
- note('Utterance idle wait before STT', `${env.UTTERANCE_IDLE_MS || '2000'} ms`);
69
+ note('Utterance idle wait before STT', `${env.UTTERANCE_IDLE_MS || '4500'} ms`);
70
70
  note('STT language', env.WHISPER_CPP_LANGUAGE || env.STT_LANGUAGE || 'ko');
71
71
  note('Progress/voice language', env.VOICE_LANGUAGE || env.WHISPER_CPP_LANGUAGE || env.STT_LANGUAGE || 'ko');
72
72
  note('Latency log path', env.LATENCY_LOG_PATH || './.logs/latency.jsonl');
@@ -58,7 +58,7 @@ async function main() {
58
58
  const openvoiceRefAudio = await ask('OpenVoice reference audio path', process.env.OPENVOICE_REF_AUDIO || './voice-samples/user-reference.wav');
59
59
  const requireWake = (await ask('Require wake word? 1/0', process.env.REQUIRE_WAKE_WORD || '0')) === '1';
60
60
  const verboseProgress = (await ask('Verbose progress by default? 1/0', process.env.AGENT_VERBOSE_PROGRESS || process.env.VERBALCODING_VERBOSE_PROGRESS || '0')) === '1';
61
- const utteranceIdleMs = await ask('Utterance idle wait before STT, ms', process.env.UTTERANCE_IDLE_MS || '2000');
61
+ const utteranceIdleMs = await ask('Utterance idle wait before STT, ms', process.env.UTTERANCE_IDLE_MS || '4500');
62
62
  const latencyLogPath = await ask('Latency JSONL log path', process.env.LATENCY_LOG_PATH || './.logs/latency.jsonl');
63
63
 
64
64
  const values = normalizeInstallAnswers({