npm - verbalcoding - Versions diffs - 0.2.2 → 0.2.4 - Mend

verbalcoding 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/.env.example +5 -3
package/app-node/discord_text.test.mjs +14 -0
package/app-node/install_config.mjs +1 -1
package/app-node/install_config.test.mjs +1 -1
package/app-node/main.mjs +19 -10
package/docs/CONFIGURATION.md +26 -2
package/docs/FRESH_INSTALL.md +8 -0
package/docs/RELEASE.md +1 -1
package/docs/USAGE.md +28 -0
package/docs/i18n/CONFIGURATION.ko.md +26 -2
package/docs/i18n/FRESH_INSTALL.ko.md +8 -0
package/docs/i18n/RELEASE.ko.md +1 -1
package/docs/i18n/USAGE.ko.md +28 -0
package/package.json +1 -1
package/run.sh +1 -1
package/scripts/docker_ubuntu_smoke.sh +1 -1
package/scripts/doctor.mjs +1 -1
package/scripts/install.mjs +1 -1

package/.env.example CHANGED Viewed

@@ -26,6 +26,7 @@ STT_LANGUAGE="ko"
 TTS_BACKEND="edge"   # edge | openvoice | speechswift | supertonic
 EDGE_TTS_COMMAND="edge-tts"
+TTS_VOICE_TYPE="korean_female"  # edge: korean_male | korean_female | korean_multilingual_male | english_male | english_female
 TTS_VOICE="ko-KR-SunHiNeural"
 TTS_RATE="+10%"
 TTS_MAX_CHARS="495"
@@ -66,8 +67,9 @@ OPENVOICE_STYLE="default"
 OPENVOICE_TIMEOUT_MS="90000"
 OPENVOICE_PROGRESS="0"   # keep progress prompts fast via Edge unless set to 1
 REQUIRE_WAKE_WORD="0"
-MIN_UTTERANCE_SECONDS="1.4"
-UTTERANCE_IDLE_MS="2000"
+MIN_UTTERANCE_SECONDS="1.0"
+# Wait for natural thinking pauses before STT. Lower for faster but more fragmented turns.
+UTTERANCE_IDLE_MS="4500"
 MIN_MEAN_VOLUME_DB="-35"
 MIN_MAX_VOLUME_DB="-12"
 BARGE_IN_MIN_SECONDS="1.4"
@@ -80,4 +82,4 @@ PLAYBACK_BARGE_IN_REQUIRE_BOTH="1"
 BARGE_IN_CONSERVATIVE_MIN_SECONDS="1.8"
 BARGE_IN_CONSERVATIVE_MIN_MEAN_VOLUME_DB="-27"
 BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB="-12"
-MAX_DEFERRED_PROCESSING_UTTERANCES="3"
+MAX_DEFERRED_PROCESSING_UTTERANCES="0"

package/app-node/discord_text.test.mjs CHANGED Viewed

@@ -8,6 +8,20 @@ test('splitDiscordMessage chunks long text for Discord', () => {
   assert.deepEqual(chunks.map(c => c.length), [1900, 1900, 201]);
 });
+test('sendDiscordText returns false without fetching when transcript channel id is missing', async () => {
+  const warnings = [];
+  let fetched = false;
+  const delivered = await sendDiscordText({
+    channelId: '',
+    text: 'restart complete',
+    client: { channels: { fetch: async () => { fetched = true; } } },
+    warn: (...args) => warnings.push(args.join(' ')),
+  });
+  assert.equal(delivered, false);
+  assert.equal(fetched, false);
+  assert.match(warnings.join('\n'), /missing transcript channel id/);
+});
 test('sendDiscordText returns false when target is not text based', async () => {
   const warnings = [];
   const delivered = await sendDiscordText({

package/app-node/install_config.mjs CHANGED Viewed

@@ -57,7 +57,7 @@ export function normalizeInstallAnswers(input = {}) {
     OPENVOICE_PROGRESS: input.openvoiceProgress === true || input.OPENVOICE_PROGRESS === '1' ? '1' : '0',
     REQUIRE_WAKE_WORD: input.requireWakeWord === true || input.REQUIRE_WAKE_WORD === '1' ? '1' : '0',
     MIN_UTTERANCE_SECONDS: clean(input.minUtteranceSeconds || input.MIN_UTTERANCE_SECONDS, '1.0'),
-    UTTERANCE_IDLE_MS: clean(input.utteranceIdleMs || input.UTTERANCE_IDLE_MS, '2000'),
+    UTTERANCE_IDLE_MS: clean(input.utteranceIdleMs || input.UTTERANCE_IDLE_MS, '4500'),
     HERMES_TASK_TIMEOUT_MS: clean(input.taskTimeoutMs || input.HERMES_TASK_TIMEOUT_MS, '0'),
     HERMES_CHAT_TIMEOUT_MS: clean(input.chatTimeoutMs || input.HERMES_CHAT_TIMEOUT_MS, '45000'),
     AGENT_VERBOSE_PROGRESS: input.verboseProgress === true || input.AGENT_VERBOSE_PROGRESS === '1' ? '1' : '0',

package/app-node/install_config.test.mjs CHANGED Viewed

@@ -63,7 +63,7 @@ test('normalizeInstallAnswers maps supported harnesses to backend env', () => {
   assert.equal(answers.SUPERTONIC_LANGUAGE, 'ko');
   assert.equal(answers.OPENVOICE_LANGUAGE, 'KR');
   assert.equal(answers.REQUIRE_WAKE_WORD, '0');
-  assert.equal(answers.UTTERANCE_IDLE_MS, '2000');
+  assert.equal(answers.UTTERANCE_IDLE_MS, '4500');
 });
 test('buildEnvFile writes configurable CLI harness and Discord settings without comments leaking into values', () => {

package/app-node/main.mjs CHANGED Viewed

@@ -166,7 +166,7 @@ const settings = {
   token: process.env.DISCORD_BOT_TOKEN || process.env.DISCORD_TOKEN,
   allowedUsers: new Set((process.env.DISCORD_ALLOWED_USERS || '').split(/[;,]/).map(s => s.trim()).filter(Boolean)),
   autoJoinVoiceChannels: (process.env.AUTO_JOIN_VOICE_CHANNELS || '일반,General,general').split(',').map(s => s.trim().toLowerCase()).filter(Boolean),
-  transcriptChannelId: (process.env.TRANSCRIPT_CHANNEL_ID || '123456789012345678').trim(),
+  transcriptChannelId: (process.env.TRANSCRIPT_CHANNEL_ID || '').trim(),
   whisperBin: process.env.WHISPER_CPP_BIN || 'whisper-cli',
   whisperModel: process.env.WHISPER_CPP_MODEL || path.join(ROOT, 'models', 'ggml-small-q5_1.bin'),
   whisperLanguage: process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
@@ -217,7 +217,10 @@ const BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB = Number(process.env.BARGE_IN_CONS
 const SENSITIVITY_MODE_DEFAULT = (process.env.BARGE_IN_SENSITIVITY_MODE || 'normal').toLowerCase() === 'conservative' ? 'conservative' : 'normal';
 const SENSITIVITY_OUTDOOR_SECONDS = Number(process.env.BARGE_IN_OUTDOOR_SECONDS || '900');
 const SUBSCRIBE_AFTER_SILENCE_MS = Number(process.env.SUBSCRIBE_AFTER_SILENCE_MS || '2200');
-const UTTERANCE_IDLE_MS = Number(process.env.UTTERANCE_IDLE_MS || '2000');
+// Wait long enough for natural mid-sentence pauses before sending audio to STT.
+// If this is too short, a long thought gets split: the first fragment starts an
+// agent turn and the rest is treated as barge-in/processing speech.
+const UTTERANCE_IDLE_MS = Number(process.env.UTTERANCE_IDLE_MS || '4500');
 const MIN_MEAN_VOLUME_DB = Number(process.env.MIN_MEAN_VOLUME_DB || '-35');
 const MIN_MAX_VOLUME_DB = Number(process.env.MIN_MAX_VOLUME_DB || '-12');
 const STT_START_VOICE_NOTICE = !['0', 'false', 'no', 'off'].includes((process.env.STT_START_VOICE_NOTICE || '1').toLowerCase());
@@ -1399,26 +1402,32 @@ async function connectTo(channel) {
     selfDeaf: false,
     selfMute: false,
   });
-  connection.subscribe(player);
-  connection.on('error', e => warn('voice connection error', e?.stack || e));
-  connection.on('stateChange', async (oldState, newState) => {
+  const voiceConnection = connection;
+  voiceConnection.subscribe(player);
+  voiceConnection.on('error', e => warn('voice connection error', e?.stack || e));
+  voiceConnection.on('stateChange', async (oldState, newState) => {
     log('voice connection state', oldState.status, '->', newState.status);
+    if (connection !== voiceConnection) {
+      log('ignore stale voice connection state', oldState.status, '->', newState.status);
+      return;
+    }
     if (newState.status === VoiceConnectionStatus.Disconnected) {
       try {
         await Promise.race([
-          entersState(connection, VoiceConnectionStatus.Signalling, 5000),
-          entersState(connection, VoiceConnectionStatus.Connecting, 5000),
+          entersState(voiceConnection, VoiceConnectionStatus.Signalling, 5000),
+          entersState(voiceConnection, VoiceConnectionStatus.Connecting, 5000),
         ]);
       } catch (e) {
+        if (connection !== voiceConnection) return;
         warn('voice connection disconnected; reconnecting to channel', channel.guild.name, channel.name, e?.message || e);
-        try { connection?.destroy(); } catch {}
+        try { voiceConnection.destroy(); } catch {}
         connection = null;
         setTimeout(() => connectTo(channel).catch(err => warn('voice reconnect failed', err?.stack || err)), 1500);
       }
     }
   });
-  await entersState(connection, VoiceConnectionStatus.Ready, 30000);
-  connection.receiver.speaking.on('start', userId => subscribeUser(connection.receiver, userId));
+  await entersState(voiceConnection, VoiceConnectionStatus.Ready, 30000);
+  voiceConnection.receiver.speaking.on('start', userId => subscribeUser(voiceConnection.receiver, userId));
   log(`Listening in voice channel ${channel.guild.name} / ${channel.name}`);
 }

package/docs/CONFIGURATION.md CHANGED Viewed

@@ -2,6 +2,12 @@
 ## Setup Wizard
+Discord bot/application setup is intentionally not re-explained from scratch here. Use these upstream guides for the Discord-side steps, then return to VerbalCoding setup:
+- Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
+- Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
+- Discord official quick start: <https://docs.discord.com/developers/quick-start/getting-started>
 ```bash
 ./scripts/install.sh
 ```
@@ -37,7 +43,7 @@ AGENT_COMMAND="my-harness run --non-interactive"
 AGENT_TASK_TIMEOUT_MS=0
 AGENT_CHAT_TIMEOUT_MS=45000
 AGENT_VERBOSE_PROGRESS=0
-UTTERANCE_IDLE_MS=2000
+UTTERANCE_IDLE_MS=4500
 LATENCY_LOG_PATH=./.logs/latency.jsonl
 ```
@@ -74,7 +80,7 @@ TTS_VOLUME="1.0"
 REQUIRE_WAKE_WORD="0"
 MIN_UTTERANCE_SECONDS="1.0"
-UTTERANCE_IDLE_MS="2000"
+UTTERANCE_IDLE_MS="4500"
 HERMES_TASK_TIMEOUT_MS="0"
 HERMES_CHAT_TIMEOUT_MS="45000"
 AGENT_VERBOSE_PROGRESS="0"
@@ -112,6 +118,24 @@ TTS_VOICE_CONFIG="config/tts-voices.json"
 For OpenVoice, SpeechSwift, or Supertonic, keep the backend-specific voice/reference settings in the sections below; the same voice catalog file can still track the active voice type.
+Backend-specific voice options:
+| Backend | Settings | Voice choices |
+|---|---|---|
+| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | Built-in types above, plus any voice returned by `edge-tts --list-voices` |
+| Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; language `ko`, `en`, `es`, `pt`, `fr` |
+| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | User-provided permitted reference WAV; style defaults to `default` |
+| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | Reference-sample voices for CosyVoice, or backend-supported speaker/model IDs |
+## Utterance Segmentation
+`UTTERANCE_IDLE_MS` controls how long the bridge waits after a speech segment before it decides the user is done and starts STT. The default is `4500` ms to preserve longer spoken instructions with natural pauses. Lower values feel faster for short commands but can split long dictation; higher values are safer for thoughtful speech.
+```bash
+UTTERANCE_IDLE_MS="4500"  # balanced default
+UTTERANCE_IDLE_MS="6000"  # safer for long dictation with pauses
+```
 ## MCP Server
 VerbalCoding ships a stdio MCP server so Hermes Agent or any MCP client can control the bridge through tools instead of relying on skills or free-form shell commands.

package/docs/FRESH_INSTALL.md CHANGED Viewed

@@ -77,6 +77,14 @@ If your OS is unsupported, install these manually before rerunning:
 ## 3. Discord application setup
+Read the upstream Discord bot setup guides first if this is your first bot:
+- Hermes Agent Discord messaging guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
+- Discord official bot overview: <https://docs.discord.com/developers/bots/overview>
+- Discord official getting started guide: <https://docs.discord.com/developers/quick-start/getting-started>
+Those pages show how to create a Discord application, add a bot user, enable privileged intents, and invite it to a server. VerbalCoding uses the same Discord bot setup, then adds voice receive, STT, CLI-agent execution, and TTS playback on top.
 1. Create a Discord application and bot in the Discord Developer Portal.
 2. Enable the Message Content privileged intent.
 3. Copy the bot token into the installer prompt or `.env` as `DISCORD_BOT_TOKEN`.

package/docs/RELEASE.md CHANGED Viewed

@@ -25,7 +25,7 @@ VerbalCoding is a Discord voice bridge for controlling CLI-based coding agents b
 - npm package install path: `npm install -g verbalcoding`, `vc setup --yes`, and `vc start`.
 - Optional verbose progress mode for text-only middle-step updates during long agent work.
 - Always-on JSONL latency metrics plus `!latency` / `!metrics` summary for pipeline optimization.
-- Lower default utterance idle wait (`UTTERANCE_IDLE_MS=2000`) so STT starts about 0.6s sooner after speech ends.
+- More patient utterance idle wait (`UTTERANCE_IDLE_MS=4500`) so long spoken instructions with natural pauses are not split into a partial prompt plus ignored processing-time speech.
 - Multi-instance Hermes profile isolation: `vc instance setup <name>` auto-clones a Hermes profile to `~/.hermes/profiles/<name>` with the instance workdir, seeds SOUL.md, and writes `HERMES_HOME` into the instance env so per-project memory and skills stay separate; `vc instance start` self-heals a missing profile, and `vc doctor` checks profile-dir presence and `terminal.cwd` consistency.
 ### Pre-release checklist

package/docs/USAGE.md CHANGED Viewed

@@ -43,6 +43,13 @@ The bot auto-joins the first configured channel name, defaulting to `일반,Gene
 ## Discord Commands
+Before wiring commands, set up the Discord application/bot using the upstream guides:
+- Hermes Agent Discord guide: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
+- Discord official bot docs: <https://docs.discord.com/developers/bots/overview>
+Then use `vc bot invite CLIENT_ID` to generate the VerbalCoding-specific invite URL with text and voice permissions.
 | Command | Purpose |
 |---|---|
 | `!ping` | Basic bot check |
@@ -88,6 +95,27 @@ Built-in Edge voice types:
 For persistent manual config, set `TTS_BACKEND=edge`, `TTS_VOICE_TYPE=<voice-type>`, and optionally `TTS_VOICE=<edge-voice>` in `.env`, or edit `config/tts-voices.json` for custom voice catalogs.
+Backend-specific voice knobs:
+| Backend | Voice setting | Common choices |
+|---|---|---|
+| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | `korean_male`, `korean_female`, `korean_multilingual_male`, `english_male`, `english_female`; any Edge voice from `edge-tts --list-voices` |
+| Supertonic | `SUPERTONIC_VOICE` | `M1`–`M5`, `F1`–`F5`; set `SUPERTONIC_LANGUAGE=ko|en|es|pt|fr` |
+| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE` | a permitted reference WAV plus style such as `default` |
+| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER` | reference WAV for CosyVoice, or backend-supported speaker/model values |
+For Supertonic and local clone backends, use the backend env vars above plus `!voice-test <text>` to audition changes. Voice-command switching currently maps the built-in Edge-style voice types; richer backend catalogs can be added in `config/tts-voices.json`.
+## Long Dictation and Pauses
+VerbalCoding waits for an idle window before sending speech to STT. The default `UTTERANCE_IDLE_MS=4500` is intentionally a bit patient so a natural pause in a long instruction does not split the sentence, start an agent turn too early, and then treat the rest as a processing-time interruption.
+If you prefer faster short commands, lower it in `.env`; if long Korean dictation is still being split, raise it:
+```bash
+UTTERANCE_IDLE_MS="6000"
+```
 ## Verbose Progress Mode
 Verbose progress is off by default unless `AGENT_VERBOSE_PROGRESS=1` is set. Enable it with `!verbose on` or a voice command like “상세 진행 켜”. It can emit short progress lines such as:

package/docs/i18n/CONFIGURATION.ko.md CHANGED Viewed

@@ -2,6 +2,12 @@
 ## 설정 마법사
+Discord 봇/애플리케이션 생성 절차는 여기에서 처음부터 반복 설명하지 않습니다. Discord 쪽 설정은 아래 상위 문서를 보고 진행한 뒤 VerbalCoding 설정으로 돌아오세요.
+- Hermes Agent Discord 메시징 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
+- Discord 공식 봇 개요: <https://docs.discord.com/developers/bots/overview>
+- Discord 공식 시작 가이드: <https://docs.discord.com/developers/quick-start/getting-started>
 npm으로 설치한 경우:
 ```bash
@@ -45,7 +51,7 @@ AGENT_COMMAND="my-harness run --non-interactive"
 AGENT_TASK_TIMEOUT_MS=0
 AGENT_CHAT_TIMEOUT_MS=45000
 AGENT_VERBOSE_PROGRESS=0
-UTTERANCE_IDLE_MS=2000
+UTTERANCE_IDLE_MS=4500
 LATENCY_LOG_PATH=./.logs/latency.jsonl
 ```
@@ -82,7 +88,7 @@ TTS_VOLUME="1.0"
 REQUIRE_WAKE_WORD="0"
 MIN_UTTERANCE_SECONDS="1.0"
-UTTERANCE_IDLE_MS="2000"
+UTTERANCE_IDLE_MS="4500"
 HERMES_TASK_TIMEOUT_MS="0"
 HERMES_CHAT_TIMEOUT_MS="45000"
 AGENT_VERBOSE_PROGRESS="0"
@@ -120,6 +126,24 @@ TTS_VOICE_CONFIG="config/tts-voices.json"
 OpenVoice, SpeechSwift, Supertonic을 쓸 때는 아래 백엔드별 reference/voice 설정을 유지하세요. 같은 voice catalog 파일에서 현재 voice type을 추적할 수 있습니다.
+백엔드별 목소리 옵션:
+| 백엔드 | 설정 | 목소리 선택지 |
+|---|---|---|
+| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | 위 기본 타입, 또는 `edge-tts --list-voices`가 반환하는 모든 voice |
+| Supertonic | `SUPERTONIC_VOICE`, `SUPERTONIC_LANGUAGE` | `M1`–`M5`, `F1`–`F5`; 언어 `ko`, `en`, `es`, `pt`, `fr` |
+| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE`, `OPENVOICE_LANGUAGE` | 사용자가 제공한 허가된 reference WAV; style 기본값은 `default` |
+| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER`, `SPEECHSWIFT_MODEL_ID` | CosyVoice reference sample voice 또는 백엔드가 지원하는 speaker/model ID |
+## 발화 분리 설정
+`UTTERANCE_IDLE_MS`는 음성 segment가 끝난 뒤 사용자의 말이 끝났다고 판단하고 STT를 시작하기 전까지 기다리는 시간입니다. 기본값은 `4500` ms입니다. 긴 지시 중 자연스러운 멈춤을 보존하기 위한 값입니다. 낮추면 짧은 명령 반응은 빨라지지만 긴 발화가 잘릴 수 있고, 높이면 생각하면서 말하는 긴 dictation에 더 안전합니다.
+```bash
+UTTERANCE_IDLE_MS="4500"  # 균형 잡힌 기본값
+UTTERANCE_IDLE_MS="6000"  # 중간 멈춤이 있는 긴 발화에 더 안전
+```
 ## MCP 서버
 VerbalCoding은 stdio MCP 서버를 포함합니다. Hermes Agent 또는 MCP client는 자유 형식 shell 명령 대신 도구로 브릿지를 제어할 수 있습니다.

package/docs/i18n/FRESH_INSTALL.ko.md CHANGED Viewed

@@ -77,6 +77,14 @@ OS가 지원되지 않으면 아래를 직접 설치한 뒤 다시 실행하세
 ## 3. Discord 애플리케이션 설정
+Discord 봇을 처음 만든다면 먼저 공식/상위 문서를 확인하세요.
+- Hermes Agent Discord 메시징 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
+- Discord 공식 봇 개요: <https://docs.discord.com/developers/bots/overview>
+- Discord 공식 시작 가이드: <https://docs.discord.com/developers/quick-start/getting-started>
+위 문서에는 Discord 애플리케이션 생성, bot user 추가, privileged intent 활성화, 서버 초대 방법이 설명되어 있습니다. VerbalCoding도 같은 Discord bot 설정을 사용하고, 그 위에 음성 수신, STT, CLI 에이전트 실행, TTS 재생을 얹습니다.
 1. Discord Developer Portal에서 애플리케이션과 봇을 만듭니다.
 2. Message Content privileged intent를 켭니다.
 3. 봇 토큰을 설치 프롬프트 또는 `.env`의 `DISCORD_BOT_TOKEN`에 넣습니다.

package/docs/i18n/RELEASE.ko.md CHANGED Viewed

@@ -24,7 +24,7 @@ VerbalCoding은 음성으로 CLI 기반 코딩 에이전트를 제어하기 위
 - 설정 마법사, `.env.example`, `vc doctor` prerequisite checker, OS 패키지/npm 의존성/Edge TTS helper/기본 whisper.cpp 모델을 준비하는 `./scripts/install.sh --yes` 부트스트랩.
 - 긴 에이전트 작업 중 텍스트 전용 중간 단계 업데이트를 위한 선택적 verbose progress mode.
 - 파이프라인 최적화를 위한 JSONL latency metrics와 `!latency` / `!metrics` 요약.
-- 낮아진 기본 utterance idle wait (`UTTERANCE_IDLE_MS=2000`)로 사용자가 말한 뒤 STT가 약 0.6초 더 빨리 시작.
+- 더 여유 있는 utterance idle wait (`UTTERANCE_IDLE_MS=4500`)로 자연스러운 중간 멈춤이 있는 긴 지시가 앞부분 prompt와 무시되는 processing-time speech로 쪼개지지 않도록 개선.
 - 멀티 인스턴스 Hermes 프로필 격리: `vc instance setup <name>`이 자동으로 Hermes 프로필을 `~/.hermes/profiles/<name>`에 clone하고, instance workdir을 설정하고, SOUL.md를 초기화하고, instance env에 `HERMES_HOME`을 기록합니다. `vc instance start`는 누락된 profile을 self-heal하고, `vc doctor`는 profile-dir 존재와 `terminal.cwd` 일관성을 검사합니다.
 - npm 공개 패키지: `npm install -g verbalcoding`, `vc setup --yes`, `vc start` 경로 지원.

package/docs/i18n/USAGE.ko.md CHANGED Viewed

@@ -51,6 +51,13 @@ VERBALCODING_INSTANCE_ENV=instances/my-project.env ./run.sh
 ## Discord 명령
+명령을 연결하기 전에 먼저 상위 문서대로 Discord 애플리케이션/봇을 설정하세요.
+- Hermes Agent Discord 가이드: <https://hermes-agent.nousresearch.com/docs/user-guide/messaging/discord>
+- Discord 공식 봇 문서: <https://docs.discord.com/developers/bots/overview>
+그 다음 `vc bot invite CLIENT_ID`를 사용하면 VerbalCoding에 필요한 텍스트/음성 권한이 포함된 초대 URL을 만들 수 있습니다.
 | 명령 | 용도 |
 |---|---|
 | `!ping` | 봇 연결 기본 확인 |
@@ -96,6 +103,27 @@ switch speaker to English
 영구 수동 설정이 필요하면 `.env`에 `TTS_BACKEND=edge`, `TTS_VOICE_TYPE=<voice-type>`, 필요 시 `TTS_VOICE=<edge-voice>`를 설정하세요. 더 많은 커스텀 목소리 카탈로그는 `config/tts-voices.json`에서 관리할 수 있습니다.
+백엔드별 목소리 설정:
+| 백엔드 | 목소리 설정 | 자주 쓰는 선택지 |
+|---|---|---|
+| Edge | `TTS_VOICE_TYPE`, `TTS_VOICE` | `korean_male`, `korean_female`, `korean_multilingual_male`, `english_male`, `english_female`; `edge-tts --list-voices`의 모든 Edge voice |
+| Supertonic | `SUPERTONIC_VOICE` | `M1`–`M5`, `F1`–`F5`; `SUPERTONIC_LANGUAGE=ko|en|es|pt|fr` |
+| OpenVoice | `OPENVOICE_REF_AUDIO`, `OPENVOICE_STYLE` | 사용 허가가 있는 reference WAV와 `default` 같은 style |
+| SpeechSwift / CosyVoice | `SPEECHSWIFT_REF_AUDIO`, `SPEECHSWIFT_ENGINE`, `SPEECHSWIFT_SPEAKER` | CosyVoice reference WAV 또는 백엔드가 지원하는 speaker/model 값 |
+Supertonic과 로컬 clone 백엔드는 위 env를 바꾼 뒤 `!voice-test <text>`로 바로 들어보세요. 현재 음성 명령 기반 전환은 기본 Edge-style voice type에 매핑되어 있고, 더 풍부한 백엔드 카탈로그는 `config/tts-voices.json`에 추가할 수 있습니다.
+## 긴 발화와 중간 멈춤
+VerbalCoding은 말을 STT로 보내기 전에 idle window를 기다립니다. 기본값 `UTTERANCE_IDLE_MS=4500`은 일부러 조금 여유 있게 잡혀 있습니다. 긴 지시 중 자연스러운 멈춤을 문장 끝으로 오해해 앞부분만 에이전트에 보내고, 뒷부분을 processing 중 끼어들기로 처리하는 문제를 줄이기 위해서입니다.
+짧은 명령 반응을 더 빠르게 하고 싶다면 `.env`에서 낮추고, 긴 한국어 dictation이 여전히 잘리면 더 올리세요.
+```bash
+UTTERANCE_IDLE_MS="6000"
+```
 ## 자세한 진행 모드
 자세한 진행은 기본적으로 꺼져 있습니다. `.env`에 `AGENT_VERBOSE_PROGRESS=1`을 설정하거나 Discord에서 `!verbose on`, 또는 음성으로 “상세 진행 켜”라고 말해 켤 수 있습니다.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "verbalcoding",
-  "version": "0.2.2",
+  "version": "0.2.4",
   "description": "Discord voice bridge for CLI coding agents.",
   "license": "MIT",
   "repository": {

package/run.sh CHANGED Viewed

@@ -8,7 +8,7 @@ mkdir -p /tmp/verbalcoding-node-debug
 export NODE_AUDIO_DEBUG_DIR="${NODE_AUDIO_DEBUG_DIR:-/tmp/verbalcoding-node-debug}"
 export MIN_UTTERANCE_SECONDS="${MIN_UTTERANCE_SECONDS:-1.0}"
 export SUBSCRIBE_AFTER_SILENCE_MS="${SUBSCRIBE_AFTER_SILENCE_MS:-2200}"
-export UTTERANCE_IDLE_MS="${UTTERANCE_IDLE_MS:-2600}"
+export UTTERANCE_IDLE_MS="${UTTERANCE_IDLE_MS:-4500}"
 export MIN_MEAN_VOLUME_DB="${MIN_MEAN_VOLUME_DB:--35}"
 export MIN_MAX_VOLUME_DB="${MIN_MAX_VOLUME_DB:--18}"
 export TTS_RATE="${TTS_RATE:-+10%}"

package/scripts/docker_ubuntu_smoke.sh CHANGED Viewed

@@ -53,7 +53,7 @@ TTS_VOICE="en-US-GuyNeural"
 TTS_RATE="+0%"
 TTS_VOLUME="1.0"
 REQUIRE_WAKE_WORD="0"
-UTTERANCE_IDLE_MS="2000"
+UTTERANCE_IDLE_MS="4500"
 LATENCY_LOG_PATH="./.logs/latency.jsonl"
 ENV
     chmod 600 .env

package/scripts/doctor.mjs CHANGED Viewed

@@ -66,7 +66,7 @@ note('Allowed users configured', env.DISCORD_ALLOWED_USERS ? '[REDACTED]' : 'not
 note('Auto-join channels', env.AUTO_JOIN_VOICE_CHANNELS || 'default: 일반,General,general');
 note('Verbose progress default', ['1', 'true', 'yes', 'on'].includes(String(env.AGENT_VERBOSE_PROGRESS || env.VERBALCODING_VERBOSE_PROGRESS || '0').toLowerCase()) ? 'on' : 'off');
 note('Auto restart voice bot after commits', autoRestartVoiceBotEnabled(env) ? 'on' : 'off');
-note('Utterance idle wait before STT', `${env.UTTERANCE_IDLE_MS || '2000'} ms`);
+note('Utterance idle wait before STT', `${env.UTTERANCE_IDLE_MS || '4500'} ms`);
 note('STT language', env.WHISPER_CPP_LANGUAGE || env.STT_LANGUAGE || 'ko');
 note('Progress/voice language', env.VOICE_LANGUAGE || env.WHISPER_CPP_LANGUAGE || env.STT_LANGUAGE || 'ko');
 note('Latency log path', env.LATENCY_LOG_PATH || './.logs/latency.jsonl');

package/scripts/install.mjs CHANGED Viewed

@@ -58,7 +58,7 @@ async function main() {
     const openvoiceRefAudio = await ask('OpenVoice reference audio path', process.env.OPENVOICE_REF_AUDIO || './voice-samples/user-reference.wav');
     const requireWake = (await ask('Require wake word? 1/0', process.env.REQUIRE_WAKE_WORD || '0')) === '1';
     const verboseProgress = (await ask('Verbose progress by default? 1/0', process.env.AGENT_VERBOSE_PROGRESS || process.env.VERBALCODING_VERBOSE_PROGRESS || '0')) === '1';
-    const utteranceIdleMs = await ask('Utterance idle wait before STT, ms', process.env.UTTERANCE_IDLE_MS || '2000');
+    const utteranceIdleMs = await ask('Utterance idle wait before STT, ms', process.env.UTTERANCE_IDLE_MS || '4500');
     const latencyLogPath = await ask('Latency JSONL log path', process.env.LATENCY_LOG_PATH || './.logs/latency.jsonl');
     const values = normalizeInstallAnswers({