verbalcoding 0.2.11 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. package/.env.example +98 -2
  2. package/README.es.md +134 -0
  3. package/README.fr.md +134 -0
  4. package/README.ja.md +134 -0
  5. package/README.ko.md +134 -0
  6. package/README.md +118 -74
  7. package/README.ru.md +134 -0
  8. package/README.zh.md +133 -0
  9. package/app-node/agent_adapters.mjs +37 -5
  10. package/app-node/agent_adapters.test.mjs +27 -1
  11. package/app-node/agent_detect.mjs +73 -0
  12. package/app-node/agent_detect.test.mjs +77 -0
  13. package/app-node/agent_routing.mjs +148 -0
  14. package/app-node/agent_routing.test.mjs +138 -0
  15. package/app-node/agent_turn.mjs +86 -0
  16. package/app-node/agent_turn.test.mjs +109 -0
  17. package/app-node/bridge_context.mjs +73 -0
  18. package/app-node/bridge_context.test.mjs +54 -0
  19. package/app-node/bridge_state.mjs +4 -0
  20. package/app-node/bridge_wireup.test.mjs +462 -0
  21. package/app-node/cli_install.test.mjs +31 -0
  22. package/app-node/cross_agent_routing.test.mjs +78 -0
  23. package/app-node/discord_command_router.mjs +204 -0
  24. package/app-node/discord_command_router.test.mjs +311 -0
  25. package/app-node/discord_voice_setup.mjs +251 -0
  26. package/app-node/discord_voice_setup.test.mjs +86 -0
  27. package/app-node/hermes_profiles.test.mjs +12 -1
  28. package/app-node/install_config.mjs +113 -3
  29. package/app-node/install_config.test.mjs +8 -0
  30. package/app-node/instance_doctor.test.mjs +9 -0
  31. package/app-node/instances.test.mjs +8 -1
  32. package/app-node/main.mjs +513 -1058
  33. package/app-node/mcp_tools.test.mjs +7 -0
  34. package/app-node/notification_handler.mjs +89 -0
  35. package/app-node/notification_handler.test.mjs +187 -0
  36. package/app-node/notify.mjs +73 -0
  37. package/app-node/notify.test.mjs +68 -0
  38. package/app-node/plan_dispatcher.mjs +215 -0
  39. package/app-node/plan_dispatcher.test.mjs +101 -0
  40. package/app-node/plan_mode.mjs +203 -0
  41. package/app-node/plan_mode.test.mjs +231 -0
  42. package/app-node/progress_handler.mjs +220 -0
  43. package/app-node/progress_handler.test.mjs +193 -0
  44. package/app-node/progress_speech.mjs +54 -32
  45. package/app-node/progress_speech.test.mjs +12 -3
  46. package/app-node/project_sessions.mjs +5 -2
  47. package/app-node/project_sessions.test.mjs +7 -0
  48. package/app-node/research_mode.mjs +282 -0
  49. package/app-node/research_mode.test.mjs +264 -0
  50. package/app-node/restart_notice.mjs +3 -0
  51. package/app-node/restart_notice.test.mjs +11 -0
  52. package/app-node/session_ontology.mjs +271 -0
  53. package/app-node/session_ontology.test.mjs +130 -0
  54. package/app-node/smart_progress.mjs +94 -0
  55. package/app-node/smart_progress.test.mjs +66 -0
  56. package/app-node/stream_sentencer.mjs +91 -0
  57. package/app-node/stream_sentencer.test.mjs +129 -0
  58. package/app-node/streaming_tts_queue.mjs +52 -0
  59. package/app-node/streaming_tts_queue.test.mjs +64 -0
  60. package/app-node/stt_whisper.mjs +24 -0
  61. package/app-node/stt_whisper.test.mjs +32 -0
  62. package/app-node/text_routing.mjs +22 -0
  63. package/app-node/text_routing.test.mjs +23 -1
  64. package/app-node/tts_backends.mjs +537 -3
  65. package/app-node/tts_backends.test.mjs +454 -0
  66. package/app-node/tts_player.mjs +164 -0
  67. package/app-node/tts_player.test.mjs +202 -0
  68. package/app-node/tts_runtime.mjs +134 -0
  69. package/app-node/tts_runtime.test.mjs +89 -0
  70. package/app-node/tts_settings.mjs +150 -3
  71. package/app-node/tts_settings.test.mjs +204 -0
  72. package/app-node/tts_voice_config.mjs +136 -2
  73. package/app-node/tts_voice_config.test.mjs +94 -0
  74. package/app-node/utterance_router.mjs +216 -0
  75. package/app-node/utterance_router.test.mjs +236 -0
  76. package/app-node/voice_autojoin.mjs +37 -0
  77. package/app-node/voice_autojoin.test.mjs +59 -0
  78. package/app-node/voice_io.mjs +272 -0
  79. package/app-node/voice_io.test.mjs +102 -0
  80. package/app-node/voice_turn_runner.mjs +449 -0
  81. package/app-node/voice_turn_runner.test.mjs +289 -0
  82. package/docs/CONFIGURATION.md +79 -96
  83. package/docs/FRESH_INSTALL.md +105 -63
  84. package/docs/HARNESSES.md +58 -0
  85. package/docs/HARNESS_AIDER.md +50 -0
  86. package/docs/HARNESS_CLAUDE.md +56 -0
  87. package/docs/HARNESS_CODEX.md +56 -0
  88. package/docs/HARNESS_CURSOR.md +45 -0
  89. package/docs/HARNESS_GEMINI.md +45 -0
  90. package/docs/HARNESS_HERMES.md +57 -0
  91. package/docs/HARNESS_OPENCLAW.md +44 -0
  92. package/docs/HARNESS_OPENCODE.md +44 -0
  93. package/docs/HERMES_VOICE.md +65 -0
  94. package/docs/MULTI_INSTANCE.md +16 -0
  95. package/docs/README.md +50 -0
  96. package/docs/RELEASE.md +42 -19
  97. package/docs/ROADMAP.md +53 -0
  98. package/docs/TROUBLESHOOTING.md +126 -0
  99. package/docs/TTS_BACKENDS.md +227 -0
  100. package/docs/USAGE.md +94 -40
  101. package/docs/assets/figures/verbalcoding-flow.svg +1 -1
  102. package/docs/i18n/AGENTS.es.md +34 -0
  103. package/docs/i18n/AGENTS.fr.md +34 -0
  104. package/docs/i18n/AGENTS.ja.md +34 -0
  105. package/docs/i18n/AGENTS.ko.md +34 -0
  106. package/docs/i18n/AGENTS.ru.md +34 -0
  107. package/docs/i18n/AGENTS.zh.md +34 -0
  108. package/docs/i18n/CONFIGURATION.es.md +25 -0
  109. package/docs/i18n/CONFIGURATION.fr.md +25 -0
  110. package/docs/i18n/CONFIGURATION.ja.md +25 -0
  111. package/docs/i18n/CONFIGURATION.ko.md +25 -0
  112. package/docs/i18n/CONFIGURATION.ru.md +25 -0
  113. package/docs/i18n/CONFIGURATION.zh.md +25 -0
  114. package/docs/i18n/FRESH_INSTALL.es.md +27 -2
  115. package/docs/i18n/FRESH_INSTALL.fr.md +27 -2
  116. package/docs/i18n/FRESH_INSTALL.ja.md +27 -2
  117. package/docs/i18n/FRESH_INSTALL.ko.md +27 -2
  118. package/docs/i18n/FRESH_INSTALL.ru.md +27 -2
  119. package/docs/i18n/FRESH_INSTALL.zh.md +27 -2
  120. package/docs/i18n/HARNESSES.es.md +58 -0
  121. package/docs/i18n/HARNESSES.fr.md +58 -0
  122. package/docs/i18n/HARNESSES.ja.md +58 -0
  123. package/docs/i18n/HARNESSES.ko.md +58 -0
  124. package/docs/i18n/HARNESSES.ru.md +58 -0
  125. package/docs/i18n/HARNESSES.zh.md +58 -0
  126. package/docs/i18n/HARNESS_AIDER.es.md +48 -0
  127. package/docs/i18n/HARNESS_AIDER.fr.md +48 -0
  128. package/docs/i18n/HARNESS_AIDER.ja.md +50 -0
  129. package/docs/i18n/HARNESS_AIDER.ko.md +50 -0
  130. package/docs/i18n/HARNESS_AIDER.ru.md +48 -0
  131. package/docs/i18n/HARNESS_AIDER.zh.md +48 -0
  132. package/docs/i18n/HARNESS_CLAUDE.es.md +55 -0
  133. package/docs/i18n/HARNESS_CLAUDE.fr.md +55 -0
  134. package/docs/i18n/HARNESS_CLAUDE.ja.md +56 -0
  135. package/docs/i18n/HARNESS_CLAUDE.ko.md +56 -0
  136. package/docs/i18n/HARNESS_CLAUDE.ru.md +55 -0
  137. package/docs/i18n/HARNESS_CLAUDE.zh.md +56 -0
  138. package/docs/i18n/HARNESS_CODEX.es.md +55 -0
  139. package/docs/i18n/HARNESS_CODEX.fr.md +55 -0
  140. package/docs/i18n/HARNESS_CODEX.ja.md +56 -0
  141. package/docs/i18n/HARNESS_CODEX.ko.md +56 -0
  142. package/docs/i18n/HARNESS_CODEX.ru.md +55 -0
  143. package/docs/i18n/HARNESS_CODEX.zh.md +56 -0
  144. package/docs/i18n/HARNESS_CURSOR.es.md +42 -0
  145. package/docs/i18n/HARNESS_CURSOR.fr.md +42 -0
  146. package/docs/i18n/HARNESS_CURSOR.ja.md +45 -0
  147. package/docs/i18n/HARNESS_CURSOR.ko.md +45 -0
  148. package/docs/i18n/HARNESS_CURSOR.ru.md +42 -0
  149. package/docs/i18n/HARNESS_CURSOR.zh.md +42 -0
  150. package/docs/i18n/HARNESS_GEMINI.es.md +44 -0
  151. package/docs/i18n/HARNESS_GEMINI.fr.md +44 -0
  152. package/docs/i18n/HARNESS_GEMINI.ja.md +45 -0
  153. package/docs/i18n/HARNESS_GEMINI.ko.md +45 -0
  154. package/docs/i18n/HARNESS_GEMINI.ru.md +44 -0
  155. package/docs/i18n/HARNESS_GEMINI.zh.md +45 -0
  156. package/docs/i18n/HARNESS_HERMES.es.md +54 -0
  157. package/docs/i18n/HARNESS_HERMES.fr.md +54 -0
  158. package/docs/i18n/HARNESS_HERMES.ja.md +57 -0
  159. package/docs/i18n/HARNESS_HERMES.ko.md +57 -0
  160. package/docs/i18n/HARNESS_HERMES.ru.md +54 -0
  161. package/docs/i18n/HARNESS_HERMES.zh.md +57 -0
  162. package/docs/i18n/HARNESS_OPENCLAW.es.md +41 -0
  163. package/docs/i18n/HARNESS_OPENCLAW.fr.md +41 -0
  164. package/docs/i18n/HARNESS_OPENCLAW.ja.md +44 -0
  165. package/docs/i18n/HARNESS_OPENCLAW.ko.md +44 -0
  166. package/docs/i18n/HARNESS_OPENCLAW.ru.md +41 -0
  167. package/docs/i18n/HARNESS_OPENCLAW.zh.md +42 -0
  168. package/docs/i18n/HARNESS_OPENCODE.es.md +41 -0
  169. package/docs/i18n/HARNESS_OPENCODE.fr.md +41 -0
  170. package/docs/i18n/HARNESS_OPENCODE.ja.md +44 -0
  171. package/docs/i18n/HARNESS_OPENCODE.ko.md +44 -0
  172. package/docs/i18n/HARNESS_OPENCODE.ru.md +41 -0
  173. package/docs/i18n/HARNESS_OPENCODE.zh.md +44 -0
  174. package/docs/i18n/HERMES_VOICE.es.md +46 -0
  175. package/docs/i18n/HERMES_VOICE.fr.md +46 -0
  176. package/docs/i18n/HERMES_VOICE.ja.md +46 -0
  177. package/docs/i18n/HERMES_VOICE.ko.md +65 -0
  178. package/docs/i18n/HERMES_VOICE.ru.md +46 -0
  179. package/docs/i18n/HERMES_VOICE.zh.md +46 -0
  180. package/docs/i18n/MULTI_INSTANCE.es.md +25 -0
  181. package/docs/i18n/MULTI_INSTANCE.fr.md +25 -0
  182. package/docs/i18n/MULTI_INSTANCE.ja.md +25 -0
  183. package/docs/i18n/MULTI_INSTANCE.ko.md +25 -0
  184. package/docs/i18n/MULTI_INSTANCE.ru.md +25 -0
  185. package/docs/i18n/MULTI_INSTANCE.zh.md +25 -0
  186. package/docs/i18n/README.es.md +20 -134
  187. package/docs/i18n/README.fr.md +20 -134
  188. package/docs/i18n/README.ja.md +20 -134
  189. package/docs/i18n/README.ko.md +20 -133
  190. package/docs/i18n/README.ru.md +20 -134
  191. package/docs/i18n/README.zh.md +20 -133
  192. package/docs/i18n/RELEASE.es.md +26 -1
  193. package/docs/i18n/RELEASE.fr.md +26 -1
  194. package/docs/i18n/RELEASE.ja.md +26 -1
  195. package/docs/i18n/RELEASE.ko.md +26 -1
  196. package/docs/i18n/RELEASE.ru.md +26 -1
  197. package/docs/i18n/RELEASE.zh.md +26 -1
  198. package/docs/i18n/TROUBLESHOOTING.es.md +39 -0
  199. package/docs/i18n/TROUBLESHOOTING.fr.md +39 -0
  200. package/docs/i18n/TROUBLESHOOTING.ja.md +39 -0
  201. package/docs/i18n/TROUBLESHOOTING.ko.md +39 -0
  202. package/docs/i18n/TROUBLESHOOTING.ru.md +39 -0
  203. package/docs/i18n/TROUBLESHOOTING.zh.md +39 -0
  204. package/docs/i18n/USAGE.es.md +25 -0
  205. package/docs/i18n/USAGE.fr.md +25 -0
  206. package/docs/i18n/USAGE.ja.md +25 -0
  207. package/docs/i18n/USAGE.ko.md +25 -0
  208. package/docs/i18n/USAGE.ru.md +25 -0
  209. package/docs/i18n/USAGE.zh.md +25 -0
  210. package/docs/superpowers/plans/2026-05-13-phase1-streaming-pipeline.md +122 -0
  211. package/docs/superpowers/plans/2026-05-13-phase10-push-notifications.md +152 -0
  212. package/docs/superpowers/plans/2026-05-13-phase2-agent-adapters.md +242 -0
  213. package/docs/superpowers/plans/2026-05-13-phase6-smart-progress.md +172 -0
  214. package/docs/superpowers/plans/2026-05-13-phase7-voice-plan-mode.md +108 -0
  215. package/docs/superpowers/plans/2026-05-14-cross-agent-voice-transfer.md +625 -0
  216. package/docs/superpowers/plans/2026-05-21-audio-overview-narrated-diffs.md +95 -0
  217. package/docs/superpowers/plans/2026-05-21-autoresearch-ontology.md +83 -0
  218. package/docs/superpowers/plans/2026-05-21-phase11-push-to-talk-wakeword-v2.md +77 -0
  219. package/docs/superpowers/plans/2026-05-21-phase12-multi-user-voice.md +147 -0
  220. package/docs/superpowers/plans/2026-05-21-phase14-verbalbench.md +136 -0
  221. package/docs/superpowers/plans/2026-05-21-phase15-phone-companion.md +72 -0
  222. package/integrations/fireredtts2/mlx_llm.py +183 -0
  223. package/integrations/fireredtts2/synth.py +156 -0
  224. package/integrations/fireredtts2/synth_mlx.py +196 -0
  225. package/integrations/mlxaudio/synth.py +74 -0
  226. package/integrations/neuttsair/synth.py +104 -0
  227. package/integrations/omnivoice/synth.py +110 -0
  228. package/package.json +7 -1
  229. package/scripts/cli.mjs +88 -3
  230. package/scripts/doctor.mjs +115 -4
  231. package/scripts/install.mjs +20 -2
  232. package/scripts/install_fireredtts2.sh +109 -0
  233. package/scripts/install_mlxaudio.sh +34 -0
  234. package/scripts/install_mossttsnano.sh +46 -0
  235. package/scripts/postinstall.mjs +34 -0
package/app-node/main.mjs CHANGED
@@ -6,19 +6,8 @@ import { spawn, execFile } from 'node:child_process';
6
6
  import { promisify } from 'node:util';
7
7
 
8
8
  import { Client, GatewayIntentBits, Partials } from 'discord.js';
9
- import {
10
- AudioPlayerStatus,
11
- EndBehaviorType,
12
- StreamType,
13
- VoiceConnectionStatus,
14
- createAudioPlayer,
15
- createAudioResource,
16
- entersState,
17
- joinVoiceChannel,
18
- } from '@discordjs/voice';
19
- import prism from 'prism-media';
20
- import wav from 'wav';
21
- import { buildAgentSettings, createAgentAdapter, isPatchLikeOutput } from './agent_adapters.mjs';
9
+ import { createAudioPlayer } from '@discordjs/voice';
10
+ import { buildAgentSettings, createAgentAdapter, isPatchLikeOutput, shellSplit } from './agent_adapters.mjs';
22
11
  import {
23
12
  appendJsonl,
24
13
  createLatencyTurn,
@@ -26,9 +15,28 @@ import {
26
15
  readJsonlRecords,
27
16
  summarizeLatencyRecords,
28
17
  } from './latency_metrics.mjs';
29
- import { splitForTTS } from './tts_chunks.mjs';
30
- import { playChunkedTTSWithPrefetch } from './tts_prefetch.mjs';
31
- import { progressCategory, summarizeProgressEvents, formatProgressMessage } from './progress_speech.mjs';
18
+ import {
19
+ isPlanEntryUtterance,
20
+ parsePlanOutput,
21
+ parseVoiceCommand as parsePlanVoiceCommand,
22
+ applyCommand as applyPlanCommand,
23
+ renderFinalPlan,
24
+ planModePreamble,
25
+ planExecutionPreamble,
26
+ parseDecisionAnswer,
27
+ renderDecisionPrompt,
28
+ renderResolvedDecisions,
29
+ } from './plan_mode.mjs';
30
+ import {
31
+ parseAgentRoutingCommand,
32
+ renderAgentPrefix,
33
+ buildCrossAgentPrompt,
34
+ isAgentRoutingDecision,
35
+ buildFallbackDecision,
36
+ isRoutingOnlyUtterance,
37
+ } from './agent_routing.mjs';
38
+ import { createSessionOntology } from './session_ontology.mjs';
39
+ import { parseResearchCommand, runResearchTurn } from './research_mode.mjs';
32
40
  import { buildTtsSettings } from './tts_settings.mjs';
33
41
  import { createTtsBackend } from './tts_backends.mjs';
34
42
  import {
@@ -43,17 +51,30 @@ import {
43
51
  } from './tts_voice_config.mjs';
44
52
  import { createBridgeLogger, createTransientErrorReporter, isTransientNetworkError } from './bridge_logger.mjs';
45
53
  import { createBridgeState } from './bridge_state.mjs';
54
+ import { createBridge } from './bridge_context.mjs';
55
+ import { createVoiceIO } from './voice_io.mjs';
56
+ import { createTtsPlayer } from './tts_player.mjs';
57
+ import { createUtteranceRouter } from './utterance_router.mjs';
58
+ import { createProgressHandler } from './progress_handler.mjs';
59
+ import { createNotificationHandler } from './notification_handler.mjs';
60
+ import { createTtsRuntime } from './tts_runtime.mjs';
61
+ import { createDiscordVoiceSetup } from './discord_voice_setup.mjs';
62
+ import { createAgentTurnLifecycle } from './agent_turn.mjs';
63
+ import { createDiscordCommandRouter } from './discord_command_router.mjs';
64
+ import { createVoiceTurnRunner } from './voice_turn_runner.mjs';
65
+ import { createPlanDispatcher } from './plan_dispatcher.mjs';
46
66
  import { sendDiscordText, splitDiscordMessage } from './discord_text.mjs';
47
- import { progressTtsCacheFileName } from './progress_cache.mjs';
48
67
  import { shouldPassWhisperLanguage, voiceLanguageCommandFromTranscript, languagePreset } from './language_config.mjs';
49
- import { formatRestartCompleteNotice, formatRestartShutdownNotice } from './restart_notice.mjs';
50
- import { shouldRouteDiscordTextToAgent } from './text_routing.mjs';
68
+ import { whisperFailureMessage, whisperTimeoutMs } from './stt_whisper.mjs';
69
+ import { formatRestartCompleteNotice } from './restart_notice.mjs';
70
+ import {
71
+ formatRecentDiscordContext,
72
+ } from './text_routing.mjs';
51
73
  import {
52
74
  bindProjectSessionToChannel,
53
75
  createProjectSession,
54
76
  listProjectSessions,
55
77
  loadProjectSessions,
56
- parseProjectSessionCommand,
57
78
  projectSessionContextText,
58
79
  projectSessionForChannel,
59
80
  saveProjectSessions,
@@ -141,13 +162,20 @@ function ensureTtsVoiceConfig() {
141
162
  return readTtsVoiceConfig(TTS_VOICE_CONFIG_PATH);
142
163
  }
143
164
  function applyVoiceConfigToProcessEnv(config = ensureTtsVoiceConfig()) {
144
- const selection = effectiveTtsVoiceSelection(config, {});
165
+ const selection = effectiveTtsVoiceSelection(config, process.env);
145
166
  const configuredVoiceLanguage = process.env.VOICE_LANGUAGE;
146
167
  const nextEnv = applyTtsVoiceSelectionToEnv(process.env, selection);
147
168
  if (configuredVoiceLanguage) nextEnv.VOICE_LANGUAGE = configuredVoiceLanguage;
148
169
  for (const [key, value] of Object.entries(nextEnv)) process.env[key] = value;
149
170
  return { config, selection };
150
171
  }
172
+ function rebuildTtsRuntimeSettings(selection = null) {
173
+ settings.tts = buildTtsSettings(process.env, ROOT);
174
+ if (selection?.backend === 'edge' && selection.voice?.voice) settings.tts.edge.voice = selection.voice.voice;
175
+ try { bridge.ttsBackend?.close?.(); } catch (e) { warn('tts backend close failed', e?.message || e); }
176
+ bridge.ttsBackend = createTtsBackend(settings.tts, { execFileAsync, spawn, log, warn, onFallback: ttsFallbackNotice, voiceProvider: () => settings.tts.edge.voice });
177
+ return settings.tts;
178
+ }
151
179
  function reloadRuntimeLanguageFromEnv() {
152
180
  const previousWhisperLanguage = settings?.whisperLanguage;
153
181
  const previousVoiceLanguage = settings?.voiceLanguage;
@@ -170,6 +198,7 @@ const settings = {
170
198
  whisperBin: process.env.WHISPER_CPP_BIN || 'whisper-cli',
171
199
  whisperModel: process.env.WHISPER_CPP_MODEL || path.join(ROOT, 'models', 'ggml-small-q5_1.bin'),
172
200
  whisperLanguage: process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
201
+ whisperTimeoutMs: whisperTimeoutMs(process.env),
173
202
  voiceLanguage: process.env.VOICE_LANGUAGE || process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
174
203
  tts: buildTtsSettings(process.env, ROOT),
175
204
  requireWakeWord: ['1', 'true', 'yes'].includes((process.env.REQUIRE_WAKE_WORD || '0').toLowerCase()),
@@ -187,20 +216,33 @@ const client = new Client({
187
216
  intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates, GatewayIntentBits.GuildMessages, GatewayIntentBits.MessageContent],
188
217
  partials: [Partials.Channel],
189
218
  });
190
- let ttsBackend = createTtsBackend(settings.tts, { execFileAsync, log, warn, voiceProvider: () => settings.tts.edge.voice });
219
+ const announcedTtsFallbacks = new Set();
220
+ const pendingFallbackNoticePromises = new Set();
221
+ function ttsFallbackNotice({ backend } = {}) {
222
+ if (!backend || backend === 'edge') return;
223
+ if (announcedTtsFallbacks.has(backend)) return;
224
+ announcedTtsFallbacks.add(backend);
225
+ const en = /^en/i.test(String(settings.voiceLanguage || ''));
226
+ const msg = en
227
+ ? `${backend} synthesis failed; using Edge for the rest of this session.`
228
+ : `${backend} 음성 생성에 실패해서 이번 세션은 Edge로 진행할게.`;
229
+ const textPromise = sendText(`⚠️ ${msg}`)
230
+ .catch(e => warn('tts fallback notice send failed', e?.message || e));
231
+ pendingFallbackNoticePromises.add(textPromise);
232
+ textPromise.finally(() => pendingFallbackNoticePromises.delete(textPromise));
233
+ const speakPromise = new Promise(resolve => queueMicrotask(() => {
234
+ speakText(msg, null, null, { mirrorText: false })
235
+ .catch(e => warn('tts fallback notice speak failed', e?.message || e))
236
+ .finally(resolve);
237
+ }));
238
+ pendingFallbackNoticePromises.add(speakPromise);
239
+ speakPromise.finally(() => pendingFallbackNoticePromises.delete(speakPromise));
240
+ }
241
+ const bridge = createBridge();
242
+ bridge.ttsBackend = createTtsBackend(settings.tts, { execFileAsync, spawn, log, warn, onFallback: ttsFallbackNotice, voiceProvider: () => settings.tts.edge.voice });
191
243
  const voiceCloneCapture = createVoiceCloneCaptureState({ defaultTargetPath: settings.tts.openvoice.refAudio });
192
244
 
193
- let connection = null;
194
- let activeVoiceChannelId = '';
195
- let activeTranscriptChannelId = '';
196
- let player = createAudioPlayer();
197
- let speaking = false;
198
- let processing = false;
199
- let activeTurnId = 0;
200
- let currentAbortController = null;
201
- const interruptedTurns = new Set();
202
- const activeStreams = new Map();
203
- let bridgeState = null;
245
+ bridge.player = createAudioPlayer();
204
246
  const MAX_DEFERRED_PROCESSING_UTTERANCES = Number(process.env.MAX_DEFERRED_PROCESSING_UTTERANCES || '0');
205
247
  const MIN_UTTERANCE_SECONDS = Number(process.env.MIN_UTTERANCE_SECONDS || '1.4');
206
248
  const MIN_UTTERANCE_BYTES = 48000 * 2 * 2 * MIN_UTTERANCE_SECONDS;
@@ -233,7 +275,7 @@ const bridgeLogger = createBridgeLogger({
233
275
  });
234
276
  function log(...args) { bridgeLogger.log(...args); }
235
277
  function warn(...args) { bridgeLogger.warn(...args); }
236
- bridgeState = createBridgeState({ log, cleanupFile: file => fs.rm(file, { force: true }, () => {}) });
278
+ bridge.bridgeState = createBridgeState({ log, cleanupFile: file => fs.rm(file, { force: true }, () => {}) });
237
279
  const reportTransientProcessError = createTransientErrorReporter({ warn });
238
280
  function isBenignTransientNetworkError(error) {
239
281
  return isTransientNetworkError(error);
@@ -252,27 +294,80 @@ function newLatencyTurn(userId, startedAtMs) {
252
294
  }
253
295
 
254
296
  function discardVoiceInputQueues(reason = 'config-change') {
255
- return bridgeState?.discardQueues(reason) || 0;
297
+ return bridge.bridgeState?.discardQueues(reason) || 0;
256
298
  }
257
- let verboseProgress = Boolean(settings.agent.verboseProgress);
258
- let activeProgressSignal = null;
259
- let verboseProgressSpeechQueue = Promise.resolve();
260
- let activeProgressAbortController = null;
261
- let speechPlaybackGeneration = 0;
262
- let progressSpeechBatch = [];
263
- let progressSpeechBatchTimer = null;
264
- let progressSpeechBatchSignal = null;
265
- let progressSpeechBatchStartedAt = 0;
266
- let activeProgressLastEventAt = 0;
267
- let lastVerboseProgressText = '';
268
- let lastVerboseProgressTextAt = 0;
299
+ bridge.verboseProgress = Boolean(settings.agent.verboseProgress);
300
+
301
+ const STREAMING_TTS_ENABLED = ['1', 'true', 'yes', 'on'].includes(String(process.env.STREAMING_TTS || '1').toLowerCase());
302
+
303
+ bridge.smartProgressEnabled = Boolean(process.env.SMART_PROGRESS_API_KEY);
304
+ const VOICE_CONNECT_TIMEOUT_MS = Number(process.env.VOICE_CONNECT_TIMEOUT_MS || '60000');
269
305
  const PROGRESS_IDLE_NOTICE_INITIAL_MS = Number(process.env.PROGRESS_IDLE_NOTICE_INITIAL_MS || process.env.PROGRESS_IDLE_NOTICE_MS || '10000');
270
306
  const PROGRESS_IDLE_NOTICE_MAX_MS = Number(process.env.PROGRESS_IDLE_NOTICE_MAX_MS || '30000');
271
307
  const PROGRESS_IDLE_NOTICE_MULTIPLIER = Number(process.env.PROGRESS_IDLE_NOTICE_MULTIPLIER || '1.8');
272
308
  const PROGRESS_IDLE_CHECK_MS = Number(process.env.PROGRESS_IDLE_CHECK_MS || '5000');
273
309
  const PROGRESS_IDLE_NOTICE_LIMIT = Number(process.env.PROGRESS_IDLE_NOTICE_LIMIT || '20');
274
310
  const projectSessionsState = loadProjectSessions(settings.projectSessionsPath);
275
- const agentAdaptersBySession = new Map();
311
+ const ttsPlayer = createTtsPlayer({
312
+ bridge,
313
+ settings,
314
+ log,
315
+ warn,
316
+ sleep,
317
+ sendText,
318
+ refreshTtsRuntimeConfig,
319
+ waitEvent,
320
+ isAbortError,
321
+ STREAMING_TTS_ENABLED,
322
+ });
323
+ const { synthTTS, playAudio, speakText, beginStreamingTurn, endStreamingTurn, stopPlaybackForBargeIn } = ttsPlayer;
324
+
325
+ const progressHandler = createProgressHandler({
326
+ bridge,
327
+ settings,
328
+ log,
329
+ warn,
330
+ isAbortError,
331
+ playAudio,
332
+ sendText,
333
+ refreshTtsRuntimeConfig,
334
+ });
335
+ const {
336
+ ensureSmartProgressSummarizer,
337
+ smartProgressStatusText,
338
+ progressEmoji,
339
+ formatProgressText,
340
+ sendVerboseProgressText,
341
+ synthProgressTTS,
342
+ speakProgress,
343
+ speakImmediateNotice,
344
+ queueProgressSpeechText,
345
+ flushProgressSpeechBatch,
346
+ queueVerboseProgressSpeech,
347
+ clearProgressSpeechBatch,
348
+ stopProgressSpeech,
349
+ } = progressHandler;
350
+
351
+ const agentTurnLifecycle = createAgentTurnLifecycle({ bridge, warn });
352
+
353
+ const notificationHandler = createNotificationHandler({ bridge, client, log, warn });
354
+ const {
355
+ ensureNotifier,
356
+ notifyStatusText,
357
+ getVoiceChannelHumanCount,
358
+ maybeNotifyTaskComplete,
359
+ } = notificationHandler;
360
+
361
+ const ttsRuntime = createTtsRuntime({
362
+ bridge,
363
+ ROOT,
364
+ execFileAsync,
365
+ speakText,
366
+ warn,
367
+ persistEnvValues,
368
+ });
369
+ const { ensureSelectedTtsBackendInstalled, commandIsInstalled } = ttsRuntime;
370
+
276
371
  function createBridgeAgentAdapter(agentSettings) {
277
372
  return createAgentAdapter(agentSettings, {
278
373
  execFileAsync,
@@ -280,43 +375,72 @@ function createBridgeAgentAdapter(agentSettings) {
280
375
  log,
281
376
  warn,
282
377
  onProgress: event => {
283
- if (!verboseProgress) return;
284
- activeProgressLastEventAt = Date.now();
285
- sendVerboseProgressText(event, activeProgressSignal);
286
- queueVerboseProgressSpeech(event, activeProgressSignal);
378
+ if (!bridge.verboseProgress) return;
379
+ bridge.activeProgressLastEventAt = Date.now();
380
+ sendVerboseProgressText(event, bridge.activeProgressSignal);
381
+ if (bridge.smartProgressEnabled && process.env.SMART_PROGRESS_API_KEY) {
382
+ try { ensureSmartProgressSummarizer().ingest(event); }
383
+ catch (e) { warn('smart progress ingest failed', e?.stack || e); queueVerboseProgressSpeech(event, bridge.activeProgressSignal); }
384
+ } else {
385
+ queueVerboseProgressSpeech(event, bridge.activeProgressSignal);
386
+ }
387
+ },
388
+ onStdoutChunk: chunk => {
389
+ if (bridge.activeSentencer) {
390
+ try { bridge.activeSentencer.push(chunk); } catch (e) { warn('streaming sentencer push failed', e?.stack || e); }
391
+ }
287
392
  },
288
393
  });
289
394
  }
290
395
  const agentAdapter = createBridgeAgentAdapter(settings.agent);
291
- function adapterForProjectSession(session) {
292
- if (!session) return agentAdapter;
293
- const key = session.slug || session.name;
294
- if (!agentAdaptersBySession.has(key)) {
295
- agentAdaptersBySession.set(key, createBridgeAgentAdapter({
296
- ...settings.agent,
297
- label: `${settings.agent.label} · ${session.name}`,
298
- sessionFile: session.sessionFile,
299
- cwd: session.workdir,
300
- projectContext: projectSessionContextText(session),
301
- }));
302
- }
303
- return agentAdaptersBySession.get(key);
304
- }
305
396
  function resolveProjectSessionForChannel(channelId) {
306
397
  return projectSessionForChannel(projectSessionsState, channelId) || null;
307
398
  }
399
+
400
+ function ontologyStateFor(channelKey) {
401
+ const key = String(channelKey || 'default');
402
+ let store = bridge.ontologyByChannel.get(key);
403
+ if (!store) {
404
+ store = createSessionOntology({ channelKey: key });
405
+ try { store.load(); } catch {}
406
+ bridge.ontologyByChannel.set(key, store);
407
+ }
408
+ return store;
409
+ }
410
+ function captureOntologyFromTurn(channelKey, { prompt, answer, backend }) {
411
+ try {
412
+ const store = ontologyStateFor(channelKey);
413
+ const promptEntities = store.entitiesFromText(String(prompt || ''), { by: backend, kind: 'utterance' });
414
+ const answerEntities = store.entitiesFromText(String(answer || ''), { by: backend, kind: 'result' });
415
+ store.add(promptEntities);
416
+ store.add(answerEntities);
417
+ store.save();
418
+ } catch (e) {
419
+ warn('ontology capture failed', e?.message || e);
420
+ }
421
+ }
422
+ function resetRoutingState(channelKey) {
423
+ const state = routingStateFor(channelKey);
424
+ state.activeRouting = { backend: settings.agent.backend, sticky: false };
425
+ state.pendingFallbackPrompt = null;
426
+ }
427
+ function invalidateBackendAdaptersForSession(sessionSlug) {
428
+ if (!sessionSlug) return;
429
+ for (const key of Array.from(bridge.agentAdaptersByBackend.keys())) {
430
+ if (key.endsWith(`::${sessionSlug}`)) bridge.agentAdaptersByBackend.delete(key);
431
+ }
432
+ }
308
433
  function saveProjectSessionsState() {
309
434
  saveProjectSessions(settings.projectSessionsPath, projectSessionsState);
310
435
  }
311
- let sensitivityMode = SENSITIVITY_MODE_DEFAULT;
312
- let sensitivityModeExpiresAt = 0;
436
+ bridge.sensitivityMode = SENSITIVITY_MODE_DEFAULT;
313
437
  function currentBargeInThresholds() {
314
- if (sensitivityModeExpiresAt && Date.now() > sensitivityModeExpiresAt) {
315
- sensitivityMode = SENSITIVITY_MODE_DEFAULT;
316
- sensitivityModeExpiresAt = 0;
317
- log('barge-in sensitivity mode expired; restored', sensitivityMode);
438
+ if (bridge.sensitivityModeExpiresAt && Date.now() > bridge.sensitivityModeExpiresAt) {
439
+ bridge.sensitivityMode = SENSITIVITY_MODE_DEFAULT;
440
+ bridge.sensitivityModeExpiresAt = 0;
441
+ log('barge-in sensitivity mode expired; restored', bridge.sensitivityMode);
318
442
  }
319
- return bargeInThresholdsForMode(sensitivityMode, {
443
+ return bargeInThresholdsForMode(bridge.sensitivityMode, {
320
444
  minSeconds: BARGE_IN_MIN_SECONDS,
321
445
  minMeanDb: BARGE_IN_MIN_MEAN_VOLUME_DB,
322
446
  minMaxDb: BARGE_IN_MIN_MAX_VOLUME_DB,
@@ -336,48 +460,28 @@ function currentPlaybackBargeInThresholds() {
336
460
  };
337
461
  }
338
462
  function setSensitivityMode(mode, reason = 'manual') {
339
- sensitivityMode = mode === 'conservative' ? 'conservative' : 'normal';
340
- sensitivityModeExpiresAt = sensitivityMode === 'conservative' && SENSITIVITY_OUTDOOR_SECONDS > 0
463
+ bridge.sensitivityMode = mode === 'conservative' ? 'conservative' : 'normal';
464
+ bridge.sensitivityModeExpiresAt = bridge.sensitivityMode === 'conservative' && SENSITIVITY_OUTDOOR_SECONDS > 0
341
465
  ? Date.now() + SENSITIVITY_OUTDOOR_SECONDS * 1000
342
466
  : 0;
343
467
  const thresholds = currentBargeInThresholds();
344
- log('barge-in sensitivity mode set', sensitivityMode, 'reason', reason, 'expiresAt', sensitivityModeExpiresAt || 'never', 'thresholds', thresholds);
468
+ log('barge-in sensitivity mode set', bridge.sensitivityMode, 'reason', reason, 'expiresAt', bridge.sensitivityModeExpiresAt || 'never', 'thresholds', thresholds);
345
469
  return thresholds;
346
470
  }
347
471
  function sensitivityStatusText() {
348
472
  const thresholds = currentBargeInThresholds();
349
- const ttl = sensitivityModeExpiresAt ? Math.max(0, Math.round((sensitivityModeExpiresAt - Date.now()) / 1000)) : 0;
473
+ const ttl = bridge.sensitivityModeExpiresAt ? Math.max(0, Math.round((bridge.sensitivityModeExpiresAt - Date.now()) / 1000)) : 0;
350
474
  return sensitivityStatusTextForLanguage(thresholds, ttl, settings.voiceLanguage);
351
475
  }
352
476
 
353
477
  function verboseStatusText() {
354
- return verboseStatusTextForLanguage(verboseProgress, settings.voiceLanguage);
355
- }
356
-
357
- function progressEmoji(event) {
358
- const category = progressCategory(event, { language: settings.voiceLanguage })?.key;
359
- return {
360
- test: '🧪',
361
- edit: '✏️',
362
- read: '📖',
363
- search: '🔎',
364
- terminal: '⌨️',
365
- skill: '🧰',
366
- browser: '🌐',
367
- tool: '🛠️',
368
- agent: '🤖',
369
- work: '⚙️',
370
- }[category] || '⚙️';
371
- }
372
-
373
- function formatProgressText(event) {
374
- return formatProgressMessage(event, { language: settings.voiceLanguage });
478
+ return verboseStatusTextForLanguage(bridge.verboseProgress, settings.voiceLanguage);
375
479
  }
376
480
 
377
481
  function setVerboseProgress(enabled, reason = 'manual') {
378
- verboseProgress = Boolean(enabled);
379
- log('verbose progress mode set', verboseProgress, 'reason', reason);
380
- return verboseProgress;
482
+ bridge.verboseProgress = Boolean(enabled);
483
+ log('verbose progress mode set', bridge.verboseProgress, 'reason', reason);
484
+ return bridge.verboseProgress;
381
485
  }
382
486
 
383
487
  function persistEnvValues(values) {
@@ -388,7 +492,7 @@ function persistEnvValues(values) {
388
492
  } catch (e) {
389
493
  warn('read .env for update failed', e?.stack || e);
390
494
  }
391
- const pending = new Map(Object.entries(values));
495
+ const pending = new Map(Object.entries(values).filter(([, value]) => value !== undefined));
392
496
  const updated = lines.map(line => {
393
497
  const match = line.match(/^\s*([A-Za-z_][A-Za-z0-9_]*)\s*=.*$/);
394
498
  if (!match || !pending.has(match[1])) return line;
@@ -410,8 +514,8 @@ function applyRuntimeLanguage(language) {
410
514
  config = updateTtsVoiceConfig(config, { voiceType: preferredVoiceTypeForLanguage(config, preset.voiceLanguage) });
411
515
  writeTtsVoiceConfig(TTS_VOICE_CONFIG_PATH, config);
412
516
  const { selection } = applyVoiceConfigToProcessEnv(config);
413
- settings.tts.backend = selection.backend;
414
- settings.tts.edge.voice = selection.backend === 'edge' ? selection.voice.voice : preset.ttsVoice;
517
+ rebuildTtsRuntimeSettings(selection);
518
+ if (selection.backend !== 'edge') settings.tts.edge.voice = preset.ttsVoice;
415
519
  process.env.VOICE_LANGUAGE = preset.voiceLanguage;
416
520
  process.env.WHISPER_CPP_LANGUAGE = preset.sttLanguage;
417
521
  process.env.STT_LANGUAGE = preset.sttLanguage;
@@ -440,33 +544,23 @@ function voiceChangedText(selection) {
440
544
  return `Voice changed to ${selection.voice?.label || selection.voiceType}.`;
441
545
  }
442
546
 
443
- async function handleTtsVoiceCommand(prompt, signal) {
444
- const request = voiceCommandFromTranscript(prompt);
445
- if (!request) return false;
446
- discardVoiceInputQueues('voice-change');
447
- let config = ensureTtsVoiceConfig();
448
- config = updateTtsVoiceConfig(config, request);
449
- writeTtsVoiceConfig(TTS_VOICE_CONFIG_PATH, config);
450
- const { selection } = applyVoiceConfigToProcessEnv(config);
451
- settings.tts.backend = selection.backend;
452
- if (selection.backend === 'edge') settings.tts.edge.voice = selection.voice.voice;
453
- if (selection.voice?.language) settings.voiceLanguage = selection.voice.language;
454
- persistEnvValues({
455
- TTS_BACKEND: selection.backend,
456
- TTS_VOICE_TYPE: selection.voiceType,
457
- TTS_VOICE: selection.backend === 'edge' ? selection.voice.voice : process.env.TTS_VOICE,
458
- VOICE_LANGUAGE: settings.voiceLanguage,
459
- });
460
- await speakText(voiceChangedText(selection), signal);
461
- return true;
547
+ function isCloneVoiceType(voiceType) {
548
+ return /^(cloned_reference|prompt_reference|cosyvoice_reference)$/i.test(String(voiceType || ''));
462
549
  }
463
550
 
464
- async function handleLanguageCommand(prompt, signal) {
465
- const request = voiceLanguageCommandFromTranscript(prompt);
466
- if (!request) return false;
467
- const preset = applyRuntimeLanguage(request.language);
468
- await speakText(languageChangedText(preset), signal);
469
- return true;
551
+ async function notifyVoiceCloneSampleGapIfNeeded(selection, signal) {
552
+ if (!selection || selection.backend === 'edge') return;
553
+ if (!isCloneVoiceType(selection.voiceType)) return;
554
+ const ref = String(selection.voice?.voice || '').trim();
555
+ if (!ref) return;
556
+ const candidatePath = path.isAbsolute(ref) ? ref : path.resolve(ROOT, ref);
557
+ if (fs.existsSync(candidatePath)) return;
558
+ const en = /^en/i.test(String(settings.voiceLanguage || ''));
559
+ const msg = en
560
+ ? `${selection.backend} needs a voice clone sample at ${ref}. Say "voice clone capture" to record one, or pick a non-clone voice.`
561
+ : `${selection.backend} 백엔드는 음성 클론 샘플(${ref})이 필요해. "보이스 클로닝 캡처"라고 하거나 다른 보이스를 골라줘.`;
562
+ await sendText(`🎙️ ${msg}`);
563
+ await speakText(msg, signal, null);
470
564
  }
471
565
 
472
566
  function isAllowed(userId) { return settings.allowedUsers.size === 0 || settings.allowedUsers.has(String(userId)); }
@@ -512,13 +606,28 @@ function spokenResultOnly(userPrompt, answer, language = settings.voiceLanguage)
512
606
  async function sendText(text) {
513
607
  return sendDiscordText({
514
608
  client,
515
- channelId: activeTranscriptChannelId || settings.transcriptChannelId,
609
+ channelId: bridge.activeTranscriptChannelId || settings.transcriptChannelId,
516
610
  text,
517
611
  log,
518
612
  warn,
519
613
  });
520
614
  }
521
615
 
616
+ async function sendEmbed(embed, { content = '' } = {}) {
617
+ if (!embed) return false;
618
+ try {
619
+ const channelId = bridge.activeTranscriptChannelId || settings.transcriptChannelId;
620
+ if (!channelId) return false;
621
+ const channel = await client.channels.fetch(channelId).catch(() => null);
622
+ if (!channel?.send) return false;
623
+ await channel.send(content ? { content, embeds: [embed] } : { embeds: [embed] });
624
+ return true;
625
+ } catch (e) {
626
+ warn('sendEmbed failed', e?.message || e);
627
+ return false;
628
+ }
629
+ }
630
+
522
631
  async function sendChannelText(channel, text) {
523
632
  const body = String(text || '');
524
633
  const chunks = splitDiscordMessage(body);
@@ -526,18 +635,6 @@ async function sendChannelText(channel, text) {
526
635
  return true;
527
636
  }
528
637
 
529
- function sendVerboseProgressText(event, signal) {
530
- if (!verboseProgress || !signal || signal.aborted || activeProgressSignal !== signal) return;
531
- const formatted = formatProgressText(event).replace(/\s+/g, ' ').trim();
532
- if (!formatted) return;
533
- const message = formatted.slice(0, 1900);
534
- const now = Date.now();
535
- if (message === lastVerboseProgressText && now - lastVerboseProgressTextAt < 2000) return;
536
- lastVerboseProgressText = message;
537
- lastVerboseProgressTextAt = now;
538
- void sendText(message).catch(e => warn('verbose progress text delivery failed', e?.stack || e));
539
- }
540
-
541
638
  function sleep(ms) {
542
639
  return new Promise(resolve => setTimeout(resolve, ms));
543
640
  }
@@ -553,83 +650,240 @@ function waitEvent(emitter, event, timeoutMs = 60000) {
553
650
  });
554
651
  }
555
652
 
556
- async function transcribeOnce(wavPath, input16k, outBase) {
557
- const args = ['-m', settings.whisperModel, '-f', input16k];
558
- if (shouldPassWhisperLanguage(settings.whisperLanguage)) args.push('-l', settings.whisperLanguage);
559
- args.push('-nt', '-otxt', '-of', outBase, '-sns', '-nf', '-nth', '0.35', '-et', '2.2', '-lpt', '-0.8');
560
- try {
561
- await execFileAsync(settings.whisperBin, args, { timeout: 25000, maxBuffer: 2 * 1024 * 1024 });
562
- } catch (e) {
563
- throw new Error(`whisper failed: ${e.stderr || e.message}`);
564
- }
565
- const txtPath = `${outBase}.txt`;
566
- const raw = fs.existsSync(txtPath) ? fs.readFileSync(txtPath, 'utf8') : '';
567
- return { raw, txtPath };
568
- }
569
-
570
- async function transcribe(wavPath) {
571
- const tmpBase = path.join(os.tmpdir(), `hermes-node-stt-${Date.now()}`);
572
- const input16k = `${tmpBase}.16k.wav`;
573
- const outBase = `${tmpBase}.out`;
574
- // whisper.cpp can read WAV, but Discord receiver output is 48 kHz stereo.
575
- // Convert explicitly to the 16 kHz mono PCM shape Whisper expects.
576
- await execFileAsync('ffmpeg', ['-y', '-hide_banner', '-loglevel', 'error', '-i', wavPath, '-ac', '1', '-ar', '16000', '-sample_fmt', 's16', input16k], {
577
- timeout: 20000,
578
- maxBuffer: 1024 * 1024,
579
- });
580
-
581
- let raw = '';
582
- let txtPath = '';
583
- try {
584
- ({ raw, txtPath } = await transcribeOnce(wavPath, input16k, outBase));
585
- let cleaned = cleanTranscript(raw);
586
- log('stt raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
587
- if (!cleaned) {
588
- await sleep(300);
589
- const retryBase = `${tmpBase}.retry`;
590
- const retry = await transcribeOnce(wavPath, input16k, retryBase);
591
- raw = retry.raw;
592
- txtPath = retry.txtPath;
593
- cleaned = cleanTranscript(raw);
594
- log('stt retry raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
595
- }
596
- return cleaned;
597
- } finally {
598
- if (settings.debugDir) {
599
- const debug16k = path.join(settings.debugDir, `stt-input-${stamp()}.wav`);
600
- fs.copyFile(input16k, debug16k, () => {});
601
- if (raw) fs.writeFile(path.join(settings.debugDir, `stt-raw-${stamp()}.txt`), raw, () => {});
602
- }
603
- fs.rm(input16k, { force: true }, () => {});
604
- if (txtPath) fs.rm(txtPath, { force: true }, () => {});
605
- }
606
- }
607
-
608
- function cleanTranscript(raw) {
609
- const bad = [
610
- '구독', '좋아요', '알림설정', '시청해주셔서', '시청해주신', '다음영상', '영상에서만나요',
611
- '부탁드려요', '큰힘이됩니다',
612
- 'mbc뉴스', '이준범기자입니다', '뉴스입니다', '기자입니다', '앵커', '속보', '보도입니다', '전해드립니다',
613
- ];
614
- const lines = raw
615
- .split(/\r?\n/)
616
- .map(l => l.trim())
617
- .filter(Boolean)
618
- .map(l => l.replace(/^\[[^\]]+\]\s*/, '').trim());
619
- const kept = [];
620
- for (const line of lines) {
621
- const compact = line
622
- .replace(/\s+/g, '')
623
- .replace(/[\p{P}\p{S}_]+/gu, '');
624
- if (!compact) continue;
625
- if (/^[\(\[(【].*[\)\])】]$/.test(line.replace(/\s+/g, ''))) continue;
626
- if (['끄덕', '끄덕끄덕', '박수', '웃음', '음악', '자막', '침묵', '무음'].includes(compact)) continue;
627
- if (bad.some(b => compact.toLowerCase().includes(b))) continue;
628
- if (isRepeatedNoiseTranscript(compact)) continue;
629
- kept.push(line);
630
- }
631
- return kept.join(' ').trim();
632
- }
653
+ // handleRecording lives inside utteranceRouter (extracted in Phase 4b) but
654
+ // voiceIO.flushUtterance needs to call it. Use a forward-declared `let` plus
655
+ // a thunk so the deps for createVoiceIO resolve before createUtteranceRouter
656
+ // is constructed.
657
+ let utteranceRouter;
658
+ let voiceTurnRunner;
659
+ const voiceIO = createVoiceIO({
660
+ bridge,
661
+ settings,
662
+ client,
663
+ execFileAsync,
664
+ log,
665
+ warn,
666
+ stamp,
667
+ sleep,
668
+ isAllowed,
669
+ UTTERANCE_IDLE_MS,
670
+ SUBSCRIBE_AFTER_SILENCE_MS,
671
+ MIN_UTTERANCE_BYTES,
672
+ MIN_MEAN_VOLUME_DB,
673
+ MIN_MAX_VOLUME_DB,
674
+ currentBargeInThresholds,
675
+ currentPlaybackBargeInThresholds,
676
+ createLiveBargeInMonitor,
677
+ shouldUseLivePlaybackBargeIn,
678
+ stopPlaybackForBargeIn,
679
+ analyzeAudio,
680
+ concatWavs,
681
+ saveCapturedVoiceCloneSample,
682
+ isBargeInCandidate,
683
+ validateProcessingBargeIn,
684
+ enqueueDeferredProcessingUtterance,
685
+ newLatencyTurn,
686
+ handleRecording: (...args) => voiceTurnRunner.handleRecording(...args),
687
+ });
688
+ const { transcribeOnce, transcribe, cleanTranscript, queueSegment, flushUtterance, subscribeUser } = voiceIO;
689
+
690
+ const discordVoiceSetup = createDiscordVoiceSetup({
691
+ bridge,
692
+ client,
693
+ settings,
694
+ ROOT,
695
+ log,
696
+ warn,
697
+ speakText,
698
+ waitEvent,
699
+ subscribeUser,
700
+ pendingFallbackNoticePromises,
701
+ bindProjectSessionToChannel,
702
+ createProjectSession,
703
+ resolveProjectSessionForChannel,
704
+ saveProjectSessionsState,
705
+ projectSessionsState,
706
+ invalidateBackendAdaptersForSession,
707
+ VOICE_CONNECT_TIMEOUT_MS,
708
+ });
709
+ const {
710
+ connectTo,
711
+ autoJoin,
712
+ findVoiceChannelBySelector,
713
+ voiceChannelLabel,
714
+ resolveVoiceChannelForAttach,
715
+ attachVoiceChannelToTextSession,
716
+ gracefulShutdown,
717
+ } = discordVoiceSetup;
718
+ utteranceRouter = createUtteranceRouter({
719
+ bridge,
720
+ agentTurnLifecycle,
721
+ log,
722
+ warn,
723
+ path,
724
+ fs,
725
+ ROOT,
726
+ TTS_VOICE_CONFIG_PATH,
727
+ agentAdapter,
728
+ settings,
729
+ isPlanEntryUtterance,
730
+ parsePlanOutput,
731
+ parsePlanVoiceCommand,
732
+ applyPlanCommand,
733
+ renderFinalPlan,
734
+ planModePreamble,
735
+ planExecutionPreamble,
736
+ parseDecisionAnswer,
737
+ renderDecisionPrompt,
738
+ renderResolvedDecisions,
739
+ isAgentRoutingDecision,
740
+ projectSessionContextText,
741
+ resolveProjectSessionForChannel,
742
+ createBridgeAgentAdapter,
743
+ buildAgentSettings,
744
+ commandIsInstalled,
745
+ shellSplit,
746
+ sendText,
747
+ speakText,
748
+ ensureTtsVoiceConfig,
749
+ updateTtsVoiceConfig,
750
+ writeTtsVoiceConfig,
751
+ applyVoiceConfigToProcessEnv,
752
+ ensureSelectedTtsBackendInstalled,
753
+ rebuildTtsRuntimeSettings,
754
+ voiceCommandFromTranscript,
755
+ voiceChangedText,
756
+ voiceLanguageCommandFromTranscript,
757
+ voiceCloneCommandFromText,
758
+ voiceCloneCapture,
759
+ notifyVoiceCloneSampleGapIfNeeded,
760
+ languageChangedText,
761
+ applyRuntimeLanguage,
762
+ persistEnvValues,
763
+ discardVoiceInputQueues,
764
+ // Phase 4b deps
765
+ transcribe,
766
+ beginStreamingTurn,
767
+ endStreamingTurn,
768
+ client,
769
+ isAllowed,
770
+ isAbortError,
771
+ sleep,
772
+ sendEmbed,
773
+ speakImmediateNotice,
774
+ reloadRuntimeLanguageFromEnv,
775
+ drainDeferredProcessingUtterances,
776
+ maybeNotifyTaskComplete,
777
+ ontologyStateFor,
778
+ captureOntologyFromTurn,
779
+ queueProgressSpeechText,
780
+ stopProgressSpeech,
781
+ agentAnswerHeader,
782
+ emptyAgentAnswer,
783
+ formatRecentDiscordContext,
784
+ formatSttResultMessage,
785
+ formatSttStartMessage,
786
+ formatVoiceErrorMessage,
787
+ formatWakeRejectedMessage,
788
+ spokenResultOnly,
789
+ stripWake,
790
+ acceptsWake,
791
+ sensitivityChangedSpeech,
792
+ sensitivityModeFromTranscript,
793
+ sensitivityStatusText,
794
+ setSensitivityMode,
795
+ isSensitivityOnlyRequest,
796
+ verboseChangedSpeech,
797
+ verboseModeFromTranscript,
798
+ verboseStatusText,
799
+ setVerboseProgress,
800
+ isVerboseOnlyRequest,
801
+ isRoutingOnlyUtterance,
802
+ parseAgentRoutingCommand,
803
+ renderAgentPrefix,
804
+ buildCrossAgentPrompt,
805
+ buildFallbackDecision,
806
+ parseResearchCommand,
807
+ runResearchTurn,
808
+ PROGRESS_IDLE_CHECK_MS,
809
+ PROGRESS_IDLE_NOTICE_INITIAL_MS,
810
+ PROGRESS_IDLE_NOTICE_LIMIT,
811
+ PROGRESS_IDLE_NOTICE_MAX_MS,
812
+ PROGRESS_IDLE_NOTICE_MULTIPLIER,
813
+ STT_START_VOICE_NOTICE,
814
+ });
815
+ const {
816
+ adapterForProjectSession,
817
+ routingStateFor,
818
+ recordUtterance,
819
+ clearTransientRouting,
820
+ adapterForBackend,
821
+ handleTtsVoiceCommand,
822
+ handleLanguageCommand,
823
+ handleVoiceCloneCommand,
824
+ interruptCurrentResponse,
825
+ } = utteranceRouter;
826
+
827
+ const planDispatcher = createPlanDispatcher({
828
+ bridge, settings,
829
+ sendText, speakText,
830
+ routingStateFor, adapterForBackend, adapterForProjectSession,
831
+ resolveProjectSessionForChannel,
832
+ isAgentRoutingDecision,
833
+ parseDecisionAnswer, parsePlanVoiceCommand: parsePlanVoiceCommand,
834
+ applyPlanCommand: applyPlanCommand,
835
+ parsePlanOutput,
836
+ renderDecisionPrompt, renderResolvedDecisions, renderFinalPlan,
837
+ planModePreamble, planExecutionPreamble, isPlanEntryUtterance,
838
+ });
839
+ const {
840
+ planChannelKey,
841
+ askNextDecision,
842
+ finalizePlanReady,
843
+ dispatchPlanModeUtterance,
844
+ planNarrationLines,
845
+ } = planDispatcher;
846
+
847
+ voiceTurnRunner = createVoiceTurnRunner({
848
+ bridge,
849
+ agentTurnLifecycle,
850
+ settings, client, log, warn, fs,
851
+ // From voice_io
852
+ transcribe,
853
+ // From tts_player
854
+ beginStreamingTurn, endStreamingTurn, speakText,
855
+ // From progress_handler
856
+ queueProgressSpeechText, stopProgressSpeech, speakImmediateNotice,
857
+ // From notification_handler
858
+ maybeNotifyTaskComplete,
859
+ // From utterance_router (sibling-module dispatch + adapter selection)
860
+ handleLanguageCommand, handleTtsVoiceCommand, handleVoiceCloneCommand,
861
+ dispatchPlanModeUtterance,
862
+ adapterForBackend, adapterForProjectSession,
863
+ planChannelKey, routingStateFor, recordUtterance, clearTransientRouting,
864
+ // Direct (imported in main or hoisted helpers)
865
+ isAllowed, isAbortError, sleep, sendText, sendEmbed,
866
+ reloadRuntimeLanguageFromEnv, drainDeferredProcessingUtterances,
867
+ resolveProjectSessionForChannel, projectSessionContextText,
868
+ ontologyStateFor, captureOntologyFromTurn,
869
+ formatRecentDiscordContext,
870
+ formatSttResultMessage, formatSttStartMessage,
871
+ formatVoiceErrorMessage, formatWakeRejectedMessage,
872
+ agentAnswerHeader, emptyAgentAnswer, spokenResultOnly,
873
+ stripWake, acceptsWake,
874
+ sensitivityChangedSpeech, sensitivityModeFromTranscript, sensitivityStatusText,
875
+ setSensitivityMode, isSensitivityOnlyRequest,
876
+ verboseChangedSpeech, verboseModeFromTranscript, verboseStatusText,
877
+ setVerboseProgress, isVerboseOnlyRequest,
878
+ isRoutingOnlyUtterance, parseAgentRoutingCommand, renderAgentPrefix,
879
+ buildCrossAgentPrompt, buildFallbackDecision,
880
+ parseDecisionAnswer,
881
+ parseResearchCommand, runResearchTurn,
882
+ PROGRESS_IDLE_CHECK_MS, PROGRESS_IDLE_NOTICE_INITIAL_MS,
883
+ PROGRESS_IDLE_NOTICE_LIMIT, PROGRESS_IDLE_NOTICE_MAX_MS,
884
+ PROGRESS_IDLE_NOTICE_MULTIPLIER, STT_START_VOICE_NOTICE,
885
+ });
886
+ const { handleRecording } = voiceTurnRunner;
633
887
 
634
888
  function isAbortError(e) {
635
889
  return e?.name === 'AbortError' || e?.code === 'ABORT_ERR';
@@ -669,237 +923,45 @@ async function refreshTtsRuntimeConfig() {
669
923
  if (previousBackend !== settings.tts.backend) {
670
924
  const rebuilt = buildTtsSettings(process.env, ROOT);
671
925
  Object.assign(settings.tts, rebuilt);
672
- ttsBackend = createTtsBackend(settings.tts, { execFileAsync, log, warn, voiceProvider: () => settings.tts.edge.voice });
926
+ try { bridge.ttsBackend?.close?.(); } catch (e) { warn('tts backend close failed', e?.message || e); }
927
+ bridge.ttsBackend = createTtsBackend(settings.tts, { execFileAsync, spawn, log, warn, onFallback: ttsFallbackNotice, voiceProvider: () => settings.tts.edge.voice });
673
928
  log('tts backend reloaded from voice config', settings.tts.backend, 'voiceType', selection.voiceType);
674
929
  }
675
930
  return selection;
676
931
  }
677
932
 
678
- async function synthTTS(text, signal) {
679
- await refreshTtsRuntimeConfig();
680
- let lastError = null;
681
- for (let attempt = 1; attempt <= 3; attempt += 1) {
682
- try {
683
- log('final tts synth start', 'backend', ttsBackend.name, 'attempt', attempt, 'chars', String(text || '').length);
684
- const out = await ttsBackend.synthesize(text, { signal, kind: 'final' });
685
- log('final tts synth done', 'backend', ttsBackend.name, 'attempt', attempt, out, fs.statSync(out).size);
686
- return out;
687
- } catch (e) {
688
- lastError = e;
689
- if (isAbortError(e) || signal?.aborted) throw e;
690
- warn('final tts synth failed', 'attempt', attempt, e?.stderr?.toString?.().slice(-500) || e?.message || e);
691
- await sleep(1000 * attempt);
692
- }
693
- }
694
- throw lastError;
695
- }
696
-
697
- async function synthProgressTTS(text, signal) {
698
- await refreshTtsRuntimeConfig();
699
- const ext = ttsBackend.outputExtension || 'mp3';
700
- const cachePath = path.join(settings.tts.progressCacheDir, progressTtsCacheFileName({
701
- backendKeyParts: ttsBackend.cacheKeyParts(),
702
- text,
703
- ext,
704
- }));
705
- if (fs.existsSync(cachePath) && fs.statSync(cachePath).size > 0) {
706
- log('progress tts cache hit', text, cachePath);
707
- return cachePath;
708
- }
709
- log('progress tts cache miss', text);
710
- const tmp = await ttsBackend.synthesize(text, { signal, kind: 'progress' });
711
- fs.renameSync(tmp, cachePath);
712
- return cachePath;
713
- }
714
-
715
- async function playAudio(file, { deleteAfter = true } = {}) {
716
- if (!connection) return;
717
- speaking = true;
718
- try {
719
- const resource = createAudioResource(file, { inputType: StreamType.Arbitrary, inlineVolume: true });
720
- resource.volume?.setVolume(settings.tts.volume);
721
- player.play(resource);
722
- connection.subscribe(player);
723
- await waitEvent(player, AudioPlayerStatus.Idle, 120000).catch(() => {});
724
- } finally {
725
- speaking = false;
726
- if (deleteAfter) fs.rm(file, { force: true }, () => {});
727
- }
728
- }
729
-
730
- async function speakText(text, signal, metricsTurn = null, options = {}) {
731
- const chunks = splitForTTS(text, settings.tts.maxChars);
732
- if (!chunks.length) return;
733
- if (options.mirrorText !== false) {
734
- await sendText(`${options.mirrorPrefix || '🔊 음성으로 읽는 내용'}:\n${String(text || '')}`);
735
- }
736
- log('TTS chunks', chunks.length, 'maxChars', settings.tts.maxChars, 'backend', ttsBackend.name);
737
- const playbackGeneration = speechPlaybackGeneration;
738
- const playbackStopped = () => playbackGeneration !== speechPlaybackGeneration;
739
- let synthMs = 0;
740
- let playMs = 0;
741
- const ttsStart = Date.now();
742
- await playChunkedTTSWithPrefetch(chunks, {
743
- signal,
744
- log,
745
- synth: async chunk => {
746
- if (playbackStopped()) return null;
747
- const start = Date.now();
748
- try { return await synthTTS(chunk, signal); }
749
- finally { synthMs += Date.now() - start; }
750
- },
751
- play: async file => {
752
- if (playbackStopped()) {
753
- await fs.promises.rm(file, { force: true }).catch(() => {});
754
- return;
755
- }
756
- const start = Date.now();
757
- try { return await playAudio(file); }
758
- finally { playMs += Date.now() - start; }
759
- },
760
- cleanup: file => fs.promises.rm(file, { force: true }),
761
- });
762
- metricsTurn?.stage('tts_synth', synthMs, { ttsChunks: chunks.length, spokenChars: String(text || '').length });
763
- metricsTurn?.stage('tts_play', playMs);
764
- metricsTurn?.stage('tts_total', Date.now() - ttsStart);
765
- }
766
-
767
- async function speakProgress(text, signal) {
768
- if (signal?.aborted) return;
769
- try {
770
- const mp3 = await synthProgressTTS(text, signal);
771
- if (signal?.aborted) return;
772
- await playAudio(mp3, { deleteAfter: false });
773
- } catch (e) {
774
- if (!isAbortError(e)) warn('progress tts failed', e?.stack || e);
775
- }
776
- }
777
-
778
- async function speakImmediateNotice(text, signal, reason = 'notice') {
779
- if (signal?.aborted) return;
780
- try {
781
- log('immediate notice speech', reason, 'text', String(text || '').slice(0, 80));
782
- const mp3 = await synthProgressTTS(text, signal);
783
- if (signal?.aborted) return;
784
- await playAudio(mp3, { deleteAfter: false });
785
- } catch (e) {
786
- if (!isAbortError(e)) warn('immediate notice speech failed', reason, e?.stack || e);
787
- }
788
- }
789
-
790
- function queueProgressSpeechText(text, signal, reason = 'status') {
791
- const spoken = String(text || '').replace(/\s+/g, ' ').trim();
792
- if (!spoken || !signal || signal.aborted || activeProgressSignal !== signal) return;
793
- verboseProgressSpeechQueue = verboseProgressSpeechQueue
794
- .catch(() => {})
795
- .then(async () => {
796
- if (signal.aborted || activeProgressSignal !== signal || !processing) return;
797
- log('progress speech queued', reason, 'text', spoken);
798
- await speakProgress(spoken, signal);
799
- });
800
- }
801
-
802
- function flushProgressSpeechBatch(signal, reason = 'timer') {
803
- if (!signal || signal.aborted || activeProgressSignal !== signal) return;
804
- if (progressSpeechBatchTimer) {
805
- clearTimeout(progressSpeechBatchTimer);
806
- progressSpeechBatchTimer = null;
807
- }
808
- const events = progressSpeechBatch;
809
- progressSpeechBatch = [];
810
- progressSpeechBatchSignal = null;
811
- progressSpeechBatchStartedAt = 0;
812
- const text = summarizeProgressEvents(events, { maxCategories: 3, language: settings.voiceLanguage });
813
- if (!text) return;
814
- queueProgressSpeechText(text, signal, `batch-${reason}-${events.length}`);
815
- }
816
-
817
- function queueVerboseProgressSpeech(event, signal) {
818
- if (!verboseProgress || !signal || signal.aborted || activeProgressSignal !== signal) return;
819
- const text = String(event || '').replace(/\s+/g, ' ').trim().slice(0, 120);
820
- if (!text) return;
821
- if (progressSpeechBatchSignal && progressSpeechBatchSignal !== signal) {
822
- progressSpeechBatch = [];
823
- if (progressSpeechBatchTimer) clearTimeout(progressSpeechBatchTimer);
824
- progressSpeechBatchTimer = null;
825
- progressSpeechBatchStartedAt = 0;
826
- }
827
- progressSpeechBatchSignal = signal;
828
- if (!progressSpeechBatchStartedAt) progressSpeechBatchStartedAt = Date.now();
829
- progressSpeechBatch.push(text);
830
- const elapsedMs = Date.now() - progressSpeechBatchStartedAt;
831
- const ratePerSecond = progressSpeechBatch.length / Math.max(0.2, elapsedMs / 1000);
832
- const maxBatchEvents = ratePerSecond >= 6 ? 5 : ratePerSecond >= 3 ? 4 : 3;
833
- const batchDelayMs = ratePerSecond >= 6 ? 650 : ratePerSecond >= 3 ? 550 : 450;
834
- if (progressSpeechBatch.length >= maxBatchEvents) {
835
- flushProgressSpeechBatch(signal, 'full');
836
- return;
837
- }
838
- if (progressSpeechBatchTimer) clearTimeout(progressSpeechBatchTimer);
839
- progressSpeechBatchTimer = setTimeout(() => flushProgressSpeechBatch(signal, 'timer'), batchDelayMs);
840
- }
841
-
842
- function clearProgressSpeechBatch(signal = activeProgressSignal) {
843
- if (progressSpeechBatchTimer) {
844
- clearTimeout(progressSpeechBatchTimer);
845
- progressSpeechBatchTimer = null;
846
- }
847
- if (!signal || progressSpeechBatchSignal === signal) {
848
- progressSpeechBatch = [];
849
- progressSpeechBatchSignal = null;
850
- progressSpeechBatchStartedAt = 0;
851
- }
852
- }
853
-
854
- function stopProgressSpeech(signal, reason = 'final-answer') {
855
- if (activeProgressSignal !== signal) return;
856
- clearProgressSpeechBatch(signal);
857
- activeProgressSignal = null;
858
- if (activeProgressAbortController && !activeProgressAbortController.signal.aborted) {
859
- try { activeProgressAbortController.abort(); } catch (e) { warn('abort progress speech failed', e?.stack || e); }
860
- }
861
- if (speaking) {
862
- log('stop progress speech before final answer', reason);
863
- try { player.stop(true); } catch (e) { warn('stop progress speech failed', e?.stack || e); }
864
- speaking = false;
865
- }
866
- }
867
-
868
933
  async function handleTextAgentMessage(msg, text, { speakResponse = false } = {}) {
869
- if (processing) {
934
+ if (bridge.processing) {
870
935
  await msg.reply('지금 이전 작업을 처리 중이야. 끝나면 다시 보내줘.');
871
936
  return;
872
937
  }
873
- processing = true;
874
- const controller = new AbortController();
875
- currentAbortController = controller;
876
- const signal = controller.signal;
877
- const progressController = new AbortController();
878
- activeProgressAbortController = progressController;
879
- activeProgressSignal = progressController.signal;
880
- activeProgressLastEventAt = Date.now();
881
- const previousTranscriptChannelId = activeTranscriptChannelId;
938
+ const turn = agentTurnLifecycle.start();
939
+ const { controller, signal, progressController } = turn;
882
940
  const session = resolveProjectSessionForChannel(msg.channelId);
883
- activeTranscriptChannelId = session?.transcriptChannelId || msg.channelId;
941
+ bridge.activeTranscriptChannelId = session?.transcriptChannelId || msg.channelId;
884
942
  const selectedAgentAdapter = adapterForProjectSession(session);
885
943
  const projectContext = projectSessionContextText(session);
944
+ const recentDiscordContext = formatRecentDiscordContext(bridge.recentDiscordTextByChannel, {
945
+ channelId: bridge.activeTranscriptChannelId,
946
+ });
886
947
  const plan = {
887
948
  task: true,
888
949
  label: selectedAgentAdapter.label,
889
- verboseProgress,
950
+ verboseProgress: bridge.verboseProgress,
890
951
  language: settings.voiceLanguage,
891
952
  cwd: session?.workdir,
892
953
  projectContext,
954
+ recentDiscordContext,
893
955
  };
894
956
  const sessionBefore = selectedAgentAdapter.readSessionId?.();
895
- log('text agent request start', selectedAgentAdapter.label, sessionBefore ? 'resume-existing-session' : 'new-session', 'verbose', verboseProgress, session ? `project=${session.slug}` : 'project=default');
957
+ log('text agent request start', selectedAgentAdapter.label, sessionBefore ? 'resume-existing-session' : 'new-session', 'verbose', bridge.verboseProgress, session ? `project=${session.slug}` : 'project=default');
896
958
  try {
897
959
  const result = await selectedAgentAdapter.run(text, signal, plan);
898
960
  const answer = result.answer || emptyAgentAnswer(settings.voiceLanguage);
899
961
  const fullAnswerText = `${agentAnswerHeader(settings.voiceLanguage, selectedAgentAdapter.label)}\n${answer}`;
900
962
  await sendChannelText(msg.channel, fullAnswerText);
901
963
  stopProgressSpeech(progressController.signal, 'text-agent-answer-ready');
902
- if (speakResponse && connection) {
964
+ if (speakResponse && bridge.connection) {
903
965
  const spokenAnswer = spokenResultOnly(text, answer, settings.voiceLanguage);
904
966
  await speakText(spokenAnswer, signal, null, { mirrorText: false });
905
967
  }
@@ -908,15 +970,11 @@ async function handleTextAgentMessage(msg, text, { speakResponse = false } = {})
908
970
  warn('text agent request failed', e?.stack || e);
909
971
  await sendChannelText(msg.channel, formatVoiceErrorMessage(settings.voiceLanguage, String(e?.message || e).slice(0, 800)));
910
972
  } finally {
911
- if (activeProgressAbortController && activeProgressAbortController.signal === progressController.signal && !activeProgressAbortController.signal.aborted) {
912
- try { activeProgressAbortController.abort(); } catch (e) { warn('abort text progress speech failed', e?.stack || e); }
913
- }
914
- if (activeProgressSignal === progressController.signal) activeProgressSignal = null;
915
- if (activeProgressAbortController?.signal === progressController.signal) activeProgressAbortController = null;
973
+ // Text-path-only behaviour pre-refactor: drain the verbose-progress batch
974
+ // before tearing the controllers down. Kept explicit so the lifecycle's
975
+ // finish() can stay path-agnostic.
916
976
  clearProgressSpeechBatch(progressController.signal);
917
- if (currentAbortController === controller) currentAbortController = null;
918
- activeTranscriptChannelId = previousTranscriptChannelId;
919
- processing = false;
977
+ agentTurnLifecycle.finish(turn);
920
978
  }
921
979
  }
922
980
 
@@ -940,53 +998,6 @@ async function saveCapturedVoiceCloneSample(userId, wavPath, pcmBytes, segments,
940
998
  return true;
941
999
  }
942
1000
 
943
- async function handleVoiceCloneCommand(userId, prompt, signal = null) {
944
- const command = voiceCloneCommandFromText(prompt);
945
- if (!command) return false;
946
- if (command.action === 'cancel') {
947
- const cancelled = voiceCloneCapture.cancel(userId);
948
- await sendText(cancelled ? '🎙️ 보이스 클로닝 샘플 캡처를 취소했어.' : '🎙️ 대기 중인 보이스 클로닝 샘플 캡처가 없어.');
949
- await speakText(cancelled ? '목소리 샘플 녹음 대기를 취소했어.' : '대기 중인 목소리 샘플 녹음은 없어.', signal);
950
- return true;
951
- }
952
- if (command.action === 'status') {
953
- const current = voiceCloneCapture.current();
954
- const status = current?.userId === String(userId)
955
- ? `🎙️ 다음 유효한 음성을 ${path.relative(ROOT, current.targetPath)}에 저장할게.`
956
- : '🎙️ 지금 대기 중인 보이스 클로닝 샘플 캡처는 없어.';
957
- await sendText(status);
958
- await speakText(current?.userId === String(userId) ? '다음에 말하는 목소리를 샘플로 저장할게.' : '대기 중인 목소리 샘플 녹음은 없어.', signal);
959
- return true;
960
- }
961
- const armed = voiceCloneCapture.arm({ userId, source: 'voice-command' });
962
- await sendText(`🎙️ 보이스 클로닝 샘플 캡처 대기 중. 다음 10초에서 30초 정도 말하면 ${path.relative(ROOT, armed.targetPath)}에 저장할게.`);
963
- await speakText('좋아. 다음에 10초에서 30초 정도 말하면 그 음성을 목소리 샘플로 저장할게.', signal);
964
- return true;
965
- }
966
-
967
- function stopPlaybackForBargeIn(userId, reason = 'playback-barge-in') {
968
- if (!speaking) return false;
969
- log('stop playback for barge-in', 'byUser', userId, 'reason', reason, 'speaking', speaking, 'processing', processing, 'turn', activeTurnId);
970
- speechPlaybackGeneration += 1;
971
- try { player.stop(true); } catch (e) { warn('stop playback failed', e?.stack || e); }
972
- speaking = false;
973
- return true;
974
- }
975
-
976
- function interruptCurrentResponse(userId, reason = 'barge-in') {
977
- if (!speaking && !processing) return false;
978
- const turnId = activeTurnId;
979
- if (turnId) interruptedTurns.add(turnId);
980
- log('interrupt current response', 'byUser', userId, 'reason', reason, 'speaking', speaking, 'processing', processing, 'turn', turnId);
981
- if (currentAbortController && !currentAbortController.signal.aborted) {
982
- try { currentAbortController.abort(); } catch (e) { warn('abort current response failed', e?.stack || e); }
983
- }
984
- try { player.stop(true); } catch (e) { warn('stop playback failed', e?.stack || e); }
985
- speaking = false;
986
- processing = false;
987
- return true;
988
- }
989
-
990
1001
  function acceptsWake(text) {
991
1002
  if (!settings.requireWakeWord) return true;
992
1003
  const low = text.toLowerCase();
@@ -1034,17 +1045,6 @@ async function concatWavs(files, output) {
1034
1045
  }
1035
1046
  }
1036
1047
 
1037
- function queueSegment(userId, file, pcmBytes, startedAtMs = Date.now(), endedAtMs = Date.now()) {
1038
- const pending = bridgeState.appendSegment(userId, {
1039
- file,
1040
- pcmBytes,
1041
- startedAtMs,
1042
- endedAtMs,
1043
- timerFactory: () => setTimeout(() => flushUtterance(userId).catch(e => warn('flushUtterance failed', userId, e?.stack || e)), UTTERANCE_IDLE_MS),
1044
- });
1045
- log('queued segment', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes, 'idleMs', UTTERANCE_IDLE_MS, 'epoch', pending.epoch);
1046
- }
1047
-
1048
1048
  function isBargeInCandidate(pcmBytes, levels) {
1049
1049
  const thresholds = currentBargeInThresholds();
1050
1050
  return isValidatedBargeInCandidate(pcmBytes, levels, thresholds);
@@ -1052,7 +1052,7 @@ function isBargeInCandidate(pcmBytes, levels) {
1052
1052
 
1053
1053
  function enqueueDeferredProcessingUtterance({ userId, wavPath, pcmBytes, segments, startedAtMs = Date.now() }) {
1054
1054
  const item = { userId, wavPath, pcmBytes, segments, startedAtMs };
1055
- const result = bridgeState.enqueueDeferred(item, enqueueDeferredUtterance, MAX_DEFERRED_PROCESSING_UTTERANCES);
1055
+ const result = bridge.bridgeState.enqueueDeferred(item, enqueueDeferredUtterance, MAX_DEFERRED_PROCESSING_UTTERANCES);
1056
1056
  if (!result.queued) {
1057
1057
  log('drop deferred utterance because queue disabled', userId, wavPath, 'max', MAX_DEFERRED_PROCESSING_UTTERANCES);
1058
1058
  return false;
@@ -1060,15 +1060,15 @@ function enqueueDeferredProcessingUtterance({ userId, wavPath, pcmBytes, segment
1060
1060
  if (result.dropped) {
1061
1061
  log('drop oldest deferred utterance because queue is full', result.dropped?.userId, result.dropped?.wavPath);
1062
1062
  }
1063
- log('queued deferred utterance while processing', userId, wavPath, 'queueSize', bridgeState.deferredSize(), 'epoch', bridgeState.currentEpoch());
1063
+ log('queued deferred utterance while processing', userId, wavPath, 'queueSize', bridge.bridgeState.deferredSize(), 'epoch', bridge.bridgeState.currentEpoch());
1064
1064
  return true;
1065
1065
  }
1066
1066
 
1067
1067
  async function drainDeferredProcessingUtterances() {
1068
- if (processing || bridgeState.deferredSize() === 0) return;
1069
- const next = bridgeState.shiftDeferred();
1068
+ if (bridge.processing || bridge.bridgeState.deferredSize() === 0) return;
1069
+ const next = bridge.bridgeState.shiftDeferred();
1070
1070
  if (!next) return;
1071
- log('drain deferred utterance', next.userId, next.wavPath, 'remaining', bridgeState.deferredSize());
1071
+ log('drain deferred utterance', next.userId, next.wavPath, 'remaining', bridge.bridgeState.deferredSize());
1072
1072
  const metricsTurn = newLatencyTurn(next.userId, next.startedAtMs || Date.now());
1073
1073
  metricsTurn.mark('voice_first_packet', next.startedAtMs || Date.now());
1074
1074
  metricsTurn.mark('utterance_flush');
@@ -1092,368 +1092,6 @@ async function validateProcessingBargeIn(userId, wavPath, pcmBytes, segments) {
1092
1092
  return { action: 'interrupt', text };
1093
1093
  }
1094
1094
 
1095
- async function flushUtterance(userId) {
1096
- const pending = bridgeState.deletePending(userId);
1097
- if (!pending) return;
1098
- if (pending.timer) clearTimeout(pending.timer);
1099
- const files = pending.files;
1100
- const pcmBytes = pending.pcmBytes;
1101
- const metricsTurn = newLatencyTurn(userId, pending.firstPacketAt || Date.now());
1102
- metricsTurn.mark('voice_first_packet', pending.firstPacketAt || Date.now());
1103
- metricsTurn.mark('voice_segment_end', pending.lastSegmentEndAt || Date.now());
1104
- metricsTurn.mark('utterance_flush');
1105
- metricsTurn.addMeta({ segments: files.length, pcmBytes, epoch: pending.epoch });
1106
- if (pending.epoch !== bridgeState.currentEpoch()) {
1107
- log('drop stale utterance after voice input queue reset', userId, 'utteranceEpoch', pending.epoch, 'currentEpoch', bridgeState.currentEpoch());
1108
- for (const file of files) fs.rm(file, { force: true }, () => {});
1109
- metricsTurn.finish({ status: 'stale_after_config_change' });
1110
- return;
1111
- }
1112
- if (pcmBytes < MIN_UTTERANCE_BYTES) {
1113
- log('skip short utterance', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'minBytes', MIN_UTTERANCE_BYTES);
1114
- metricsTurn.finish({ status: 'skip_short' });
1115
- return;
1116
- }
1117
- const merged = path.join(settings.debugDir, `utterance-merged-${stamp()}-${userId}.wav`);
1118
- await concatWavs(files, merged);
1119
- const levels = await analyzeAudio(merged);
1120
- log('utterance levels', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
1121
- if (await saveCapturedVoiceCloneSample(userId, merged, pcmBytes, files.length)) {
1122
- metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1123
- metricsTurn.finish({ status: 'voice_clone_sample_saved' });
1124
- return;
1125
- }
1126
- const candidate = isBargeInCandidate(pcmBytes, levels);
1127
- if (speaking || processing) {
1128
- const thresholds = currentBargeInThresholds();
1129
- if (!candidate) {
1130
- log('check weak barge-in for explicit stop transcript', userId, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholdBytes', thresholds.minBytes, 'thresholds', thresholds.minMeanDb, thresholds.minMaxDb, 'mode', thresholds.mode);
1131
- }
1132
- const validation = await validateProcessingBargeIn(userId, merged, pcmBytes, files.length);
1133
- if (validation?.action === 'interrupt') {
1134
- metricsTurn.finish({ status: processing ? 'barge_in_processing_interrupt' : 'barge_in_playback_interrupt' });
1135
- return;
1136
- }
1137
- if (processing && validation?.action === 'defer') {
1138
- const queued = enqueueDeferredProcessingUtterance({
1139
- userId,
1140
- wavPath: merged,
1141
- pcmBytes,
1142
- segments: files.length,
1143
- startedAtMs: pending.firstPacketAt || Date.now(),
1144
- });
1145
- metricsTurn.finish({ status: queued ? 'deferred_during_processing' : 'drop_deferred_during_processing' });
1146
- return;
1147
- }
1148
- metricsTurn.finish({ status: speaking ? 'barge_in_playback_ignored' : 'barge_in_processing_ignored' });
1149
- return;
1150
- }
1151
- // Drop only when BOTH overall energy and peak are low. Real Discord speech from this
1152
- // mic can have low mean volume while still carrying intelligible peaks; using OR here
1153
- // caused valid Korean utterances to be discarded as "low-energy".
1154
- if (levels.meanDb < MIN_MEAN_VOLUME_DB && levels.maxDb < MIN_MAX_VOLUME_DB) {
1155
- log('skip low-energy utterance', userId, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholds', MIN_MEAN_VOLUME_DB, MIN_MAX_VOLUME_DB, 'mode', 'both-below');
1156
- metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1157
- metricsTurn.finish({ status: 'skip_low_energy' });
1158
- return;
1159
- }
1160
- metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1161
- await handleRecording(userId, merged, pcmBytes, files.length, metricsTurn);
1162
- }
1163
-
1164
- async function handleRecording(userId, wavPath, pcmBytes, segments = 1, metricsTurn = null) {
1165
- if (processing) { log('drop while processing', userId); metricsTurn?.finish({ status: 'drop_processing' }); return; }
1166
- if (!isAllowed(userId)) { warn('ignore unauthorized', userId); metricsTurn?.finish({ status: 'unauthorized' }); return; }
1167
- processing = true;
1168
- const turnId = ++activeTurnId;
1169
- const controller = new AbortController();
1170
- currentAbortController = controller;
1171
- const signal = controller.signal;
1172
- const sessionForVoice = resolveProjectSessionForChannel(activeVoiceChannelId || settings.transcriptChannelId);
1173
- const previousTranscriptChannelId = activeTranscriptChannelId;
1174
- activeTranscriptChannelId = sessionForVoice?.transcriptChannelId || settings.transcriptChannelId;
1175
- try {
1176
- const runtimeLanguage = reloadRuntimeLanguageFromEnv();
1177
- if (runtimeLanguage.changed) {
1178
- log('drop current utterance because language changed before STT', userId, 'turn', turnId, 'language', runtimeLanguage.voiceLanguage);
1179
- fs.rm(wavPath, { force: true }, () => {});
1180
- metricsTurn?.finish({ status: 'drop_stale_language_change' });
1181
- return;
1182
- }
1183
- const session = resolveProjectSessionForChannel(activeVoiceChannelId || settings.transcriptChannelId);
1184
- activeTranscriptChannelId = session?.transcriptChannelId || settings.transcriptChannelId;
1185
- log('voice turn text target', session ? `project=${session.slug}` : 'project=default', 'channel', activeTranscriptChannelId ? 'project-or-default' : 'none');
1186
- log('transcribing', userId, wavPath, 'pcmBytes', pcmBytes, 'segments', segments, 'turn', turnId);
1187
- const sttNotice = formatSttStartMessage(settings.voiceLanguage);
1188
- await sendText(sttNotice);
1189
- const sttNoticeSpeech = STT_START_VOICE_NOTICE
1190
- ? speakImmediateNotice(sttNotice.replace(/^🎧\s*/u, ''), signal, 'stt-start')
1191
- : Promise.resolve();
1192
- const sttStart = Date.now();
1193
- const text = await transcribe(wavPath);
1194
- await sttNoticeSpeech;
1195
- metricsTurn?.stage('stt', Date.now() - sttStart, { transcriptChars: String(text || '').length });
1196
- if (interruptedTurns.has(turnId) || signal.aborted) { metricsTurn?.finish({ status: 'aborted_after_stt' }); return; }
1197
- if (!text) { log('empty transcript', userId, wavPath); metricsTurn?.finish({ status: 'empty_transcript' }); return; }
1198
- log(`user ${userId} said: ${text}`);
1199
- await sendText(formatSttResultMessage(settings.voiceLanguage, userId, text));
1200
- if (!acceptsWake(text)) { await sendText(formatWakeRejectedMessage(settings.voiceLanguage)); metricsTurn?.finish({ status: 'wake_rejected' }); return; }
1201
-
1202
- const prompt = stripWake(text);
1203
- if (await handleLanguageCommand(prompt, signal)) {
1204
- metricsTurn?.finish({ status: 'language_command' });
1205
- return;
1206
- }
1207
- if (await handleTtsVoiceCommand(prompt, signal)) {
1208
- metricsTurn?.finish({ status: 'voice_command' });
1209
- return;
1210
- }
1211
- if (await handleVoiceCloneCommand(userId, prompt, signal)) {
1212
- metricsTurn?.finish({ status: 'voice_clone_command' });
1213
- return;
1214
- }
1215
- const sensitivityRequest = sensitivityModeFromTranscript(prompt);
1216
- if (sensitivityRequest) {
1217
- const thresholds = setSensitivityMode(sensitivityRequest.mode, sensitivityRequest.reason);
1218
- await sendText(`🎚️ ${sensitivityStatusText()}`);
1219
- if (isSensitivityOnlyRequest(prompt)) {
1220
- await speakText(sensitivityChangedSpeech(thresholds.mode, settings.voiceLanguage), signal, metricsTurn);
1221
- metricsTurn?.finish({ status: 'sensitivity_only' });
1222
- return;
1223
- }
1224
- }
1225
- const verboseRequest = verboseModeFromTranscript(prompt);
1226
- if (verboseRequest !== null) {
1227
- setVerboseProgress(verboseRequest, 'voice-command');
1228
- await sendText(`🔎 ${verboseStatusText()}`);
1229
- if (isVerboseOnlyRequest(prompt)) {
1230
- await speakText(verboseChangedSpeech(verboseRequest, settings.voiceLanguage), signal, metricsTurn);
1231
- metricsTurn?.finish({ status: 'verbose_only' });
1232
- return;
1233
- }
1234
- }
1235
- const selectedAgentAdapter = adapterForProjectSession(session);
1236
- const projectContext = projectSessionContextText(session);
1237
- const plan = {
1238
- task: true,
1239
- label: selectedAgentAdapter.label,
1240
- verboseProgress,
1241
- language: settings.voiceLanguage,
1242
- cwd: session?.workdir,
1243
- projectContext,
1244
- };
1245
- log('Agent plan', plan.label, 'backend', selectedAgentAdapter.backend, 'task', plan.task, 'language', plan.language, session ? `project=${session.slug}` : 'project=default');
1246
- const agentStart = Date.now();
1247
- const progressController = new AbortController();
1248
- activeProgressAbortController = progressController;
1249
- activeProgressSignal = progressController.signal;
1250
- activeProgressLastEventAt = Date.now();
1251
- const agentPromise = selectedAgentAdapter.ask(prompt, signal, plan);
1252
- let done = false;
1253
- // Status announcements share one queue with verbose progress so they never
1254
- // talk over each other. In verbose mode, skip the generic initial prompt;
1255
- // the detailed tool/file/test events are the initial progress voice.
1256
- const progressLoop = (async () => {
1257
- if (!verboseProgress) {
1258
- await sleep(2500);
1259
- if (!done && !signal.aborted && !interruptedTurns.has(turnId)) {
1260
- const initial = /^en/i.test(String(settings.voiceLanguage || ''))
1261
- ? 'calling the agent.'
1262
- : '에이전트 호출했어. 응답 기다리는 중.';
1263
- queueProgressSpeechText(initial, progressController.signal, 'generic-initial');
1264
- }
1265
- }
1266
- let idleNotices = 0;
1267
- let nextIdleNoticeMs = PROGRESS_IDLE_NOTICE_INITIAL_MS;
1268
- let lastObservedProgressAt = activeProgressLastEventAt;
1269
- while (!done && !signal.aborted && !interruptedTurns.has(turnId) && idleNotices < PROGRESS_IDLE_NOTICE_LIMIT) {
1270
- await sleep(Math.min(PROGRESS_IDLE_CHECK_MS, nextIdleNoticeMs));
1271
- if (done || signal.aborted || interruptedTurns.has(turnId)) break;
1272
- if (activeProgressLastEventAt !== lastObservedProgressAt) {
1273
- lastObservedProgressAt = activeProgressLastEventAt;
1274
- nextIdleNoticeMs = PROGRESS_IDLE_NOTICE_INITIAL_MS;
1275
- continue;
1276
- }
1277
- const idleMs = Date.now() - activeProgressLastEventAt;
1278
- if (idleMs < nextIdleNoticeMs) continue;
1279
- idleNotices += 1;
1280
- activeProgressLastEventAt = Date.now();
1281
- lastObservedProgressAt = activeProgressLastEventAt;
1282
- const idle = /^en/i.test(String(settings.voiceLanguage || ''))
1283
- ? 'still working on that.'
1284
- : '아직 작업 중이야.';
1285
- queueProgressSpeechText(idle, progressController.signal, `idle-${idleNotices}-${Math.round(nextIdleNoticeMs / 1000)}s`);
1286
- nextIdleNoticeMs = Math.min(
1287
- PROGRESS_IDLE_NOTICE_MAX_MS,
1288
- Math.max(nextIdleNoticeMs + 1000, Math.round(nextIdleNoticeMs * PROGRESS_IDLE_NOTICE_MULTIPLIER)),
1289
- );
1290
- }
1291
- })().catch(e => {
1292
- if (!isAbortError(e)) warn('progress loop failed', e?.stack || e);
1293
- });
1294
- const answer = await agentPromise.finally(() => { done = true; });
1295
- metricsTurn?.stage('agent', Date.now() - agentStart, { answerChars: String(answer || '').length, backend: selectedAgentAdapter.backend });
1296
- void progressLoop;
1297
- if (interruptedTurns.has(turnId) || signal.aborted) { metricsTurn?.finish({ status: 'aborted_after_agent' }); return; }
1298
-
1299
- log('Agent answer', selectedAgentAdapter.label, answer.slice(0, 200));
1300
- const spokenAnswer = spokenResultOnly(prompt, answer, settings.voiceLanguage);
1301
- const fullAnswerText = `${agentAnswerHeader(settings.voiceLanguage, selectedAgentAdapter.label)}\n${answer || emptyAgentAnswer(settings.voiceLanguage)}`;
1302
- log('send agent answer text', 'chars', fullAnswerText.length);
1303
- const answerTextDelivered = await sendText(fullAnswerText);
1304
- if (!answerTextDelivered) {
1305
- warn('agent answer text delivery failed; still speaking answer');
1306
- }
1307
- log('spoken answer', spokenAnswer.slice(0, 200));
1308
- stopProgressSpeech(progressController.signal, 'agent-answer-ready');
1309
- await speakText(spokenAnswer, signal, metricsTurn, { mirrorText: !answerTextDelivered });
1310
- metricsTurn?.finish({ status: 'ok' });
1311
- } catch (e) {
1312
- if (isAbortError(e) || interruptedTurns.has(turnId)) {
1313
- log('turn aborted', userId, 'turn', turnId);
1314
- metricsTurn?.finish({ status: 'aborted' });
1315
- return;
1316
- }
1317
- warn('handleRecording failed', e?.stack || e);
1318
- const shortMsg = String(e?.message || e).slice(0, 800);
1319
- metricsTurn?.finish({ status: 'error', error: shortMsg });
1320
- await sendText(formatVoiceErrorMessage(settings.voiceLanguage, shortMsg));
1321
- } finally {
1322
- if (activeProgressAbortController && !activeProgressAbortController.signal.aborted) {
1323
- try { activeProgressAbortController.abort(); } catch (e) { warn('abort progress speech in cleanup failed', e?.stack || e); }
1324
- }
1325
- if (activeProgressSignal === activeProgressAbortController?.signal) activeProgressSignal = null;
1326
- activeProgressAbortController = null;
1327
- if (currentAbortController === controller) currentAbortController = null;
1328
- activeTranscriptChannelId = previousTranscriptChannelId;
1329
- interruptedTurns.delete(turnId);
1330
- if (activeTurnId === turnId) activeTurnId = 0;
1331
- processing = false;
1332
- if (bridgeState.deferredSize() > 0) {
1333
- setImmediate(() => drainDeferredProcessingUtterances().catch(e => warn('drain deferred utterance failed', e?.stack || e)));
1334
- }
1335
- }
1336
- }
1337
-
1338
- function subscribeUser(receiver, userId) {
1339
- if (!isAllowed(userId)) return;
1340
- if (String(userId) === client.user?.id) return;
1341
- const wasSpeaking = speaking;
1342
- const wasProcessing = processing;
1343
- if ((wasSpeaking || wasProcessing) && !activeStreams.has(userId)) {
1344
- // Speaking-start alone is too noisy in Discord voice. Record and validate a
1345
- // real segment first; only confirmed playback barge-in stops the current
1346
- // audio chunk, and only explicit stop transcripts abort active agent work.
1347
- log('possible barge-in start; waiting for segment validation', userId, 'speaking', wasSpeaking, 'processing', wasProcessing);
1348
- }
1349
- if (activeStreams.has(userId)) return;
1350
- const pending = bridgeState.getPending(userId);
1351
- if (pending?.timer) {
1352
- bridgeState.clearPendingTimer(userId);
1353
- log('extend pending utterance because new segment started', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes);
1354
- }
1355
-
1356
- const file = path.join(settings.debugDir, `segment-${stamp()}-${userId}.wav`);
1357
- log('subscribe user', userId, file);
1358
- const opusStream = receiver.subscribe(userId, { end: { behavior: EndBehaviorType.AfterSilence, duration: SUBSCRIBE_AFTER_SILENCE_MS } });
1359
- const decoder = new prism.opus.Decoder({ rate: 48000, channels: 2, frameSize: 960 });
1360
- const writer = new wav.FileWriter(file, { sampleRate: 48000, channels: 2, bitDepth: 16 });
1361
- activeStreams.set(userId, { opusStream, decoder, writer, file, startedAtMs: Date.now() });
1362
- let pcmBytes = 0;
1363
- const liveThresholds = wasSpeaking && !wasProcessing ? currentPlaybackBargeInThresholds() : currentBargeInThresholds();
1364
- const liveBargeIn = shouldUseLivePlaybackBargeIn({ speaking: wasSpeaking, processing: wasProcessing }) ? createLiveBargeInMonitor({
1365
- minBytes: liveThresholds.minBytes,
1366
- minMeanDb: liveThresholds.minMeanDb,
1367
- minMaxDb: liveThresholds.minMaxDb,
1368
- requireBoth: liveThresholds.requireBoth,
1369
- log,
1370
- onConfirm: ({ pcmBytes: confirmedBytes, levels }) => {
1371
- log('confirmed live playback barge-in before segment end', userId, 'pcmBytes', confirmedBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
1372
- stopPlaybackForBargeIn(userId, 'confirmed-live-playback-barge-in');
1373
- },
1374
- }) : null;
1375
- decoder.on('data', chunk => {
1376
- pcmBytes += chunk.length;
1377
- liveBargeIn?.push(chunk);
1378
- });
1379
- opusStream.on('error', e => warn('opus stream error', userId, e?.stack || e));
1380
- decoder.on('error', e => warn('opus decoder error', userId, e?.stack || e));
1381
- writer.on('error', e => warn('wav writer error', userId, e?.stack || e));
1382
- opusStream.on('end', () => log('opus end', userId, 'pcmBytes', pcmBytes));
1383
- writer.on('finish', () => {
1384
- const streamState = activeStreams.get(userId);
1385
- activeStreams.delete(userId);
1386
- const endedAtMs = Date.now();
1387
- log('saved segment', userId, 'pcmBytes', pcmBytes, file);
1388
- queueSegment(userId, file, pcmBytes, streamState?.startedAtMs || endedAtMs, endedAtMs);
1389
- });
1390
- opusStream.pipe(decoder).pipe(writer);
1391
- }
1392
-
1393
- async function connectTo(channel) {
1394
- if (connection) {
1395
- try { connection.destroy(); } catch {}
1396
- }
1397
- activeVoiceChannelId = channel.id;
1398
- connection = joinVoiceChannel({
1399
- channelId: channel.id,
1400
- guildId: channel.guild.id,
1401
- adapterCreator: channel.guild.voiceAdapterCreator,
1402
- selfDeaf: false,
1403
- selfMute: false,
1404
- });
1405
- const voiceConnection = connection;
1406
- voiceConnection.subscribe(player);
1407
- voiceConnection.on('error', e => warn('voice connection error', e?.stack || e));
1408
- voiceConnection.on('stateChange', async (oldState, newState) => {
1409
- log('voice connection state', oldState.status, '->', newState.status);
1410
- if (connection !== voiceConnection) {
1411
- log('ignore stale voice connection state', oldState.status, '->', newState.status);
1412
- return;
1413
- }
1414
- if (newState.status === VoiceConnectionStatus.Disconnected) {
1415
- try {
1416
- await Promise.race([
1417
- entersState(voiceConnection, VoiceConnectionStatus.Signalling, 5000),
1418
- entersState(voiceConnection, VoiceConnectionStatus.Connecting, 5000),
1419
- ]);
1420
- } catch (e) {
1421
- if (connection !== voiceConnection) return;
1422
- warn('voice connection disconnected; reconnecting to channel', channel.guild.name, channel.name, e?.message || e);
1423
- try { voiceConnection.destroy(); } catch {}
1424
- connection = null;
1425
- setTimeout(() => connectTo(channel).catch(err => warn('voice reconnect failed', err?.stack || err)), 1500);
1426
- }
1427
- }
1428
- });
1429
- await entersState(voiceConnection, VoiceConnectionStatus.Ready, 30000);
1430
- voiceConnection.receiver.speaking.on('start', userId => subscribeUser(voiceConnection.receiver, userId));
1431
- log(`Listening in voice channel ${channel.guild.name} / ${channel.name}`);
1432
- }
1433
-
1434
- async function autoJoin() {
1435
- const attempted = [];
1436
- for (const preferredName of settings.autoJoinVoiceChannels) {
1437
- for (const guild of client.guilds.cache.values()) {
1438
- const channels = await guild.channels.fetch();
1439
- for (const ch of channels.values()) {
1440
- if (!ch?.isVoiceBased?.() || ch.name.toLowerCase() !== preferredName) continue;
1441
- attempted.push(`${guild.name}/${ch.name}`);
1442
- try {
1443
- await connectTo(ch);
1444
- return;
1445
- } catch (e) {
1446
- warn('auto-join failed; trying next configured voice channel', guild.name, ch.name, e?.stack || e);
1447
- try { connection?.destroy(); } catch {}
1448
- connection = null;
1449
- activeVoiceChannelId = '';
1450
- }
1451
- }
1452
- }
1453
- }
1454
- warn('No auto-join channel found or reachable', settings.autoJoinVoiceChannels, 'attempted', attempted);
1455
- }
1456
-
1457
1095
  function consumeRestartNotice() {
1458
1096
  const noticePath = path.join(ROOT, '.cache', 'restart-notice.txt');
1459
1097
  try {
@@ -1475,76 +1113,8 @@ async function announceRestartComplete() {
1475
1113
  await speakText(speech, undefined, null, { mirrorText: false });
1476
1114
  }
1477
1115
 
1478
- async function findVoiceChannelBySelector(guild, selector) {
1479
- const wanted = String(selector || '').trim();
1480
- if (!wanted || !guild) return null;
1481
- const id = wanted.replace(/^<#(\d+)>$/, '$1');
1482
- const channels = await guild.channels.fetch();
1483
- const voiceChannels = [...channels.values()].filter(ch => ch?.isVoiceBased?.());
1484
- const byId = voiceChannels.find(ch => ch.id === id);
1485
- if (byId) return byId;
1486
- const matches = voiceChannels.filter(ch => String(ch.name || '').toLowerCase() === wanted.toLowerCase());
1487
- if (matches.length === 1) return matches[0];
1488
- if (matches.length > 1) throw new Error(`같은 이름의 음성 채널이 여러 개야. 채널 ID나 멘션으로 지정해줘: ${wanted}`);
1489
- throw new Error(`음성 채널을 찾지 못했어: ${wanted}`);
1490
- }
1491
-
1492
- async function voiceChannelLabel(guild, channelId) {
1493
- if (!channelId || !guild) return '없음';
1494
- try {
1495
- const ch = await guild.channels.fetch(channelId);
1496
- return ch?.name || '지정됨';
1497
- } catch {
1498
- return '지정됨';
1499
- }
1500
- }
1501
-
1502
- async function resolveVoiceChannelForAttach(msg, selector = '') {
1503
- if (selector) return findVoiceChannelBySelector(msg.guild, selector);
1504
- if (msg.member?.voice?.channel) return msg.member.voice.channel;
1505
- if (activeVoiceChannelId && msg.guild) {
1506
- try {
1507
- const ch = await msg.guild.channels.fetch(activeVoiceChannelId);
1508
- if (ch?.isVoiceBased?.()) return ch;
1509
- } catch {}
1510
- }
1511
- throw new Error('붙일 음성 채널을 못 찾았어. 음성채널에 들어가서 `!session attach-voice`를 치거나 `--voice "채널명"`을 붙여줘.');
1512
- }
1513
-
1514
- async function attachVoiceChannelToTextSession(msg, command) {
1515
- const voiceChannel = await resolveVoiceChannelForAttach(msg, command.voice);
1516
- let session = null;
1517
- if (command.name) {
1518
- session = bindProjectSessionToChannel({ state: projectSessionsState, nameOrSlug: command.name, channelId: msg.channelId });
1519
- } else {
1520
- session = resolveProjectSessionForChannel(msg.channelId)
1521
- || resolveProjectSessionForChannel(voiceChannel.id);
1522
- if (!session) {
1523
- const fallbackName = String(msg.channel?.name || `channel-${msg.channelId}`).trim() || `channel-${msg.channelId}`;
1524
- session = createProjectSession({
1525
- root: ROOT,
1526
- state: projectSessionsState,
1527
- name: fallbackName,
1528
- workdir: settings.agent.cwd || ROOT,
1529
- channelId: msg.channelId,
1530
- voiceChannelId: voiceChannel.id,
1531
- transcriptChannelId: msg.channelId,
1532
- mcpContext: 'Ad-hoc Discord text channel session',
1533
- });
1534
- }
1535
- }
1536
- session.transcriptChannelId = msg.channelId;
1537
- session.voiceChannelId = voiceChannel.id;
1538
- projectSessionsState.channelSessions[msg.channelId] = session.slug;
1539
- projectSessionsState.channelSessions[voiceChannel.id] = session.slug;
1540
- saveProjectSessionsState();
1541
- agentAdaptersBySession.delete(session.slug);
1542
- if (activeVoiceChannelId !== voiceChannel.id) await connectTo(voiceChannel);
1543
- return msg.reply(`${session.name} 세션을 이 텍스트 채널과 음성 채널 ${voiceChannel.name}에 붙였어. 이제 그 음성채널 발화의 STT/답변 텍스트는 이 채널로 가.`);
1544
- }
1545
-
1546
1116
  async function handleProjectSessionCommand(msg, command) {
1547
- const activeSession = resolveProjectSessionForChannel(msg.channelId) || resolveProjectSessionForChannel(activeVoiceChannelId);
1117
+ const activeSession = resolveProjectSessionForChannel(msg.channelId) || resolveProjectSessionForChannel(bridge.activeVoiceChannelId);
1548
1118
  if (command.action === 'attach-voice') return void await attachVoiceChannelToTextSession(msg, command);
1549
1119
  if (command.action === 'status') {
1550
1120
  if (!activeSession) return void msg.reply(`${agentAdapter.label} 기본 세션: ${agentAdapter.readSessionId?.() || '아직 없음'}`);
@@ -1602,7 +1172,8 @@ async function handleProjectSessionCommand(msg, command) {
1602
1172
  mcpContext: command.mcpContext,
1603
1173
  });
1604
1174
  saveProjectSessionsState();
1605
- agentAdaptersBySession.delete(session.slug);
1175
+ bridge.agentAdaptersBySession.delete(session.slug);
1176
+ invalidateBackendAdaptersForSession(session.slug);
1606
1177
  return void msg.reply(`${session.name} 프로젝트 세션 만들었어. 작업실은 ${session.workdir}이고, 이 텍스트 채널${voiceChannel ? `과 음성 채널 ${voiceChannel.name}` : ''} 입력은 별도 Hermes 세션 파일로 이어져.`);
1607
1178
  }
1608
1179
  }
@@ -1613,107 +1184,22 @@ client.once('ready', async () => {
1613
1184
  await announceRestartComplete();
1614
1185
  });
1615
1186
 
1616
- client.on('messageCreate', async msg => {
1617
- if (msg.author.bot) return;
1618
- if (!isAllowed(msg.author.id)) return;
1619
- const content = msg.content.trim();
1620
- const projectSessionCommand = parseProjectSessionCommand(content);
1621
- if (projectSessionCommand) {
1622
- try {
1623
- await handleProjectSessionCommand(msg, projectSessionCommand);
1624
- } catch (e) {
1625
- warn('project session command failed', e?.stack || e);
1626
- await msg.reply(String(e?.message || e).slice(0, 700));
1627
- }
1628
- return;
1629
- }
1630
- if (content === '!ping') return void msg.reply('pong');
1631
- if (content === '!verbose') return void msg.reply(verboseStatusText());
1632
- if (['!verbose on', '!verbose true', '!verbose 1', '!verbose 켜', '!verbose 켜줘'].includes(content.toLowerCase())) {
1633
- setVerboseProgress(true, 'discord-command');
1634
- return void msg.reply(verboseStatusText());
1635
- }
1636
- if (['!verbose off', '!verbose false', '!verbose 0', '!verbose 꺼', '!verbose 꺼줘'].includes(content.toLowerCase())) {
1637
- setVerboseProgress(false, 'discord-command');
1638
- return void msg.reply(verboseStatusText());
1639
- }
1640
- if (content === '!sensitivity') return void msg.reply(sensitivityStatusText());
1641
- if (content === '!latency' || content === '!metrics') {
1642
- const summary = summarizeLatencyRecords(readJsonlRecords(settings.latencyLogPath, { limit: 200 }));
1643
- return void msg.reply(`최근 latency 요약 (${settings.latencyLogPath}):\n${formatLatencySummary(summary)}`.slice(0, 1900));
1644
- }
1645
- if (content === '!sensitivity conservative') {
1646
- setSensitivityMode('conservative', 'discord-command');
1647
- return void msg.reply(sensitivityStatusText());
1648
- }
1649
- if (content === '!sensitivity normal') {
1650
- setSensitivityMode('normal', 'discord-command');
1651
- return void msg.reply(sensitivityStatusText());
1652
- }
1653
- if (content === '!session') return void handleProjectSessionCommand(msg, { action: 'status' });
1654
- if (content === '!reset-session') return void handleProjectSessionCommand(msg, { action: 'reset' });
1655
- if (content === '!join') {
1656
- const ch = msg.member?.voice?.channel;
1657
- if (!ch) return void msg.reply('먼저 음성 채널에 들어가줘.');
1658
- await connectTo(ch);
1659
- return void msg.reply('들어왔어. Node receiver로 듣는 중.');
1660
- }
1661
- if (content === '!leave') {
1662
- try { connection?.destroy(); } catch {}
1663
- connection = null;
1664
- activeVoiceChannelId = '';
1665
- return void msg.reply('나갈게.');
1666
- }
1667
- if (content.startsWith('!say ')) {
1668
- const text = content.slice(5).trim();
1669
- const mp3 = await synthTTS(text);
1670
- await playAudio(mp3);
1671
- return;
1672
- }
1673
- if (content.startsWith('!voice-test ')) {
1674
- const text = content.slice('!voice-test '.length).trim();
1675
- if (!text) return void msg.reply('테스트할 문장을 붙여줘.');
1676
- const started = Date.now();
1677
- try {
1678
- await msg.reply(`TTS 백엔드 ${ttsBackend.name}로 음성 테스트할게.`);
1679
- await speakText(text);
1680
- await msg.channel.send(`음성 테스트 완료: ${ttsBackend.name}, ${Date.now() - started}ms`);
1681
- } catch (e) {
1682
- warn('voice-test failed', e?.stack || e);
1683
- await msg.channel.send(`음성 테스트 실패: ${String(e?.message || e).slice(0, 700)}`);
1684
- }
1685
- return;
1686
- }
1687
- if (content === '!voice-clone' || content === '!voice-clone status') {
1688
- const current = voiceCloneCapture.current();
1689
- if (current?.userId === String(msg.author.id)) {
1690
- return void msg.reply(`다음 유효한 음성을 ${path.relative(ROOT, current.targetPath)}에 저장할게.`);
1691
- }
1692
- return void msg.reply('대기 중인 보이스 클로닝 샘플 캡처가 없어. `!voice-clone capture`로 시작해.');
1693
- }
1694
- if (content === '!voice-clone cancel') {
1695
- const cancelled = voiceCloneCapture.cancel(msg.author.id);
1696
- return void msg.reply(cancelled ? '보이스 클로닝 샘플 캡처를 취소했어.' : '대기 중인 캡처가 없어.');
1697
- }
1698
- if (content === '!voice-clone capture') {
1699
- const armed = voiceCloneCapture.arm({ userId: msg.author.id, source: 'discord-command' });
1700
- return void msg.reply(`다음 유효한 음성을 ${path.relative(ROOT, armed.targetPath)}에 저장할게. 음성 채널에서 10~30초 정도 말해줘.`);
1701
- }
1702
- if (content.startsWith('!ask ')) {
1703
- const text = content.slice(5).trim();
1704
- if (!text) return void msg.reply('물어볼 내용을 붙여줘.');
1705
- await handleTextAgentMessage(msg, text, { speakResponse: true });
1706
- return;
1707
- }
1708
- if (shouldRouteDiscordTextToAgent({
1709
- content,
1710
- channelId: msg.channelId,
1711
- transcriptChannelId: settings.transcriptChannelId,
1712
- }) || resolveProjectSessionForChannel(msg.channelId)) {
1713
- await handleTextAgentMessage(msg, content, { speakResponse: false });
1714
- return;
1715
- }
1187
+ const discordCommandRouter = createDiscordCommandRouter({
1188
+ bridge, settings, warn, path, ROOT,
1189
+ isAllowed,
1190
+ handleProjectSessionCommand,
1191
+ handleTextAgentMessage,
1192
+ resolveProjectSessionForChannel,
1193
+ verboseStatusText, setVerboseProgress,
1194
+ notifyStatusText,
1195
+ smartProgressStatusText,
1196
+ sensitivityStatusText, setSensitivityMode,
1197
+ summarizeLatencyRecords, readJsonlRecords, formatLatencySummary,
1198
+ connectTo,
1199
+ synthTTS, playAudio, speakText,
1200
+ voiceCloneCapture,
1716
1201
  });
1202
+ client.on('messageCreate', msg => discordCommandRouter.handleDiscordMessage(msg).catch(e => warn('discord command router failed', e?.stack || e)));
1717
1203
 
1718
1204
  process.stdout?.on?.('error', error => {
1719
1205
  if (isBenignTransientNetworkError(error)) {
@@ -1743,37 +1229,6 @@ process.on('uncaughtException', error => {
1743
1229
  client.on('error', e => warn('discord client error', e?.stack || e));
1744
1230
  client.on('shardError', e => warn('discord shard error', e?.stack || e));
1745
1231
 
1746
- let shutdownStarted = false;
1747
- async function gracefulShutdown(signalName) {
1748
- if (shutdownStarted) return;
1749
- shutdownStarted = true;
1750
- log('graceful shutdown requested', signalName, 'connection', Boolean(connection));
1751
- try {
1752
- if (currentAbortController && !currentAbortController.signal.aborted) currentAbortController.abort();
1753
- } catch (e) {
1754
- warn('abort before shutdown failed', e?.stack || e);
1755
- }
1756
- try {
1757
- if (connection) {
1758
- let detail = '';
1759
- const noticePath = path.join(ROOT, '.cache', 'restart-notice.txt');
1760
- try {
1761
- if (fs.existsSync(noticePath)) {
1762
- detail = fs.readFileSync(noticePath, 'utf8').replace(/\s+/g, ' ').trim().slice(0, 120);
1763
- }
1764
- } catch (e) {
1765
- warn('read restart notice failed', e?.stack || e);
1766
- }
1767
- await speakText(formatRestartShutdownNotice(detail, settings.tts.edge.voice));
1768
- await waitEvent(player, AudioPlayerStatus.Idle, 30000).catch(() => {});
1769
- }
1770
- } catch (e) {
1771
- warn('shutdown voice notice failed', e?.stack || e);
1772
- }
1773
- try { connection?.destroy(); } catch {}
1774
- try { client.destroy(); } catch {}
1775
- process.exit(0);
1776
- }
1777
1232
  process.on('SIGTERM', () => { void gracefulShutdown('SIGTERM'); });
1778
1233
  process.on('SIGINT', () => { void gracefulShutdown('SIGINT'); });
1779
1234