verbalcoding 0.2.12 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/.env.example +74 -4
  2. package/README.es.md +3 -1
  3. package/README.fr.md +3 -1
  4. package/README.ja.md +3 -1
  5. package/README.ko.md +4 -2
  6. package/README.md +4 -2
  7. package/README.ru.md +3 -1
  8. package/README.zh.md +3 -1
  9. package/app-node/agent_adapters.test.mjs +14 -0
  10. package/app-node/agent_routing.mjs +148 -0
  11. package/app-node/agent_routing.test.mjs +138 -0
  12. package/app-node/agent_turn.mjs +86 -0
  13. package/app-node/agent_turn.test.mjs +109 -0
  14. package/app-node/bridge_context.mjs +73 -0
  15. package/app-node/bridge_context.test.mjs +54 -0
  16. package/app-node/bridge_state.mjs +4 -0
  17. package/app-node/bridge_wireup.test.mjs +462 -0
  18. package/app-node/cli_install.test.mjs +31 -0
  19. package/app-node/cross_agent_routing.test.mjs +78 -0
  20. package/app-node/discord_command_router.mjs +204 -0
  21. package/app-node/discord_command_router.test.mjs +311 -0
  22. package/app-node/discord_voice_setup.mjs +251 -0
  23. package/app-node/discord_voice_setup.test.mjs +86 -0
  24. package/app-node/hermes_profiles.test.mjs +12 -1
  25. package/app-node/install_config.mjs +110 -3
  26. package/app-node/install_config.test.mjs +8 -0
  27. package/app-node/instance_doctor.test.mjs +9 -0
  28. package/app-node/instances.test.mjs +8 -1
  29. package/app-node/main.mjs +488 -1368
  30. package/app-node/mcp_tools.test.mjs +7 -0
  31. package/app-node/notification_handler.mjs +89 -0
  32. package/app-node/notification_handler.test.mjs +187 -0
  33. package/app-node/plan_dispatcher.mjs +215 -0
  34. package/app-node/plan_dispatcher.test.mjs +101 -0
  35. package/app-node/plan_mode.mjs +36 -7
  36. package/app-node/plan_mode.test.mjs +78 -0
  37. package/app-node/progress_handler.mjs +220 -0
  38. package/app-node/progress_handler.test.mjs +193 -0
  39. package/app-node/progress_speech.mjs +54 -32
  40. package/app-node/progress_speech.test.mjs +12 -3
  41. package/app-node/project_sessions.mjs +5 -2
  42. package/app-node/project_sessions.test.mjs +7 -0
  43. package/app-node/research_mode.mjs +282 -0
  44. package/app-node/research_mode.test.mjs +264 -0
  45. package/app-node/restart_notice.mjs +3 -0
  46. package/app-node/restart_notice.test.mjs +11 -0
  47. package/app-node/session_ontology.mjs +271 -0
  48. package/app-node/session_ontology.test.mjs +130 -0
  49. package/app-node/smart_progress.mjs +1 -1
  50. package/app-node/stream_sentencer.mjs +32 -2
  51. package/app-node/stream_sentencer.test.mjs +65 -0
  52. package/app-node/streaming_tts_queue.mjs +5 -1
  53. package/app-node/streaming_tts_queue.test.mjs +7 -1
  54. package/app-node/stt_whisper.mjs +24 -0
  55. package/app-node/stt_whisper.test.mjs +32 -0
  56. package/app-node/text_routing.mjs +4 -2
  57. package/app-node/tts_backends.mjs +537 -3
  58. package/app-node/tts_backends.test.mjs +454 -0
  59. package/app-node/tts_player.mjs +164 -0
  60. package/app-node/tts_player.test.mjs +202 -0
  61. package/app-node/tts_runtime.mjs +134 -0
  62. package/app-node/tts_runtime.test.mjs +89 -0
  63. package/app-node/tts_settings.mjs +150 -3
  64. package/app-node/tts_settings.test.mjs +204 -0
  65. package/app-node/tts_voice_config.mjs +136 -2
  66. package/app-node/tts_voice_config.test.mjs +94 -0
  67. package/app-node/utterance_router.mjs +216 -0
  68. package/app-node/utterance_router.test.mjs +236 -0
  69. package/app-node/voice_autojoin.mjs +37 -0
  70. package/app-node/voice_autojoin.test.mjs +59 -0
  71. package/app-node/voice_io.mjs +272 -0
  72. package/app-node/voice_io.test.mjs +102 -0
  73. package/app-node/voice_turn_runner.mjs +449 -0
  74. package/app-node/voice_turn_runner.test.mjs +289 -0
  75. package/docs/CONFIGURATION.md +12 -2
  76. package/docs/HARNESSES.md +58 -0
  77. package/docs/HARNESS_AIDER.md +50 -0
  78. package/docs/HARNESS_CLAUDE.md +56 -0
  79. package/docs/HARNESS_CODEX.md +56 -0
  80. package/docs/HARNESS_CURSOR.md +45 -0
  81. package/docs/HARNESS_GEMINI.md +45 -0
  82. package/docs/HARNESS_HERMES.md +57 -0
  83. package/docs/HARNESS_OPENCLAW.md +44 -0
  84. package/docs/HARNESS_OPENCODE.md +44 -0
  85. package/docs/README.md +1 -0
  86. package/docs/ROADMAP.md +20 -5
  87. package/docs/TTS_BACKENDS.md +227 -0
  88. package/docs/USAGE.md +22 -0
  89. package/docs/i18n/AGENTS.es.md +34 -0
  90. package/docs/i18n/AGENTS.fr.md +34 -0
  91. package/docs/i18n/AGENTS.ja.md +34 -0
  92. package/docs/i18n/AGENTS.ko.md +34 -0
  93. package/docs/i18n/AGENTS.ru.md +34 -0
  94. package/docs/i18n/AGENTS.zh.md +34 -0
  95. package/docs/i18n/HARNESSES.es.md +58 -0
  96. package/docs/i18n/HARNESSES.fr.md +58 -0
  97. package/docs/i18n/HARNESSES.ja.md +58 -0
  98. package/docs/i18n/HARNESSES.ko.md +58 -0
  99. package/docs/i18n/HARNESSES.ru.md +58 -0
  100. package/docs/i18n/HARNESSES.zh.md +58 -0
  101. package/docs/i18n/HARNESS_AIDER.es.md +48 -0
  102. package/docs/i18n/HARNESS_AIDER.fr.md +48 -0
  103. package/docs/i18n/HARNESS_AIDER.ja.md +50 -0
  104. package/docs/i18n/HARNESS_AIDER.ko.md +50 -0
  105. package/docs/i18n/HARNESS_AIDER.ru.md +48 -0
  106. package/docs/i18n/HARNESS_AIDER.zh.md +48 -0
  107. package/docs/i18n/HARNESS_CLAUDE.es.md +55 -0
  108. package/docs/i18n/HARNESS_CLAUDE.fr.md +55 -0
  109. package/docs/i18n/HARNESS_CLAUDE.ja.md +56 -0
  110. package/docs/i18n/HARNESS_CLAUDE.ko.md +56 -0
  111. package/docs/i18n/HARNESS_CLAUDE.ru.md +55 -0
  112. package/docs/i18n/HARNESS_CLAUDE.zh.md +56 -0
  113. package/docs/i18n/HARNESS_CODEX.es.md +55 -0
  114. package/docs/i18n/HARNESS_CODEX.fr.md +55 -0
  115. package/docs/i18n/HARNESS_CODEX.ja.md +56 -0
  116. package/docs/i18n/HARNESS_CODEX.ko.md +56 -0
  117. package/docs/i18n/HARNESS_CODEX.ru.md +55 -0
  118. package/docs/i18n/HARNESS_CODEX.zh.md +56 -0
  119. package/docs/i18n/HARNESS_CURSOR.es.md +42 -0
  120. package/docs/i18n/HARNESS_CURSOR.fr.md +42 -0
  121. package/docs/i18n/HARNESS_CURSOR.ja.md +45 -0
  122. package/docs/i18n/HARNESS_CURSOR.ko.md +45 -0
  123. package/docs/i18n/HARNESS_CURSOR.ru.md +42 -0
  124. package/docs/i18n/HARNESS_CURSOR.zh.md +42 -0
  125. package/docs/i18n/HARNESS_GEMINI.es.md +44 -0
  126. package/docs/i18n/HARNESS_GEMINI.fr.md +44 -0
  127. package/docs/i18n/HARNESS_GEMINI.ja.md +45 -0
  128. package/docs/i18n/HARNESS_GEMINI.ko.md +45 -0
  129. package/docs/i18n/HARNESS_GEMINI.ru.md +44 -0
  130. package/docs/i18n/HARNESS_GEMINI.zh.md +45 -0
  131. package/docs/i18n/HARNESS_HERMES.es.md +54 -0
  132. package/docs/i18n/HARNESS_HERMES.fr.md +54 -0
  133. package/docs/i18n/HARNESS_HERMES.ja.md +57 -0
  134. package/docs/i18n/HARNESS_HERMES.ko.md +57 -0
  135. package/docs/i18n/HARNESS_HERMES.ru.md +54 -0
  136. package/docs/i18n/HARNESS_HERMES.zh.md +57 -0
  137. package/docs/i18n/HARNESS_OPENCLAW.es.md +41 -0
  138. package/docs/i18n/HARNESS_OPENCLAW.fr.md +41 -0
  139. package/docs/i18n/HARNESS_OPENCLAW.ja.md +44 -0
  140. package/docs/i18n/HARNESS_OPENCLAW.ko.md +44 -0
  141. package/docs/i18n/HARNESS_OPENCLAW.ru.md +41 -0
  142. package/docs/i18n/HARNESS_OPENCLAW.zh.md +42 -0
  143. package/docs/i18n/HARNESS_OPENCODE.es.md +41 -0
  144. package/docs/i18n/HARNESS_OPENCODE.fr.md +41 -0
  145. package/docs/i18n/HARNESS_OPENCODE.ja.md +44 -0
  146. package/docs/i18n/HARNESS_OPENCODE.ko.md +44 -0
  147. package/docs/i18n/HARNESS_OPENCODE.ru.md +41 -0
  148. package/docs/i18n/HARNESS_OPENCODE.zh.md +44 -0
  149. package/docs/superpowers/plans/2026-05-14-cross-agent-voice-transfer.md +625 -0
  150. package/docs/superpowers/plans/2026-05-21-audio-overview-narrated-diffs.md +95 -0
  151. package/docs/superpowers/plans/2026-05-21-autoresearch-ontology.md +83 -0
  152. package/docs/superpowers/plans/2026-05-21-phase11-push-to-talk-wakeword-v2.md +77 -0
  153. package/docs/superpowers/plans/2026-05-21-phase12-multi-user-voice.md +147 -0
  154. package/docs/superpowers/plans/2026-05-21-phase14-verbalbench.md +136 -0
  155. package/docs/superpowers/plans/2026-05-21-phase15-phone-companion.md +72 -0
  156. package/integrations/fireredtts2/mlx_llm.py +183 -0
  157. package/integrations/fireredtts2/synth.py +156 -0
  158. package/integrations/fireredtts2/synth_mlx.py +196 -0
  159. package/integrations/mlxaudio/synth.py +74 -0
  160. package/integrations/neuttsair/synth.py +104 -0
  161. package/integrations/omnivoice/synth.py +110 -0
  162. package/package.json +6 -1
  163. package/scripts/cli.mjs +84 -0
  164. package/scripts/doctor.mjs +104 -4
  165. package/scripts/install.mjs +5 -1
  166. package/scripts/install_fireredtts2.sh +109 -0
  167. package/scripts/install_mlxaudio.sh +34 -0
  168. package/scripts/install_mossttsnano.sh +46 -0
  169. package/scripts/postinstall.mjs +34 -0
package/app-node/main.mjs CHANGED
@@ -6,19 +6,8 @@ import { spawn, execFile } from 'node:child_process';
6
6
  import { promisify } from 'node:util';
7
7
 
8
8
  import { Client, GatewayIntentBits, Partials } from 'discord.js';
9
- import {
10
- AudioPlayerStatus,
11
- EndBehaviorType,
12
- StreamType,
13
- VoiceConnectionStatus,
14
- createAudioPlayer,
15
- createAudioResource,
16
- entersState,
17
- joinVoiceChannel,
18
- } from '@discordjs/voice';
19
- import prism from 'prism-media';
20
- import wav from 'wav';
21
- import { buildAgentSettings, createAgentAdapter, isPatchLikeOutput } from './agent_adapters.mjs';
9
+ import { createAudioPlayer } from '@discordjs/voice';
10
+ import { buildAgentSettings, createAgentAdapter, isPatchLikeOutput, shellSplit } from './agent_adapters.mjs';
22
11
  import {
23
12
  appendJsonl,
24
13
  createLatencyTurn,
@@ -26,11 +15,6 @@ import {
26
15
  readJsonlRecords,
27
16
  summarizeLatencyRecords,
28
17
  } from './latency_metrics.mjs';
29
- import { splitForTTS } from './tts_chunks.mjs';
30
- import { playChunkedTTSWithPrefetch } from './tts_prefetch.mjs';
31
- import { createSentencer } from './stream_sentencer.mjs';
32
- import { createStreamingTTSQueue } from './streaming_tts_queue.mjs';
33
- import { createSmartProgressSummarizer } from './smart_progress.mjs';
34
18
  import {
35
19
  isPlanEntryUtterance,
36
20
  parsePlanOutput,
@@ -43,8 +27,16 @@ import {
43
27
  renderDecisionPrompt,
44
28
  renderResolvedDecisions,
45
29
  } from './plan_mode.mjs';
46
- import { createNotifier, buildDiscordDeepLink } from './notify.mjs';
47
- import { progressCategory, summarizeProgressEvents, formatProgressMessage } from './progress_speech.mjs';
30
+ import {
31
+ parseAgentRoutingCommand,
32
+ renderAgentPrefix,
33
+ buildCrossAgentPrompt,
34
+ isAgentRoutingDecision,
35
+ buildFallbackDecision,
36
+ isRoutingOnlyUtterance,
37
+ } from './agent_routing.mjs';
38
+ import { createSessionOntology } from './session_ontology.mjs';
39
+ import { parseResearchCommand, runResearchTurn } from './research_mode.mjs';
48
40
  import { buildTtsSettings } from './tts_settings.mjs';
49
41
  import { createTtsBackend } from './tts_backends.mjs';
50
42
  import {
@@ -59,21 +51,30 @@ import {
59
51
  } from './tts_voice_config.mjs';
60
52
  import { createBridgeLogger, createTransientErrorReporter, isTransientNetworkError } from './bridge_logger.mjs';
61
53
  import { createBridgeState } from './bridge_state.mjs';
54
+ import { createBridge } from './bridge_context.mjs';
55
+ import { createVoiceIO } from './voice_io.mjs';
56
+ import { createTtsPlayer } from './tts_player.mjs';
57
+ import { createUtteranceRouter } from './utterance_router.mjs';
58
+ import { createProgressHandler } from './progress_handler.mjs';
59
+ import { createNotificationHandler } from './notification_handler.mjs';
60
+ import { createTtsRuntime } from './tts_runtime.mjs';
61
+ import { createDiscordVoiceSetup } from './discord_voice_setup.mjs';
62
+ import { createAgentTurnLifecycle } from './agent_turn.mjs';
63
+ import { createDiscordCommandRouter } from './discord_command_router.mjs';
64
+ import { createVoiceTurnRunner } from './voice_turn_runner.mjs';
65
+ import { createPlanDispatcher } from './plan_dispatcher.mjs';
62
66
  import { sendDiscordText, splitDiscordMessage } from './discord_text.mjs';
63
- import { progressTtsCacheFileName } from './progress_cache.mjs';
64
67
  import { shouldPassWhisperLanguage, voiceLanguageCommandFromTranscript, languagePreset } from './language_config.mjs';
65
- import { formatRestartCompleteNotice, formatRestartShutdownNotice } from './restart_notice.mjs';
68
+ import { whisperFailureMessage, whisperTimeoutMs } from './stt_whisper.mjs';
69
+ import { formatRestartCompleteNotice } from './restart_notice.mjs';
66
70
  import {
67
- appendRecentDiscordText,
68
71
  formatRecentDiscordContext,
69
- shouldRouteDiscordTextToAgent,
70
72
  } from './text_routing.mjs';
71
73
  import {
72
74
  bindProjectSessionToChannel,
73
75
  createProjectSession,
74
76
  listProjectSessions,
75
77
  loadProjectSessions,
76
- parseProjectSessionCommand,
77
78
  projectSessionContextText,
78
79
  projectSessionForChannel,
79
80
  saveProjectSessions,
@@ -161,13 +162,20 @@ function ensureTtsVoiceConfig() {
161
162
  return readTtsVoiceConfig(TTS_VOICE_CONFIG_PATH);
162
163
  }
163
164
  function applyVoiceConfigToProcessEnv(config = ensureTtsVoiceConfig()) {
164
- const selection = effectiveTtsVoiceSelection(config, {});
165
+ const selection = effectiveTtsVoiceSelection(config, process.env);
165
166
  const configuredVoiceLanguage = process.env.VOICE_LANGUAGE;
166
167
  const nextEnv = applyTtsVoiceSelectionToEnv(process.env, selection);
167
168
  if (configuredVoiceLanguage) nextEnv.VOICE_LANGUAGE = configuredVoiceLanguage;
168
169
  for (const [key, value] of Object.entries(nextEnv)) process.env[key] = value;
169
170
  return { config, selection };
170
171
  }
172
+ function rebuildTtsRuntimeSettings(selection = null) {
173
+ settings.tts = buildTtsSettings(process.env, ROOT);
174
+ if (selection?.backend === 'edge' && selection.voice?.voice) settings.tts.edge.voice = selection.voice.voice;
175
+ try { bridge.ttsBackend?.close?.(); } catch (e) { warn('tts backend close failed', e?.message || e); }
176
+ bridge.ttsBackend = createTtsBackend(settings.tts, { execFileAsync, spawn, log, warn, onFallback: ttsFallbackNotice, voiceProvider: () => settings.tts.edge.voice });
177
+ return settings.tts;
178
+ }
171
179
  function reloadRuntimeLanguageFromEnv() {
172
180
  const previousWhisperLanguage = settings?.whisperLanguage;
173
181
  const previousVoiceLanguage = settings?.voiceLanguage;
@@ -190,6 +198,7 @@ const settings = {
190
198
  whisperBin: process.env.WHISPER_CPP_BIN || 'whisper-cli',
191
199
  whisperModel: process.env.WHISPER_CPP_MODEL || path.join(ROOT, 'models', 'ggml-small-q5_1.bin'),
192
200
  whisperLanguage: process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
201
+ whisperTimeoutMs: whisperTimeoutMs(process.env),
193
202
  voiceLanguage: process.env.VOICE_LANGUAGE || process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
194
203
  tts: buildTtsSettings(process.env, ROOT),
195
204
  requireWakeWord: ['1', 'true', 'yes'].includes((process.env.REQUIRE_WAKE_WORD || '0').toLowerCase()),
@@ -207,21 +216,33 @@ const client = new Client({
207
216
  intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates, GatewayIntentBits.GuildMessages, GatewayIntentBits.MessageContent],
208
217
  partials: [Partials.Channel],
209
218
  });
210
- let ttsBackend = createTtsBackend(settings.tts, { execFileAsync, log, warn, voiceProvider: () => settings.tts.edge.voice });
219
+ const announcedTtsFallbacks = new Set();
220
+ const pendingFallbackNoticePromises = new Set();
221
+ function ttsFallbackNotice({ backend } = {}) {
222
+ if (!backend || backend === 'edge') return;
223
+ if (announcedTtsFallbacks.has(backend)) return;
224
+ announcedTtsFallbacks.add(backend);
225
+ const en = /^en/i.test(String(settings.voiceLanguage || ''));
226
+ const msg = en
227
+ ? `${backend} synthesis failed; using Edge for the rest of this session.`
228
+ : `${backend} 음성 생성에 실패해서 이번 세션은 Edge로 진행할게.`;
229
+ const textPromise = sendText(`⚠️ ${msg}`)
230
+ .catch(e => warn('tts fallback notice send failed', e?.message || e));
231
+ pendingFallbackNoticePromises.add(textPromise);
232
+ textPromise.finally(() => pendingFallbackNoticePromises.delete(textPromise));
233
+ const speakPromise = new Promise(resolve => queueMicrotask(() => {
234
+ speakText(msg, null, null, { mirrorText: false })
235
+ .catch(e => warn('tts fallback notice speak failed', e?.message || e))
236
+ .finally(resolve);
237
+ }));
238
+ pendingFallbackNoticePromises.add(speakPromise);
239
+ speakPromise.finally(() => pendingFallbackNoticePromises.delete(speakPromise));
240
+ }
241
+ const bridge = createBridge();
242
+ bridge.ttsBackend = createTtsBackend(settings.tts, { execFileAsync, spawn, log, warn, onFallback: ttsFallbackNotice, voiceProvider: () => settings.tts.edge.voice });
211
243
  const voiceCloneCapture = createVoiceCloneCaptureState({ defaultTargetPath: settings.tts.openvoice.refAudio });
212
244
 
213
- let connection = null;
214
- let activeVoiceChannelId = '';
215
- let activeTranscriptChannelId = '';
216
- const recentDiscordTextByChannel = new Map();
217
- let player = createAudioPlayer();
218
- let speaking = false;
219
- let processing = false;
220
- let activeTurnId = 0;
221
- let currentAbortController = null;
222
- const interruptedTurns = new Set();
223
- const activeStreams = new Map();
224
- let bridgeState = null;
245
+ bridge.player = createAudioPlayer();
225
246
  const MAX_DEFERRED_PROCESSING_UTTERANCES = Number(process.env.MAX_DEFERRED_PROCESSING_UTTERANCES || '0');
226
247
  const MIN_UTTERANCE_SECONDS = Number(process.env.MIN_UTTERANCE_SECONDS || '1.4');
227
248
  const MIN_UTTERANCE_BYTES = 48000 * 2 * 2 * MIN_UTTERANCE_SECONDS;
@@ -254,7 +275,7 @@ const bridgeLogger = createBridgeLogger({
254
275
  });
255
276
  function log(...args) { bridgeLogger.log(...args); }
256
277
  function warn(...args) { bridgeLogger.warn(...args); }
257
- bridgeState = createBridgeState({ log, cleanupFile: file => fs.rm(file, { force: true }, () => {}) });
278
+ bridge.bridgeState = createBridgeState({ log, cleanupFile: file => fs.rm(file, { force: true }, () => {}) });
258
279
  const reportTransientProcessError = createTransientErrorReporter({ warn });
259
280
  function isBenignTransientNetworkError(error) {
260
281
  return isTransientNetworkError(error);
@@ -273,241 +294,80 @@ function newLatencyTurn(userId, startedAtMs) {
273
294
  }
274
295
 
275
296
  function discardVoiceInputQueues(reason = 'config-change') {
276
- return bridgeState?.discardQueues(reason) || 0;
277
- }
278
- let verboseProgress = Boolean(settings.agent.verboseProgress);
279
- let activeProgressSignal = null;
280
- let verboseProgressSpeechQueue = Promise.resolve();
281
- let activeProgressAbortController = null;
282
- let speechPlaybackGeneration = 0;
283
- let progressSpeechBatch = [];
284
- let progressSpeechBatchTimer = null;
285
- let progressSpeechBatchSignal = null;
286
- let progressSpeechBatchStartedAt = 0;
287
-
288
- const STREAMING_TTS_ENABLED = ['1', 'true', 'yes', 'on'].includes(String(process.env.STREAMING_TTS || '0').toLowerCase());
289
- let activeSentencer = null;
290
- let activeStreamingQueue = null;
291
- let streamingSpeechDelivered = false;
292
-
293
- let notifyUserOptIn = false;
294
- let notifierInstance = null;
295
- function ensureNotifier() {
296
- if (notifierInstance) return notifierInstance;
297
- notifierInstance = createNotifier({
298
- provider: (process.env.NOTIFY_PROVIDER || 'ntfy').toLowerCase(),
299
- topic: process.env.NTFY_TOPIC || '',
300
- pushoverUser: process.env.PUSHOVER_USER || '',
301
- pushoverToken: process.env.PUSHOVER_TOKEN || '',
302
- });
303
- return notifierInstance;
304
- }
305
- function notifyStatusText() {
306
- const provider = (process.env.NOTIFY_PROVIDER || 'ntfy').toLowerCase();
307
- const hasTopic = provider === 'ntfy' ? Boolean(process.env.NTFY_TOPIC) : (provider === 'pushover' ? Boolean(process.env.PUSHOVER_USER && process.env.PUSHOVER_TOKEN) : true);
308
- const mode = notifyUserOptIn ? 'always' : 'empty-channel only';
309
- const config = hasTopic ? 'configured' : 'NOT configured';
310
- return `notify: ${mode} via ${provider} (${config}). Threshold: ${process.env.NOTIFY_MIN_TASK_MS || '60000'}ms.`;
311
- }
312
- async function getVoiceChannelHumanCount() {
313
- if (!activeVoiceChannelId) return 0;
314
- try {
315
- const ch = await client.channels.fetch(activeVoiceChannelId).catch(() => null);
316
- if (!ch || !ch.members) return 0;
317
- let count = 0;
318
- for (const [, m] of ch.members) if (!m.user?.bot) count += 1;
319
- return count;
320
- } catch (e) {
321
- warn('humanCount failed', e?.message || e);
322
- return 0;
323
- }
324
- }
325
- async function maybeNotifyTaskComplete({ answer, label, elapsedMs, guildId }) {
326
- const provider = (process.env.NOTIFY_PROVIDER || '').toLowerCase();
327
- if (!provider || provider === 'noop') return;
328
- const minTaskMs = Number(process.env.NOTIFY_MIN_TASK_MS || '60000');
329
- const humanCount = await getVoiceChannelHumanCount();
330
- const notifier = ensureNotifier();
331
- if (!notifier.shouldNotify({ humanCount, taskMs: elapsedMs, minTaskMs, userOptIn: notifyUserOptIn })) return;
332
- const text = String(answer || '').trim();
333
- const lastSentence = text.split(/(?<=[.!?。!?])\s+/).filter(Boolean).pop() || text;
334
- const body = lastSentence.slice(0, 200);
335
- const title = label ? `${label} finished` : 'VerbalCoding finished';
336
- const deepLink = buildDiscordDeepLink({ guildId, channelId: activeVoiceChannelId });
337
- try {
338
- const result = await notifier.send({ title, body, deepLink });
339
- log('notify sent', 'provider', provider, 'status', result?.status || result?.ok, 'skipped', result?.skipped || false);
340
- } catch (e) {
341
- warn('notify send failed', e?.message || e);
342
- }
343
- }
344
-
345
- const planStates = new Map(); // channelId -> { steps, language }
346
-
347
- function planChannelKey() {
348
- return activeVoiceChannelId || settings.transcriptChannelId || 'default';
349
- }
350
-
351
- async function askNextDecision(state, signal) {
352
- const decision = state.decisions[state.pendingDecisionIndex];
353
- if (!decision) return;
354
- const text = renderDecisionPrompt(decision, state.language);
355
- await sendText(`❓ ${text}`);
356
- await speakText(text, signal, null);
297
+ return bridge.bridgeState?.discardQueues(reason) || 0;
357
298
  }
299
+ bridge.verboseProgress = Boolean(settings.agent.verboseProgress);
358
300
 
359
- async function finalizePlanReady(state, signal) {
360
- const language = state.language;
361
- const resolvedLine = renderResolvedDecisions(state.resolvedDecisions, language);
362
- const plan = planNarrationLines(state.steps, language);
363
- const tail = /^en/i.test(String(language || ''))
364
- ? `${plan}\n${resolvedLine}\nSay "approve" to run, or edit with skip/insert.`
365
- : `${plan}\n${resolvedLine}\n"실행"이라고 하면 시작할게. skip/insert로 수정도 돼.`;
366
- await sendText(`📝 ${tail}`);
367
- await speakText(tail, signal, null);
368
- }
369
-
370
- async function dispatchPlanModeUtterance(prompt, signal) {
371
- const language = settings.voiceLanguage;
372
- const key = planChannelKey();
373
- const existing = planStates.get(key);
374
-
375
- if (existing && existing.pendingDecisionIndex < existing.decisions.length) {
376
- const decision = existing.decisions[existing.pendingDecisionIndex];
377
- const answer = parseDecisionAnswer(prompt, decision, language);
378
- if (answer.type === 'unknown') {
379
- await sendText(/^en/i.test(String(language || ''))
380
- ? '⚠️ I did not catch that. Please pick an option.'
381
- : '⚠️ 못 알아들었어. 옵션 중에 하나 골라줘.');
382
- await askNextDecision(existing, signal);
383
- return { handled: true };
384
- }
385
- const next = {
386
- ...existing,
387
- resolvedDecisions: { ...existing.resolvedDecisions, [decision.slot]: answer.choice },
388
- pendingDecisionIndex: existing.pendingDecisionIndex + 1,
389
- };
390
- planStates.set(key, next);
391
- if (next.pendingDecisionIndex < next.decisions.length) {
392
- await askNextDecision(next, signal);
393
- } else {
394
- await finalizePlanReady(next, signal);
395
- }
396
- return { handled: true };
397
- }
301
+ const STREAMING_TTS_ENABLED = ['1', 'true', 'yes', 'on'].includes(String(process.env.STREAMING_TTS || '1').toLowerCase());
398
302
 
399
- if (existing) {
400
- const cmd = parsePlanVoiceCommand(prompt, language);
401
- if (cmd.type === 'skip' || cmd.type === 'insert') {
402
- const nextSteps = applyPlanCommand(existing.steps, cmd);
403
- planStates.set(key, { ...existing, steps: nextSteps });
404
- await finalizePlanReady({ ...existing, steps: nextSteps }, signal);
405
- return { handled: true };
406
- }
407
- if (cmd.type === 'cancel') {
408
- planStates.delete(key);
409
- const msg = /^en/i.test(String(language || '')) ? 'Plan cancelled.' : '계획을 취소했어.';
410
- await sendText(`❎ ${msg}`);
411
- await speakText(msg, signal, null);
412
- return { handled: true };
413
- }
414
- if (cmd.type === 'approve') {
415
- const finalPlan = renderFinalPlan(existing.steps);
416
- const resolvedLine = renderResolvedDecisions(existing.resolvedDecisions, language);
417
- const promptToRun = [
418
- planExecutionPreamble(language),
419
- '',
420
- finalPlan,
421
- resolvedLine,
422
- '',
423
- `Original user request: ${existing.originalPrompt}`,
424
- ].filter(Boolean).join('\n');
425
- planStates.delete(key);
426
- const note = /^en/i.test(String(language || '')) ? 'Running the plan now.' : '계획대로 실행할게.';
427
- await sendText(`▶ ${note}`);
428
- await speakText(note, signal, null);
429
- return { handled: false, prompt: promptToRun };
430
- }
431
- planStates.delete(key);
432
- return { handled: false, prompt };
433
- }
434
-
435
- if (isPlanEntryUtterance(prompt, language)) {
436
- const planPrompt = `${planModePreamble(language)}\n\nUser request: ${prompt}`;
437
- const adapter = adapterForProjectSession(resolveProjectSessionForChannel(planChannelKey()));
438
- const plan = { task: false, label: adapter.label, verboseProgress: false, language, projectContext: '' };
439
- const result = await adapter.run(planPrompt, signal, plan).catch(e => ({ answer: '', error: e }));
440
- const { steps, decisions } = parsePlanOutput(result.answer || '');
441
- if (!steps.length) {
442
- const failMsg = /^en/i.test(String(language || ''))
443
- ? 'I could not produce a plan. Continuing as a regular turn.'
444
- : '계획을 만들지 못했어. 일반 작업으로 진행할게.';
445
- await sendText(`⚠️ ${failMsg}`);
446
- return { handled: false, prompt };
447
- }
448
- const state = {
449
- steps,
450
- decisions,
451
- resolvedDecisions: {},
452
- pendingDecisionIndex: 0,
453
- originalPrompt: prompt,
454
- language,
455
- };
456
- planStates.set(planChannelKey(), state);
457
- const narration = planNarrationLines(steps, language);
458
- await sendText(`📝 ${narration}`);
459
- await speakText(narration, signal, null);
460
- if (decisions.length) {
461
- await askNextDecision(state, signal);
462
- } else {
463
- await finalizePlanReady(state, signal);
464
- }
465
- return { handled: true };
466
- }
467
- return { handled: false, prompt };
468
- }
469
-
470
- function planNarrationLines(steps, language) {
471
- const visible = steps.filter(s => s.status !== 'skipped');
472
- const header = /^en/i.test(String(language || ''))
473
- ? `Plan with ${visible.length} steps. Say "skip step N", "add X after step N", or "approve" to run.`
474
- : `${visible.length}단계 계획. "step N 건너뛰어", "step N 다음에 X 추가", "실행"이라고 말해줘.`;
475
- const body = visible.map((s, i) => `${i + 1}. ${s.text}`).join('\n');
476
- return `${header}\n${body}`;
477
- }
478
-
479
- let smartProgressEnabled = Boolean(process.env.SMART_PROGRESS_API_KEY);
480
- let smartProgressSummarizer = null;
481
- function ensureSmartProgressSummarizer() {
482
- if (smartProgressSummarizer) return smartProgressSummarizer;
483
- smartProgressSummarizer = createSmartProgressSummarizer({
484
- apiKey: process.env.SMART_PROGRESS_API_KEY || '',
485
- baseUrl: process.env.SMART_PROGRESS_BASE_URL || 'https://api.groq.com/openai/v1',
486
- model: process.env.SMART_PROGRESS_MODEL || 'llama-3.1-8b-instant',
487
- language: settings.voiceLanguage,
488
- });
489
- smartProgressSummarizer.on('summary', summary => {
490
- if (!summary || !activeProgressSignal) return;
491
- queueVerboseProgressSpeech(summary, activeProgressSignal);
492
- });
493
- return smartProgressSummarizer;
494
- }
495
- function smartProgressStatusText() {
496
- const hasKey = Boolean(process.env.SMART_PROGRESS_API_KEY);
497
- const mode = smartProgressEnabled && hasKey ? 'on' : 'off';
498
- const reason = !hasKey ? ' (no SMART_PROGRESS_API_KEY set)' : '';
499
- return `smart-progress: ${mode}${reason}`;
500
- }
501
- let activeProgressLastEventAt = 0;
502
- let lastVerboseProgressText = '';
503
- let lastVerboseProgressTextAt = 0;
303
+ bridge.smartProgressEnabled = Boolean(process.env.SMART_PROGRESS_API_KEY);
304
+ const VOICE_CONNECT_TIMEOUT_MS = Number(process.env.VOICE_CONNECT_TIMEOUT_MS || '60000');
504
305
  const PROGRESS_IDLE_NOTICE_INITIAL_MS = Number(process.env.PROGRESS_IDLE_NOTICE_INITIAL_MS || process.env.PROGRESS_IDLE_NOTICE_MS || '10000');
505
306
  const PROGRESS_IDLE_NOTICE_MAX_MS = Number(process.env.PROGRESS_IDLE_NOTICE_MAX_MS || '30000');
506
307
  const PROGRESS_IDLE_NOTICE_MULTIPLIER = Number(process.env.PROGRESS_IDLE_NOTICE_MULTIPLIER || '1.8');
507
308
  const PROGRESS_IDLE_CHECK_MS = Number(process.env.PROGRESS_IDLE_CHECK_MS || '5000');
508
309
  const PROGRESS_IDLE_NOTICE_LIMIT = Number(process.env.PROGRESS_IDLE_NOTICE_LIMIT || '20');
509
310
  const projectSessionsState = loadProjectSessions(settings.projectSessionsPath);
510
- const agentAdaptersBySession = new Map();
311
+ const ttsPlayer = createTtsPlayer({
312
+ bridge,
313
+ settings,
314
+ log,
315
+ warn,
316
+ sleep,
317
+ sendText,
318
+ refreshTtsRuntimeConfig,
319
+ waitEvent,
320
+ isAbortError,
321
+ STREAMING_TTS_ENABLED,
322
+ });
323
+ const { synthTTS, playAudio, speakText, beginStreamingTurn, endStreamingTurn, stopPlaybackForBargeIn } = ttsPlayer;
324
+
325
+ const progressHandler = createProgressHandler({
326
+ bridge,
327
+ settings,
328
+ log,
329
+ warn,
330
+ isAbortError,
331
+ playAudio,
332
+ sendText,
333
+ refreshTtsRuntimeConfig,
334
+ });
335
+ const {
336
+ ensureSmartProgressSummarizer,
337
+ smartProgressStatusText,
338
+ progressEmoji,
339
+ formatProgressText,
340
+ sendVerboseProgressText,
341
+ synthProgressTTS,
342
+ speakProgress,
343
+ speakImmediateNotice,
344
+ queueProgressSpeechText,
345
+ flushProgressSpeechBatch,
346
+ queueVerboseProgressSpeech,
347
+ clearProgressSpeechBatch,
348
+ stopProgressSpeech,
349
+ } = progressHandler;
350
+
351
+ const agentTurnLifecycle = createAgentTurnLifecycle({ bridge, warn });
352
+
353
+ const notificationHandler = createNotificationHandler({ bridge, client, log, warn });
354
+ const {
355
+ ensureNotifier,
356
+ notifyStatusText,
357
+ getVoiceChannelHumanCount,
358
+ maybeNotifyTaskComplete,
359
+ } = notificationHandler;
360
+
361
+ const ttsRuntime = createTtsRuntime({
362
+ bridge,
363
+ ROOT,
364
+ execFileAsync,
365
+ speakText,
366
+ warn,
367
+ persistEnvValues,
368
+ });
369
+ const { ensureSelectedTtsBackendInstalled, commandIsInstalled } = ttsRuntime;
370
+
511
371
  function createBridgeAgentAdapter(agentSettings) {
512
372
  return createAgentAdapter(agentSettings, {
513
373
  execFileAsync,
@@ -515,53 +375,72 @@ function createBridgeAgentAdapter(agentSettings) {
515
375
  log,
516
376
  warn,
517
377
  onProgress: event => {
518
- if (!verboseProgress) return;
519
- activeProgressLastEventAt = Date.now();
520
- sendVerboseProgressText(event, activeProgressSignal);
521
- if (smartProgressEnabled && process.env.SMART_PROGRESS_API_KEY) {
378
+ if (!bridge.verboseProgress) return;
379
+ bridge.activeProgressLastEventAt = Date.now();
380
+ sendVerboseProgressText(event, bridge.activeProgressSignal);
381
+ if (bridge.smartProgressEnabled && process.env.SMART_PROGRESS_API_KEY) {
522
382
  try { ensureSmartProgressSummarizer().ingest(event); }
523
- catch (e) { warn('smart progress ingest failed', e?.stack || e); queueVerboseProgressSpeech(event, activeProgressSignal); }
383
+ catch (e) { warn('smart progress ingest failed', e?.stack || e); queueVerboseProgressSpeech(event, bridge.activeProgressSignal); }
524
384
  } else {
525
- queueVerboseProgressSpeech(event, activeProgressSignal);
385
+ queueVerboseProgressSpeech(event, bridge.activeProgressSignal);
526
386
  }
527
387
  },
528
388
  onStdoutChunk: chunk => {
529
- if (activeSentencer) {
530
- try { activeSentencer.push(chunk); } catch (e) { warn('streaming sentencer push failed', e?.stack || e); }
389
+ if (bridge.activeSentencer) {
390
+ try { bridge.activeSentencer.push(chunk); } catch (e) { warn('streaming sentencer push failed', e?.stack || e); }
531
391
  }
532
392
  },
533
393
  });
534
394
  }
535
395
  const agentAdapter = createBridgeAgentAdapter(settings.agent);
536
- function adapterForProjectSession(session) {
537
- if (!session) return agentAdapter;
538
- const key = session.slug || session.name;
539
- if (!agentAdaptersBySession.has(key)) {
540
- agentAdaptersBySession.set(key, createBridgeAgentAdapter({
541
- ...settings.agent,
542
- label: `${settings.agent.label} · ${session.name}`,
543
- sessionFile: session.sessionFile,
544
- cwd: session.workdir,
545
- projectContext: projectSessionContextText(session),
546
- }));
547
- }
548
- return agentAdaptersBySession.get(key);
549
- }
550
396
  function resolveProjectSessionForChannel(channelId) {
551
397
  return projectSessionForChannel(projectSessionsState, channelId) || null;
552
398
  }
399
+
400
+ function ontologyStateFor(channelKey) {
401
+ const key = String(channelKey || 'default');
402
+ let store = bridge.ontologyByChannel.get(key);
403
+ if (!store) {
404
+ store = createSessionOntology({ channelKey: key });
405
+ try { store.load(); } catch {}
406
+ bridge.ontologyByChannel.set(key, store);
407
+ }
408
+ return store;
409
+ }
410
+ function captureOntologyFromTurn(channelKey, { prompt, answer, backend }) {
411
+ try {
412
+ const store = ontologyStateFor(channelKey);
413
+ const promptEntities = store.entitiesFromText(String(prompt || ''), { by: backend, kind: 'utterance' });
414
+ const answerEntities = store.entitiesFromText(String(answer || ''), { by: backend, kind: 'result' });
415
+ store.add(promptEntities);
416
+ store.add(answerEntities);
417
+ store.save();
418
+ } catch (e) {
419
+ warn('ontology capture failed', e?.message || e);
420
+ }
421
+ }
422
+ function resetRoutingState(channelKey) {
423
+ const state = routingStateFor(channelKey);
424
+ state.activeRouting = { backend: settings.agent.backend, sticky: false };
425
+ state.pendingFallbackPrompt = null;
426
+ }
427
+ function invalidateBackendAdaptersForSession(sessionSlug) {
428
+ if (!sessionSlug) return;
429
+ for (const key of Array.from(bridge.agentAdaptersByBackend.keys())) {
430
+ if (key.endsWith(`::${sessionSlug}`)) bridge.agentAdaptersByBackend.delete(key);
431
+ }
432
+ }
553
433
  function saveProjectSessionsState() {
554
434
  saveProjectSessions(settings.projectSessionsPath, projectSessionsState);
555
435
  }
556
- let sensitivityMode = SENSITIVITY_MODE_DEFAULT;
557
- let sensitivityModeExpiresAt = 0;
436
+ bridge.sensitivityMode = SENSITIVITY_MODE_DEFAULT;
558
437
  function currentBargeInThresholds() {
559
- if (sensitivityModeExpiresAt && Date.now() > sensitivityModeExpiresAt) {
560
- sensitivityMode = SENSITIVITY_MODE_DEFAULT;
561
- sensitivityModeExpiresAt = 0;
562
- log('barge-in sensitivity mode expired; restored', sensitivityMode);
438
+ if (bridge.sensitivityModeExpiresAt && Date.now() > bridge.sensitivityModeExpiresAt) {
439
+ bridge.sensitivityMode = SENSITIVITY_MODE_DEFAULT;
440
+ bridge.sensitivityModeExpiresAt = 0;
441
+ log('barge-in sensitivity mode expired; restored', bridge.sensitivityMode);
563
442
  }
564
- return bargeInThresholdsForMode(sensitivityMode, {
443
+ return bargeInThresholdsForMode(bridge.sensitivityMode, {
565
444
  minSeconds: BARGE_IN_MIN_SECONDS,
566
445
  minMeanDb: BARGE_IN_MIN_MEAN_VOLUME_DB,
567
446
  minMaxDb: BARGE_IN_MIN_MAX_VOLUME_DB,
@@ -581,48 +460,28 @@ function currentPlaybackBargeInThresholds() {
581
460
  };
582
461
  }
583
462
  function setSensitivityMode(mode, reason = 'manual') {
584
- sensitivityMode = mode === 'conservative' ? 'conservative' : 'normal';
585
- sensitivityModeExpiresAt = sensitivityMode === 'conservative' && SENSITIVITY_OUTDOOR_SECONDS > 0
463
+ bridge.sensitivityMode = mode === 'conservative' ? 'conservative' : 'normal';
464
+ bridge.sensitivityModeExpiresAt = bridge.sensitivityMode === 'conservative' && SENSITIVITY_OUTDOOR_SECONDS > 0
586
465
  ? Date.now() + SENSITIVITY_OUTDOOR_SECONDS * 1000
587
466
  : 0;
588
467
  const thresholds = currentBargeInThresholds();
589
- log('barge-in sensitivity mode set', sensitivityMode, 'reason', reason, 'expiresAt', sensitivityModeExpiresAt || 'never', 'thresholds', thresholds);
468
+ log('barge-in sensitivity mode set', bridge.sensitivityMode, 'reason', reason, 'expiresAt', bridge.sensitivityModeExpiresAt || 'never', 'thresholds', thresholds);
590
469
  return thresholds;
591
470
  }
592
471
  function sensitivityStatusText() {
593
472
  const thresholds = currentBargeInThresholds();
594
- const ttl = sensitivityModeExpiresAt ? Math.max(0, Math.round((sensitivityModeExpiresAt - Date.now()) / 1000)) : 0;
473
+ const ttl = bridge.sensitivityModeExpiresAt ? Math.max(0, Math.round((bridge.sensitivityModeExpiresAt - Date.now()) / 1000)) : 0;
595
474
  return sensitivityStatusTextForLanguage(thresholds, ttl, settings.voiceLanguage);
596
475
  }
597
476
 
598
477
  function verboseStatusText() {
599
- return verboseStatusTextForLanguage(verboseProgress, settings.voiceLanguage);
600
- }
601
-
602
- function progressEmoji(event) {
603
- const category = progressCategory(event, { language: settings.voiceLanguage })?.key;
604
- return {
605
- test: '🧪',
606
- edit: '✏️',
607
- read: '📖',
608
- search: '🔎',
609
- terminal: '⌨️',
610
- skill: '🧰',
611
- browser: '🌐',
612
- tool: '🛠️',
613
- agent: '🤖',
614
- work: '⚙️',
615
- }[category] || '⚙️';
616
- }
617
-
618
- function formatProgressText(event) {
619
- return formatProgressMessage(event, { language: settings.voiceLanguage });
478
+ return verboseStatusTextForLanguage(bridge.verboseProgress, settings.voiceLanguage);
620
479
  }
621
480
 
622
481
  function setVerboseProgress(enabled, reason = 'manual') {
623
- verboseProgress = Boolean(enabled);
624
- log('verbose progress mode set', verboseProgress, 'reason', reason);
625
- return verboseProgress;
482
+ bridge.verboseProgress = Boolean(enabled);
483
+ log('verbose progress mode set', bridge.verboseProgress, 'reason', reason);
484
+ return bridge.verboseProgress;
626
485
  }
627
486
 
628
487
  function persistEnvValues(values) {
@@ -633,7 +492,7 @@ function persistEnvValues(values) {
633
492
  } catch (e) {
634
493
  warn('read .env for update failed', e?.stack || e);
635
494
  }
636
- const pending = new Map(Object.entries(values));
495
+ const pending = new Map(Object.entries(values).filter(([, value]) => value !== undefined));
637
496
  const updated = lines.map(line => {
638
497
  const match = line.match(/^\s*([A-Za-z_][A-Za-z0-9_]*)\s*=.*$/);
639
498
  if (!match || !pending.has(match[1])) return line;
@@ -655,8 +514,8 @@ function applyRuntimeLanguage(language) {
655
514
  config = updateTtsVoiceConfig(config, { voiceType: preferredVoiceTypeForLanguage(config, preset.voiceLanguage) });
656
515
  writeTtsVoiceConfig(TTS_VOICE_CONFIG_PATH, config);
657
516
  const { selection } = applyVoiceConfigToProcessEnv(config);
658
- settings.tts.backend = selection.backend;
659
- settings.tts.edge.voice = selection.backend === 'edge' ? selection.voice.voice : preset.ttsVoice;
517
+ rebuildTtsRuntimeSettings(selection);
518
+ if (selection.backend !== 'edge') settings.tts.edge.voice = preset.ttsVoice;
660
519
  process.env.VOICE_LANGUAGE = preset.voiceLanguage;
661
520
  process.env.WHISPER_CPP_LANGUAGE = preset.sttLanguage;
662
521
  process.env.STT_LANGUAGE = preset.sttLanguage;
@@ -685,33 +544,23 @@ function voiceChangedText(selection) {
685
544
  return `Voice changed to ${selection.voice?.label || selection.voiceType}.`;
686
545
  }
687
546
 
688
- async function handleTtsVoiceCommand(prompt, signal) {
689
- const request = voiceCommandFromTranscript(prompt);
690
- if (!request) return false;
691
- discardVoiceInputQueues('voice-change');
692
- let config = ensureTtsVoiceConfig();
693
- config = updateTtsVoiceConfig(config, request);
694
- writeTtsVoiceConfig(TTS_VOICE_CONFIG_PATH, config);
695
- const { selection } = applyVoiceConfigToProcessEnv(config);
696
- settings.tts.backend = selection.backend;
697
- if (selection.backend === 'edge') settings.tts.edge.voice = selection.voice.voice;
698
- if (selection.voice?.language) settings.voiceLanguage = selection.voice.language;
699
- persistEnvValues({
700
- TTS_BACKEND: selection.backend,
701
- TTS_VOICE_TYPE: selection.voiceType,
702
- TTS_VOICE: selection.backend === 'edge' ? selection.voice.voice : process.env.TTS_VOICE,
703
- VOICE_LANGUAGE: settings.voiceLanguage,
704
- });
705
- await speakText(voiceChangedText(selection), signal);
706
- return true;
547
+ function isCloneVoiceType(voiceType) {
548
+ return /^(cloned_reference|prompt_reference|cosyvoice_reference)$/i.test(String(voiceType || ''));
707
549
  }
708
550
 
709
- async function handleLanguageCommand(prompt, signal) {
710
- const request = voiceLanguageCommandFromTranscript(prompt);
711
- if (!request) return false;
712
- const preset = applyRuntimeLanguage(request.language);
713
- await speakText(languageChangedText(preset), signal);
714
- return true;
551
+ async function notifyVoiceCloneSampleGapIfNeeded(selection, signal) {
552
+ if (!selection || selection.backend === 'edge') return;
553
+ if (!isCloneVoiceType(selection.voiceType)) return;
554
+ const ref = String(selection.voice?.voice || '').trim();
555
+ if (!ref) return;
556
+ const candidatePath = path.isAbsolute(ref) ? ref : path.resolve(ROOT, ref);
557
+ if (fs.existsSync(candidatePath)) return;
558
+ const en = /^en/i.test(String(settings.voiceLanguage || ''));
559
+ const msg = en
560
+ ? `${selection.backend} needs a voice clone sample at ${ref}. Say "voice clone capture" to record one, or pick a non-clone voice.`
561
+ : `${selection.backend} 백엔드는 음성 클론 샘플(${ref})이 필요해. "보이스 클로닝 캡처"라고 하거나 다른 보이스를 골라줘.`;
562
+ await sendText(`🎙️ ${msg}`);
563
+ await speakText(msg, signal, null);
715
564
  }
716
565
 
717
566
  function isAllowed(userId) { return settings.allowedUsers.size === 0 || settings.allowedUsers.has(String(userId)); }
@@ -757,13 +606,28 @@ function spokenResultOnly(userPrompt, answer, language = settings.voiceLanguage)
757
606
  async function sendText(text) {
758
607
  return sendDiscordText({
759
608
  client,
760
- channelId: activeTranscriptChannelId || settings.transcriptChannelId,
609
+ channelId: bridge.activeTranscriptChannelId || settings.transcriptChannelId,
761
610
  text,
762
611
  log,
763
612
  warn,
764
613
  });
765
614
  }
766
615
 
616
+ async function sendEmbed(embed, { content = '' } = {}) {
617
+ if (!embed) return false;
618
+ try {
619
+ const channelId = bridge.activeTranscriptChannelId || settings.transcriptChannelId;
620
+ if (!channelId) return false;
621
+ const channel = await client.channels.fetch(channelId).catch(() => null);
622
+ if (!channel?.send) return false;
623
+ await channel.send(content ? { content, embeds: [embed] } : { embeds: [embed] });
624
+ return true;
625
+ } catch (e) {
626
+ warn('sendEmbed failed', e?.message || e);
627
+ return false;
628
+ }
629
+ }
630
+
767
631
  async function sendChannelText(channel, text) {
768
632
  const body = String(text || '');
769
633
  const chunks = splitDiscordMessage(body);
@@ -771,18 +635,6 @@ async function sendChannelText(channel, text) {
771
635
  return true;
772
636
  }
773
637
 
774
- function sendVerboseProgressText(event, signal) {
775
- if (!verboseProgress || !signal || signal.aborted || activeProgressSignal !== signal) return;
776
- const formatted = formatProgressText(event).replace(/\s+/g, ' ').trim();
777
- if (!formatted) return;
778
- const message = formatted.slice(0, 1900);
779
- const now = Date.now();
780
- if (message === lastVerboseProgressText && now - lastVerboseProgressTextAt < 2000) return;
781
- lastVerboseProgressText = message;
782
- lastVerboseProgressTextAt = now;
783
- void sendText(message).catch(e => warn('verbose progress text delivery failed', e?.stack || e));
784
- }
785
-
786
638
  function sleep(ms) {
787
639
  return new Promise(resolve => setTimeout(resolve, ms));
788
640
  }
@@ -798,83 +650,240 @@ function waitEvent(emitter, event, timeoutMs = 60000) {
798
650
  });
799
651
  }
800
652
 
801
- async function transcribeOnce(wavPath, input16k, outBase) {
802
- const args = ['-m', settings.whisperModel, '-f', input16k];
803
- if (shouldPassWhisperLanguage(settings.whisperLanguage)) args.push('-l', settings.whisperLanguage);
804
- args.push('-nt', '-otxt', '-of', outBase, '-sns', '-nf', '-nth', '0.35', '-et', '2.2', '-lpt', '-0.8');
805
- try {
806
- await execFileAsync(settings.whisperBin, args, { timeout: 25000, maxBuffer: 2 * 1024 * 1024 });
807
- } catch (e) {
808
- throw new Error(`whisper failed: ${e.stderr || e.message}`);
809
- }
810
- const txtPath = `${outBase}.txt`;
811
- const raw = fs.existsSync(txtPath) ? fs.readFileSync(txtPath, 'utf8') : '';
812
- return { raw, txtPath };
813
- }
814
-
815
- async function transcribe(wavPath) {
816
- const tmpBase = path.join(os.tmpdir(), `hermes-node-stt-${Date.now()}`);
817
- const input16k = `${tmpBase}.16k.wav`;
818
- const outBase = `${tmpBase}.out`;
819
- // whisper.cpp can read WAV, but Discord receiver output is 48 kHz stereo.
820
- // Convert explicitly to the 16 kHz mono PCM shape Whisper expects.
821
- await execFileAsync('ffmpeg', ['-y', '-hide_banner', '-loglevel', 'error', '-i', wavPath, '-ac', '1', '-ar', '16000', '-sample_fmt', 's16', input16k], {
822
- timeout: 20000,
823
- maxBuffer: 1024 * 1024,
824
- });
825
-
826
- let raw = '';
827
- let txtPath = '';
828
- try {
829
- ({ raw, txtPath } = await transcribeOnce(wavPath, input16k, outBase));
830
- let cleaned = cleanTranscript(raw);
831
- log('stt raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
832
- if (!cleaned) {
833
- await sleep(300);
834
- const retryBase = `${tmpBase}.retry`;
835
- const retry = await transcribeOnce(wavPath, input16k, retryBase);
836
- raw = retry.raw;
837
- txtPath = retry.txtPath;
838
- cleaned = cleanTranscript(raw);
839
- log('stt retry raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
840
- }
841
- return cleaned;
842
- } finally {
843
- if (settings.debugDir) {
844
- const debug16k = path.join(settings.debugDir, `stt-input-${stamp()}.wav`);
845
- fs.copyFile(input16k, debug16k, () => {});
846
- if (raw) fs.writeFile(path.join(settings.debugDir, `stt-raw-${stamp()}.txt`), raw, () => {});
847
- }
848
- fs.rm(input16k, { force: true }, () => {});
849
- if (txtPath) fs.rm(txtPath, { force: true }, () => {});
850
- }
851
- }
852
-
853
- function cleanTranscript(raw) {
854
- const bad = [
855
- '구독', '좋아요', '알림설정', '시청해주셔서', '시청해주신', '다음영상', '영상에서만나요',
856
- '부탁드려요', '큰힘이됩니다',
857
- 'mbc뉴스', '이준범기자입니다', '뉴스입니다', '기자입니다', '앵커', '속보', '보도입니다', '전해드립니다',
858
- ];
859
- const lines = raw
860
- .split(/\r?\n/)
861
- .map(l => l.trim())
862
- .filter(Boolean)
863
- .map(l => l.replace(/^\[[^\]]+\]\s*/, '').trim());
864
- const kept = [];
865
- for (const line of lines) {
866
- const compact = line
867
- .replace(/\s+/g, '')
868
- .replace(/[\p{P}\p{S}_]+/gu, '');
869
- if (!compact) continue;
870
- if (/^[\(\[(【].*[\)\])】]$/.test(line.replace(/\s+/g, ''))) continue;
871
- if (['끄덕', '끄덕끄덕', '박수', '웃음', '음악', '자막', '침묵', '무음'].includes(compact)) continue;
872
- if (bad.some(b => compact.toLowerCase().includes(b))) continue;
873
- if (isRepeatedNoiseTranscript(compact)) continue;
874
- kept.push(line);
875
- }
876
- return kept.join(' ').trim();
877
- }
653
+ // handleRecording lives inside utteranceRouter (extracted in Phase 4b) but
654
+ // voiceIO.flushUtterance needs to call it. Use a forward-declared `let` plus
655
+ // a thunk so the deps for createVoiceIO resolve before createUtteranceRouter
656
+ // is constructed.
657
+ let utteranceRouter;
658
+ let voiceTurnRunner;
659
+ const voiceIO = createVoiceIO({
660
+ bridge,
661
+ settings,
662
+ client,
663
+ execFileAsync,
664
+ log,
665
+ warn,
666
+ stamp,
667
+ sleep,
668
+ isAllowed,
669
+ UTTERANCE_IDLE_MS,
670
+ SUBSCRIBE_AFTER_SILENCE_MS,
671
+ MIN_UTTERANCE_BYTES,
672
+ MIN_MEAN_VOLUME_DB,
673
+ MIN_MAX_VOLUME_DB,
674
+ currentBargeInThresholds,
675
+ currentPlaybackBargeInThresholds,
676
+ createLiveBargeInMonitor,
677
+ shouldUseLivePlaybackBargeIn,
678
+ stopPlaybackForBargeIn,
679
+ analyzeAudio,
680
+ concatWavs,
681
+ saveCapturedVoiceCloneSample,
682
+ isBargeInCandidate,
683
+ validateProcessingBargeIn,
684
+ enqueueDeferredProcessingUtterance,
685
+ newLatencyTurn,
686
+ handleRecording: (...args) => voiceTurnRunner.handleRecording(...args),
687
+ });
688
+ const { transcribeOnce, transcribe, cleanTranscript, queueSegment, flushUtterance, subscribeUser } = voiceIO;
689
+
690
+ const discordVoiceSetup = createDiscordVoiceSetup({
691
+ bridge,
692
+ client,
693
+ settings,
694
+ ROOT,
695
+ log,
696
+ warn,
697
+ speakText,
698
+ waitEvent,
699
+ subscribeUser,
700
+ pendingFallbackNoticePromises,
701
+ bindProjectSessionToChannel,
702
+ createProjectSession,
703
+ resolveProjectSessionForChannel,
704
+ saveProjectSessionsState,
705
+ projectSessionsState,
706
+ invalidateBackendAdaptersForSession,
707
+ VOICE_CONNECT_TIMEOUT_MS,
708
+ });
709
+ const {
710
+ connectTo,
711
+ autoJoin,
712
+ findVoiceChannelBySelector,
713
+ voiceChannelLabel,
714
+ resolveVoiceChannelForAttach,
715
+ attachVoiceChannelToTextSession,
716
+ gracefulShutdown,
717
+ } = discordVoiceSetup;
718
+ utteranceRouter = createUtteranceRouter({
719
+ bridge,
720
+ agentTurnLifecycle,
721
+ log,
722
+ warn,
723
+ path,
724
+ fs,
725
+ ROOT,
726
+ TTS_VOICE_CONFIG_PATH,
727
+ agentAdapter,
728
+ settings,
729
+ isPlanEntryUtterance,
730
+ parsePlanOutput,
731
+ parsePlanVoiceCommand,
732
+ applyPlanCommand,
733
+ renderFinalPlan,
734
+ planModePreamble,
735
+ planExecutionPreamble,
736
+ parseDecisionAnswer,
737
+ renderDecisionPrompt,
738
+ renderResolvedDecisions,
739
+ isAgentRoutingDecision,
740
+ projectSessionContextText,
741
+ resolveProjectSessionForChannel,
742
+ createBridgeAgentAdapter,
743
+ buildAgentSettings,
744
+ commandIsInstalled,
745
+ shellSplit,
746
+ sendText,
747
+ speakText,
748
+ ensureTtsVoiceConfig,
749
+ updateTtsVoiceConfig,
750
+ writeTtsVoiceConfig,
751
+ applyVoiceConfigToProcessEnv,
752
+ ensureSelectedTtsBackendInstalled,
753
+ rebuildTtsRuntimeSettings,
754
+ voiceCommandFromTranscript,
755
+ voiceChangedText,
756
+ voiceLanguageCommandFromTranscript,
757
+ voiceCloneCommandFromText,
758
+ voiceCloneCapture,
759
+ notifyVoiceCloneSampleGapIfNeeded,
760
+ languageChangedText,
761
+ applyRuntimeLanguage,
762
+ persistEnvValues,
763
+ discardVoiceInputQueues,
764
+ // Phase 4b deps
765
+ transcribe,
766
+ beginStreamingTurn,
767
+ endStreamingTurn,
768
+ client,
769
+ isAllowed,
770
+ isAbortError,
771
+ sleep,
772
+ sendEmbed,
773
+ speakImmediateNotice,
774
+ reloadRuntimeLanguageFromEnv,
775
+ drainDeferredProcessingUtterances,
776
+ maybeNotifyTaskComplete,
777
+ ontologyStateFor,
778
+ captureOntologyFromTurn,
779
+ queueProgressSpeechText,
780
+ stopProgressSpeech,
781
+ agentAnswerHeader,
782
+ emptyAgentAnswer,
783
+ formatRecentDiscordContext,
784
+ formatSttResultMessage,
785
+ formatSttStartMessage,
786
+ formatVoiceErrorMessage,
787
+ formatWakeRejectedMessage,
788
+ spokenResultOnly,
789
+ stripWake,
790
+ acceptsWake,
791
+ sensitivityChangedSpeech,
792
+ sensitivityModeFromTranscript,
793
+ sensitivityStatusText,
794
+ setSensitivityMode,
795
+ isSensitivityOnlyRequest,
796
+ verboseChangedSpeech,
797
+ verboseModeFromTranscript,
798
+ verboseStatusText,
799
+ setVerboseProgress,
800
+ isVerboseOnlyRequest,
801
+ isRoutingOnlyUtterance,
802
+ parseAgentRoutingCommand,
803
+ renderAgentPrefix,
804
+ buildCrossAgentPrompt,
805
+ buildFallbackDecision,
806
+ parseResearchCommand,
807
+ runResearchTurn,
808
+ PROGRESS_IDLE_CHECK_MS,
809
+ PROGRESS_IDLE_NOTICE_INITIAL_MS,
810
+ PROGRESS_IDLE_NOTICE_LIMIT,
811
+ PROGRESS_IDLE_NOTICE_MAX_MS,
812
+ PROGRESS_IDLE_NOTICE_MULTIPLIER,
813
+ STT_START_VOICE_NOTICE,
814
+ });
815
+ const {
816
+ adapterForProjectSession,
817
+ routingStateFor,
818
+ recordUtterance,
819
+ clearTransientRouting,
820
+ adapterForBackend,
821
+ handleTtsVoiceCommand,
822
+ handleLanguageCommand,
823
+ handleVoiceCloneCommand,
824
+ interruptCurrentResponse,
825
+ } = utteranceRouter;
826
+
827
+ const planDispatcher = createPlanDispatcher({
828
+ bridge, settings,
829
+ sendText, speakText,
830
+ routingStateFor, adapterForBackend, adapterForProjectSession,
831
+ resolveProjectSessionForChannel,
832
+ isAgentRoutingDecision,
833
+ parseDecisionAnswer, parsePlanVoiceCommand: parsePlanVoiceCommand,
834
+ applyPlanCommand: applyPlanCommand,
835
+ parsePlanOutput,
836
+ renderDecisionPrompt, renderResolvedDecisions, renderFinalPlan,
837
+ planModePreamble, planExecutionPreamble, isPlanEntryUtterance,
838
+ });
839
+ const {
840
+ planChannelKey,
841
+ askNextDecision,
842
+ finalizePlanReady,
843
+ dispatchPlanModeUtterance,
844
+ planNarrationLines,
845
+ } = planDispatcher;
846
+
847
+ voiceTurnRunner = createVoiceTurnRunner({
848
+ bridge,
849
+ agentTurnLifecycle,
850
+ settings, client, log, warn, fs,
851
+ // From voice_io
852
+ transcribe,
853
+ // From tts_player
854
+ beginStreamingTurn, endStreamingTurn, speakText,
855
+ // From progress_handler
856
+ queueProgressSpeechText, stopProgressSpeech, speakImmediateNotice,
857
+ // From notification_handler
858
+ maybeNotifyTaskComplete,
859
+ // From utterance_router (sibling-module dispatch + adapter selection)
860
+ handleLanguageCommand, handleTtsVoiceCommand, handleVoiceCloneCommand,
861
+ dispatchPlanModeUtterance,
862
+ adapterForBackend, adapterForProjectSession,
863
+ planChannelKey, routingStateFor, recordUtterance, clearTransientRouting,
864
+ // Direct (imported in main or hoisted helpers)
865
+ isAllowed, isAbortError, sleep, sendText, sendEmbed,
866
+ reloadRuntimeLanguageFromEnv, drainDeferredProcessingUtterances,
867
+ resolveProjectSessionForChannel, projectSessionContextText,
868
+ ontologyStateFor, captureOntologyFromTurn,
869
+ formatRecentDiscordContext,
870
+ formatSttResultMessage, formatSttStartMessage,
871
+ formatVoiceErrorMessage, formatWakeRejectedMessage,
872
+ agentAnswerHeader, emptyAgentAnswer, spokenResultOnly,
873
+ stripWake, acceptsWake,
874
+ sensitivityChangedSpeech, sensitivityModeFromTranscript, sensitivityStatusText,
875
+ setSensitivityMode, isSensitivityOnlyRequest,
876
+ verboseChangedSpeech, verboseModeFromTranscript, verboseStatusText,
877
+ setVerboseProgress, isVerboseOnlyRequest,
878
+ isRoutingOnlyUtterance, parseAgentRoutingCommand, renderAgentPrefix,
879
+ buildCrossAgentPrompt, buildFallbackDecision,
880
+ parseDecisionAnswer,
881
+ parseResearchCommand, runResearchTurn,
882
+ PROGRESS_IDLE_CHECK_MS, PROGRESS_IDLE_NOTICE_INITIAL_MS,
883
+ PROGRESS_IDLE_NOTICE_LIMIT, PROGRESS_IDLE_NOTICE_MAX_MS,
884
+ PROGRESS_IDLE_NOTICE_MULTIPLIER, STT_START_VOICE_NOTICE,
885
+ });
886
+ const { handleRecording } = voiceTurnRunner;
878
887
 
879
888
  function isAbortError(e) {
880
889
  return e?.name === 'AbortError' || e?.code === 'ABORT_ERR';
@@ -914,274 +923,45 @@ async function refreshTtsRuntimeConfig() {
914
923
  if (previousBackend !== settings.tts.backend) {
915
924
  const rebuilt = buildTtsSettings(process.env, ROOT);
916
925
  Object.assign(settings.tts, rebuilt);
917
- ttsBackend = createTtsBackend(settings.tts, { execFileAsync, log, warn, voiceProvider: () => settings.tts.edge.voice });
926
+ try { bridge.ttsBackend?.close?.(); } catch (e) { warn('tts backend close failed', e?.message || e); }
927
+ bridge.ttsBackend = createTtsBackend(settings.tts, { execFileAsync, spawn, log, warn, onFallback: ttsFallbackNotice, voiceProvider: () => settings.tts.edge.voice });
918
928
  log('tts backend reloaded from voice config', settings.tts.backend, 'voiceType', selection.voiceType);
919
929
  }
920
930
  return selection;
921
931
  }
922
932
 
923
- async function synthTTS(text, signal) {
924
- await refreshTtsRuntimeConfig();
925
- let lastError = null;
926
- for (let attempt = 1; attempt <= 3; attempt += 1) {
927
- try {
928
- log('final tts synth start', 'backend', ttsBackend.name, 'attempt', attempt, 'chars', String(text || '').length);
929
- const out = await ttsBackend.synthesize(text, { signal, kind: 'final' });
930
- log('final tts synth done', 'backend', ttsBackend.name, 'attempt', attempt, out, fs.statSync(out).size);
931
- return out;
932
- } catch (e) {
933
- lastError = e;
934
- if (isAbortError(e) || signal?.aborted) throw e;
935
- warn('final tts synth failed', 'attempt', attempt, e?.stderr?.toString?.().slice(-500) || e?.message || e);
936
- await sleep(1000 * attempt);
937
- }
938
- }
939
- throw lastError;
940
- }
941
-
942
- async function synthProgressTTS(text, signal) {
943
- await refreshTtsRuntimeConfig();
944
- const ext = ttsBackend.outputExtension || 'mp3';
945
- const cachePath = path.join(settings.tts.progressCacheDir, progressTtsCacheFileName({
946
- backendKeyParts: ttsBackend.cacheKeyParts(),
947
- text,
948
- ext,
949
- }));
950
- if (fs.existsSync(cachePath) && fs.statSync(cachePath).size > 0) {
951
- log('progress tts cache hit', text, cachePath);
952
- return cachePath;
953
- }
954
- log('progress tts cache miss', text);
955
- const tmp = await ttsBackend.synthesize(text, { signal, kind: 'progress' });
956
- fs.renameSync(tmp, cachePath);
957
- return cachePath;
958
- }
959
-
960
- async function playAudio(file, { deleteAfter = true } = {}) {
961
- if (!connection) return;
962
- speaking = true;
963
- try {
964
- const resource = createAudioResource(file, { inputType: StreamType.Arbitrary, inlineVolume: true });
965
- resource.volume?.setVolume(settings.tts.volume);
966
- player.play(resource);
967
- connection.subscribe(player);
968
- await waitEvent(player, AudioPlayerStatus.Idle, 120000).catch(() => {});
969
- } finally {
970
- speaking = false;
971
- if (deleteAfter) fs.rm(file, { force: true }, () => {});
972
- }
973
- }
974
-
975
- async function speakText(text, signal, metricsTurn = null, options = {}) {
976
- const chunks = splitForTTS(text, settings.tts.maxChars);
977
- if (!chunks.length) return;
978
- if (options.mirrorText !== false) {
979
- await sendText(`${options.mirrorPrefix || '🔊 음성으로 읽는 내용'}:\n${String(text || '')}`);
980
- }
981
- log('TTS chunks', chunks.length, 'maxChars', settings.tts.maxChars, 'backend', ttsBackend.name);
982
- const playbackGeneration = speechPlaybackGeneration;
983
- const playbackStopped = () => playbackGeneration !== speechPlaybackGeneration;
984
- let synthMs = 0;
985
- let playMs = 0;
986
- const ttsStart = Date.now();
987
- await playChunkedTTSWithPrefetch(chunks, {
988
- signal,
989
- log,
990
- synth: async chunk => {
991
- if (playbackStopped()) return null;
992
- const start = Date.now();
993
- try { return await synthTTS(chunk, signal); }
994
- finally { synthMs += Date.now() - start; }
995
- },
996
- play: async file => {
997
- if (playbackStopped()) {
998
- await fs.promises.rm(file, { force: true }).catch(() => {});
999
- return;
1000
- }
1001
- const start = Date.now();
1002
- try { return await playAudio(file); }
1003
- finally { playMs += Date.now() - start; }
1004
- },
1005
- cleanup: file => fs.promises.rm(file, { force: true }),
1006
- });
1007
- metricsTurn?.stage('tts_synth', synthMs, { ttsChunks: chunks.length, spokenChars: String(text || '').length });
1008
- metricsTurn?.stage('tts_play', playMs);
1009
- metricsTurn?.stage('tts_total', Date.now() - ttsStart);
1010
- }
1011
-
1012
- function beginStreamingTurn(signal) {
1013
- if (!STREAMING_TTS_ENABLED || !connection) return false;
1014
- streamingSpeechDelivered = false;
1015
- const sentencer = createSentencer({ minChars: 40, maxLatencyMs: 800 });
1016
- const queue = createStreamingTTSQueue({
1017
- synth: async text => synthTTS(text, signal),
1018
- play: async file => playAudio(file, { deleteAfter: false }),
1019
- cleanup: async file => { try { await fs.promises.rm(file, { force: true }); } catch {} },
1020
- signal,
1021
- log,
1022
- });
1023
- sentencer.on('sentence', text => {
1024
- if (signal?.aborted) return;
1025
- queue.enqueue(text);
1026
- });
1027
- activeSentencer = sentencer;
1028
- activeStreamingQueue = queue;
1029
- log('streaming turn begin');
1030
- return true;
1031
- }
1032
-
1033
- async function endStreamingTurn() {
1034
- const sentencer = activeSentencer;
1035
- const queue = activeStreamingQueue;
1036
- activeSentencer = null;
1037
- activeStreamingQueue = null;
1038
- if (!sentencer || !queue) return;
1039
- try { sentencer.flush(); } catch (e) { warn('streaming sentencer flush failed', e?.stack || e); }
1040
- try { await queue.drain(); } catch (e) { warn('streaming queue drain failed', e?.stack || e); }
1041
- streamingSpeechDelivered = queue.size === 0;
1042
- log('streaming turn end');
1043
- }
1044
-
1045
- async function speakProgress(text, signal) {
1046
- if (signal?.aborted) return;
1047
- try {
1048
- const mp3 = await synthProgressTTS(text, signal);
1049
- if (signal?.aborted) return;
1050
- await playAudio(mp3, { deleteAfter: false });
1051
- } catch (e) {
1052
- if (!isAbortError(e)) warn('progress tts failed', e?.stack || e);
1053
- }
1054
- }
1055
-
1056
- async function speakImmediateNotice(text, signal, reason = 'notice') {
1057
- if (signal?.aborted) return;
1058
- try {
1059
- log('immediate notice speech', reason, 'text', String(text || '').slice(0, 80));
1060
- const mp3 = await synthProgressTTS(text, signal);
1061
- if (signal?.aborted) return;
1062
- await playAudio(mp3, { deleteAfter: false });
1063
- } catch (e) {
1064
- if (!isAbortError(e)) warn('immediate notice speech failed', reason, e?.stack || e);
1065
- }
1066
- }
1067
-
1068
- function queueProgressSpeechText(text, signal, reason = 'status') {
1069
- const spoken = String(text || '').replace(/\s+/g, ' ').trim();
1070
- if (!spoken || !signal || signal.aborted || activeProgressSignal !== signal) return;
1071
- verboseProgressSpeechQueue = verboseProgressSpeechQueue
1072
- .catch(() => {})
1073
- .then(async () => {
1074
- if (signal.aborted || activeProgressSignal !== signal || !processing) return;
1075
- log('progress speech queued', reason, 'text', spoken);
1076
- await speakProgress(spoken, signal);
1077
- });
1078
- }
1079
-
1080
- function flushProgressSpeechBatch(signal, reason = 'timer') {
1081
- if (!signal || signal.aborted || activeProgressSignal !== signal) return;
1082
- if (progressSpeechBatchTimer) {
1083
- clearTimeout(progressSpeechBatchTimer);
1084
- progressSpeechBatchTimer = null;
1085
- }
1086
- const events = progressSpeechBatch;
1087
- progressSpeechBatch = [];
1088
- progressSpeechBatchSignal = null;
1089
- progressSpeechBatchStartedAt = 0;
1090
- const text = summarizeProgressEvents(events, { maxCategories: 3, language: settings.voiceLanguage });
1091
- if (!text) return;
1092
- queueProgressSpeechText(text, signal, `batch-${reason}-${events.length}`);
1093
- }
1094
-
1095
- function queueVerboseProgressSpeech(event, signal) {
1096
- if (!verboseProgress || !signal || signal.aborted || activeProgressSignal !== signal) return;
1097
- const text = String(event || '').replace(/\s+/g, ' ').trim().slice(0, 120);
1098
- if (!text) return;
1099
- if (progressSpeechBatchSignal && progressSpeechBatchSignal !== signal) {
1100
- progressSpeechBatch = [];
1101
- if (progressSpeechBatchTimer) clearTimeout(progressSpeechBatchTimer);
1102
- progressSpeechBatchTimer = null;
1103
- progressSpeechBatchStartedAt = 0;
1104
- }
1105
- progressSpeechBatchSignal = signal;
1106
- if (!progressSpeechBatchStartedAt) progressSpeechBatchStartedAt = Date.now();
1107
- progressSpeechBatch.push(text);
1108
- const elapsedMs = Date.now() - progressSpeechBatchStartedAt;
1109
- const ratePerSecond = progressSpeechBatch.length / Math.max(0.2, elapsedMs / 1000);
1110
- const maxBatchEvents = ratePerSecond >= 6 ? 5 : ratePerSecond >= 3 ? 4 : 3;
1111
- const batchDelayMs = ratePerSecond >= 6 ? 650 : ratePerSecond >= 3 ? 550 : 450;
1112
- if (progressSpeechBatch.length >= maxBatchEvents) {
1113
- flushProgressSpeechBatch(signal, 'full');
1114
- return;
1115
- }
1116
- if (progressSpeechBatchTimer) clearTimeout(progressSpeechBatchTimer);
1117
- progressSpeechBatchTimer = setTimeout(() => flushProgressSpeechBatch(signal, 'timer'), batchDelayMs);
1118
- }
1119
-
1120
- function clearProgressSpeechBatch(signal = activeProgressSignal) {
1121
- if (progressSpeechBatchTimer) {
1122
- clearTimeout(progressSpeechBatchTimer);
1123
- progressSpeechBatchTimer = null;
1124
- }
1125
- if (!signal || progressSpeechBatchSignal === signal) {
1126
- progressSpeechBatch = [];
1127
- progressSpeechBatchSignal = null;
1128
- progressSpeechBatchStartedAt = 0;
1129
- }
1130
- }
1131
-
1132
- function stopProgressSpeech(signal, reason = 'final-answer') {
1133
- if (activeProgressSignal !== signal) return;
1134
- clearProgressSpeechBatch(signal);
1135
- activeProgressSignal = null;
1136
- if (activeProgressAbortController && !activeProgressAbortController.signal.aborted) {
1137
- try { activeProgressAbortController.abort(); } catch (e) { warn('abort progress speech failed', e?.stack || e); }
1138
- }
1139
- if (speaking) {
1140
- log('stop progress speech before final answer', reason);
1141
- try { player.stop(true); } catch (e) { warn('stop progress speech failed', e?.stack || e); }
1142
- speaking = false;
1143
- }
1144
- }
1145
-
1146
933
  async function handleTextAgentMessage(msg, text, { speakResponse = false } = {}) {
1147
- if (processing) {
934
+ if (bridge.processing) {
1148
935
  await msg.reply('지금 이전 작업을 처리 중이야. 끝나면 다시 보내줘.');
1149
936
  return;
1150
937
  }
1151
- processing = true;
1152
- const controller = new AbortController();
1153
- currentAbortController = controller;
1154
- const signal = controller.signal;
1155
- const progressController = new AbortController();
1156
- activeProgressAbortController = progressController;
1157
- activeProgressSignal = progressController.signal;
1158
- activeProgressLastEventAt = Date.now();
1159
- const previousTranscriptChannelId = activeTranscriptChannelId;
938
+ const turn = agentTurnLifecycle.start();
939
+ const { controller, signal, progressController } = turn;
1160
940
  const session = resolveProjectSessionForChannel(msg.channelId);
1161
- activeTranscriptChannelId = session?.transcriptChannelId || msg.channelId;
941
+ bridge.activeTranscriptChannelId = session?.transcriptChannelId || msg.channelId;
1162
942
  const selectedAgentAdapter = adapterForProjectSession(session);
1163
943
  const projectContext = projectSessionContextText(session);
1164
- const recentDiscordContext = formatRecentDiscordContext(recentDiscordTextByChannel, {
1165
- channelId: activeTranscriptChannelId,
944
+ const recentDiscordContext = formatRecentDiscordContext(bridge.recentDiscordTextByChannel, {
945
+ channelId: bridge.activeTranscriptChannelId,
1166
946
  });
1167
947
  const plan = {
1168
948
  task: true,
1169
949
  label: selectedAgentAdapter.label,
1170
- verboseProgress,
950
+ verboseProgress: bridge.verboseProgress,
1171
951
  language: settings.voiceLanguage,
1172
952
  cwd: session?.workdir,
1173
953
  projectContext,
1174
954
  recentDiscordContext,
1175
955
  };
1176
956
  const sessionBefore = selectedAgentAdapter.readSessionId?.();
1177
- log('text agent request start', selectedAgentAdapter.label, sessionBefore ? 'resume-existing-session' : 'new-session', 'verbose', verboseProgress, session ? `project=${session.slug}` : 'project=default');
957
+ log('text agent request start', selectedAgentAdapter.label, sessionBefore ? 'resume-existing-session' : 'new-session', 'verbose', bridge.verboseProgress, session ? `project=${session.slug}` : 'project=default');
1178
958
  try {
1179
959
  const result = await selectedAgentAdapter.run(text, signal, plan);
1180
960
  const answer = result.answer || emptyAgentAnswer(settings.voiceLanguage);
1181
961
  const fullAnswerText = `${agentAnswerHeader(settings.voiceLanguage, selectedAgentAdapter.label)}\n${answer}`;
1182
962
  await sendChannelText(msg.channel, fullAnswerText);
1183
963
  stopProgressSpeech(progressController.signal, 'text-agent-answer-ready');
1184
- if (speakResponse && connection) {
964
+ if (speakResponse && bridge.connection) {
1185
965
  const spokenAnswer = spokenResultOnly(text, answer, settings.voiceLanguage);
1186
966
  await speakText(spokenAnswer, signal, null, { mirrorText: false });
1187
967
  }
@@ -1190,15 +970,11 @@ async function handleTextAgentMessage(msg, text, { speakResponse = false } = {})
1190
970
  warn('text agent request failed', e?.stack || e);
1191
971
  await sendChannelText(msg.channel, formatVoiceErrorMessage(settings.voiceLanguage, String(e?.message || e).slice(0, 800)));
1192
972
  } finally {
1193
- if (activeProgressAbortController && activeProgressAbortController.signal === progressController.signal && !activeProgressAbortController.signal.aborted) {
1194
- try { activeProgressAbortController.abort(); } catch (e) { warn('abort text progress speech failed', e?.stack || e); }
1195
- }
1196
- if (activeProgressSignal === progressController.signal) activeProgressSignal = null;
1197
- if (activeProgressAbortController?.signal === progressController.signal) activeProgressAbortController = null;
973
+ // Text-path-only behaviour pre-refactor: drain the verbose-progress batch
974
+ // before tearing the controllers down. Kept explicit so the lifecycle's
975
+ // finish() can stay path-agnostic.
1198
976
  clearProgressSpeechBatch(progressController.signal);
1199
- if (currentAbortController === controller) currentAbortController = null;
1200
- activeTranscriptChannelId = previousTranscriptChannelId;
1201
- processing = false;
977
+ agentTurnLifecycle.finish(turn);
1202
978
  }
1203
979
  }
1204
980
 
@@ -1222,53 +998,6 @@ async function saveCapturedVoiceCloneSample(userId, wavPath, pcmBytes, segments,
1222
998
  return true;
1223
999
  }
1224
1000
 
1225
- async function handleVoiceCloneCommand(userId, prompt, signal = null) {
1226
- const command = voiceCloneCommandFromText(prompt);
1227
- if (!command) return false;
1228
- if (command.action === 'cancel') {
1229
- const cancelled = voiceCloneCapture.cancel(userId);
1230
- await sendText(cancelled ? '🎙️ 보이스 클로닝 샘플 캡처를 취소했어.' : '🎙️ 대기 중인 보이스 클로닝 샘플 캡처가 없어.');
1231
- await speakText(cancelled ? '목소리 샘플 녹음 대기를 취소했어.' : '대기 중인 목소리 샘플 녹음은 없어.', signal);
1232
- return true;
1233
- }
1234
- if (command.action === 'status') {
1235
- const current = voiceCloneCapture.current();
1236
- const status = current?.userId === String(userId)
1237
- ? `🎙️ 다음 유효한 음성을 ${path.relative(ROOT, current.targetPath)}에 저장할게.`
1238
- : '🎙️ 지금 대기 중인 보이스 클로닝 샘플 캡처는 없어.';
1239
- await sendText(status);
1240
- await speakText(current?.userId === String(userId) ? '다음에 말하는 목소리를 샘플로 저장할게.' : '대기 중인 목소리 샘플 녹음은 없어.', signal);
1241
- return true;
1242
- }
1243
- const armed = voiceCloneCapture.arm({ userId, source: 'voice-command' });
1244
- await sendText(`🎙️ 보이스 클로닝 샘플 캡처 대기 중. 다음 10초에서 30초 정도 말하면 ${path.relative(ROOT, armed.targetPath)}에 저장할게.`);
1245
- await speakText('좋아. 다음에 10초에서 30초 정도 말하면 그 음성을 목소리 샘플로 저장할게.', signal);
1246
- return true;
1247
- }
1248
-
1249
- function stopPlaybackForBargeIn(userId, reason = 'playback-barge-in') {
1250
- if (!speaking) return false;
1251
- log('stop playback for barge-in', 'byUser', userId, 'reason', reason, 'speaking', speaking, 'processing', processing, 'turn', activeTurnId);
1252
- speechPlaybackGeneration += 1;
1253
- try { player.stop(true); } catch (e) { warn('stop playback failed', e?.stack || e); }
1254
- speaking = false;
1255
- return true;
1256
- }
1257
-
1258
- function interruptCurrentResponse(userId, reason = 'barge-in') {
1259
- if (!speaking && !processing) return false;
1260
- const turnId = activeTurnId;
1261
- if (turnId) interruptedTurns.add(turnId);
1262
- log('interrupt current response', 'byUser', userId, 'reason', reason, 'speaking', speaking, 'processing', processing, 'turn', turnId);
1263
- if (currentAbortController && !currentAbortController.signal.aborted) {
1264
- try { currentAbortController.abort(); } catch (e) { warn('abort current response failed', e?.stack || e); }
1265
- }
1266
- try { player.stop(true); } catch (e) { warn('stop playback failed', e?.stack || e); }
1267
- speaking = false;
1268
- processing = false;
1269
- return true;
1270
- }
1271
-
1272
1001
  function acceptsWake(text) {
1273
1002
  if (!settings.requireWakeWord) return true;
1274
1003
  const low = text.toLowerCase();
@@ -1316,17 +1045,6 @@ async function concatWavs(files, output) {
1316
1045
  }
1317
1046
  }
1318
1047
 
1319
- function queueSegment(userId, file, pcmBytes, startedAtMs = Date.now(), endedAtMs = Date.now()) {
1320
- const pending = bridgeState.appendSegment(userId, {
1321
- file,
1322
- pcmBytes,
1323
- startedAtMs,
1324
- endedAtMs,
1325
- timerFactory: () => setTimeout(() => flushUtterance(userId).catch(e => warn('flushUtterance failed', userId, e?.stack || e)), UTTERANCE_IDLE_MS),
1326
- });
1327
- log('queued segment', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes, 'idleMs', UTTERANCE_IDLE_MS, 'epoch', pending.epoch);
1328
- }
1329
-
1330
1048
  function isBargeInCandidate(pcmBytes, levels) {
1331
1049
  const thresholds = currentBargeInThresholds();
1332
1050
  return isValidatedBargeInCandidate(pcmBytes, levels, thresholds);
@@ -1334,7 +1052,7 @@ function isBargeInCandidate(pcmBytes, levels) {
1334
1052
 
1335
1053
  function enqueueDeferredProcessingUtterance({ userId, wavPath, pcmBytes, segments, startedAtMs = Date.now() }) {
1336
1054
  const item = { userId, wavPath, pcmBytes, segments, startedAtMs };
1337
- const result = bridgeState.enqueueDeferred(item, enqueueDeferredUtterance, MAX_DEFERRED_PROCESSING_UTTERANCES);
1055
+ const result = bridge.bridgeState.enqueueDeferred(item, enqueueDeferredUtterance, MAX_DEFERRED_PROCESSING_UTTERANCES);
1338
1056
  if (!result.queued) {
1339
1057
  log('drop deferred utterance because queue disabled', userId, wavPath, 'max', MAX_DEFERRED_PROCESSING_UTTERANCES);
1340
1058
  return false;
@@ -1342,15 +1060,15 @@ function enqueueDeferredProcessingUtterance({ userId, wavPath, pcmBytes, segment
1342
1060
  if (result.dropped) {
1343
1061
  log('drop oldest deferred utterance because queue is full', result.dropped?.userId, result.dropped?.wavPath);
1344
1062
  }
1345
- log('queued deferred utterance while processing', userId, wavPath, 'queueSize', bridgeState.deferredSize(), 'epoch', bridgeState.currentEpoch());
1063
+ log('queued deferred utterance while processing', userId, wavPath, 'queueSize', bridge.bridgeState.deferredSize(), 'epoch', bridge.bridgeState.currentEpoch());
1346
1064
  return true;
1347
1065
  }
1348
1066
 
1349
1067
  async function drainDeferredProcessingUtterances() {
1350
- if (processing || bridgeState.deferredSize() === 0) return;
1351
- const next = bridgeState.shiftDeferred();
1068
+ if (bridge.processing || bridge.bridgeState.deferredSize() === 0) return;
1069
+ const next = bridge.bridgeState.shiftDeferred();
1352
1070
  if (!next) return;
1353
- log('drain deferred utterance', next.userId, next.wavPath, 'remaining', bridgeState.deferredSize());
1071
+ log('drain deferred utterance', next.userId, next.wavPath, 'remaining', bridge.bridgeState.deferredSize());
1354
1072
  const metricsTurn = newLatencyTurn(next.userId, next.startedAtMs || Date.now());
1355
1073
  metricsTurn.mark('voice_first_packet', next.startedAtMs || Date.now());
1356
1074
  metricsTurn.mark('utterance_flush');
@@ -1374,398 +1092,6 @@ async function validateProcessingBargeIn(userId, wavPath, pcmBytes, segments) {
1374
1092
  return { action: 'interrupt', text };
1375
1093
  }
1376
1094
 
1377
- async function flushUtterance(userId) {
1378
- const pending = bridgeState.deletePending(userId);
1379
- if (!pending) return;
1380
- if (pending.timer) clearTimeout(pending.timer);
1381
- const files = pending.files;
1382
- const pcmBytes = pending.pcmBytes;
1383
- const metricsTurn = newLatencyTurn(userId, pending.firstPacketAt || Date.now());
1384
- metricsTurn.mark('voice_first_packet', pending.firstPacketAt || Date.now());
1385
- metricsTurn.mark('voice_segment_end', pending.lastSegmentEndAt || Date.now());
1386
- metricsTurn.mark('utterance_flush');
1387
- metricsTurn.addMeta({ segments: files.length, pcmBytes, epoch: pending.epoch });
1388
- if (pending.epoch !== bridgeState.currentEpoch()) {
1389
- log('drop stale utterance after voice input queue reset', userId, 'utteranceEpoch', pending.epoch, 'currentEpoch', bridgeState.currentEpoch());
1390
- for (const file of files) fs.rm(file, { force: true }, () => {});
1391
- metricsTurn.finish({ status: 'stale_after_config_change' });
1392
- return;
1393
- }
1394
- if (pcmBytes < MIN_UTTERANCE_BYTES) {
1395
- log('skip short utterance', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'minBytes', MIN_UTTERANCE_BYTES);
1396
- metricsTurn.finish({ status: 'skip_short' });
1397
- return;
1398
- }
1399
- const merged = path.join(settings.debugDir, `utterance-merged-${stamp()}-${userId}.wav`);
1400
- await concatWavs(files, merged);
1401
- const levels = await analyzeAudio(merged);
1402
- log('utterance levels', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
1403
- if (await saveCapturedVoiceCloneSample(userId, merged, pcmBytes, files.length)) {
1404
- metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1405
- metricsTurn.finish({ status: 'voice_clone_sample_saved' });
1406
- return;
1407
- }
1408
- const candidate = isBargeInCandidate(pcmBytes, levels);
1409
- if (speaking || processing) {
1410
- const thresholds = currentBargeInThresholds();
1411
- if (!candidate) {
1412
- log('check weak barge-in for explicit stop transcript', userId, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholdBytes', thresholds.minBytes, 'thresholds', thresholds.minMeanDb, thresholds.minMaxDb, 'mode', thresholds.mode);
1413
- }
1414
- const validation = await validateProcessingBargeIn(userId, merged, pcmBytes, files.length);
1415
- if (validation?.action === 'interrupt') {
1416
- metricsTurn.finish({ status: processing ? 'barge_in_processing_interrupt' : 'barge_in_playback_interrupt' });
1417
- return;
1418
- }
1419
- if (processing && validation?.action === 'defer') {
1420
- const queued = enqueueDeferredProcessingUtterance({
1421
- userId,
1422
- wavPath: merged,
1423
- pcmBytes,
1424
- segments: files.length,
1425
- startedAtMs: pending.firstPacketAt || Date.now(),
1426
- });
1427
- metricsTurn.finish({ status: queued ? 'deferred_during_processing' : 'drop_deferred_during_processing' });
1428
- return;
1429
- }
1430
- metricsTurn.finish({ status: speaking ? 'barge_in_playback_ignored' : 'barge_in_processing_ignored' });
1431
- return;
1432
- }
1433
- // Drop only when BOTH overall energy and peak are low. Real Discord speech from this
1434
- // mic can have low mean volume while still carrying intelligible peaks; using OR here
1435
- // caused valid Korean utterances to be discarded as "low-energy".
1436
- if (levels.meanDb < MIN_MEAN_VOLUME_DB && levels.maxDb < MIN_MAX_VOLUME_DB) {
1437
- log('skip low-energy utterance', userId, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholds', MIN_MEAN_VOLUME_DB, MIN_MAX_VOLUME_DB, 'mode', 'both-below');
1438
- metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1439
- metricsTurn.finish({ status: 'skip_low_energy' });
1440
- return;
1441
- }
1442
- metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1443
- await handleRecording(userId, merged, pcmBytes, files.length, metricsTurn);
1444
- }
1445
-
1446
- async function handleRecording(userId, wavPath, pcmBytes, segments = 1, metricsTurn = null) {
1447
- if (processing) { log('drop while processing', userId); metricsTurn?.finish({ status: 'drop_processing' }); return; }
1448
- if (!isAllowed(userId)) { warn('ignore unauthorized', userId); metricsTurn?.finish({ status: 'unauthorized' }); return; }
1449
- processing = true;
1450
- const turnId = ++activeTurnId;
1451
- const controller = new AbortController();
1452
- currentAbortController = controller;
1453
- const signal = controller.signal;
1454
- const sessionForVoice = resolveProjectSessionForChannel(activeVoiceChannelId || settings.transcriptChannelId);
1455
- const previousTranscriptChannelId = activeTranscriptChannelId;
1456
- activeTranscriptChannelId = sessionForVoice?.transcriptChannelId || settings.transcriptChannelId;
1457
- try {
1458
- const runtimeLanguage = reloadRuntimeLanguageFromEnv();
1459
- if (runtimeLanguage.changed) {
1460
- log('drop current utterance because language changed before STT', userId, 'turn', turnId, 'language', runtimeLanguage.voiceLanguage);
1461
- fs.rm(wavPath, { force: true }, () => {});
1462
- metricsTurn?.finish({ status: 'drop_stale_language_change' });
1463
- return;
1464
- }
1465
- const session = resolveProjectSessionForChannel(activeVoiceChannelId || settings.transcriptChannelId);
1466
- activeTranscriptChannelId = session?.transcriptChannelId || settings.transcriptChannelId;
1467
- log('voice turn text target', session ? `project=${session.slug}` : 'project=default', 'channel', activeTranscriptChannelId ? 'project-or-default' : 'none');
1468
- log('transcribing', userId, wavPath, 'pcmBytes', pcmBytes, 'segments', segments, 'turn', turnId);
1469
- const sttNotice = formatSttStartMessage(settings.voiceLanguage);
1470
- await sendText(sttNotice);
1471
- const sttNoticeSpeech = STT_START_VOICE_NOTICE
1472
- ? speakImmediateNotice(sttNotice.replace(/^🎧\s*/u, ''), signal, 'stt-start')
1473
- : Promise.resolve();
1474
- const sttStart = Date.now();
1475
- const text = await transcribe(wavPath);
1476
- await sttNoticeSpeech;
1477
- metricsTurn?.stage('stt', Date.now() - sttStart, { transcriptChars: String(text || '').length });
1478
- if (interruptedTurns.has(turnId) || signal.aborted) { metricsTurn?.finish({ status: 'aborted_after_stt' }); return; }
1479
- if (!text) { log('empty transcript', userId, wavPath); metricsTurn?.finish({ status: 'empty_transcript' }); return; }
1480
- log(`user ${userId} said: ${text}`);
1481
- await sendText(formatSttResultMessage(settings.voiceLanguage, userId, text));
1482
- if (!acceptsWake(text)) { await sendText(formatWakeRejectedMessage(settings.voiceLanguage)); metricsTurn?.finish({ status: 'wake_rejected' }); return; }
1483
-
1484
- const prompt = stripWake(text);
1485
- if (await handleLanguageCommand(prompt, signal)) {
1486
- metricsTurn?.finish({ status: 'language_command' });
1487
- return;
1488
- }
1489
- if (await handleTtsVoiceCommand(prompt, signal)) {
1490
- metricsTurn?.finish({ status: 'voice_command' });
1491
- return;
1492
- }
1493
- if (await handleVoiceCloneCommand(userId, prompt, signal)) {
1494
- metricsTurn?.finish({ status: 'voice_clone_command' });
1495
- return;
1496
- }
1497
- const sensitivityRequest = sensitivityModeFromTranscript(prompt);
1498
- if (sensitivityRequest) {
1499
- const thresholds = setSensitivityMode(sensitivityRequest.mode, sensitivityRequest.reason);
1500
- await sendText(`🎚️ ${sensitivityStatusText()}`);
1501
- if (isSensitivityOnlyRequest(prompt)) {
1502
- await speakText(sensitivityChangedSpeech(thresholds.mode, settings.voiceLanguage), signal, metricsTurn);
1503
- metricsTurn?.finish({ status: 'sensitivity_only' });
1504
- return;
1505
- }
1506
- }
1507
- const verboseRequest = verboseModeFromTranscript(prompt);
1508
- if (verboseRequest !== null) {
1509
- setVerboseProgress(verboseRequest, 'voice-command');
1510
- await sendText(`🔎 ${verboseStatusText()}`);
1511
- if (isVerboseOnlyRequest(prompt)) {
1512
- await speakText(verboseChangedSpeech(verboseRequest, settings.voiceLanguage), signal, metricsTurn);
1513
- metricsTurn?.finish({ status: 'verbose_only' });
1514
- return;
1515
- }
1516
- }
1517
- let promptForAgent = prompt;
1518
- try {
1519
- const planOutcome = await dispatchPlanModeUtterance(prompt, signal);
1520
- if (planOutcome.handled) {
1521
- metricsTurn?.finish({ status: 'plan_mode' });
1522
- return;
1523
- }
1524
- if (planOutcome.prompt) promptForAgent = planOutcome.prompt;
1525
- } catch (e) {
1526
- warn('plan mode dispatch failed', e?.stack || e);
1527
- }
1528
- const selectedAgentAdapter = adapterForProjectSession(session);
1529
- const projectContext = projectSessionContextText(session);
1530
- const recentDiscordContext = formatRecentDiscordContext(recentDiscordTextByChannel, {
1531
- channelId: activeTranscriptChannelId,
1532
- });
1533
- const plan = {
1534
- task: true,
1535
- label: selectedAgentAdapter.label,
1536
- verboseProgress,
1537
- language: settings.voiceLanguage,
1538
- cwd: session?.workdir,
1539
- projectContext,
1540
- recentDiscordContext,
1541
- };
1542
- log('Agent plan', plan.label, 'backend', selectedAgentAdapter.backend, 'task', plan.task, 'language', plan.language, session ? `project=${session.slug}` : 'project=default');
1543
- const agentStart = Date.now();
1544
- const progressController = new AbortController();
1545
- activeProgressAbortController = progressController;
1546
- activeProgressSignal = progressController.signal;
1547
- activeProgressLastEventAt = Date.now();
1548
- const streamingTurnActive = beginStreamingTurn(signal);
1549
- const agentPromise = selectedAgentAdapter.ask(promptForAgent, signal, plan);
1550
- let done = false;
1551
- // Status announcements share one queue with verbose progress so they never
1552
- // talk over each other. In verbose mode, skip the generic initial prompt;
1553
- // the detailed tool/file/test events are the initial progress voice.
1554
- const progressLoop = (async () => {
1555
- if (!verboseProgress) {
1556
- await sleep(2500);
1557
- if (!done && !signal.aborted && !interruptedTurns.has(turnId)) {
1558
- const initial = /^en/i.test(String(settings.voiceLanguage || ''))
1559
- ? 'calling the agent.'
1560
- : '에이전트 호출했어. 응답 기다리는 중.';
1561
- queueProgressSpeechText(initial, progressController.signal, 'generic-initial');
1562
- }
1563
- }
1564
- let idleNotices = 0;
1565
- let nextIdleNoticeMs = PROGRESS_IDLE_NOTICE_INITIAL_MS;
1566
- let lastObservedProgressAt = activeProgressLastEventAt;
1567
- while (!done && !signal.aborted && !interruptedTurns.has(turnId) && idleNotices < PROGRESS_IDLE_NOTICE_LIMIT) {
1568
- await sleep(Math.min(PROGRESS_IDLE_CHECK_MS, nextIdleNoticeMs));
1569
- if (done || signal.aborted || interruptedTurns.has(turnId)) break;
1570
- if (activeProgressLastEventAt !== lastObservedProgressAt) {
1571
- lastObservedProgressAt = activeProgressLastEventAt;
1572
- nextIdleNoticeMs = PROGRESS_IDLE_NOTICE_INITIAL_MS;
1573
- continue;
1574
- }
1575
- const idleMs = Date.now() - activeProgressLastEventAt;
1576
- if (idleMs < nextIdleNoticeMs) continue;
1577
- idleNotices += 1;
1578
- activeProgressLastEventAt = Date.now();
1579
- lastObservedProgressAt = activeProgressLastEventAt;
1580
- const idle = /^en/i.test(String(settings.voiceLanguage || ''))
1581
- ? 'still working on that.'
1582
- : '아직 작업 중이야.';
1583
- queueProgressSpeechText(idle, progressController.signal, `idle-${idleNotices}-${Math.round(nextIdleNoticeMs / 1000)}s`);
1584
- nextIdleNoticeMs = Math.min(
1585
- PROGRESS_IDLE_NOTICE_MAX_MS,
1586
- Math.max(nextIdleNoticeMs + 1000, Math.round(nextIdleNoticeMs * PROGRESS_IDLE_NOTICE_MULTIPLIER)),
1587
- );
1588
- }
1589
- })().catch(e => {
1590
- if (!isAbortError(e)) warn('progress loop failed', e?.stack || e);
1591
- });
1592
- const answer = await agentPromise.finally(() => { done = true; });
1593
- if (streamingTurnActive) await endStreamingTurn();
1594
- metricsTurn?.stage('agent', Date.now() - agentStart, { answerChars: String(answer || '').length, backend: selectedAgentAdapter.backend });
1595
- void progressLoop;
1596
- if (interruptedTurns.has(turnId) || signal.aborted) { metricsTurn?.finish({ status: 'aborted_after_agent' }); return; }
1597
-
1598
- log('Agent answer', selectedAgentAdapter.label, answer.slice(0, 200));
1599
- const spokenAnswer = spokenResultOnly(prompt, answer, settings.voiceLanguage);
1600
- const fullAnswerText = `${agentAnswerHeader(settings.voiceLanguage, selectedAgentAdapter.label)}\n${answer || emptyAgentAnswer(settings.voiceLanguage)}`;
1601
- log('send agent answer text', 'chars', fullAnswerText.length);
1602
- const answerTextDelivered = await sendText(fullAnswerText);
1603
- if (!answerTextDelivered) {
1604
- warn('agent answer text delivery failed; still speaking answer');
1605
- }
1606
- log('spoken answer', spokenAnswer.slice(0, 200));
1607
- stopProgressSpeech(progressController.signal, 'agent-answer-ready');
1608
- if (streamingTurnActive && streamingSpeechDelivered) {
1609
- log('skipping post-run speakText; streaming already delivered audio');
1610
- } else {
1611
- await speakText(spokenAnswer, signal, metricsTurn, { mirrorText: !answerTextDelivered });
1612
- }
1613
- try {
1614
- const guildId = client.channels.cache.get(activeVoiceChannelId)?.guild?.id || '';
1615
- await maybeNotifyTaskComplete({
1616
- answer: spokenAnswer || answer,
1617
- label: selectedAgentAdapter.label,
1618
- elapsedMs: Date.now() - agentStart,
1619
- guildId,
1620
- });
1621
- } catch (e) { warn('maybeNotifyTaskComplete failed', e?.message || e); }
1622
- metricsTurn?.finish({ status: 'ok' });
1623
- } catch (e) {
1624
- if (isAbortError(e) || interruptedTurns.has(turnId)) {
1625
- log('turn aborted', userId, 'turn', turnId);
1626
- metricsTurn?.finish({ status: 'aborted' });
1627
- return;
1628
- }
1629
- warn('handleRecording failed', e?.stack || e);
1630
- const shortMsg = String(e?.message || e).slice(0, 800);
1631
- metricsTurn?.finish({ status: 'error', error: shortMsg });
1632
- await sendText(formatVoiceErrorMessage(settings.voiceLanguage, shortMsg));
1633
- } finally {
1634
- if (activeProgressAbortController && !activeProgressAbortController.signal.aborted) {
1635
- try { activeProgressAbortController.abort(); } catch (e) { warn('abort progress speech in cleanup failed', e?.stack || e); }
1636
- }
1637
- if (activeProgressSignal === activeProgressAbortController?.signal) activeProgressSignal = null;
1638
- activeProgressAbortController = null;
1639
- if (currentAbortController === controller) currentAbortController = null;
1640
- activeTranscriptChannelId = previousTranscriptChannelId;
1641
- interruptedTurns.delete(turnId);
1642
- if (activeTurnId === turnId) activeTurnId = 0;
1643
- processing = false;
1644
- if (bridgeState.deferredSize() > 0) {
1645
- setImmediate(() => drainDeferredProcessingUtterances().catch(e => warn('drain deferred utterance failed', e?.stack || e)));
1646
- }
1647
- }
1648
- }
1649
-
1650
- function subscribeUser(receiver, userId) {
1651
- if (!isAllowed(userId)) return;
1652
- if (String(userId) === client.user?.id) return;
1653
- const wasSpeaking = speaking;
1654
- const wasProcessing = processing;
1655
- if ((wasSpeaking || wasProcessing) && !activeStreams.has(userId)) {
1656
- // Speaking-start alone is too noisy in Discord voice. Record and validate a
1657
- // real segment first; only confirmed playback barge-in stops the current
1658
- // audio chunk, and only explicit stop transcripts abort active agent work.
1659
- log('possible barge-in start; waiting for segment validation', userId, 'speaking', wasSpeaking, 'processing', wasProcessing);
1660
- }
1661
- if (activeStreams.has(userId)) return;
1662
- const pending = bridgeState.getPending(userId);
1663
- if (pending?.timer) {
1664
- bridgeState.clearPendingTimer(userId);
1665
- log('extend pending utterance because new segment started', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes);
1666
- }
1667
-
1668
- const file = path.join(settings.debugDir, `segment-${stamp()}-${userId}.wav`);
1669
- log('subscribe user', userId, file);
1670
- const opusStream = receiver.subscribe(userId, { end: { behavior: EndBehaviorType.AfterSilence, duration: SUBSCRIBE_AFTER_SILENCE_MS } });
1671
- const decoder = new prism.opus.Decoder({ rate: 48000, channels: 2, frameSize: 960 });
1672
- const writer = new wav.FileWriter(file, { sampleRate: 48000, channels: 2, bitDepth: 16 });
1673
- activeStreams.set(userId, { opusStream, decoder, writer, file, startedAtMs: Date.now() });
1674
- let pcmBytes = 0;
1675
- const liveThresholds = wasSpeaking && !wasProcessing ? currentPlaybackBargeInThresholds() : currentBargeInThresholds();
1676
- const liveBargeIn = shouldUseLivePlaybackBargeIn({ speaking: wasSpeaking, processing: wasProcessing }) ? createLiveBargeInMonitor({
1677
- minBytes: liveThresholds.minBytes,
1678
- minMeanDb: liveThresholds.minMeanDb,
1679
- minMaxDb: liveThresholds.minMaxDb,
1680
- requireBoth: liveThresholds.requireBoth,
1681
- log,
1682
- onConfirm: ({ pcmBytes: confirmedBytes, levels }) => {
1683
- log('confirmed live playback barge-in before segment end', userId, 'pcmBytes', confirmedBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
1684
- stopPlaybackForBargeIn(userId, 'confirmed-live-playback-barge-in');
1685
- },
1686
- }) : null;
1687
- decoder.on('data', chunk => {
1688
- pcmBytes += chunk.length;
1689
- liveBargeIn?.push(chunk);
1690
- });
1691
- opusStream.on('error', e => warn('opus stream error', userId, e?.stack || e));
1692
- decoder.on('error', e => warn('opus decoder error', userId, e?.stack || e));
1693
- writer.on('error', e => warn('wav writer error', userId, e?.stack || e));
1694
- opusStream.on('end', () => log('opus end', userId, 'pcmBytes', pcmBytes));
1695
- writer.on('finish', () => {
1696
- const streamState = activeStreams.get(userId);
1697
- activeStreams.delete(userId);
1698
- const endedAtMs = Date.now();
1699
- log('saved segment', userId, 'pcmBytes', pcmBytes, file);
1700
- queueSegment(userId, file, pcmBytes, streamState?.startedAtMs || endedAtMs, endedAtMs);
1701
- });
1702
- opusStream.pipe(decoder).pipe(writer);
1703
- }
1704
-
1705
- async function connectTo(channel) {
1706
- if (connection) {
1707
- try { connection.destroy(); } catch {}
1708
- }
1709
- activeVoiceChannelId = channel.id;
1710
- connection = joinVoiceChannel({
1711
- channelId: channel.id,
1712
- guildId: channel.guild.id,
1713
- adapterCreator: channel.guild.voiceAdapterCreator,
1714
- selfDeaf: false,
1715
- selfMute: false,
1716
- });
1717
- const voiceConnection = connection;
1718
- voiceConnection.subscribe(player);
1719
- voiceConnection.on('error', e => warn('voice connection error', e?.stack || e));
1720
- voiceConnection.on('stateChange', async (oldState, newState) => {
1721
- log('voice connection state', oldState.status, '->', newState.status);
1722
- if (connection !== voiceConnection) {
1723
- log('ignore stale voice connection state', oldState.status, '->', newState.status);
1724
- return;
1725
- }
1726
- if (newState.status === VoiceConnectionStatus.Disconnected) {
1727
- try {
1728
- await Promise.race([
1729
- entersState(voiceConnection, VoiceConnectionStatus.Signalling, 5000),
1730
- entersState(voiceConnection, VoiceConnectionStatus.Connecting, 5000),
1731
- ]);
1732
- } catch (e) {
1733
- if (connection !== voiceConnection) return;
1734
- warn('voice connection disconnected; reconnecting to channel', channel.guild.name, channel.name, e?.message || e);
1735
- try { voiceConnection.destroy(); } catch {}
1736
- connection = null;
1737
- setTimeout(() => connectTo(channel).catch(err => warn('voice reconnect failed', err?.stack || err)), 1500);
1738
- }
1739
- }
1740
- });
1741
- await entersState(voiceConnection, VoiceConnectionStatus.Ready, 30000);
1742
- voiceConnection.receiver.speaking.on('start', userId => subscribeUser(voiceConnection.receiver, userId));
1743
- log(`Listening in voice channel ${channel.guild.name} / ${channel.name}`);
1744
- }
1745
-
1746
- async function autoJoin() {
1747
- const attempted = [];
1748
- for (const preferredName of settings.autoJoinVoiceChannels) {
1749
- for (const guild of client.guilds.cache.values()) {
1750
- const channels = await guild.channels.fetch();
1751
- for (const ch of channels.values()) {
1752
- if (!ch?.isVoiceBased?.() || ch.name.toLowerCase() !== preferredName) continue;
1753
- attempted.push(`${guild.name}/${ch.name}`);
1754
- try {
1755
- await connectTo(ch);
1756
- return;
1757
- } catch (e) {
1758
- warn('auto-join failed; trying next configured voice channel', guild.name, ch.name, e?.stack || e);
1759
- try { connection?.destroy(); } catch {}
1760
- connection = null;
1761
- activeVoiceChannelId = '';
1762
- }
1763
- }
1764
- }
1765
- }
1766
- warn('No auto-join channel found or reachable', settings.autoJoinVoiceChannels, 'attempted', attempted);
1767
- }
1768
-
1769
1095
  function consumeRestartNotice() {
1770
1096
  const noticePath = path.join(ROOT, '.cache', 'restart-notice.txt');
1771
1097
  try {
@@ -1787,76 +1113,8 @@ async function announceRestartComplete() {
1787
1113
  await speakText(speech, undefined, null, { mirrorText: false });
1788
1114
  }
1789
1115
 
1790
- async function findVoiceChannelBySelector(guild, selector) {
1791
- const wanted = String(selector || '').trim();
1792
- if (!wanted || !guild) return null;
1793
- const id = wanted.replace(/^<#(\d+)>$/, '$1');
1794
- const channels = await guild.channels.fetch();
1795
- const voiceChannels = [...channels.values()].filter(ch => ch?.isVoiceBased?.());
1796
- const byId = voiceChannels.find(ch => ch.id === id);
1797
- if (byId) return byId;
1798
- const matches = voiceChannels.filter(ch => String(ch.name || '').toLowerCase() === wanted.toLowerCase());
1799
- if (matches.length === 1) return matches[0];
1800
- if (matches.length > 1) throw new Error(`같은 이름의 음성 채널이 여러 개야. 채널 ID나 멘션으로 지정해줘: ${wanted}`);
1801
- throw new Error(`음성 채널을 찾지 못했어: ${wanted}`);
1802
- }
1803
-
1804
- async function voiceChannelLabel(guild, channelId) {
1805
- if (!channelId || !guild) return '없음';
1806
- try {
1807
- const ch = await guild.channels.fetch(channelId);
1808
- return ch?.name || '지정됨';
1809
- } catch {
1810
- return '지정됨';
1811
- }
1812
- }
1813
-
1814
- async function resolveVoiceChannelForAttach(msg, selector = '') {
1815
- if (selector) return findVoiceChannelBySelector(msg.guild, selector);
1816
- if (msg.member?.voice?.channel) return msg.member.voice.channel;
1817
- if (activeVoiceChannelId && msg.guild) {
1818
- try {
1819
- const ch = await msg.guild.channels.fetch(activeVoiceChannelId);
1820
- if (ch?.isVoiceBased?.()) return ch;
1821
- } catch {}
1822
- }
1823
- throw new Error('붙일 음성 채널을 못 찾았어. 음성채널에 들어가서 `!session attach-voice`를 치거나 `--voice "채널명"`을 붙여줘.');
1824
- }
1825
-
1826
- async function attachVoiceChannelToTextSession(msg, command) {
1827
- const voiceChannel = await resolveVoiceChannelForAttach(msg, command.voice);
1828
- let session = null;
1829
- if (command.name) {
1830
- session = bindProjectSessionToChannel({ state: projectSessionsState, nameOrSlug: command.name, channelId: msg.channelId });
1831
- } else {
1832
- session = resolveProjectSessionForChannel(msg.channelId)
1833
- || resolveProjectSessionForChannel(voiceChannel.id);
1834
- if (!session) {
1835
- const fallbackName = String(msg.channel?.name || `channel-${msg.channelId}`).trim() || `channel-${msg.channelId}`;
1836
- session = createProjectSession({
1837
- root: ROOT,
1838
- state: projectSessionsState,
1839
- name: fallbackName,
1840
- workdir: settings.agent.cwd || ROOT,
1841
- channelId: msg.channelId,
1842
- voiceChannelId: voiceChannel.id,
1843
- transcriptChannelId: msg.channelId,
1844
- mcpContext: 'Ad-hoc Discord text channel session',
1845
- });
1846
- }
1847
- }
1848
- session.transcriptChannelId = msg.channelId;
1849
- session.voiceChannelId = voiceChannel.id;
1850
- projectSessionsState.channelSessions[msg.channelId] = session.slug;
1851
- projectSessionsState.channelSessions[voiceChannel.id] = session.slug;
1852
- saveProjectSessionsState();
1853
- agentAdaptersBySession.delete(session.slug);
1854
- if (activeVoiceChannelId !== voiceChannel.id) await connectTo(voiceChannel);
1855
- return msg.reply(`${session.name} 세션을 이 텍스트 채널과 음성 채널 ${voiceChannel.name}에 붙였어. 이제 그 음성채널 발화의 STT/답변 텍스트는 이 채널로 가.`);
1856
- }
1857
-
1858
1116
  async function handleProjectSessionCommand(msg, command) {
1859
- const activeSession = resolveProjectSessionForChannel(msg.channelId) || resolveProjectSessionForChannel(activeVoiceChannelId);
1117
+ const activeSession = resolveProjectSessionForChannel(msg.channelId) || resolveProjectSessionForChannel(bridge.activeVoiceChannelId);
1860
1118
  if (command.action === 'attach-voice') return void await attachVoiceChannelToTextSession(msg, command);
1861
1119
  if (command.action === 'status') {
1862
1120
  if (!activeSession) return void msg.reply(`${agentAdapter.label} 기본 세션: ${agentAdapter.readSessionId?.() || '아직 없음'}`);
@@ -1914,7 +1172,8 @@ async function handleProjectSessionCommand(msg, command) {
1914
1172
  mcpContext: command.mcpContext,
1915
1173
  });
1916
1174
  saveProjectSessionsState();
1917
- agentAdaptersBySession.delete(session.slug);
1175
+ bridge.agentAdaptersBySession.delete(session.slug);
1176
+ invalidateBackendAdaptersForSession(session.slug);
1918
1177
  return void msg.reply(`${session.name} 프로젝트 세션 만들었어. 작업실은 ${session.workdir}이고, 이 텍스트 채널${voiceChannel ? `과 음성 채널 ${voiceChannel.name}` : ''} 입력은 별도 Hermes 세션 파일로 이어져.`);
1919
1178
  }
1920
1179
  }
@@ -1925,130 +1184,22 @@ client.once('ready', async () => {
1925
1184
  await announceRestartComplete();
1926
1185
  });
1927
1186
 
1928
- client.on('messageCreate', async msg => {
1929
- if (msg.author.bot) return;
1930
- if (!isAllowed(msg.author.id)) return;
1931
- const content = msg.content.trim();
1932
- appendRecentDiscordText(recentDiscordTextByChannel, {
1933
- channelId: msg.channelId,
1934
- authorLabel: msg.member?.displayName || msg.author?.username || 'user',
1935
- content,
1936
- });
1937
- const projectSessionCommand = parseProjectSessionCommand(content);
1938
- if (projectSessionCommand) {
1939
- try {
1940
- await handleProjectSessionCommand(msg, projectSessionCommand);
1941
- } catch (e) {
1942
- warn('project session command failed', e?.stack || e);
1943
- await msg.reply(String(e?.message || e).slice(0, 700));
1944
- }
1945
- return;
1946
- }
1947
- if (content === '!ping') return void msg.reply('pong');
1948
- if (content === '!verbose') return void msg.reply(verboseStatusText());
1949
- if (['!verbose on', '!verbose true', '!verbose 1', '!verbose 켜', '!verbose 켜줘'].includes(content.toLowerCase())) {
1950
- setVerboseProgress(true, 'discord-command');
1951
- return void msg.reply(verboseStatusText());
1952
- }
1953
- if (['!verbose off', '!verbose false', '!verbose 0', '!verbose 꺼', '!verbose 꺼줘'].includes(content.toLowerCase())) {
1954
- setVerboseProgress(false, 'discord-command');
1955
- return void msg.reply(verboseStatusText());
1956
- }
1957
- if (content === '!notify') return void msg.reply(notifyStatusText());
1958
- if (['!notify on', '!notify always', '!notify 1'].includes(content.toLowerCase())) {
1959
- notifyUserOptIn = true;
1960
- return void msg.reply(notifyStatusText());
1961
- }
1962
- if (['!notify off', '!notify auto', '!notify 0'].includes(content.toLowerCase())) {
1963
- notifyUserOptIn = false;
1964
- return void msg.reply(notifyStatusText());
1965
- }
1966
- if (content === '!smart-progress' || content === '!smart_progress') return void msg.reply(smartProgressStatusText());
1967
- if (['!smart-progress on', '!smart-progress true', '!smart-progress 1', '!smart_progress on'].includes(content.toLowerCase())) {
1968
- smartProgressEnabled = true;
1969
- return void msg.reply(smartProgressStatusText());
1970
- }
1971
- if (['!smart-progress off', '!smart-progress false', '!smart-progress 0', '!smart_progress off'].includes(content.toLowerCase())) {
1972
- smartProgressEnabled = false;
1973
- return void msg.reply(smartProgressStatusText());
1974
- }
1975
- if (content === '!sensitivity') return void msg.reply(sensitivityStatusText());
1976
- if (content === '!latency' || content === '!metrics') {
1977
- const summary = summarizeLatencyRecords(readJsonlRecords(settings.latencyLogPath, { limit: 200 }));
1978
- return void msg.reply(`최근 latency 요약 (${settings.latencyLogPath}):\n${formatLatencySummary(summary)}`.slice(0, 1900));
1979
- }
1980
- if (content === '!sensitivity conservative') {
1981
- setSensitivityMode('conservative', 'discord-command');
1982
- return void msg.reply(sensitivityStatusText());
1983
- }
1984
- if (content === '!sensitivity normal') {
1985
- setSensitivityMode('normal', 'discord-command');
1986
- return void msg.reply(sensitivityStatusText());
1987
- }
1988
- if (content === '!session') return void handleProjectSessionCommand(msg, { action: 'status' });
1989
- if (content === '!reset-session') return void handleProjectSessionCommand(msg, { action: 'reset' });
1990
- if (content === '!join') {
1991
- const ch = msg.member?.voice?.channel;
1992
- if (!ch) return void msg.reply('먼저 음성 채널에 들어가줘.');
1993
- await connectTo(ch);
1994
- return void msg.reply('들어왔어. Node receiver로 듣는 중.');
1995
- }
1996
- if (content === '!leave') {
1997
- try { connection?.destroy(); } catch {}
1998
- connection = null;
1999
- activeVoiceChannelId = '';
2000
- return void msg.reply('나갈게.');
2001
- }
2002
- if (content.startsWith('!say ')) {
2003
- const text = content.slice(5).trim();
2004
- const mp3 = await synthTTS(text);
2005
- await playAudio(mp3);
2006
- return;
2007
- }
2008
- if (content.startsWith('!voice-test ')) {
2009
- const text = content.slice('!voice-test '.length).trim();
2010
- if (!text) return void msg.reply('테스트할 문장을 붙여줘.');
2011
- const started = Date.now();
2012
- try {
2013
- await msg.reply(`TTS 백엔드 ${ttsBackend.name}로 음성 테스트할게.`);
2014
- await speakText(text);
2015
- await msg.channel.send(`음성 테스트 완료: ${ttsBackend.name}, ${Date.now() - started}ms`);
2016
- } catch (e) {
2017
- warn('voice-test failed', e?.stack || e);
2018
- await msg.channel.send(`음성 테스트 실패: ${String(e?.message || e).slice(0, 700)}`);
2019
- }
2020
- return;
2021
- }
2022
- if (content === '!voice-clone' || content === '!voice-clone status') {
2023
- const current = voiceCloneCapture.current();
2024
- if (current?.userId === String(msg.author.id)) {
2025
- return void msg.reply(`다음 유효한 음성을 ${path.relative(ROOT, current.targetPath)}에 저장할게.`);
2026
- }
2027
- return void msg.reply('대기 중인 보이스 클로닝 샘플 캡처가 없어. `!voice-clone capture`로 시작해.');
2028
- }
2029
- if (content === '!voice-clone cancel') {
2030
- const cancelled = voiceCloneCapture.cancel(msg.author.id);
2031
- return void msg.reply(cancelled ? '보이스 클로닝 샘플 캡처를 취소했어.' : '대기 중인 캡처가 없어.');
2032
- }
2033
- if (content === '!voice-clone capture') {
2034
- const armed = voiceCloneCapture.arm({ userId: msg.author.id, source: 'discord-command' });
2035
- return void msg.reply(`다음 유효한 음성을 ${path.relative(ROOT, armed.targetPath)}에 저장할게. 음성 채널에서 10~30초 정도 말해줘.`);
2036
- }
2037
- if (content.startsWith('!ask ')) {
2038
- const text = content.slice(5).trim();
2039
- if (!text) return void msg.reply('물어볼 내용을 붙여줘.');
2040
- await handleTextAgentMessage(msg, text, { speakResponse: true });
2041
- return;
2042
- }
2043
- if (shouldRouteDiscordTextToAgent({
2044
- content,
2045
- channelId: msg.channelId,
2046
- transcriptChannelId: settings.transcriptChannelId,
2047
- }) || resolveProjectSessionForChannel(msg.channelId)) {
2048
- await handleTextAgentMessage(msg, content, { speakResponse: false });
2049
- return;
2050
- }
1187
+ const discordCommandRouter = createDiscordCommandRouter({
1188
+ bridge, settings, warn, path, ROOT,
1189
+ isAllowed,
1190
+ handleProjectSessionCommand,
1191
+ handleTextAgentMessage,
1192
+ resolveProjectSessionForChannel,
1193
+ verboseStatusText, setVerboseProgress,
1194
+ notifyStatusText,
1195
+ smartProgressStatusText,
1196
+ sensitivityStatusText, setSensitivityMode,
1197
+ summarizeLatencyRecords, readJsonlRecords, formatLatencySummary,
1198
+ connectTo,
1199
+ synthTTS, playAudio, speakText,
1200
+ voiceCloneCapture,
2051
1201
  });
1202
+ client.on('messageCreate', msg => discordCommandRouter.handleDiscordMessage(msg).catch(e => warn('discord command router failed', e?.stack || e)));
2052
1203
 
2053
1204
  process.stdout?.on?.('error', error => {
2054
1205
  if (isBenignTransientNetworkError(error)) {
@@ -2078,37 +1229,6 @@ process.on('uncaughtException', error => {
2078
1229
  client.on('error', e => warn('discord client error', e?.stack || e));
2079
1230
  client.on('shardError', e => warn('discord shard error', e?.stack || e));
2080
1231
 
2081
- let shutdownStarted = false;
2082
- async function gracefulShutdown(signalName) {
2083
- if (shutdownStarted) return;
2084
- shutdownStarted = true;
2085
- log('graceful shutdown requested', signalName, 'connection', Boolean(connection));
2086
- try {
2087
- if (currentAbortController && !currentAbortController.signal.aborted) currentAbortController.abort();
2088
- } catch (e) {
2089
- warn('abort before shutdown failed', e?.stack || e);
2090
- }
2091
- try {
2092
- if (connection) {
2093
- let detail = '';
2094
- const noticePath = path.join(ROOT, '.cache', 'restart-notice.txt');
2095
- try {
2096
- if (fs.existsSync(noticePath)) {
2097
- detail = fs.readFileSync(noticePath, 'utf8').replace(/\s+/g, ' ').trim().slice(0, 120);
2098
- }
2099
- } catch (e) {
2100
- warn('read restart notice failed', e?.stack || e);
2101
- }
2102
- await speakText(formatRestartShutdownNotice(detail, settings.tts.edge.voice));
2103
- await waitEvent(player, AudioPlayerStatus.Idle, 30000).catch(() => {});
2104
- }
2105
- } catch (e) {
2106
- warn('shutdown voice notice failed', e?.stack || e);
2107
- }
2108
- try { connection?.destroy(); } catch {}
2109
- try { client.destroy(); } catch {}
2110
- process.exit(0);
2111
- }
2112
1232
  process.on('SIGTERM', () => { void gracefulShutdown('SIGTERM'); });
2113
1233
  process.on('SIGINT', () => { void gracefulShutdown('SIGINT'); });
2114
1234