verbalcoding 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/.env.example +83 -0
  2. package/LICENSE +21 -0
  3. package/README.md +157 -0
  4. package/app-node/agent_adapters.mjs +576 -0
  5. package/app-node/agent_adapters.test.mjs +455 -0
  6. package/app-node/agent_contract.mjs +45 -0
  7. package/app-node/barge_in.mjs +148 -0
  8. package/app-node/barge_in.test.mjs +179 -0
  9. package/app-node/bridge_logger.mjs +66 -0
  10. package/app-node/bridge_logger.test.mjs +73 -0
  11. package/app-node/bridge_state.mjs +104 -0
  12. package/app-node/bridge_state.test.mjs +64 -0
  13. package/app-node/cli_install.test.mjs +97 -0
  14. package/app-node/deferred_queue.mjs +12 -0
  15. package/app-node/deferred_queue.test.mjs +20 -0
  16. package/app-node/discord_invite_cli.test.mjs +31 -0
  17. package/app-node/discord_text.mjs +29 -0
  18. package/app-node/discord_text.test.mjs +32 -0
  19. package/app-node/hermes_profiles.mjs +164 -0
  20. package/app-node/hermes_profiles.test.mjs +276 -0
  21. package/app-node/install_config.mjs +263 -0
  22. package/app-node/install_config.test.mjs +205 -0
  23. package/app-node/instance_doctor.mjs +137 -0
  24. package/app-node/instance_doctor.test.mjs +128 -0
  25. package/app-node/instance_profile_lifecycle.mjs +16 -0
  26. package/app-node/instances.mjs +153 -0
  27. package/app-node/instances.test.mjs +102 -0
  28. package/app-node/language_config.mjs +73 -0
  29. package/app-node/language_config.test.mjs +51 -0
  30. package/app-node/latency_metrics.mjs +133 -0
  31. package/app-node/latency_metrics.test.mjs +71 -0
  32. package/app-node/main.mjs +1771 -0
  33. package/app-node/mcp_tools.mjs +198 -0
  34. package/app-node/mcp_tools.test.mjs +39 -0
  35. package/app-node/progress_cache.mjs +7 -0
  36. package/app-node/progress_cache.test.mjs +23 -0
  37. package/app-node/progress_speech.mjs +102 -0
  38. package/app-node/progress_speech.test.mjs +48 -0
  39. package/app-node/project_sessions.mjs +148 -0
  40. package/app-node/project_sessions.test.mjs +77 -0
  41. package/app-node/restart_notice.mjs +57 -0
  42. package/app-node/restart_notice.test.mjs +37 -0
  43. package/app-node/restart_policy.mjs +27 -0
  44. package/app-node/restart_policy.test.mjs +33 -0
  45. package/app-node/text_routing.mjs +8 -0
  46. package/app-node/text_routing.test.mjs +18 -0
  47. package/app-node/tts_backends.mjs +251 -0
  48. package/app-node/tts_backends.test.mjs +400 -0
  49. package/app-node/tts_chunks.mjs +57 -0
  50. package/app-node/tts_chunks.test.mjs +35 -0
  51. package/app-node/tts_prefetch.mjs +38 -0
  52. package/app-node/tts_prefetch.test.mjs +49 -0
  53. package/app-node/tts_settings.mjs +72 -0
  54. package/app-node/tts_settings.test.mjs +127 -0
  55. package/app-node/tts_voice_config.mjs +127 -0
  56. package/app-node/tts_voice_config.test.mjs +64 -0
  57. package/app-node/voice_clone_capture.mjs +76 -0
  58. package/app-node/voice_clone_capture.test.mjs +51 -0
  59. package/app-node/voice_messages.mjs +62 -0
  60. package/app-node/voice_messages.test.mjs +33 -0
  61. package/docs/CONFIGURATION.md +183 -0
  62. package/docs/FRESH_INSTALL.md +193 -0
  63. package/docs/MULTI_INSTANCE.md +183 -0
  64. package/docs/RELEASE.md +72 -0
  65. package/docs/USAGE.md +108 -0
  66. package/docs/assets/figures/verbalcoding-flow.svg +63 -0
  67. package/docs/i18n/README.es.md +121 -0
  68. package/docs/i18n/README.fr.md +121 -0
  69. package/docs/i18n/README.ja.md +121 -0
  70. package/docs/i18n/README.ko.md +121 -0
  71. package/docs/i18n/README.ru.md +121 -0
  72. package/docs/i18n/README.zh.md +121 -0
  73. package/package.json +58 -0
  74. package/run.sh +82 -0
  75. package/scripts/bootstrap_prereqs.sh +193 -0
  76. package/scripts/cli.mjs +369 -0
  77. package/scripts/docker_ubuntu_smoke.sh +76 -0
  78. package/scripts/doctor.mjs +134 -0
  79. package/scripts/install.mjs +108 -0
  80. package/scripts/install.sh +44 -0
  81. package/scripts/mcp-server.mjs +84 -0
  82. package/scripts/openvoice_smoke.py +34 -0
  83. package/scripts/openvoice_synth.py +103 -0
  84. package/scripts/setup_openvoice.sh +34 -0
  85. package/scripts/setup_supertonic.sh +18 -0
@@ -0,0 +1,1771 @@
1
+ import fs from 'node:fs';
2
+ import os from 'node:os';
3
+ import path from 'node:path';
4
+ import { fileURLToPath } from 'node:url';
5
+ import { spawn, execFile } from 'node:child_process';
6
+ import { promisify } from 'node:util';
7
+
8
+ import { Client, GatewayIntentBits, Partials } from 'discord.js';
9
+ import {
10
+ AudioPlayerStatus,
11
+ EndBehaviorType,
12
+ StreamType,
13
+ VoiceConnectionStatus,
14
+ createAudioPlayer,
15
+ createAudioResource,
16
+ entersState,
17
+ joinVoiceChannel,
18
+ } from '@discordjs/voice';
19
+ import prism from 'prism-media';
20
+ import wav from 'wav';
21
+ import { buildAgentSettings, createAgentAdapter, isPatchLikeOutput } from './agent_adapters.mjs';
22
+ import {
23
+ appendJsonl,
24
+ createLatencyTurn,
25
+ formatLatencySummary,
26
+ readJsonlRecords,
27
+ summarizeLatencyRecords,
28
+ } from './latency_metrics.mjs';
29
+ import { splitForTTS } from './tts_chunks.mjs';
30
+ import { playChunkedTTSWithPrefetch } from './tts_prefetch.mjs';
31
+ import { progressCategory, summarizeProgressEvents, formatProgressMessage } from './progress_speech.mjs';
32
+ import { buildTtsSettings } from './tts_settings.mjs';
33
+ import { createTtsBackend } from './tts_backends.mjs';
34
+ import {
35
+ applyTtsVoiceSelectionToEnv,
36
+ defaultTtsVoiceConfig,
37
+ effectiveTtsVoiceSelection,
38
+ preferredVoiceTypeForLanguage,
39
+ readTtsVoiceConfig,
40
+ updateTtsVoiceConfig,
41
+ voiceCommandFromTranscript,
42
+ writeTtsVoiceConfig,
43
+ } from './tts_voice_config.mjs';
44
+ import { createBridgeLogger, createTransientErrorReporter, isTransientNetworkError } from './bridge_logger.mjs';
45
+ import { createBridgeState } from './bridge_state.mjs';
46
+ import { sendDiscordText, splitDiscordMessage } from './discord_text.mjs';
47
+ import { progressTtsCacheFileName } from './progress_cache.mjs';
48
+ import { shouldPassWhisperLanguage, voiceLanguageCommandFromTranscript, languagePreset } from './language_config.mjs';
49
+ import { formatRestartCompleteNotice, formatRestartShutdownNotice } from './restart_notice.mjs';
50
+ import { shouldRouteDiscordTextToAgent } from './text_routing.mjs';
51
+ import {
52
+ bindProjectSessionToChannel,
53
+ createProjectSession,
54
+ listProjectSessions,
55
+ loadProjectSessions,
56
+ parseProjectSessionCommand,
57
+ projectSessionContextText,
58
+ projectSessionForChannel,
59
+ saveProjectSessions,
60
+ } from './project_sessions.mjs';
61
+ import {
62
+ agentAnswerHeader,
63
+ emptyAgentAnswer,
64
+ formatSttResultMessage,
65
+ formatSttStartMessage,
66
+ formatVoiceErrorMessage,
67
+ formatWakeRejectedMessage,
68
+ sensitivityChangedSpeech,
69
+ sensitivityStatusTextForLanguage,
70
+ verboseChangedSpeech,
71
+ verboseStatusTextForLanguage,
72
+ } from './voice_messages.mjs';
73
+ import { enqueueDeferredUtterance } from './deferred_queue.mjs';
74
+ import {
75
+ createVoiceCloneCaptureState,
76
+ saveVoiceCloneReference,
77
+ voiceCloneCommandFromText,
78
+ } from './voice_clone_capture.mjs';
79
+ import {
80
+ bargeInThresholdsForMode,
81
+ createLiveBargeInMonitor,
82
+ isBargeInCandidate as isValidatedBargeInCandidate,
83
+ isExplicitBargeInTranscript,
84
+ isRepeatedNoiseTranscript,
85
+ sensitivityModeFromTranscript,
86
+ shouldUseLivePlaybackBargeIn,
87
+ } from './barge_in.mjs';
88
+
89
+ const execFileAsync = promisify(execFile);
90
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
91
+ const ROOT = path.resolve(__dirname, '..');
92
+
93
+ function loadDotEnv(file = path.join(ROOT, '.env'), { override = true } = {}) {
94
+ if (!fs.existsSync(file)) return;
95
+ const text = fs.readFileSync(file, 'utf8');
96
+ for (const raw of text.split(/\r?\n/)) {
97
+ const line = raw.trim();
98
+ if (!line || line.startsWith('#') || !line.includes('=')) continue;
99
+ const idx = line.indexOf('=');
100
+ const key = line.slice(0, idx).trim().replace(/^export\s+/, '');
101
+ let value = line.slice(idx + 1).trim();
102
+ if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
103
+ try { value = JSON.parse(value); } catch { value = value.slice(1, -1); }
104
+ }
105
+ if (key && (override || !(key in process.env))) process.env[key] = value;
106
+ }
107
+ }
108
+
109
+ function loadRuntimeEnv() {
110
+ const instanceEnv = process.env.VERBALCODING_INSTANCE_ENV || '';
111
+ loadDotEnv(path.join(ROOT, '.env'), { override: !instanceEnv });
112
+ if (instanceEnv) loadDotEnv(instanceEnv, { override: true });
113
+ }
114
+
115
+ function loadZshrcExports() {
116
+ const zshrc = path.join(os.homedir(), '.zshrc');
117
+ if (!fs.existsSync(zshrc)) return;
118
+ const text = fs.readFileSync(zshrc, 'utf8');
119
+ for (const raw of text.split(/\r?\n/)) {
120
+ const line = raw.trim();
121
+ if (!line.startsWith('export ') || !line.includes('=')) continue;
122
+ const idx = line.indexOf('=');
123
+ const key = line.slice('export '.length, idx).trim();
124
+ if (process.env[key]) continue;
125
+ let value = line.slice(idx + 1).trim();
126
+ if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
127
+ value = value.slice(1, -1);
128
+ }
129
+ if (key) process.env[key] = value;
130
+ }
131
+ }
132
+
133
+ loadZshrcExports();
134
+ loadRuntimeEnv();
135
+
136
+ const TTS_VOICE_CONFIG_PATH = process.env.TTS_VOICE_CONFIG || path.join(ROOT, 'config', 'tts-voices.json');
137
+ function ensureTtsVoiceConfig() {
138
+ if (!fs.existsSync(TTS_VOICE_CONFIG_PATH)) {
139
+ writeTtsVoiceConfig(TTS_VOICE_CONFIG_PATH, defaultTtsVoiceConfig());
140
+ }
141
+ return readTtsVoiceConfig(TTS_VOICE_CONFIG_PATH);
142
+ }
143
+ function applyVoiceConfigToProcessEnv(config = ensureTtsVoiceConfig()) {
144
+ const selection = effectiveTtsVoiceSelection(config, {});
145
+ const configuredVoiceLanguage = process.env.VOICE_LANGUAGE;
146
+ const nextEnv = applyTtsVoiceSelectionToEnv(process.env, selection);
147
+ if (configuredVoiceLanguage) nextEnv.VOICE_LANGUAGE = configuredVoiceLanguage;
148
+ for (const [key, value] of Object.entries(nextEnv)) process.env[key] = value;
149
+ return { config, selection };
150
+ }
151
+ function reloadRuntimeLanguageFromEnv() {
152
+ const previousWhisperLanguage = settings?.whisperLanguage;
153
+ const previousVoiceLanguage = settings?.voiceLanguage;
154
+ loadRuntimeEnv();
155
+ settings.whisperLanguage = process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || settings.whisperLanguage || 'ko';
156
+ settings.voiceLanguage = process.env.VOICE_LANGUAGE || process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || settings.voiceLanguage || 'ko';
157
+ const changed = previousWhisperLanguage !== undefined && (
158
+ previousWhisperLanguage !== settings.whisperLanguage || previousVoiceLanguage !== settings.voiceLanguage
159
+ );
160
+ if (changed) discardVoiceInputQueues('external-language-change');
161
+ return { whisperLanguage: settings.whisperLanguage, voiceLanguage: settings.voiceLanguage, changed };
162
+ }
163
+ applyVoiceConfigToProcessEnv();
164
+
165
+ const settings = {
166
+ token: process.env.DISCORD_BOT_TOKEN || process.env.DISCORD_TOKEN,
167
+ allowedUsers: new Set((process.env.DISCORD_ALLOWED_USERS || '').split(/[;,]/).map(s => s.trim()).filter(Boolean)),
168
+ autoJoinVoiceChannels: (process.env.AUTO_JOIN_VOICE_CHANNELS || '일반,General,general').split(',').map(s => s.trim().toLowerCase()).filter(Boolean),
169
+ transcriptChannelId: (process.env.TRANSCRIPT_CHANNEL_ID || '123456789012345678').trim(),
170
+ whisperBin: process.env.WHISPER_CPP_BIN || 'whisper-cli',
171
+ whisperModel: process.env.WHISPER_CPP_MODEL || path.join(ROOT, 'models', 'ggml-small-q5_1.bin'),
172
+ whisperLanguage: process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
173
+ voiceLanguage: process.env.VOICE_LANGUAGE || process.env.WHISPER_CPP_LANGUAGE || process.env.STT_LANGUAGE || 'ko',
174
+ tts: buildTtsSettings(process.env, ROOT),
175
+ requireWakeWord: ['1', 'true', 'yes'].includes((process.env.REQUIRE_WAKE_WORD || '0').toLowerCase()),
176
+ wakeWords: (process.env.WAKE_WORDS || 'hermes,헤르메스,허미스').split(',').map(s => s.trim().toLowerCase()).filter(Boolean),
177
+ debugDir: process.env.NODE_AUDIO_DEBUG_DIR || '/tmp/verbalcoding-node-debug',
178
+ latencyLogPath: process.env.LATENCY_LOG_PATH || path.join(ROOT, '.logs', 'latency.jsonl'),
179
+ projectSessionsPath: process.env.PROJECT_SESSIONS_FILE || path.join(ROOT, 'config', 'project-sessions.json'),
180
+ agent: buildAgentSettings({ ROOT, env: process.env }),
181
+ };
182
+ if (!settings.token) throw new Error('DISCORD_BOT_TOKEN or DISCORD_TOKEN is required');
183
+ fs.mkdirSync(settings.debugDir, { recursive: true });
184
+ fs.mkdirSync(settings.tts.progressCacheDir, { recursive: true });
185
+
186
+ const client = new Client({
187
+ intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates, GatewayIntentBits.GuildMessages, GatewayIntentBits.MessageContent],
188
+ partials: [Partials.Channel],
189
+ });
190
+ let ttsBackend = createTtsBackend(settings.tts, { execFileAsync, log, warn, voiceProvider: () => settings.tts.edge.voice });
191
+ const voiceCloneCapture = createVoiceCloneCaptureState({ defaultTargetPath: settings.tts.openvoice.refAudio });
192
+
193
+ let connection = null;
194
+ let activeVoiceChannelId = '';
195
+ let activeTranscriptChannelId = '';
196
+ let player = createAudioPlayer();
197
+ let speaking = false;
198
+ let processing = false;
199
+ let activeTurnId = 0;
200
+ let currentAbortController = null;
201
+ const interruptedTurns = new Set();
202
+ const activeStreams = new Map();
203
+ let bridgeState = null;
204
+ const MAX_DEFERRED_PROCESSING_UTTERANCES = Number(process.env.MAX_DEFERRED_PROCESSING_UTTERANCES || '0');
205
+ const MIN_UTTERANCE_SECONDS = Number(process.env.MIN_UTTERANCE_SECONDS || '1.4');
206
+ const MIN_UTTERANCE_BYTES = 48000 * 2 * 2 * MIN_UTTERANCE_SECONDS;
207
+ const BARGE_IN_MIN_SECONDS = Number(process.env.BARGE_IN_MIN_SECONDS || '1.4');
208
+ const BARGE_IN_MIN_MEAN_VOLUME_DB = Number(process.env.BARGE_IN_MIN_MEAN_VOLUME_DB || '-30');
209
+ const BARGE_IN_MIN_MAX_VOLUME_DB = Number(process.env.BARGE_IN_MIN_MAX_VOLUME_DB || '-14');
210
+ const PLAYBACK_BARGE_IN_MIN_SECONDS = Number(process.env.PLAYBACK_BARGE_IN_MIN_SECONDS || '0.9');
211
+ const PLAYBACK_BARGE_IN_MIN_MEAN_VOLUME_DB = Number(process.env.PLAYBACK_BARGE_IN_MIN_MEAN_VOLUME_DB || '-36');
212
+ const PLAYBACK_BARGE_IN_MIN_MAX_VOLUME_DB = Number(process.env.PLAYBACK_BARGE_IN_MIN_MAX_VOLUME_DB || '-18');
213
+ const PLAYBACK_BARGE_IN_REQUIRE_BOTH = !['0', 'false', 'no', 'off'].includes(String(process.env.PLAYBACK_BARGE_IN_REQUIRE_BOTH || '1').toLowerCase());
214
+ const BARGE_IN_CONSERVATIVE_MIN_SECONDS = Number(process.env.BARGE_IN_CONSERVATIVE_MIN_SECONDS || '1.8');
215
+ const BARGE_IN_CONSERVATIVE_MIN_MEAN_VOLUME_DB = Number(process.env.BARGE_IN_CONSERVATIVE_MIN_MEAN_VOLUME_DB || '-27');
216
+ const BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB = Number(process.env.BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB || '-12');
217
+ const SENSITIVITY_MODE_DEFAULT = (process.env.BARGE_IN_SENSITIVITY_MODE || 'normal').toLowerCase() === 'conservative' ? 'conservative' : 'normal';
218
+ const SENSITIVITY_OUTDOOR_SECONDS = Number(process.env.BARGE_IN_OUTDOOR_SECONDS || '900');
219
+ const SUBSCRIBE_AFTER_SILENCE_MS = Number(process.env.SUBSCRIBE_AFTER_SILENCE_MS || '2200');
220
+ const UTTERANCE_IDLE_MS = Number(process.env.UTTERANCE_IDLE_MS || '2000');
221
+ const MIN_MEAN_VOLUME_DB = Number(process.env.MIN_MEAN_VOLUME_DB || '-35');
222
+ const MIN_MAX_VOLUME_DB = Number(process.env.MIN_MAX_VOLUME_DB || '-12');
223
+ const STT_START_VOICE_NOTICE = !['0', 'false', 'no', 'off'].includes((process.env.STT_START_VOICE_NOTICE || '1').toLowerCase());
224
+
225
+ const bridgeLogger = createBridgeLogger({
226
+ appendLine: line => {
227
+ if (!process.env.BRIDGE_LOG_PATH) return;
228
+ fs.appendFileSync(process.env.BRIDGE_LOG_PATH, `${line}\n`);
229
+ },
230
+ });
231
+ function log(...args) { bridgeLogger.log(...args); }
232
+ function warn(...args) { bridgeLogger.warn(...args); }
233
+ bridgeState = createBridgeState({ log, cleanupFile: file => fs.rm(file, { force: true }, () => {}) });
234
+ const reportTransientProcessError = createTransientErrorReporter({ warn });
235
+ function isBenignTransientNetworkError(error) {
236
+ return isTransientNetworkError(error);
237
+ }
238
+ function writeLatencyRecord(record) {
239
+ try {
240
+ appendJsonl(settings.latencyLogPath, record);
241
+ log('latency metric', 'status', record.status, 'total_ms', record.durations?.total_ms, 'stt_ms', record.durations?.stt_ms, 'agent_ms', record.durations?.agent_ms, 'tts_total_ms', record.durations?.tts_total_ms);
242
+ } catch (e) {
243
+ warn('write latency metric failed', e?.stack || e);
244
+ }
245
+ }
246
+ function newLatencyTurn(userId, startedAtMs) {
247
+ const id = `${Date.now()}-${userId}-${Math.random().toString(16).slice(2, 8)}`;
248
+ return createLatencyTurn({ id, userId, startedAtMs, writeRecord: writeLatencyRecord });
249
+ }
250
+
251
+ function discardVoiceInputQueues(reason = 'config-change') {
252
+ return bridgeState?.discardQueues(reason) || 0;
253
+ }
254
+ let verboseProgress = Boolean(settings.agent.verboseProgress);
255
+ let activeProgressSignal = null;
256
+ let verboseProgressSpeechQueue = Promise.resolve();
257
+ let activeProgressAbortController = null;
258
+ let speechPlaybackGeneration = 0;
259
+ let progressSpeechBatch = [];
260
+ let progressSpeechBatchTimer = null;
261
+ let progressSpeechBatchSignal = null;
262
+ let progressSpeechBatchStartedAt = 0;
263
+ let activeProgressLastEventAt = 0;
264
+ let lastVerboseProgressText = '';
265
+ let lastVerboseProgressTextAt = 0;
266
+ const PROGRESS_IDLE_NOTICE_INITIAL_MS = Number(process.env.PROGRESS_IDLE_NOTICE_INITIAL_MS || process.env.PROGRESS_IDLE_NOTICE_MS || '10000');
267
+ const PROGRESS_IDLE_NOTICE_MAX_MS = Number(process.env.PROGRESS_IDLE_NOTICE_MAX_MS || '30000');
268
+ const PROGRESS_IDLE_NOTICE_MULTIPLIER = Number(process.env.PROGRESS_IDLE_NOTICE_MULTIPLIER || '1.8');
269
+ const PROGRESS_IDLE_CHECK_MS = Number(process.env.PROGRESS_IDLE_CHECK_MS || '5000');
270
+ const PROGRESS_IDLE_NOTICE_LIMIT = Number(process.env.PROGRESS_IDLE_NOTICE_LIMIT || '20');
271
+ const projectSessionsState = loadProjectSessions(settings.projectSessionsPath);
272
+ const agentAdaptersBySession = new Map();
273
+ function createBridgeAgentAdapter(agentSettings) {
274
+ return createAgentAdapter(agentSettings, {
275
+ execFileAsync,
276
+ spawn,
277
+ log,
278
+ warn,
279
+ onProgress: event => {
280
+ if (!verboseProgress) return;
281
+ activeProgressLastEventAt = Date.now();
282
+ sendVerboseProgressText(event, activeProgressSignal);
283
+ queueVerboseProgressSpeech(event, activeProgressSignal);
284
+ },
285
+ });
286
+ }
287
+ const agentAdapter = createBridgeAgentAdapter(settings.agent);
288
+ function adapterForProjectSession(session) {
289
+ if (!session) return agentAdapter;
290
+ const key = session.slug || session.name;
291
+ if (!agentAdaptersBySession.has(key)) {
292
+ agentAdaptersBySession.set(key, createBridgeAgentAdapter({
293
+ ...settings.agent,
294
+ label: `${settings.agent.label} · ${session.name}`,
295
+ sessionFile: session.sessionFile,
296
+ cwd: session.workdir,
297
+ projectContext: projectSessionContextText(session),
298
+ }));
299
+ }
300
+ return agentAdaptersBySession.get(key);
301
+ }
302
+ function resolveProjectSessionForChannel(channelId) {
303
+ return projectSessionForChannel(projectSessionsState, channelId) || null;
304
+ }
305
+ function saveProjectSessionsState() {
306
+ saveProjectSessions(settings.projectSessionsPath, projectSessionsState);
307
+ }
308
+ let sensitivityMode = SENSITIVITY_MODE_DEFAULT;
309
+ let sensitivityModeExpiresAt = 0;
310
+ function currentBargeInThresholds() {
311
+ if (sensitivityModeExpiresAt && Date.now() > sensitivityModeExpiresAt) {
312
+ sensitivityMode = SENSITIVITY_MODE_DEFAULT;
313
+ sensitivityModeExpiresAt = 0;
314
+ log('barge-in sensitivity mode expired; restored', sensitivityMode);
315
+ }
316
+ return bargeInThresholdsForMode(sensitivityMode, {
317
+ minSeconds: BARGE_IN_MIN_SECONDS,
318
+ minMeanDb: BARGE_IN_MIN_MEAN_VOLUME_DB,
319
+ minMaxDb: BARGE_IN_MIN_MAX_VOLUME_DB,
320
+ conservativeMinSeconds: BARGE_IN_CONSERVATIVE_MIN_SECONDS,
321
+ conservativeMinMeanDb: BARGE_IN_CONSERVATIVE_MIN_MEAN_VOLUME_DB,
322
+ conservativeMinMaxDb: BARGE_IN_CONSERVATIVE_MIN_MAX_VOLUME_DB,
323
+ });
324
+ }
325
+ function currentPlaybackBargeInThresholds() {
326
+ return {
327
+ minBytes: 48000 * 2 * 2 * PLAYBACK_BARGE_IN_MIN_SECONDS,
328
+ minSeconds: PLAYBACK_BARGE_IN_MIN_SECONDS,
329
+ minMeanDb: PLAYBACK_BARGE_IN_MIN_MEAN_VOLUME_DB,
330
+ minMaxDb: PLAYBACK_BARGE_IN_MIN_MAX_VOLUME_DB,
331
+ requireBoth: PLAYBACK_BARGE_IN_REQUIRE_BOTH,
332
+ mode: 'playback',
333
+ };
334
+ }
335
+ function setSensitivityMode(mode, reason = 'manual') {
336
+ sensitivityMode = mode === 'conservative' ? 'conservative' : 'normal';
337
+ sensitivityModeExpiresAt = sensitivityMode === 'conservative' && SENSITIVITY_OUTDOOR_SECONDS > 0
338
+ ? Date.now() + SENSITIVITY_OUTDOOR_SECONDS * 1000
339
+ : 0;
340
+ const thresholds = currentBargeInThresholds();
341
+ log('barge-in sensitivity mode set', sensitivityMode, 'reason', reason, 'expiresAt', sensitivityModeExpiresAt || 'never', 'thresholds', thresholds);
342
+ return thresholds;
343
+ }
344
+ function sensitivityStatusText() {
345
+ const thresholds = currentBargeInThresholds();
346
+ const ttl = sensitivityModeExpiresAt ? Math.max(0, Math.round((sensitivityModeExpiresAt - Date.now()) / 1000)) : 0;
347
+ return sensitivityStatusTextForLanguage(thresholds, ttl, settings.voiceLanguage);
348
+ }
349
+
350
+ function verboseStatusText() {
351
+ return verboseStatusTextForLanguage(verboseProgress, settings.voiceLanguage);
352
+ }
353
+
354
+ function progressEmoji(event) {
355
+ const category = progressCategory(event, { language: settings.voiceLanguage })?.key;
356
+ return {
357
+ test: '🧪',
358
+ edit: '✏️',
359
+ read: '📖',
360
+ search: '🔎',
361
+ terminal: '⌨️',
362
+ skill: '🧰',
363
+ browser: '🌐',
364
+ tool: '🛠️',
365
+ agent: '🤖',
366
+ work: '⚙️',
367
+ }[category] || '⚙️';
368
+ }
369
+
370
+ function formatProgressText(event) {
371
+ return formatProgressMessage(event, { language: settings.voiceLanguage });
372
+ }
373
+
374
+ function setVerboseProgress(enabled, reason = 'manual') {
375
+ verboseProgress = Boolean(enabled);
376
+ log('verbose progress mode set', verboseProgress, 'reason', reason);
377
+ return verboseProgress;
378
+ }
379
+
380
+ function persistEnvValues(values) {
381
+ const envPath = path.join(ROOT, '.env');
382
+ let lines = [];
383
+ try {
384
+ if (fs.existsSync(envPath)) lines = fs.readFileSync(envPath, 'utf8').split(/\r?\n/);
385
+ } catch (e) {
386
+ warn('read .env for update failed', e?.stack || e);
387
+ }
388
+ const pending = new Map(Object.entries(values));
389
+ const updated = lines.map(line => {
390
+ const match = line.match(/^\s*([A-Za-z_][A-Za-z0-9_]*)\s*=.*$/);
391
+ if (!match || !pending.has(match[1])) return line;
392
+ const key = match[1];
393
+ const value = pending.get(key);
394
+ pending.delete(key);
395
+ return `${key}=${value}`;
396
+ });
397
+ for (const [key, value] of pending) updated.push(`${key}=${value}`);
398
+ fs.writeFileSync(envPath, `${updated.filter((line, idx) => line !== '' || idx < updated.length - 1).join('\n')}\n`, { mode: 0o600 });
399
+ }
400
+
401
+ function applyRuntimeLanguage(language) {
402
+ discardVoiceInputQueues('language-change');
403
+ const preset = languagePreset(language);
404
+ settings.whisperLanguage = preset.sttLanguage;
405
+ settings.voiceLanguage = preset.voiceLanguage;
406
+ let config = ensureTtsVoiceConfig();
407
+ config = updateTtsVoiceConfig(config, { voiceType: preferredVoiceTypeForLanguage(config, preset.voiceLanguage) });
408
+ writeTtsVoiceConfig(TTS_VOICE_CONFIG_PATH, config);
409
+ const { selection } = applyVoiceConfigToProcessEnv(config);
410
+ settings.tts.backend = selection.backend;
411
+ settings.tts.edge.voice = selection.backend === 'edge' ? selection.voice.voice : preset.ttsVoice;
412
+ process.env.VOICE_LANGUAGE = preset.voiceLanguage;
413
+ process.env.WHISPER_CPP_LANGUAGE = preset.sttLanguage;
414
+ process.env.STT_LANGUAGE = preset.sttLanguage;
415
+ process.env.TTS_VOICE = settings.tts.edge.voice;
416
+ process.env.TTS_VOICE_TYPE = selection.voiceType;
417
+ persistEnvValues({
418
+ VOICE_LANGUAGE: preset.voiceLanguage,
419
+ WHISPER_CPP_LANGUAGE: preset.sttLanguage,
420
+ STT_LANGUAGE: preset.sttLanguage,
421
+ TTS_BACKEND: selection.backend,
422
+ TTS_VOICE: settings.tts.edge.voice,
423
+ TTS_VOICE_TYPE: selection.voiceType,
424
+ });
425
+ return preset;
426
+ }
427
+
428
+ function languageChangedText(preset) {
429
+ if (preset.key === 'ko') return '언어를 한국어로 바꿨어. STT, 중간 음성, 최종 음성, 목소리 타입까지 한국어 설정으로 맞췄어.';
430
+ if (preset.key === 'auto') return 'Language set to auto-detect STT with English voice. Progress voice will stay in English.';
431
+ return 'Language set to English. STT, progress voice, final voice, and voice type are English now.';
432
+ }
433
+
434
+ function voiceChangedText(selection) {
435
+ const lang = selection.voice?.language || settings.voiceLanguage;
436
+ if (/^ko/i.test(String(lang))) return `목소리를 ${selection.voice?.label || selection.voiceType}로 바꿨어.`;
437
+ return `Voice changed to ${selection.voice?.label || selection.voiceType}.`;
438
+ }
439
+
440
+ async function handleTtsVoiceCommand(prompt, signal) {
441
+ const request = voiceCommandFromTranscript(prompt);
442
+ if (!request) return false;
443
+ discardVoiceInputQueues('voice-change');
444
+ let config = ensureTtsVoiceConfig();
445
+ config = updateTtsVoiceConfig(config, request);
446
+ writeTtsVoiceConfig(TTS_VOICE_CONFIG_PATH, config);
447
+ const { selection } = applyVoiceConfigToProcessEnv(config);
448
+ settings.tts.backend = selection.backend;
449
+ if (selection.backend === 'edge') settings.tts.edge.voice = selection.voice.voice;
450
+ if (selection.voice?.language) settings.voiceLanguage = selection.voice.language;
451
+ persistEnvValues({
452
+ TTS_BACKEND: selection.backend,
453
+ TTS_VOICE_TYPE: selection.voiceType,
454
+ TTS_VOICE: selection.backend === 'edge' ? selection.voice.voice : process.env.TTS_VOICE,
455
+ VOICE_LANGUAGE: settings.voiceLanguage,
456
+ });
457
+ await speakText(voiceChangedText(selection), signal);
458
+ return true;
459
+ }
460
+
461
+ async function handleLanguageCommand(prompt, signal) {
462
+ const request = voiceLanguageCommandFromTranscript(prompt);
463
+ if (!request) return false;
464
+ const preset = applyRuntimeLanguage(request.language);
465
+ await speakText(languageChangedText(preset), signal);
466
+ return true;
467
+ }
468
+
469
+ function isAllowed(userId) { return settings.allowedUsers.size === 0 || settings.allowedUsers.has(String(userId)); }
470
+ function stamp() { return new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '').replace('T', '-'); }
471
+
472
+ function stripMarkdownNoise(text, language = settings.voiceLanguage) {
473
+ const codeBlockText = /^en/i.test(String(language || '')) ? 'I left the code block in the text channel.' : '코드 블록은 텍스트 채널에 남겼어.';
474
+ return String(text || '')
475
+ .replace(/```[\s\S]*?```/g, codeBlockText)
476
+ .replace(/^\s*[-*+]\s+/gm, '')
477
+ .replace(/^\s*#{1,6}\s*/gm, '')
478
+ .replace(/`([^`]+)`/g, '$1')
479
+ .replace(/\[[^\]]+\]\([^\)]+\)/g, match => match.replace(/\]\([^\)]+\)/, '').replace('[', ''))
480
+ .replace(/\n{3,}/g, '\n\n')
481
+ .trim();
482
+ }
483
+
484
+ function spokenResultOnly(userPrompt, answer, language = settings.voiceLanguage) {
485
+ const english = /^en/i.test(String(language || ''));
486
+ const cleaned = stripMarkdownNoise(answer, language);
487
+ if (isPatchLikeOutput(cleaned)) {
488
+ return english
489
+ ? 'The code diff is too long to read aloud. I will keep the changed files and test results in the text channel.'
490
+ : '코드 변경 diff가 길게 나와서 음성으로는 읽지 않을게. 변경 파일과 테스트 결과만 텍스트 채널에 정리할게.';
491
+ }
492
+ const tooLongForVoice = cleaned.length > 3000;
493
+ const hasBulkyCodeOrLogs = /I left the code block in the text channel|코드 블록은 텍스트 채널에 남겼어|^\s*(run|log|command|diff|changed files|verification log|test output|실행|로그|명령|diff|변경사항 상세|검증 로그|테스트 출력)\s*[::]/im.test(cleaned);
494
+ if (!tooLongForVoice) return cleaned;
495
+
496
+ const lines = cleaned
497
+ .split(/\r?\n/)
498
+ .map(line => line.trim())
499
+ .filter(Boolean)
500
+ .filter(line => !/^\s*(run|log|command|diff|changed files|verification log|test output|실행|로그|명령|diff|변경사항 상세|검증 로그|테스트 출력)\s*[::]/i.test(line));
501
+
502
+ let spoken = hasBulkyCodeOrLogs ? lines.slice(0, 10).join(' ') : cleaned;
503
+ const moreText = english ? 'I left the rest in the text channel.' : '나머지는 텍스트 채널에 남겼어.';
504
+ if (spoken.length > 1800) spoken = `${spoken.slice(0, 1760).replace(/[\s,.;:,。]+$/u, '')}. ${moreText}`;
505
+ if (spoken.length < cleaned.length && !/(text channel|텍스트 채널)/i.test(spoken)) spoken += ` ${moreText}`;
506
+ return spoken;
507
+ }
508
+
509
+ async function sendText(text) {
510
+ return sendDiscordText({
511
+ client,
512
+ channelId: activeTranscriptChannelId || settings.transcriptChannelId,
513
+ text,
514
+ log,
515
+ warn,
516
+ });
517
+ }
518
+
519
+ async function sendChannelText(channel, text) {
520
+ const body = String(text || '');
521
+ const chunks = splitDiscordMessage(body);
522
+ for (const chunk of chunks) await channel.send(chunk);
523
+ return true;
524
+ }
525
+
526
+ function sendVerboseProgressText(event, signal) {
527
+ if (!verboseProgress || !signal || signal.aborted || activeProgressSignal !== signal) return;
528
+ const formatted = formatProgressText(event).replace(/\s+/g, ' ').trim();
529
+ if (!formatted) return;
530
+ const message = formatted.slice(0, 1900);
531
+ const now = Date.now();
532
+ if (message === lastVerboseProgressText && now - lastVerboseProgressTextAt < 2000) return;
533
+ lastVerboseProgressText = message;
534
+ lastVerboseProgressTextAt = now;
535
+ void sendText(message).catch(e => warn('verbose progress text delivery failed', e?.stack || e));
536
+ }
537
+
538
+ function sleep(ms) {
539
+ return new Promise(resolve => setTimeout(resolve, ms));
540
+ }
541
+
542
+ function waitEvent(emitter, event, timeoutMs = 60000) {
543
+ return new Promise((resolve, reject) => {
544
+ const t = setTimeout(() => { cleanup(); reject(new Error(`timeout waiting ${event}`)); }, timeoutMs);
545
+ const onEvent = (...args) => { cleanup(); resolve(args); };
546
+ const onErr = err => { cleanup(); reject(err); };
547
+ const cleanup = () => { clearTimeout(t); emitter.off(event, onEvent); emitter.off('error', onErr); };
548
+ emitter.once(event, onEvent);
549
+ emitter.once('error', onErr);
550
+ });
551
+ }
552
+
553
+ async function transcribeOnce(wavPath, input16k, outBase) {
554
+ const args = ['-m', settings.whisperModel, '-f', input16k];
555
+ if (shouldPassWhisperLanguage(settings.whisperLanguage)) args.push('-l', settings.whisperLanguage);
556
+ args.push('-nt', '-otxt', '-of', outBase, '-sns', '-nf', '-nth', '0.35', '-et', '2.2', '-lpt', '-0.8');
557
+ try {
558
+ await execFileAsync(settings.whisperBin, args, { timeout: 25000, maxBuffer: 2 * 1024 * 1024 });
559
+ } catch (e) {
560
+ throw new Error(`whisper failed: ${e.stderr || e.message}`);
561
+ }
562
+ const txtPath = `${outBase}.txt`;
563
+ const raw = fs.existsSync(txtPath) ? fs.readFileSync(txtPath, 'utf8') : '';
564
+ return { raw, txtPath };
565
+ }
566
+
567
+ async function transcribe(wavPath) {
568
+ const tmpBase = path.join(os.tmpdir(), `hermes-node-stt-${Date.now()}`);
569
+ const input16k = `${tmpBase}.16k.wav`;
570
+ const outBase = `${tmpBase}.out`;
571
+ // whisper.cpp can read WAV, but Discord receiver output is 48 kHz stereo.
572
+ // Convert explicitly to the 16 kHz mono PCM shape Whisper expects.
573
+ await execFileAsync('ffmpeg', ['-y', '-hide_banner', '-loglevel', 'error', '-i', wavPath, '-ac', '1', '-ar', '16000', '-sample_fmt', 's16', input16k], {
574
+ timeout: 20000,
575
+ maxBuffer: 1024 * 1024,
576
+ });
577
+
578
+ let raw = '';
579
+ let txtPath = '';
580
+ try {
581
+ ({ raw, txtPath } = await transcribeOnce(wavPath, input16k, outBase));
582
+ let cleaned = cleanTranscript(raw);
583
+ log('stt raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
584
+ if (!cleaned) {
585
+ await sleep(300);
586
+ const retryBase = `${tmpBase}.retry`;
587
+ const retry = await transcribeOnce(wavPath, input16k, retryBase);
588
+ raw = retry.raw;
589
+ txtPath = retry.txtPath;
590
+ cleaned = cleanTranscript(raw);
591
+ log('stt retry raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
592
+ }
593
+ return cleaned;
594
+ } finally {
595
+ if (settings.debugDir) {
596
+ const debug16k = path.join(settings.debugDir, `stt-input-${stamp()}.wav`);
597
+ fs.copyFile(input16k, debug16k, () => {});
598
+ if (raw) fs.writeFile(path.join(settings.debugDir, `stt-raw-${stamp()}.txt`), raw, () => {});
599
+ }
600
+ fs.rm(input16k, { force: true }, () => {});
601
+ if (txtPath) fs.rm(txtPath, { force: true }, () => {});
602
+ }
603
+ }
604
+
605
+ function cleanTranscript(raw) {
606
+ const bad = [
607
+ '구독', '좋아요', '알림설정', '시청해주셔서', '시청해주신', '다음영상', '영상에서만나요',
608
+ '부탁드려요', '큰힘이됩니다',
609
+ 'mbc뉴스', '이준범기자입니다', '뉴스입니다', '기자입니다', '앵커', '속보', '보도입니다', '전해드립니다',
610
+ ];
611
+ const lines = raw
612
+ .split(/\r?\n/)
613
+ .map(l => l.trim())
614
+ .filter(Boolean)
615
+ .map(l => l.replace(/^\[[^\]]+\]\s*/, '').trim());
616
+ const kept = [];
617
+ for (const line of lines) {
618
+ const compact = line
619
+ .replace(/\s+/g, '')
620
+ .replace(/[\p{P}\p{S}_]+/gu, '');
621
+ if (!compact) continue;
622
+ if (/^[\(\[(【].*[\)\])】]$/.test(line.replace(/\s+/g, ''))) continue;
623
+ if (['끄덕', '끄덕끄덕', '박수', '웃음', '음악', '자막', '침묵', '무음'].includes(compact)) continue;
624
+ if (bad.some(b => compact.toLowerCase().includes(b))) continue;
625
+ if (isRepeatedNoiseTranscript(compact)) continue;
626
+ kept.push(line);
627
+ }
628
+ return kept.join(' ').trim();
629
+ }
630
+
631
+ function isAbortError(e) {
632
+ return e?.name === 'AbortError' || e?.code === 'ABORT_ERR';
633
+ }
634
+
635
+ function isTaskRequest(text) {
636
+ const compact = text.replace(/\s+/g, '').toLowerCase();
637
+ return /(파일|폴더|프로젝트|코드|구현|수정|고쳐|만들|생성|실행|확인|검색|설치|테스트|디버그|재시작|로그|커밋|깃|git|github|브랜치|배포|서버|프로세스|터미널|스크립트|압축|다운로드|분석해|찾아)/i.test(compact);
638
+ }
639
+
640
+ function isSensitivityOnlyRequest(text) {
641
+ const compact = String(text || '').replace(/\s+/g, '').toLowerCase();
642
+ if (!sensitivityModeFromTranscript(compact)) return false;
643
+ return !isTaskRequest(compact) && !/(그리고|그다음|다음에|추가로|해줘.*(말|설명|대답))/u.test(compact);
644
+ }
645
+
646
+ function verboseModeFromTranscript(text) {
647
+ const compact = String(text || '').replace(/\s+/g, '').toLowerCase();
648
+ // Korean STT often hears "상세" as "상쇄" or "상쇠" in noisy voice calls.
649
+ const verboseWords = 'verbose|버보스|상세|상쇄|상쇠|상세진행|자세히알려|중간과정';
650
+ if (new RegExp(`(${verboseWords}).*(켜|on|시작|보여|알려|읽어|말해)|^(verbose|버보스|상세|상쇄|상쇠)모드(켜|on)?$`).test(compact)) return true;
651
+ if (new RegExp(`(${verboseWords}).*(꺼|off|중지|그만)|^(verbose|버보스|상세|상쇄|상쇠)모드꺼$`).test(compact)) return false;
652
+ return null;
653
+ }
654
+
655
+ function isVerboseOnlyRequest(text) {
656
+ const compact = String(text || '').replace(/\s+/g, '').toLowerCase();
657
+ return verboseModeFromTranscript(compact) !== null && !isTaskRequest(compact) && !/(그리고|그다음|다음에|추가로)/u.test(compact);
658
+ }
659
+
660
+ async function refreshTtsRuntimeConfig() {
661
+ reloadRuntimeLanguageFromEnv();
662
+ const { selection } = applyVoiceConfigToProcessEnv(ensureTtsVoiceConfig());
663
+ const previousBackend = settings.tts.backend;
664
+ settings.tts.backend = selection.backend;
665
+ if (selection.backend === 'edge') settings.tts.edge.voice = selection.voice.voice;
666
+ if (previousBackend !== settings.tts.backend) {
667
+ const rebuilt = buildTtsSettings(process.env, ROOT);
668
+ Object.assign(settings.tts, rebuilt);
669
+ ttsBackend = createTtsBackend(settings.tts, { execFileAsync, log, warn, voiceProvider: () => settings.tts.edge.voice });
670
+ log('tts backend reloaded from voice config', settings.tts.backend, 'voiceType', selection.voiceType);
671
+ }
672
+ return selection;
673
+ }
674
+
675
+ async function synthTTS(text, signal) {
676
+ await refreshTtsRuntimeConfig();
677
+ let lastError = null;
678
+ for (let attempt = 1; attempt <= 3; attempt += 1) {
679
+ try {
680
+ log('final tts synth start', 'backend', ttsBackend.name, 'attempt', attempt, 'chars', String(text || '').length);
681
+ const out = await ttsBackend.synthesize(text, { signal, kind: 'final' });
682
+ log('final tts synth done', 'backend', ttsBackend.name, 'attempt', attempt, out, fs.statSync(out).size);
683
+ return out;
684
+ } catch (e) {
685
+ lastError = e;
686
+ if (isAbortError(e) || signal?.aborted) throw e;
687
+ warn('final tts synth failed', 'attempt', attempt, e?.stderr?.toString?.().slice(-500) || e?.message || e);
688
+ await sleep(1000 * attempt);
689
+ }
690
+ }
691
+ throw lastError;
692
+ }
693
+
694
+ async function synthProgressTTS(text, signal) {
695
+ await refreshTtsRuntimeConfig();
696
+ const ext = ttsBackend.outputExtension || 'mp3';
697
+ const cachePath = path.join(settings.tts.progressCacheDir, progressTtsCacheFileName({
698
+ backendKeyParts: ttsBackend.cacheKeyParts(),
699
+ text,
700
+ ext,
701
+ }));
702
+ if (fs.existsSync(cachePath) && fs.statSync(cachePath).size > 0) {
703
+ log('progress tts cache hit', text, cachePath);
704
+ return cachePath;
705
+ }
706
+ log('progress tts cache miss', text);
707
+ const tmp = await ttsBackend.synthesize(text, { signal, kind: 'progress' });
708
+ fs.renameSync(tmp, cachePath);
709
+ return cachePath;
710
+ }
711
+
712
+ async function playAudio(file, { deleteAfter = true } = {}) {
713
+ if (!connection) return;
714
+ speaking = true;
715
+ try {
716
+ const resource = createAudioResource(file, { inputType: StreamType.Arbitrary, inlineVolume: true });
717
+ resource.volume?.setVolume(settings.tts.volume);
718
+ player.play(resource);
719
+ connection.subscribe(player);
720
+ await waitEvent(player, AudioPlayerStatus.Idle, 120000).catch(() => {});
721
+ } finally {
722
+ speaking = false;
723
+ if (deleteAfter) fs.rm(file, { force: true }, () => {});
724
+ }
725
+ }
726
+
727
+ async function speakText(text, signal, metricsTurn = null, options = {}) {
728
+ const chunks = splitForTTS(text, settings.tts.maxChars);
729
+ if (!chunks.length) return;
730
+ if (options.mirrorText !== false) {
731
+ await sendText(`${options.mirrorPrefix || '🔊 음성으로 읽는 내용'}:\n${String(text || '')}`);
732
+ }
733
+ log('TTS chunks', chunks.length, 'maxChars', settings.tts.maxChars, 'backend', ttsBackend.name);
734
+ const playbackGeneration = speechPlaybackGeneration;
735
+ const playbackStopped = () => playbackGeneration !== speechPlaybackGeneration;
736
+ let synthMs = 0;
737
+ let playMs = 0;
738
+ const ttsStart = Date.now();
739
+ await playChunkedTTSWithPrefetch(chunks, {
740
+ signal,
741
+ log,
742
+ synth: async chunk => {
743
+ if (playbackStopped()) return null;
744
+ const start = Date.now();
745
+ try { return await synthTTS(chunk, signal); }
746
+ finally { synthMs += Date.now() - start; }
747
+ },
748
+ play: async file => {
749
+ if (playbackStopped()) {
750
+ await fs.promises.rm(file, { force: true }).catch(() => {});
751
+ return;
752
+ }
753
+ const start = Date.now();
754
+ try { return await playAudio(file); }
755
+ finally { playMs += Date.now() - start; }
756
+ },
757
+ cleanup: file => fs.promises.rm(file, { force: true }),
758
+ });
759
+ metricsTurn?.stage('tts_synth', synthMs, { ttsChunks: chunks.length, spokenChars: String(text || '').length });
760
+ metricsTurn?.stage('tts_play', playMs);
761
+ metricsTurn?.stage('tts_total', Date.now() - ttsStart);
762
+ }
763
+
764
+ async function speakProgress(text, signal) {
765
+ if (signal?.aborted) return;
766
+ try {
767
+ const mp3 = await synthProgressTTS(text, signal);
768
+ if (signal?.aborted) return;
769
+ await playAudio(mp3, { deleteAfter: false });
770
+ } catch (e) {
771
+ if (!isAbortError(e)) warn('progress tts failed', e?.stack || e);
772
+ }
773
+ }
774
+
775
+ async function speakImmediateNotice(text, signal, reason = 'notice') {
776
+ if (signal?.aborted) return;
777
+ try {
778
+ log('immediate notice speech', reason, 'text', String(text || '').slice(0, 80));
779
+ const mp3 = await synthProgressTTS(text, signal);
780
+ if (signal?.aborted) return;
781
+ await playAudio(mp3, { deleteAfter: false });
782
+ } catch (e) {
783
+ if (!isAbortError(e)) warn('immediate notice speech failed', reason, e?.stack || e);
784
+ }
785
+ }
786
+
787
+ function queueProgressSpeechText(text, signal, reason = 'status') {
788
+ const spoken = String(text || '').replace(/\s+/g, ' ').trim();
789
+ if (!spoken || !signal || signal.aborted || activeProgressSignal !== signal) return;
790
+ verboseProgressSpeechQueue = verboseProgressSpeechQueue
791
+ .catch(() => {})
792
+ .then(async () => {
793
+ if (signal.aborted || activeProgressSignal !== signal || !processing) return;
794
+ log('progress speech queued', reason, 'text', spoken);
795
+ await speakProgress(spoken, signal);
796
+ });
797
+ }
798
+
799
+ function flushProgressSpeechBatch(signal, reason = 'timer') {
800
+ if (!signal || signal.aborted || activeProgressSignal !== signal) return;
801
+ if (progressSpeechBatchTimer) {
802
+ clearTimeout(progressSpeechBatchTimer);
803
+ progressSpeechBatchTimer = null;
804
+ }
805
+ const events = progressSpeechBatch;
806
+ progressSpeechBatch = [];
807
+ progressSpeechBatchSignal = null;
808
+ progressSpeechBatchStartedAt = 0;
809
+ const text = summarizeProgressEvents(events, { maxCategories: 3, language: settings.voiceLanguage });
810
+ if (!text) return;
811
+ queueProgressSpeechText(text, signal, `batch-${reason}-${events.length}`);
812
+ }
813
+
814
+ function queueVerboseProgressSpeech(event, signal) {
815
+ if (!verboseProgress || !signal || signal.aborted || activeProgressSignal !== signal) return;
816
+ const text = String(event || '').replace(/\s+/g, ' ').trim().slice(0, 120);
817
+ if (!text) return;
818
+ if (progressSpeechBatchSignal && progressSpeechBatchSignal !== signal) {
819
+ progressSpeechBatch = [];
820
+ if (progressSpeechBatchTimer) clearTimeout(progressSpeechBatchTimer);
821
+ progressSpeechBatchTimer = null;
822
+ progressSpeechBatchStartedAt = 0;
823
+ }
824
+ progressSpeechBatchSignal = signal;
825
+ if (!progressSpeechBatchStartedAt) progressSpeechBatchStartedAt = Date.now();
826
+ progressSpeechBatch.push(text);
827
+ const elapsedMs = Date.now() - progressSpeechBatchStartedAt;
828
+ const ratePerSecond = progressSpeechBatch.length / Math.max(0.2, elapsedMs / 1000);
829
+ const maxBatchEvents = ratePerSecond >= 6 ? 5 : ratePerSecond >= 3 ? 4 : 3;
830
+ const batchDelayMs = ratePerSecond >= 6 ? 650 : ratePerSecond >= 3 ? 550 : 450;
831
+ if (progressSpeechBatch.length >= maxBatchEvents) {
832
+ flushProgressSpeechBatch(signal, 'full');
833
+ return;
834
+ }
835
+ if (progressSpeechBatchTimer) clearTimeout(progressSpeechBatchTimer);
836
+ progressSpeechBatchTimer = setTimeout(() => flushProgressSpeechBatch(signal, 'timer'), batchDelayMs);
837
+ }
838
+
839
+ function clearProgressSpeechBatch(signal = activeProgressSignal) {
840
+ if (progressSpeechBatchTimer) {
841
+ clearTimeout(progressSpeechBatchTimer);
842
+ progressSpeechBatchTimer = null;
843
+ }
844
+ if (!signal || progressSpeechBatchSignal === signal) {
845
+ progressSpeechBatch = [];
846
+ progressSpeechBatchSignal = null;
847
+ progressSpeechBatchStartedAt = 0;
848
+ }
849
+ }
850
+
851
+ function stopProgressSpeech(signal, reason = 'final-answer') {
852
+ if (activeProgressSignal !== signal) return;
853
+ clearProgressSpeechBatch(signal);
854
+ activeProgressSignal = null;
855
+ if (activeProgressAbortController && !activeProgressAbortController.signal.aborted) {
856
+ try { activeProgressAbortController.abort(); } catch (e) { warn('abort progress speech failed', e?.stack || e); }
857
+ }
858
+ if (speaking) {
859
+ log('stop progress speech before final answer', reason);
860
+ try { player.stop(true); } catch (e) { warn('stop progress speech failed', e?.stack || e); }
861
+ speaking = false;
862
+ }
863
+ }
864
+
865
+ async function handleTextAgentMessage(msg, text, { speakResponse = false } = {}) {
866
+ if (processing) {
867
+ await msg.reply('지금 이전 작업을 처리 중이야. 끝나면 다시 보내줘.');
868
+ return;
869
+ }
870
+ processing = true;
871
+ const controller = new AbortController();
872
+ currentAbortController = controller;
873
+ const signal = controller.signal;
874
+ const progressController = new AbortController();
875
+ activeProgressAbortController = progressController;
876
+ activeProgressSignal = progressController.signal;
877
+ activeProgressLastEventAt = Date.now();
878
+ const previousTranscriptChannelId = activeTranscriptChannelId;
879
+ const session = resolveProjectSessionForChannel(msg.channelId);
880
+ activeTranscriptChannelId = session?.transcriptChannelId || msg.channelId;
881
+ const selectedAgentAdapter = adapterForProjectSession(session);
882
+ const projectContext = projectSessionContextText(session);
883
+ const plan = {
884
+ task: true,
885
+ label: selectedAgentAdapter.label,
886
+ verboseProgress,
887
+ language: settings.voiceLanguage,
888
+ cwd: session?.workdir,
889
+ projectContext,
890
+ };
891
+ const sessionBefore = selectedAgentAdapter.readSessionId?.();
892
+ log('text agent request start', selectedAgentAdapter.label, sessionBefore ? 'resume-existing-session' : 'new-session', 'verbose', verboseProgress, session ? `project=${session.slug}` : 'project=default');
893
+ try {
894
+ const result = await selectedAgentAdapter.run(text, signal, plan);
895
+ const answer = result.answer || emptyAgentAnswer(settings.voiceLanguage);
896
+ const fullAnswerText = `${agentAnswerHeader(settings.voiceLanguage, selectedAgentAdapter.label)}\n${answer}`;
897
+ await sendChannelText(msg.channel, fullAnswerText);
898
+ stopProgressSpeech(progressController.signal, 'text-agent-answer-ready');
899
+ if (speakResponse && connection) {
900
+ const spokenAnswer = spokenResultOnly(text, answer, settings.voiceLanguage);
901
+ await speakText(spokenAnswer, signal, null, { mirrorText: false });
902
+ }
903
+ } catch (e) {
904
+ if (isAbortError(e)) return;
905
+ warn('text agent request failed', e?.stack || e);
906
+ await sendChannelText(msg.channel, formatVoiceErrorMessage(settings.voiceLanguage, String(e?.message || e).slice(0, 800)));
907
+ } finally {
908
+ if (activeProgressAbortController && activeProgressAbortController.signal === progressController.signal && !activeProgressAbortController.signal.aborted) {
909
+ try { activeProgressAbortController.abort(); } catch (e) { warn('abort text progress speech failed', e?.stack || e); }
910
+ }
911
+ if (activeProgressSignal === progressController.signal) activeProgressSignal = null;
912
+ if (activeProgressAbortController?.signal === progressController.signal) activeProgressAbortController = null;
913
+ clearProgressSpeechBatch(progressController.signal);
914
+ if (currentAbortController === controller) currentAbortController = null;
915
+ activeTranscriptChannelId = previousTranscriptChannelId;
916
+ processing = false;
917
+ }
918
+ }
919
+
920
+ async function saveCapturedVoiceCloneSample(userId, wavPath, pcmBytes, segments, signal = null) {
921
+ const capture = voiceCloneCapture.consume(userId);
922
+ if (!capture) return false;
923
+ try {
924
+ const saved = await saveVoiceCloneReference({
925
+ sourceWav: wavPath,
926
+ targetPath: capture.targetPath,
927
+ execFileAsync,
928
+ });
929
+ log('voice clone reference saved', 'user', userId, 'pcmBytes', pcmBytes, 'segments', segments, 'path', saved);
930
+ await sendText(`🎙️ 보이스 클로닝 참조 샘플 저장 완료: ${path.relative(ROOT, saved)}`);
931
+ await speakText('목소리 샘플 저장했어. 이제 OpenVoice 백엔드로 테스트할 수 있어.', signal);
932
+ } catch (e) {
933
+ warn('voice clone reference save failed', e?.stack || e);
934
+ await sendText(`⚠️ 목소리 샘플 저장 실패: ${String(e?.message || e).slice(0, 700)}`);
935
+ await speakText('목소리 샘플 저장에 실패했어. 로그를 확인해볼게.', signal);
936
+ }
937
+ return true;
938
+ }
939
+
940
+ async function handleVoiceCloneCommand(userId, prompt, signal = null) {
941
+ const command = voiceCloneCommandFromText(prompt);
942
+ if (!command) return false;
943
+ if (command.action === 'cancel') {
944
+ const cancelled = voiceCloneCapture.cancel(userId);
945
+ await sendText(cancelled ? '🎙️ 보이스 클로닝 샘플 캡처를 취소했어.' : '🎙️ 대기 중인 보이스 클로닝 샘플 캡처가 없어.');
946
+ await speakText(cancelled ? '목소리 샘플 녹음 대기를 취소했어.' : '대기 중인 목소리 샘플 녹음은 없어.', signal);
947
+ return true;
948
+ }
949
+ if (command.action === 'status') {
950
+ const current = voiceCloneCapture.current();
951
+ const status = current?.userId === String(userId)
952
+ ? `🎙️ 다음 유효한 음성을 ${path.relative(ROOT, current.targetPath)}에 저장할게.`
953
+ : '🎙️ 지금 대기 중인 보이스 클로닝 샘플 캡처는 없어.';
954
+ await sendText(status);
955
+ await speakText(current?.userId === String(userId) ? '다음에 말하는 목소리를 샘플로 저장할게.' : '대기 중인 목소리 샘플 녹음은 없어.', signal);
956
+ return true;
957
+ }
958
+ const armed = voiceCloneCapture.arm({ userId, source: 'voice-command' });
959
+ await sendText(`🎙️ 보이스 클로닝 샘플 캡처 대기 중. 다음 10초에서 30초 정도 말하면 ${path.relative(ROOT, armed.targetPath)}에 저장할게.`);
960
+ await speakText('좋아. 다음에 10초에서 30초 정도 말하면 그 음성을 목소리 샘플로 저장할게.', signal);
961
+ return true;
962
+ }
963
+
964
+ function stopPlaybackForBargeIn(userId, reason = 'playback-barge-in') {
965
+ if (!speaking) return false;
966
+ log('stop playback for barge-in', 'byUser', userId, 'reason', reason, 'speaking', speaking, 'processing', processing, 'turn', activeTurnId);
967
+ speechPlaybackGeneration += 1;
968
+ try { player.stop(true); } catch (e) { warn('stop playback failed', e?.stack || e); }
969
+ speaking = false;
970
+ return true;
971
+ }
972
+
973
+ function interruptCurrentResponse(userId, reason = 'barge-in') {
974
+ if (!speaking && !processing) return false;
975
+ const turnId = activeTurnId;
976
+ if (turnId) interruptedTurns.add(turnId);
977
+ log('interrupt current response', 'byUser', userId, 'reason', reason, 'speaking', speaking, 'processing', processing, 'turn', turnId);
978
+ if (currentAbortController && !currentAbortController.signal.aborted) {
979
+ try { currentAbortController.abort(); } catch (e) { warn('abort current response failed', e?.stack || e); }
980
+ }
981
+ try { player.stop(true); } catch (e) { warn('stop playback failed', e?.stack || e); }
982
+ speaking = false;
983
+ processing = false;
984
+ return true;
985
+ }
986
+
987
+ function acceptsWake(text) {
988
+ if (!settings.requireWakeWord) return true;
989
+ const low = text.toLowerCase();
990
+ return settings.wakeWords.some(w => low.includes(w));
991
+ }
992
+ function stripWake(text) {
993
+ let out = text;
994
+ for (const w of settings.wakeWords) out = out.replaceAll(w, '').replaceAll(w.toLowerCase(), '');
995
+ return out.trim() || text;
996
+ }
997
+
998
+ async function analyzeAudio(wavPath) {
999
+ const args = ['-hide_banner', '-nostats', '-i', wavPath, '-af', 'volumedetect', '-f', 'null', '-'];
1000
+ let text = '';
1001
+ try {
1002
+ const { stdout, stderr } = await execFileAsync('ffmpeg', args, { timeout: 15000, maxBuffer: 2 * 1024 * 1024 });
1003
+ text = `${stdout || ''}\n${stderr || ''}`;
1004
+ } catch (e) {
1005
+ text = `${e.stdout || ''}\n${e.stderr || ''}`;
1006
+ }
1007
+ const mean = /mean_volume:\s*(-?(?:[0-9.]+|inf)) dB/i.exec(text)?.[1];
1008
+ const max = /max_volume:\s*(-?(?:[0-9.]+|inf)) dB/i.exec(text)?.[1];
1009
+ if (mean && max) {
1010
+ const parseDb = value => value.toLowerCase().includes('inf') ? -Infinity : Number(value);
1011
+ return { meanDb: parseDb(mean), maxDb: parseDb(max) };
1012
+ }
1013
+ throw new Error(`volumedetect failed: ${text.slice(-500)}`);
1014
+ }
1015
+
1016
+ async function concatWavs(files, output) {
1017
+ if (files.length === 1) {
1018
+ fs.copyFileSync(files[0], output);
1019
+ return;
1020
+ }
1021
+ const listPath = path.join(os.tmpdir(), `hermes-node-concat-${Date.now()}.txt`);
1022
+ const body = files.map(f => `file '${String(f).replaceAll("'", "'\\''")}'`).join('\n');
1023
+ fs.writeFileSync(listPath, body);
1024
+ try {
1025
+ await execFileAsync('ffmpeg', ['-y', '-hide_banner', '-loglevel', 'error', '-f', 'concat', '-safe', '0', '-i', listPath, '-c', 'copy', output], {
1026
+ timeout: 20000,
1027
+ maxBuffer: 1024 * 1024,
1028
+ });
1029
+ } finally {
1030
+ fs.rm(listPath, { force: true }, () => {});
1031
+ }
1032
+ }
1033
+
1034
+ function queueSegment(userId, file, pcmBytes, startedAtMs = Date.now(), endedAtMs = Date.now()) {
1035
+ const pending = bridgeState.appendSegment(userId, {
1036
+ file,
1037
+ pcmBytes,
1038
+ startedAtMs,
1039
+ endedAtMs,
1040
+ timerFactory: () => setTimeout(() => flushUtterance(userId).catch(e => warn('flushUtterance failed', userId, e?.stack || e)), UTTERANCE_IDLE_MS),
1041
+ });
1042
+ log('queued segment', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes, 'idleMs', UTTERANCE_IDLE_MS, 'epoch', pending.epoch);
1043
+ }
1044
+
1045
+ function isBargeInCandidate(pcmBytes, levels) {
1046
+ const thresholds = currentBargeInThresholds();
1047
+ return isValidatedBargeInCandidate(pcmBytes, levels, thresholds);
1048
+ }
1049
+
1050
+ function enqueueDeferredProcessingUtterance({ userId, wavPath, pcmBytes, segments, startedAtMs = Date.now() }) {
1051
+ const item = { userId, wavPath, pcmBytes, segments, startedAtMs };
1052
+ const result = bridgeState.enqueueDeferred(item, enqueueDeferredUtterance, MAX_DEFERRED_PROCESSING_UTTERANCES);
1053
+ if (!result.queued) {
1054
+ log('drop deferred utterance because queue disabled', userId, wavPath, 'max', MAX_DEFERRED_PROCESSING_UTTERANCES);
1055
+ return false;
1056
+ }
1057
+ if (result.dropped) {
1058
+ log('drop oldest deferred utterance because queue is full', result.dropped?.userId, result.dropped?.wavPath);
1059
+ }
1060
+ log('queued deferred utterance while processing', userId, wavPath, 'queueSize', bridgeState.deferredSize(), 'epoch', bridgeState.currentEpoch());
1061
+ return true;
1062
+ }
1063
+
1064
+ async function drainDeferredProcessingUtterances() {
1065
+ if (processing || bridgeState.deferredSize() === 0) return;
1066
+ const next = bridgeState.shiftDeferred();
1067
+ if (!next) return;
1068
+ log('drain deferred utterance', next.userId, next.wavPath, 'remaining', bridgeState.deferredSize());
1069
+ const metricsTurn = newLatencyTurn(next.userId, next.startedAtMs || Date.now());
1070
+ metricsTurn.mark('voice_first_packet', next.startedAtMs || Date.now());
1071
+ metricsTurn.mark('utterance_flush');
1072
+ metricsTurn.addMeta({ segments: next.segments, pcmBytes: next.pcmBytes, deferred: true });
1073
+ await handleRecording(next.userId, next.wavPath, next.pcmBytes, next.segments, metricsTurn);
1074
+ }
1075
+
1076
+ async function validateProcessingBargeIn(userId, wavPath, pcmBytes, segments) {
1077
+ log('validating processing barge-in transcript', userId, wavPath, 'pcmBytes', pcmBytes, 'segments', segments);
1078
+ const text = await transcribe(wavPath);
1079
+ if (!text) {
1080
+ log('ignore processing barge-in: empty transcript', userId, wavPath);
1081
+ return { action: 'ignore', text: '' };
1082
+ }
1083
+ if (!isExplicitBargeInTranscript(text)) {
1084
+ log('defer processing barge-in: not explicit stop phrase', userId, JSON.stringify(text));
1085
+ return { action: 'defer', text };
1086
+ }
1087
+ log('confirmed processing barge-in by explicit transcript', userId, JSON.stringify(text));
1088
+ interruptCurrentResponse(userId, 'confirmed-processing-barge-in');
1089
+ return { action: 'interrupt', text };
1090
+ }
1091
+
1092
+ async function flushUtterance(userId) {
1093
+ const pending = bridgeState.deletePending(userId);
1094
+ if (!pending) return;
1095
+ if (pending.timer) clearTimeout(pending.timer);
1096
+ const files = pending.files;
1097
+ const pcmBytes = pending.pcmBytes;
1098
+ const metricsTurn = newLatencyTurn(userId, pending.firstPacketAt || Date.now());
1099
+ metricsTurn.mark('voice_first_packet', pending.firstPacketAt || Date.now());
1100
+ metricsTurn.mark('voice_segment_end', pending.lastSegmentEndAt || Date.now());
1101
+ metricsTurn.mark('utterance_flush');
1102
+ metricsTurn.addMeta({ segments: files.length, pcmBytes, epoch: pending.epoch });
1103
+ if (pending.epoch !== bridgeState.currentEpoch()) {
1104
+ log('drop stale utterance after voice input queue reset', userId, 'utteranceEpoch', pending.epoch, 'currentEpoch', bridgeState.currentEpoch());
1105
+ for (const file of files) fs.rm(file, { force: true }, () => {});
1106
+ metricsTurn.finish({ status: 'stale_after_config_change' });
1107
+ return;
1108
+ }
1109
+ if (pcmBytes < MIN_UTTERANCE_BYTES) {
1110
+ log('skip short utterance', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'minBytes', MIN_UTTERANCE_BYTES);
1111
+ metricsTurn.finish({ status: 'skip_short' });
1112
+ return;
1113
+ }
1114
+ const merged = path.join(settings.debugDir, `utterance-merged-${stamp()}-${userId}.wav`);
1115
+ await concatWavs(files, merged);
1116
+ const levels = await analyzeAudio(merged);
1117
+ log('utterance levels', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
1118
+ if (await saveCapturedVoiceCloneSample(userId, merged, pcmBytes, files.length)) {
1119
+ metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1120
+ metricsTurn.finish({ status: 'voice_clone_sample_saved' });
1121
+ return;
1122
+ }
1123
+ const candidate = isBargeInCandidate(pcmBytes, levels);
1124
+ if (speaking || processing) {
1125
+ const thresholds = currentBargeInThresholds();
1126
+ if (!candidate) {
1127
+ log('check weak barge-in for explicit stop transcript', userId, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholdBytes', thresholds.minBytes, 'thresholds', thresholds.minMeanDb, thresholds.minMaxDb, 'mode', thresholds.mode);
1128
+ }
1129
+ const validation = await validateProcessingBargeIn(userId, merged, pcmBytes, files.length);
1130
+ if (validation?.action === 'interrupt') {
1131
+ metricsTurn.finish({ status: processing ? 'barge_in_processing_interrupt' : 'barge_in_playback_interrupt' });
1132
+ return;
1133
+ }
1134
+ if (processing && validation?.action === 'defer') {
1135
+ const queued = enqueueDeferredProcessingUtterance({
1136
+ userId,
1137
+ wavPath: merged,
1138
+ pcmBytes,
1139
+ segments: files.length,
1140
+ startedAtMs: pending.firstPacketAt || Date.now(),
1141
+ });
1142
+ metricsTurn.finish({ status: queued ? 'deferred_during_processing' : 'drop_deferred_during_processing' });
1143
+ return;
1144
+ }
1145
+ metricsTurn.finish({ status: speaking ? 'barge_in_playback_ignored' : 'barge_in_processing_ignored' });
1146
+ return;
1147
+ }
1148
+ // Drop only when BOTH overall energy and peak are low. Real Discord speech from this
1149
+ // mic can have low mean volume while still carrying intelligible peaks; using OR here
1150
+ // caused valid Korean utterances to be discarded as "low-energy".
1151
+ if (levels.meanDb < MIN_MEAN_VOLUME_DB && levels.maxDb < MIN_MAX_VOLUME_DB) {
1152
+ log('skip low-energy utterance', userId, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholds', MIN_MEAN_VOLUME_DB, MIN_MAX_VOLUME_DB, 'mode', 'both-below');
1153
+ metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1154
+ metricsTurn.finish({ status: 'skip_low_energy' });
1155
+ return;
1156
+ }
1157
+ metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
1158
+ await handleRecording(userId, merged, pcmBytes, files.length, metricsTurn);
1159
+ }
1160
+
1161
+ async function handleRecording(userId, wavPath, pcmBytes, segments = 1, metricsTurn = null) {
1162
+ if (processing) { log('drop while processing', userId); metricsTurn?.finish({ status: 'drop_processing' }); return; }
1163
+ if (!isAllowed(userId)) { warn('ignore unauthorized', userId); metricsTurn?.finish({ status: 'unauthorized' }); return; }
1164
+ processing = true;
1165
+ const turnId = ++activeTurnId;
1166
+ const controller = new AbortController();
1167
+ currentAbortController = controller;
1168
+ const signal = controller.signal;
1169
+ const sessionForVoice = resolveProjectSessionForChannel(activeVoiceChannelId || settings.transcriptChannelId);
1170
+ const previousTranscriptChannelId = activeTranscriptChannelId;
1171
+ activeTranscriptChannelId = sessionForVoice?.transcriptChannelId || settings.transcriptChannelId;
1172
+ try {
1173
+ const runtimeLanguage = reloadRuntimeLanguageFromEnv();
1174
+ if (runtimeLanguage.changed) {
1175
+ log('drop current utterance because language changed before STT', userId, 'turn', turnId, 'language', runtimeLanguage.voiceLanguage);
1176
+ fs.rm(wavPath, { force: true }, () => {});
1177
+ metricsTurn?.finish({ status: 'drop_stale_language_change' });
1178
+ return;
1179
+ }
1180
+ const session = resolveProjectSessionForChannel(activeVoiceChannelId || settings.transcriptChannelId);
1181
+ activeTranscriptChannelId = session?.transcriptChannelId || settings.transcriptChannelId;
1182
+ log('voice turn text target', session ? `project=${session.slug}` : 'project=default', 'channel', activeTranscriptChannelId ? 'project-or-default' : 'none');
1183
+ log('transcribing', userId, wavPath, 'pcmBytes', pcmBytes, 'segments', segments, 'turn', turnId);
1184
+ const sttNotice = formatSttStartMessage(settings.voiceLanguage);
1185
+ await sendText(sttNotice);
1186
+ const sttNoticeSpeech = STT_START_VOICE_NOTICE
1187
+ ? speakImmediateNotice(sttNotice.replace(/^🎧\s*/u, ''), signal, 'stt-start')
1188
+ : Promise.resolve();
1189
+ const sttStart = Date.now();
1190
+ const text = await transcribe(wavPath);
1191
+ await sttNoticeSpeech;
1192
+ metricsTurn?.stage('stt', Date.now() - sttStart, { transcriptChars: String(text || '').length });
1193
+ if (interruptedTurns.has(turnId) || signal.aborted) { metricsTurn?.finish({ status: 'aborted_after_stt' }); return; }
1194
+ if (!text) { log('empty transcript', userId, wavPath); metricsTurn?.finish({ status: 'empty_transcript' }); return; }
1195
+ log(`user ${userId} said: ${text}`);
1196
+ await sendText(formatSttResultMessage(settings.voiceLanguage, userId, text));
1197
+ if (!acceptsWake(text)) { await sendText(formatWakeRejectedMessage(settings.voiceLanguage)); metricsTurn?.finish({ status: 'wake_rejected' }); return; }
1198
+
1199
+ const prompt = stripWake(text);
1200
+ if (await handleLanguageCommand(prompt, signal)) {
1201
+ metricsTurn?.finish({ status: 'language_command' });
1202
+ return;
1203
+ }
1204
+ if (await handleTtsVoiceCommand(prompt, signal)) {
1205
+ metricsTurn?.finish({ status: 'voice_command' });
1206
+ return;
1207
+ }
1208
+ if (await handleVoiceCloneCommand(userId, prompt, signal)) {
1209
+ metricsTurn?.finish({ status: 'voice_clone_command' });
1210
+ return;
1211
+ }
1212
+ const sensitivityRequest = sensitivityModeFromTranscript(prompt);
1213
+ if (sensitivityRequest) {
1214
+ const thresholds = setSensitivityMode(sensitivityRequest.mode, sensitivityRequest.reason);
1215
+ await sendText(`🎚️ ${sensitivityStatusText()}`);
1216
+ if (isSensitivityOnlyRequest(prompt)) {
1217
+ await speakText(sensitivityChangedSpeech(thresholds.mode, settings.voiceLanguage), signal, metricsTurn);
1218
+ metricsTurn?.finish({ status: 'sensitivity_only' });
1219
+ return;
1220
+ }
1221
+ }
1222
+ const verboseRequest = verboseModeFromTranscript(prompt);
1223
+ if (verboseRequest !== null) {
1224
+ setVerboseProgress(verboseRequest, 'voice-command');
1225
+ await sendText(`🔎 ${verboseStatusText()}`);
1226
+ if (isVerboseOnlyRequest(prompt)) {
1227
+ await speakText(verboseChangedSpeech(verboseRequest, settings.voiceLanguage), signal, metricsTurn);
1228
+ metricsTurn?.finish({ status: 'verbose_only' });
1229
+ return;
1230
+ }
1231
+ }
1232
+ const selectedAgentAdapter = adapterForProjectSession(session);
1233
+ const projectContext = projectSessionContextText(session);
1234
+ const plan = {
1235
+ task: true,
1236
+ label: selectedAgentAdapter.label,
1237
+ verboseProgress,
1238
+ language: settings.voiceLanguage,
1239
+ cwd: session?.workdir,
1240
+ projectContext,
1241
+ };
1242
+ log('Agent plan', plan.label, 'backend', selectedAgentAdapter.backend, 'task', plan.task, 'language', plan.language, session ? `project=${session.slug}` : 'project=default');
1243
+ const agentStart = Date.now();
1244
+ const progressController = new AbortController();
1245
+ activeProgressAbortController = progressController;
1246
+ activeProgressSignal = progressController.signal;
1247
+ activeProgressLastEventAt = Date.now();
1248
+ const agentPromise = selectedAgentAdapter.ask(prompt, signal, plan);
1249
+ let done = false;
1250
+ // Status announcements share one queue with verbose progress so they never
1251
+ // talk over each other. In verbose mode, skip the generic initial prompt;
1252
+ // the detailed tool/file/test events are the initial progress voice.
1253
+ const progressLoop = (async () => {
1254
+ if (!verboseProgress) {
1255
+ await sleep(2500);
1256
+ if (!done && !signal.aborted && !interruptedTurns.has(turnId)) {
1257
+ const initial = /^en/i.test(String(settings.voiceLanguage || ''))
1258
+ ? 'calling the agent.'
1259
+ : '에이전트 호출했어. 응답 기다리는 중.';
1260
+ queueProgressSpeechText(initial, progressController.signal, 'generic-initial');
1261
+ }
1262
+ }
1263
+ let idleNotices = 0;
1264
+ let nextIdleNoticeMs = PROGRESS_IDLE_NOTICE_INITIAL_MS;
1265
+ let lastObservedProgressAt = activeProgressLastEventAt;
1266
+ while (!done && !signal.aborted && !interruptedTurns.has(turnId) && idleNotices < PROGRESS_IDLE_NOTICE_LIMIT) {
1267
+ await sleep(Math.min(PROGRESS_IDLE_CHECK_MS, nextIdleNoticeMs));
1268
+ if (done || signal.aborted || interruptedTurns.has(turnId)) break;
1269
+ if (activeProgressLastEventAt !== lastObservedProgressAt) {
1270
+ lastObservedProgressAt = activeProgressLastEventAt;
1271
+ nextIdleNoticeMs = PROGRESS_IDLE_NOTICE_INITIAL_MS;
1272
+ continue;
1273
+ }
1274
+ const idleMs = Date.now() - activeProgressLastEventAt;
1275
+ if (idleMs < nextIdleNoticeMs) continue;
1276
+ idleNotices += 1;
1277
+ activeProgressLastEventAt = Date.now();
1278
+ lastObservedProgressAt = activeProgressLastEventAt;
1279
+ const idle = /^en/i.test(String(settings.voiceLanguage || ''))
1280
+ ? 'still working on that.'
1281
+ : '아직 작업 중이야.';
1282
+ queueProgressSpeechText(idle, progressController.signal, `idle-${idleNotices}-${Math.round(nextIdleNoticeMs / 1000)}s`);
1283
+ nextIdleNoticeMs = Math.min(
1284
+ PROGRESS_IDLE_NOTICE_MAX_MS,
1285
+ Math.max(nextIdleNoticeMs + 1000, Math.round(nextIdleNoticeMs * PROGRESS_IDLE_NOTICE_MULTIPLIER)),
1286
+ );
1287
+ }
1288
+ })().catch(e => {
1289
+ if (!isAbortError(e)) warn('progress loop failed', e?.stack || e);
1290
+ });
1291
+ const answer = await agentPromise.finally(() => { done = true; });
1292
+ metricsTurn?.stage('agent', Date.now() - agentStart, { answerChars: String(answer || '').length, backend: selectedAgentAdapter.backend });
1293
+ void progressLoop;
1294
+ if (interruptedTurns.has(turnId) || signal.aborted) { metricsTurn?.finish({ status: 'aborted_after_agent' }); return; }
1295
+
1296
+ log('Agent answer', selectedAgentAdapter.label, answer.slice(0, 200));
1297
+ const spokenAnswer = spokenResultOnly(prompt, answer, settings.voiceLanguage);
1298
+ const fullAnswerText = `${agentAnswerHeader(settings.voiceLanguage, selectedAgentAdapter.label)}\n${answer || emptyAgentAnswer(settings.voiceLanguage)}`;
1299
+ log('send agent answer text', 'chars', fullAnswerText.length);
1300
+ const answerTextDelivered = await sendText(fullAnswerText);
1301
+ if (!answerTextDelivered) {
1302
+ warn('agent answer text delivery failed; still speaking answer');
1303
+ }
1304
+ log('spoken answer', spokenAnswer.slice(0, 200));
1305
+ stopProgressSpeech(progressController.signal, 'agent-answer-ready');
1306
+ await speakText(spokenAnswer, signal, metricsTurn, { mirrorText: !answerTextDelivered });
1307
+ metricsTurn?.finish({ status: 'ok' });
1308
+ } catch (e) {
1309
+ if (isAbortError(e) || interruptedTurns.has(turnId)) {
1310
+ log('turn aborted', userId, 'turn', turnId);
1311
+ metricsTurn?.finish({ status: 'aborted' });
1312
+ return;
1313
+ }
1314
+ warn('handleRecording failed', e?.stack || e);
1315
+ const shortMsg = String(e?.message || e).slice(0, 800);
1316
+ metricsTurn?.finish({ status: 'error', error: shortMsg });
1317
+ await sendText(formatVoiceErrorMessage(settings.voiceLanguage, shortMsg));
1318
+ } finally {
1319
+ if (activeProgressAbortController && !activeProgressAbortController.signal.aborted) {
1320
+ try { activeProgressAbortController.abort(); } catch (e) { warn('abort progress speech in cleanup failed', e?.stack || e); }
1321
+ }
1322
+ if (activeProgressSignal === activeProgressAbortController?.signal) activeProgressSignal = null;
1323
+ activeProgressAbortController = null;
1324
+ if (currentAbortController === controller) currentAbortController = null;
1325
+ activeTranscriptChannelId = previousTranscriptChannelId;
1326
+ interruptedTurns.delete(turnId);
1327
+ if (activeTurnId === turnId) activeTurnId = 0;
1328
+ processing = false;
1329
+ if (bridgeState.deferredSize() > 0) {
1330
+ setImmediate(() => drainDeferredProcessingUtterances().catch(e => warn('drain deferred utterance failed', e?.stack || e)));
1331
+ }
1332
+ }
1333
+ }
1334
+
1335
+ function subscribeUser(receiver, userId) {
1336
+ if (!isAllowed(userId)) return;
1337
+ if (String(userId) === client.user?.id) return;
1338
+ const wasSpeaking = speaking;
1339
+ const wasProcessing = processing;
1340
+ if ((wasSpeaking || wasProcessing) && !activeStreams.has(userId)) {
1341
+ // Speaking-start alone is too noisy in Discord voice. Record and validate a
1342
+ // real segment first; only confirmed playback barge-in stops the current
1343
+ // audio chunk, and only explicit stop transcripts abort active agent work.
1344
+ log('possible barge-in start; waiting for segment validation', userId, 'speaking', wasSpeaking, 'processing', wasProcessing);
1345
+ }
1346
+ if (activeStreams.has(userId)) return;
1347
+ const pending = bridgeState.getPending(userId);
1348
+ if (pending?.timer) {
1349
+ bridgeState.clearPendingTimer(userId);
1350
+ log('extend pending utterance because new segment started', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes);
1351
+ }
1352
+
1353
+ const file = path.join(settings.debugDir, `segment-${stamp()}-${userId}.wav`);
1354
+ log('subscribe user', userId, file);
1355
+ const opusStream = receiver.subscribe(userId, { end: { behavior: EndBehaviorType.AfterSilence, duration: SUBSCRIBE_AFTER_SILENCE_MS } });
1356
+ const decoder = new prism.opus.Decoder({ rate: 48000, channels: 2, frameSize: 960 });
1357
+ const writer = new wav.FileWriter(file, { sampleRate: 48000, channels: 2, bitDepth: 16 });
1358
+ activeStreams.set(userId, { opusStream, decoder, writer, file, startedAtMs: Date.now() });
1359
+ let pcmBytes = 0;
1360
+ const liveThresholds = wasSpeaking && !wasProcessing ? currentPlaybackBargeInThresholds() : currentBargeInThresholds();
1361
+ const liveBargeIn = shouldUseLivePlaybackBargeIn({ speaking: wasSpeaking, processing: wasProcessing }) ? createLiveBargeInMonitor({
1362
+ minBytes: liveThresholds.minBytes,
1363
+ minMeanDb: liveThresholds.minMeanDb,
1364
+ minMaxDb: liveThresholds.minMaxDb,
1365
+ requireBoth: liveThresholds.requireBoth,
1366
+ log,
1367
+ onConfirm: ({ pcmBytes: confirmedBytes, levels }) => {
1368
+ log('confirmed live playback barge-in before segment end', userId, 'pcmBytes', confirmedBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
1369
+ stopPlaybackForBargeIn(userId, 'confirmed-live-playback-barge-in');
1370
+ },
1371
+ }) : null;
1372
+ decoder.on('data', chunk => {
1373
+ pcmBytes += chunk.length;
1374
+ liveBargeIn?.push(chunk);
1375
+ });
1376
+ opusStream.on('error', e => warn('opus stream error', userId, e?.stack || e));
1377
+ decoder.on('error', e => warn('opus decoder error', userId, e?.stack || e));
1378
+ writer.on('error', e => warn('wav writer error', userId, e?.stack || e));
1379
+ opusStream.on('end', () => log('opus end', userId, 'pcmBytes', pcmBytes));
1380
+ writer.on('finish', () => {
1381
+ const streamState = activeStreams.get(userId);
1382
+ activeStreams.delete(userId);
1383
+ const endedAtMs = Date.now();
1384
+ log('saved segment', userId, 'pcmBytes', pcmBytes, file);
1385
+ queueSegment(userId, file, pcmBytes, streamState?.startedAtMs || endedAtMs, endedAtMs);
1386
+ });
1387
+ opusStream.pipe(decoder).pipe(writer);
1388
+ }
1389
+
1390
+ async function connectTo(channel) {
1391
+ if (connection) {
1392
+ try { connection.destroy(); } catch {}
1393
+ }
1394
+ activeVoiceChannelId = channel.id;
1395
+ connection = joinVoiceChannel({
1396
+ channelId: channel.id,
1397
+ guildId: channel.guild.id,
1398
+ adapterCreator: channel.guild.voiceAdapterCreator,
1399
+ selfDeaf: false,
1400
+ selfMute: false,
1401
+ });
1402
+ connection.subscribe(player);
1403
+ connection.on('error', e => warn('voice connection error', e?.stack || e));
1404
+ connection.on('stateChange', async (oldState, newState) => {
1405
+ log('voice connection state', oldState.status, '->', newState.status);
1406
+ if (newState.status === VoiceConnectionStatus.Disconnected) {
1407
+ try {
1408
+ await Promise.race([
1409
+ entersState(connection, VoiceConnectionStatus.Signalling, 5000),
1410
+ entersState(connection, VoiceConnectionStatus.Connecting, 5000),
1411
+ ]);
1412
+ } catch (e) {
1413
+ warn('voice connection disconnected; reconnecting to channel', channel.guild.name, channel.name, e?.message || e);
1414
+ try { connection?.destroy(); } catch {}
1415
+ connection = null;
1416
+ setTimeout(() => connectTo(channel).catch(err => warn('voice reconnect failed', err?.stack || err)), 1500);
1417
+ }
1418
+ }
1419
+ });
1420
+ await entersState(connection, VoiceConnectionStatus.Ready, 30000);
1421
+ connection.receiver.speaking.on('start', userId => subscribeUser(connection.receiver, userId));
1422
+ log(`Listening in voice channel ${channel.guild.name} / ${channel.name}`);
1423
+ }
1424
+
1425
+ async function autoJoin() {
1426
+ const attempted = [];
1427
+ for (const preferredName of settings.autoJoinVoiceChannels) {
1428
+ for (const guild of client.guilds.cache.values()) {
1429
+ const channels = await guild.channels.fetch();
1430
+ for (const ch of channels.values()) {
1431
+ if (!ch?.isVoiceBased?.() || ch.name.toLowerCase() !== preferredName) continue;
1432
+ attempted.push(`${guild.name}/${ch.name}`);
1433
+ try {
1434
+ await connectTo(ch);
1435
+ return;
1436
+ } catch (e) {
1437
+ warn('auto-join failed; trying next configured voice channel', guild.name, ch.name, e?.stack || e);
1438
+ try { connection?.destroy(); } catch {}
1439
+ connection = null;
1440
+ activeVoiceChannelId = '';
1441
+ }
1442
+ }
1443
+ }
1444
+ }
1445
+ warn('No auto-join channel found or reachable', settings.autoJoinVoiceChannels, 'attempted', attempted);
1446
+ }
1447
+
1448
+ function consumeRestartNotice() {
1449
+ const noticePath = path.join(ROOT, '.cache', 'restart-notice.txt');
1450
+ try {
1451
+ if (!fs.existsSync(noticePath)) return '';
1452
+ const detail = fs.readFileSync(noticePath, 'utf8').replace(/\s+/g, ' ').trim().slice(0, 120);
1453
+ fs.rmSync(noticePath, { force: true });
1454
+ return detail;
1455
+ } catch (e) {
1456
+ warn('consume restart notice failed', e?.stack || e);
1457
+ return '';
1458
+ }
1459
+ }
1460
+
1461
+ async function announceRestartComplete() {
1462
+ const detail = consumeRestartNotice();
1463
+ const { text, speech } = formatRestartCompleteNotice(detail, settings.tts.edge.voice);
1464
+ const delivered = await sendText(text);
1465
+ if (!delivered) warn('restart-complete text delivery failed');
1466
+ await speakText(speech, undefined, null, { mirrorText: false });
1467
+ }
1468
+
1469
+ async function findVoiceChannelBySelector(guild, selector) {
1470
+ const wanted = String(selector || '').trim();
1471
+ if (!wanted || !guild) return null;
1472
+ const id = wanted.replace(/^<#(\d+)>$/, '$1');
1473
+ const channels = await guild.channels.fetch();
1474
+ const voiceChannels = [...channels.values()].filter(ch => ch?.isVoiceBased?.());
1475
+ const byId = voiceChannels.find(ch => ch.id === id);
1476
+ if (byId) return byId;
1477
+ const matches = voiceChannels.filter(ch => String(ch.name || '').toLowerCase() === wanted.toLowerCase());
1478
+ if (matches.length === 1) return matches[0];
1479
+ if (matches.length > 1) throw new Error(`같은 이름의 음성 채널이 여러 개야. 채널 ID나 멘션으로 지정해줘: ${wanted}`);
1480
+ throw new Error(`음성 채널을 찾지 못했어: ${wanted}`);
1481
+ }
1482
+
1483
+ async function voiceChannelLabel(guild, channelId) {
1484
+ if (!channelId || !guild) return '없음';
1485
+ try {
1486
+ const ch = await guild.channels.fetch(channelId);
1487
+ return ch?.name || '지정됨';
1488
+ } catch {
1489
+ return '지정됨';
1490
+ }
1491
+ }
1492
+
1493
+ async function resolveVoiceChannelForAttach(msg, selector = '') {
1494
+ if (selector) return findVoiceChannelBySelector(msg.guild, selector);
1495
+ if (msg.member?.voice?.channel) return msg.member.voice.channel;
1496
+ if (activeVoiceChannelId && msg.guild) {
1497
+ try {
1498
+ const ch = await msg.guild.channels.fetch(activeVoiceChannelId);
1499
+ if (ch?.isVoiceBased?.()) return ch;
1500
+ } catch {}
1501
+ }
1502
+ throw new Error('붙일 음성 채널을 못 찾았어. 음성채널에 들어가서 `!session attach-voice`를 치거나 `--voice "채널명"`을 붙여줘.');
1503
+ }
1504
+
1505
+ async function attachVoiceChannelToTextSession(msg, command) {
1506
+ const voiceChannel = await resolveVoiceChannelForAttach(msg, command.voice);
1507
+ let session = null;
1508
+ if (command.name) {
1509
+ session = bindProjectSessionToChannel({ state: projectSessionsState, nameOrSlug: command.name, channelId: msg.channelId });
1510
+ } else {
1511
+ session = resolveProjectSessionForChannel(msg.channelId)
1512
+ || resolveProjectSessionForChannel(voiceChannel.id);
1513
+ if (!session) {
1514
+ const fallbackName = String(msg.channel?.name || `channel-${msg.channelId}`).trim() || `channel-${msg.channelId}`;
1515
+ session = createProjectSession({
1516
+ root: ROOT,
1517
+ state: projectSessionsState,
1518
+ name: fallbackName,
1519
+ workdir: settings.agent.cwd || ROOT,
1520
+ channelId: msg.channelId,
1521
+ voiceChannelId: voiceChannel.id,
1522
+ transcriptChannelId: msg.channelId,
1523
+ mcpContext: 'Ad-hoc Discord text channel session',
1524
+ });
1525
+ }
1526
+ }
1527
+ session.transcriptChannelId = msg.channelId;
1528
+ session.voiceChannelId = voiceChannel.id;
1529
+ projectSessionsState.channelSessions[msg.channelId] = session.slug;
1530
+ projectSessionsState.channelSessions[voiceChannel.id] = session.slug;
1531
+ saveProjectSessionsState();
1532
+ agentAdaptersBySession.delete(session.slug);
1533
+ if (activeVoiceChannelId !== voiceChannel.id) await connectTo(voiceChannel);
1534
+ return msg.reply(`${session.name} 세션을 이 텍스트 채널과 음성 채널 ${voiceChannel.name}에 붙였어. 이제 그 음성채널 발화의 STT/답변 텍스트는 이 채널로 가.`);
1535
+ }
1536
+
1537
+ async function handleProjectSessionCommand(msg, command) {
1538
+ const activeSession = resolveProjectSessionForChannel(msg.channelId) || resolveProjectSessionForChannel(activeVoiceChannelId);
1539
+ if (command.action === 'attach-voice') return void await attachVoiceChannelToTextSession(msg, command);
1540
+ if (command.action === 'status') {
1541
+ if (!activeSession) return void msg.reply(`${agentAdapter.label} 기본 세션: ${agentAdapter.readSessionId?.() || '아직 없음'}`);
1542
+ const adapter = adapterForProjectSession(activeSession);
1543
+ const voiceName = await voiceChannelLabel(msg.guild, activeSession.voiceChannelId);
1544
+ return void msg.reply([
1545
+ `프로젝트 세션: ${activeSession.name}`,
1546
+ `작업실: ${activeSession.workdir}`,
1547
+ `음성 채널: ${voiceName}`,
1548
+ `Hermes 세션: ${adapter.readSessionId?.() || '아직 없음'}`,
1549
+ `텍스트 채널: 현재 채널`,
1550
+ ].join('\n'));
1551
+ }
1552
+ if (command.action === 'list') {
1553
+ const sessions = listProjectSessions(projectSessionsState);
1554
+ if (!sessions.length) return void msg.reply('등록된 프로젝트 세션이 없어. `!session new 이름 /프로젝트/경로 --voice 음성채널명`으로 만들 수 있어.');
1555
+ const lines = [];
1556
+ for (const session of sessions) {
1557
+ const voiceName = await voiceChannelLabel(msg.guild, session.voiceChannelId);
1558
+ lines.push(`- ${session.name}: ${session.workdir} / voice: ${voiceName}`);
1559
+ }
1560
+ return void msg.reply(lines.join('\n').slice(0, 1900));
1561
+ }
1562
+ if (command.action === 'reset') {
1563
+ const session = activeSession;
1564
+ const targetFile = session?.sessionFile || settings.agent.sessionFile;
1565
+ try { fs.rmSync(targetFile, { force: true }); } catch {}
1566
+ return void msg.reply(`${session?.name || agentAdapter.label} 세션 초기화했어.`);
1567
+ }
1568
+ if (command.action === 'use') {
1569
+ if (!command.name) return void msg.reply('사용할 세션 이름을 붙여줘. 예: `!session use llm-wiki --voice "LLM Wiki"`');
1570
+ const voiceChannel = command.voice ? await findVoiceChannelBySelector(msg.guild, command.voice) : null;
1571
+ const session = bindProjectSessionToChannel({ state: projectSessionsState, nameOrSlug: command.name, channelId: msg.channelId });
1572
+ if (voiceChannel) {
1573
+ projectSessionsState.channelSessions[voiceChannel.id] = session.slug;
1574
+ session.voiceChannelId = voiceChannel.id;
1575
+ }
1576
+ saveProjectSessionsState();
1577
+ return void msg.reply(`${session.name} 프로젝트 세션을 이 텍스트 채널${voiceChannel ? `과 음성 채널 ${voiceChannel.name}` : ''}에 연결했어. 작업실은 ${session.workdir}이야.`);
1578
+ }
1579
+ if (command.action === 'new') {
1580
+ if (!command.name || !command.workdir) {
1581
+ return void msg.reply('형식: `!session new <이름> <작업실경로> [MCP/프로젝트 설명] --voice <음성채널명>`');
1582
+ }
1583
+ if (!fs.existsSync(command.workdir)) return void msg.reply(`작업실 경로가 없어: ${command.workdir}`);
1584
+ const voiceChannel = command.voice ? await findVoiceChannelBySelector(msg.guild, command.voice) : null;
1585
+ const session = createProjectSession({
1586
+ root: ROOT,
1587
+ state: projectSessionsState,
1588
+ name: command.name,
1589
+ workdir: command.workdir,
1590
+ channelId: msg.channelId,
1591
+ voiceChannelId: voiceChannel?.id || '',
1592
+ transcriptChannelId: msg.channelId,
1593
+ mcpContext: command.mcpContext,
1594
+ });
1595
+ saveProjectSessionsState();
1596
+ agentAdaptersBySession.delete(session.slug);
1597
+ return void msg.reply(`${session.name} 프로젝트 세션 만들었어. 작업실은 ${session.workdir}이고, 이 텍스트 채널${voiceChannel ? `과 음성 채널 ${voiceChannel.name}` : ''} 입력은 별도 Hermes 세션 파일로 이어져.`);
1598
+ }
1599
+ }
1600
+
1601
+ client.once('ready', async () => {
1602
+ log(`Logged in as ${client.user.tag} (${client.user.id})`);
1603
+ await autoJoin();
1604
+ await announceRestartComplete();
1605
+ });
1606
+
1607
+ client.on('messageCreate', async msg => {
1608
+ if (msg.author.bot) return;
1609
+ if (!isAllowed(msg.author.id)) return;
1610
+ const content = msg.content.trim();
1611
+ const projectSessionCommand = parseProjectSessionCommand(content);
1612
+ if (projectSessionCommand) {
1613
+ try {
1614
+ await handleProjectSessionCommand(msg, projectSessionCommand);
1615
+ } catch (e) {
1616
+ warn('project session command failed', e?.stack || e);
1617
+ await msg.reply(String(e?.message || e).slice(0, 700));
1618
+ }
1619
+ return;
1620
+ }
1621
+ if (content === '!ping') return void msg.reply('pong');
1622
+ if (content === '!verbose') return void msg.reply(verboseStatusText());
1623
+ if (['!verbose on', '!verbose true', '!verbose 1', '!verbose 켜', '!verbose 켜줘'].includes(content.toLowerCase())) {
1624
+ setVerboseProgress(true, 'discord-command');
1625
+ return void msg.reply(verboseStatusText());
1626
+ }
1627
+ if (['!verbose off', '!verbose false', '!verbose 0', '!verbose 꺼', '!verbose 꺼줘'].includes(content.toLowerCase())) {
1628
+ setVerboseProgress(false, 'discord-command');
1629
+ return void msg.reply(verboseStatusText());
1630
+ }
1631
+ if (content === '!sensitivity') return void msg.reply(sensitivityStatusText());
1632
+ if (content === '!latency' || content === '!metrics') {
1633
+ const summary = summarizeLatencyRecords(readJsonlRecords(settings.latencyLogPath, { limit: 200 }));
1634
+ return void msg.reply(`최근 latency 요약 (${settings.latencyLogPath}):\n${formatLatencySummary(summary)}`.slice(0, 1900));
1635
+ }
1636
+ if (content === '!sensitivity conservative') {
1637
+ setSensitivityMode('conservative', 'discord-command');
1638
+ return void msg.reply(sensitivityStatusText());
1639
+ }
1640
+ if (content === '!sensitivity normal') {
1641
+ setSensitivityMode('normal', 'discord-command');
1642
+ return void msg.reply(sensitivityStatusText());
1643
+ }
1644
+ if (content === '!session') return void handleProjectSessionCommand(msg, { action: 'status' });
1645
+ if (content === '!reset-session') return void handleProjectSessionCommand(msg, { action: 'reset' });
1646
+ if (content === '!join') {
1647
+ const ch = msg.member?.voice?.channel;
1648
+ if (!ch) return void msg.reply('먼저 음성 채널에 들어가줘.');
1649
+ await connectTo(ch);
1650
+ return void msg.reply('들어왔어. Node receiver로 듣는 중.');
1651
+ }
1652
+ if (content === '!leave') {
1653
+ try { connection?.destroy(); } catch {}
1654
+ connection = null;
1655
+ activeVoiceChannelId = '';
1656
+ return void msg.reply('나갈게.');
1657
+ }
1658
+ if (content.startsWith('!say ')) {
1659
+ const text = content.slice(5).trim();
1660
+ const mp3 = await synthTTS(text);
1661
+ await playAudio(mp3);
1662
+ return;
1663
+ }
1664
+ if (content.startsWith('!voice-test ')) {
1665
+ const text = content.slice('!voice-test '.length).trim();
1666
+ if (!text) return void msg.reply('테스트할 문장을 붙여줘.');
1667
+ const started = Date.now();
1668
+ try {
1669
+ await msg.reply(`TTS 백엔드 ${ttsBackend.name}로 음성 테스트할게.`);
1670
+ await speakText(text);
1671
+ await msg.channel.send(`음성 테스트 완료: ${ttsBackend.name}, ${Date.now() - started}ms`);
1672
+ } catch (e) {
1673
+ warn('voice-test failed', e?.stack || e);
1674
+ await msg.channel.send(`음성 테스트 실패: ${String(e?.message || e).slice(0, 700)}`);
1675
+ }
1676
+ return;
1677
+ }
1678
+ if (content === '!voice-clone' || content === '!voice-clone status') {
1679
+ const current = voiceCloneCapture.current();
1680
+ if (current?.userId === String(msg.author.id)) {
1681
+ return void msg.reply(`다음 유효한 음성을 ${path.relative(ROOT, current.targetPath)}에 저장할게.`);
1682
+ }
1683
+ return void msg.reply('대기 중인 보이스 클로닝 샘플 캡처가 없어. `!voice-clone capture`로 시작해.');
1684
+ }
1685
+ if (content === '!voice-clone cancel') {
1686
+ const cancelled = voiceCloneCapture.cancel(msg.author.id);
1687
+ return void msg.reply(cancelled ? '보이스 클로닝 샘플 캡처를 취소했어.' : '대기 중인 캡처가 없어.');
1688
+ }
1689
+ if (content === '!voice-clone capture') {
1690
+ const armed = voiceCloneCapture.arm({ userId: msg.author.id, source: 'discord-command' });
1691
+ return void msg.reply(`다음 유효한 음성을 ${path.relative(ROOT, armed.targetPath)}에 저장할게. 음성 채널에서 10~30초 정도 말해줘.`);
1692
+ }
1693
+ if (content.startsWith('!ask ')) {
1694
+ const text = content.slice(5).trim();
1695
+ if (!text) return void msg.reply('물어볼 내용을 붙여줘.');
1696
+ await handleTextAgentMessage(msg, text, { speakResponse: true });
1697
+ return;
1698
+ }
1699
+ if (shouldRouteDiscordTextToAgent({
1700
+ content,
1701
+ channelId: msg.channelId,
1702
+ transcriptChannelId: settings.transcriptChannelId,
1703
+ }) || resolveProjectSessionForChannel(msg.channelId)) {
1704
+ await handleTextAgentMessage(msg, content, { speakResponse: false });
1705
+ return;
1706
+ }
1707
+ });
1708
+
1709
+ process.stdout?.on?.('error', error => {
1710
+ if (isBenignTransientNetworkError(error)) {
1711
+ bridgeLogger.markStdioBroken();
1712
+ reportTransientProcessError('stdout error', error);
1713
+ return;
1714
+ }
1715
+ warn('stdout error', error?.stack || error);
1716
+ });
1717
+ process.stderr?.on?.('error', error => {
1718
+ if (isBenignTransientNetworkError(error)) {
1719
+ bridgeLogger.markStdioBroken();
1720
+ reportTransientProcessError('stderr error', error);
1721
+ return;
1722
+ }
1723
+ warn('stderr error', error?.stack || error);
1724
+ });
1725
+ process.on('unhandledRejection', error => {
1726
+ if (reportTransientProcessError('unhandled rejection', error)) return;
1727
+ warn('unhandled rejection', error?.stack || error);
1728
+ });
1729
+ process.on('uncaughtException', error => {
1730
+ if (reportTransientProcessError('uncaught exception', error)) return;
1731
+ warn('uncaught exception; exiting', error?.stack || error);
1732
+ process.exit(1);
1733
+ });
1734
+ client.on('error', e => warn('discord client error', e?.stack || e));
1735
+ client.on('shardError', e => warn('discord shard error', e?.stack || e));
1736
+
1737
+ let shutdownStarted = false;
1738
+ async function gracefulShutdown(signalName) {
1739
+ if (shutdownStarted) return;
1740
+ shutdownStarted = true;
1741
+ log('graceful shutdown requested', signalName, 'connection', Boolean(connection));
1742
+ try {
1743
+ if (currentAbortController && !currentAbortController.signal.aborted) currentAbortController.abort();
1744
+ } catch (e) {
1745
+ warn('abort before shutdown failed', e?.stack || e);
1746
+ }
1747
+ try {
1748
+ if (connection) {
1749
+ let detail = '';
1750
+ const noticePath = path.join(ROOT, '.cache', 'restart-notice.txt');
1751
+ try {
1752
+ if (fs.existsSync(noticePath)) {
1753
+ detail = fs.readFileSync(noticePath, 'utf8').replace(/\s+/g, ' ').trim().slice(0, 120);
1754
+ }
1755
+ } catch (e) {
1756
+ warn('read restart notice failed', e?.stack || e);
1757
+ }
1758
+ await speakText(formatRestartShutdownNotice(detail, settings.tts.edge.voice));
1759
+ await waitEvent(player, AudioPlayerStatus.Idle, 30000).catch(() => {});
1760
+ }
1761
+ } catch (e) {
1762
+ warn('shutdown voice notice failed', e?.stack || e);
1763
+ }
1764
+ try { connection?.destroy(); } catch {}
1765
+ try { client.destroy(); } catch {}
1766
+ process.exit(0);
1767
+ }
1768
+ process.on('SIGTERM', () => { void gracefulShutdown('SIGTERM'); });
1769
+ process.on('SIGINT', () => { void gracefulShutdown('SIGINT'); });
1770
+
1771
+ client.login(settings.token);