verbalcoding 0.2.12 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/.env.example +74 -4
  2. package/README.es.md +3 -1
  3. package/README.fr.md +3 -1
  4. package/README.ja.md +3 -1
  5. package/README.ko.md +4 -2
  6. package/README.md +4 -2
  7. package/README.ru.md +3 -1
  8. package/README.zh.md +3 -1
  9. package/app-node/agent_adapters.test.mjs +14 -0
  10. package/app-node/agent_routing.mjs +148 -0
  11. package/app-node/agent_routing.test.mjs +138 -0
  12. package/app-node/agent_turn.mjs +86 -0
  13. package/app-node/agent_turn.test.mjs +109 -0
  14. package/app-node/bridge_context.mjs +73 -0
  15. package/app-node/bridge_context.test.mjs +54 -0
  16. package/app-node/bridge_state.mjs +4 -0
  17. package/app-node/bridge_wireup.test.mjs +462 -0
  18. package/app-node/cli_install.test.mjs +31 -0
  19. package/app-node/cross_agent_routing.test.mjs +78 -0
  20. package/app-node/discord_command_router.mjs +204 -0
  21. package/app-node/discord_command_router.test.mjs +311 -0
  22. package/app-node/discord_voice_setup.mjs +251 -0
  23. package/app-node/discord_voice_setup.test.mjs +86 -0
  24. package/app-node/hermes_profiles.test.mjs +12 -1
  25. package/app-node/install_config.mjs +110 -3
  26. package/app-node/install_config.test.mjs +8 -0
  27. package/app-node/instance_doctor.test.mjs +9 -0
  28. package/app-node/instances.test.mjs +8 -1
  29. package/app-node/main.mjs +488 -1368
  30. package/app-node/mcp_tools.test.mjs +7 -0
  31. package/app-node/notification_handler.mjs +89 -0
  32. package/app-node/notification_handler.test.mjs +187 -0
  33. package/app-node/plan_dispatcher.mjs +215 -0
  34. package/app-node/plan_dispatcher.test.mjs +101 -0
  35. package/app-node/plan_mode.mjs +36 -7
  36. package/app-node/plan_mode.test.mjs +78 -0
  37. package/app-node/progress_handler.mjs +220 -0
  38. package/app-node/progress_handler.test.mjs +193 -0
  39. package/app-node/progress_speech.mjs +54 -32
  40. package/app-node/progress_speech.test.mjs +12 -3
  41. package/app-node/project_sessions.mjs +5 -2
  42. package/app-node/project_sessions.test.mjs +7 -0
  43. package/app-node/research_mode.mjs +282 -0
  44. package/app-node/research_mode.test.mjs +264 -0
  45. package/app-node/restart_notice.mjs +3 -0
  46. package/app-node/restart_notice.test.mjs +11 -0
  47. package/app-node/session_ontology.mjs +271 -0
  48. package/app-node/session_ontology.test.mjs +130 -0
  49. package/app-node/smart_progress.mjs +1 -1
  50. package/app-node/stream_sentencer.mjs +32 -2
  51. package/app-node/stream_sentencer.test.mjs +65 -0
  52. package/app-node/streaming_tts_queue.mjs +5 -1
  53. package/app-node/streaming_tts_queue.test.mjs +7 -1
  54. package/app-node/stt_whisper.mjs +24 -0
  55. package/app-node/stt_whisper.test.mjs +32 -0
  56. package/app-node/text_routing.mjs +4 -2
  57. package/app-node/tts_backends.mjs +537 -3
  58. package/app-node/tts_backends.test.mjs +454 -0
  59. package/app-node/tts_player.mjs +164 -0
  60. package/app-node/tts_player.test.mjs +202 -0
  61. package/app-node/tts_runtime.mjs +134 -0
  62. package/app-node/tts_runtime.test.mjs +89 -0
  63. package/app-node/tts_settings.mjs +150 -3
  64. package/app-node/tts_settings.test.mjs +204 -0
  65. package/app-node/tts_voice_config.mjs +136 -2
  66. package/app-node/tts_voice_config.test.mjs +94 -0
  67. package/app-node/utterance_router.mjs +216 -0
  68. package/app-node/utterance_router.test.mjs +236 -0
  69. package/app-node/voice_autojoin.mjs +37 -0
  70. package/app-node/voice_autojoin.test.mjs +59 -0
  71. package/app-node/voice_io.mjs +272 -0
  72. package/app-node/voice_io.test.mjs +102 -0
  73. package/app-node/voice_turn_runner.mjs +449 -0
  74. package/app-node/voice_turn_runner.test.mjs +289 -0
  75. package/docs/CONFIGURATION.md +12 -2
  76. package/docs/HARNESSES.md +58 -0
  77. package/docs/HARNESS_AIDER.md +50 -0
  78. package/docs/HARNESS_CLAUDE.md +56 -0
  79. package/docs/HARNESS_CODEX.md +56 -0
  80. package/docs/HARNESS_CURSOR.md +45 -0
  81. package/docs/HARNESS_GEMINI.md +45 -0
  82. package/docs/HARNESS_HERMES.md +57 -0
  83. package/docs/HARNESS_OPENCLAW.md +44 -0
  84. package/docs/HARNESS_OPENCODE.md +44 -0
  85. package/docs/README.md +1 -0
  86. package/docs/ROADMAP.md +20 -5
  87. package/docs/TTS_BACKENDS.md +227 -0
  88. package/docs/USAGE.md +22 -0
  89. package/docs/i18n/AGENTS.es.md +34 -0
  90. package/docs/i18n/AGENTS.fr.md +34 -0
  91. package/docs/i18n/AGENTS.ja.md +34 -0
  92. package/docs/i18n/AGENTS.ko.md +34 -0
  93. package/docs/i18n/AGENTS.ru.md +34 -0
  94. package/docs/i18n/AGENTS.zh.md +34 -0
  95. package/docs/i18n/HARNESSES.es.md +58 -0
  96. package/docs/i18n/HARNESSES.fr.md +58 -0
  97. package/docs/i18n/HARNESSES.ja.md +58 -0
  98. package/docs/i18n/HARNESSES.ko.md +58 -0
  99. package/docs/i18n/HARNESSES.ru.md +58 -0
  100. package/docs/i18n/HARNESSES.zh.md +58 -0
  101. package/docs/i18n/HARNESS_AIDER.es.md +48 -0
  102. package/docs/i18n/HARNESS_AIDER.fr.md +48 -0
  103. package/docs/i18n/HARNESS_AIDER.ja.md +50 -0
  104. package/docs/i18n/HARNESS_AIDER.ko.md +50 -0
  105. package/docs/i18n/HARNESS_AIDER.ru.md +48 -0
  106. package/docs/i18n/HARNESS_AIDER.zh.md +48 -0
  107. package/docs/i18n/HARNESS_CLAUDE.es.md +55 -0
  108. package/docs/i18n/HARNESS_CLAUDE.fr.md +55 -0
  109. package/docs/i18n/HARNESS_CLAUDE.ja.md +56 -0
  110. package/docs/i18n/HARNESS_CLAUDE.ko.md +56 -0
  111. package/docs/i18n/HARNESS_CLAUDE.ru.md +55 -0
  112. package/docs/i18n/HARNESS_CLAUDE.zh.md +56 -0
  113. package/docs/i18n/HARNESS_CODEX.es.md +55 -0
  114. package/docs/i18n/HARNESS_CODEX.fr.md +55 -0
  115. package/docs/i18n/HARNESS_CODEX.ja.md +56 -0
  116. package/docs/i18n/HARNESS_CODEX.ko.md +56 -0
  117. package/docs/i18n/HARNESS_CODEX.ru.md +55 -0
  118. package/docs/i18n/HARNESS_CODEX.zh.md +56 -0
  119. package/docs/i18n/HARNESS_CURSOR.es.md +42 -0
  120. package/docs/i18n/HARNESS_CURSOR.fr.md +42 -0
  121. package/docs/i18n/HARNESS_CURSOR.ja.md +45 -0
  122. package/docs/i18n/HARNESS_CURSOR.ko.md +45 -0
  123. package/docs/i18n/HARNESS_CURSOR.ru.md +42 -0
  124. package/docs/i18n/HARNESS_CURSOR.zh.md +42 -0
  125. package/docs/i18n/HARNESS_GEMINI.es.md +44 -0
  126. package/docs/i18n/HARNESS_GEMINI.fr.md +44 -0
  127. package/docs/i18n/HARNESS_GEMINI.ja.md +45 -0
  128. package/docs/i18n/HARNESS_GEMINI.ko.md +45 -0
  129. package/docs/i18n/HARNESS_GEMINI.ru.md +44 -0
  130. package/docs/i18n/HARNESS_GEMINI.zh.md +45 -0
  131. package/docs/i18n/HARNESS_HERMES.es.md +54 -0
  132. package/docs/i18n/HARNESS_HERMES.fr.md +54 -0
  133. package/docs/i18n/HARNESS_HERMES.ja.md +57 -0
  134. package/docs/i18n/HARNESS_HERMES.ko.md +57 -0
  135. package/docs/i18n/HARNESS_HERMES.ru.md +54 -0
  136. package/docs/i18n/HARNESS_HERMES.zh.md +57 -0
  137. package/docs/i18n/HARNESS_OPENCLAW.es.md +41 -0
  138. package/docs/i18n/HARNESS_OPENCLAW.fr.md +41 -0
  139. package/docs/i18n/HARNESS_OPENCLAW.ja.md +44 -0
  140. package/docs/i18n/HARNESS_OPENCLAW.ko.md +44 -0
  141. package/docs/i18n/HARNESS_OPENCLAW.ru.md +41 -0
  142. package/docs/i18n/HARNESS_OPENCLAW.zh.md +42 -0
  143. package/docs/i18n/HARNESS_OPENCODE.es.md +41 -0
  144. package/docs/i18n/HARNESS_OPENCODE.fr.md +41 -0
  145. package/docs/i18n/HARNESS_OPENCODE.ja.md +44 -0
  146. package/docs/i18n/HARNESS_OPENCODE.ko.md +44 -0
  147. package/docs/i18n/HARNESS_OPENCODE.ru.md +41 -0
  148. package/docs/i18n/HARNESS_OPENCODE.zh.md +44 -0
  149. package/docs/superpowers/plans/2026-05-14-cross-agent-voice-transfer.md +625 -0
  150. package/docs/superpowers/plans/2026-05-21-audio-overview-narrated-diffs.md +95 -0
  151. package/docs/superpowers/plans/2026-05-21-autoresearch-ontology.md +83 -0
  152. package/docs/superpowers/plans/2026-05-21-phase11-push-to-talk-wakeword-v2.md +77 -0
  153. package/docs/superpowers/plans/2026-05-21-phase12-multi-user-voice.md +147 -0
  154. package/docs/superpowers/plans/2026-05-21-phase14-verbalbench.md +136 -0
  155. package/docs/superpowers/plans/2026-05-21-phase15-phone-companion.md +72 -0
  156. package/integrations/fireredtts2/mlx_llm.py +183 -0
  157. package/integrations/fireredtts2/synth.py +156 -0
  158. package/integrations/fireredtts2/synth_mlx.py +196 -0
  159. package/integrations/mlxaudio/synth.py +74 -0
  160. package/integrations/neuttsair/synth.py +104 -0
  161. package/integrations/omnivoice/synth.py +110 -0
  162. package/package.json +6 -1
  163. package/scripts/cli.mjs +84 -0
  164. package/scripts/doctor.mjs +104 -4
  165. package/scripts/install.mjs +5 -1
  166. package/scripts/install_fireredtts2.sh +109 -0
  167. package/scripts/install_mlxaudio.sh +34 -0
  168. package/scripts/install_mossttsnano.sh +46 -0
  169. package/scripts/postinstall.mjs +34 -0
@@ -0,0 +1,272 @@
1
+ // Voice I/O pipeline: Discord opus receive -> per-user WAV segments ->
2
+ // idle-merged utterance -> whisper transcription -> cleaned text.
3
+ //
4
+ // Phase 2 extraction from main.mjs. The functions read/write shared bridge
5
+ // state (activeStreams, speaking, processing, bridgeState) and call back
6
+ // into helpers that still live in main.mjs (currentBargeInThresholds,
7
+ // stopPlaybackForBargeIn, handleRecording, etc.), so the factory takes
8
+ // them all explicitly.
9
+
10
+ import fs from 'node:fs';
11
+ import os from 'node:os';
12
+ import path from 'node:path';
13
+ import { EndBehaviorType } from '@discordjs/voice';
14
+ import prism from 'prism-media';
15
+ import wav from 'wav';
16
+ import { shouldPassWhisperLanguage } from './language_config.mjs';
17
+ import { whisperFailureMessage } from './stt_whisper.mjs';
18
+ import { isRepeatedNoiseTranscript } from './barge_in.mjs';
19
+
20
+ export function createVoiceIO(deps) {
21
+ const {
22
+ bridge,
23
+ settings,
24
+ client,
25
+ execFileAsync,
26
+ log,
27
+ warn,
28
+ stamp,
29
+ sleep,
30
+ isAllowed,
31
+ UTTERANCE_IDLE_MS,
32
+ SUBSCRIBE_AFTER_SILENCE_MS,
33
+ MIN_UTTERANCE_BYTES,
34
+ MIN_MEAN_VOLUME_DB,
35
+ MIN_MAX_VOLUME_DB,
36
+ currentBargeInThresholds,
37
+ currentPlaybackBargeInThresholds,
38
+ createLiveBargeInMonitor,
39
+ shouldUseLivePlaybackBargeIn,
40
+ stopPlaybackForBargeIn,
41
+ analyzeAudio,
42
+ concatWavs,
43
+ saveCapturedVoiceCloneSample,
44
+ isBargeInCandidate,
45
+ validateProcessingBargeIn,
46
+ enqueueDeferredProcessingUtterance,
47
+ newLatencyTurn,
48
+ handleRecording,
49
+ } = deps;
50
+
51
+ async function transcribeOnce(wavPath, input16k, outBase) {
52
+ const args = ['-m', settings.whisperModel, '-f', input16k];
53
+ if (shouldPassWhisperLanguage(settings.whisperLanguage)) args.push('-l', settings.whisperLanguage);
54
+ args.push('-nt', '-otxt', '-of', outBase, '-sns', '-nf', '-nth', '0.35', '-et', '2.2', '-lpt', '-0.8');
55
+ try {
56
+ await execFileAsync(settings.whisperBin, args, { timeout: settings.whisperTimeoutMs, maxBuffer: 4 * 1024 * 1024 });
57
+ } catch (e) {
58
+ throw new Error(`whisper failed: ${whisperFailureMessage(e)}`);
59
+ }
60
+ const txtPath = `${outBase}.txt`;
61
+ const raw = fs.existsSync(txtPath) ? fs.readFileSync(txtPath, 'utf8') : '';
62
+ return { raw, txtPath };
63
+ }
64
+
65
+ async function transcribe(wavPath) {
66
+ const tmpBase = path.join(os.tmpdir(), `hermes-node-stt-${Date.now()}`);
67
+ const input16k = `${tmpBase}.16k.wav`;
68
+ const outBase = `${tmpBase}.out`;
69
+ // whisper.cpp can read WAV, but Discord receiver output is 48 kHz stereo.
70
+ // Convert explicitly to the 16 kHz mono PCM shape Whisper expects.
71
+ await execFileAsync('ffmpeg', ['-y', '-hide_banner', '-loglevel', 'error', '-i', wavPath, '-ac', '1', '-ar', '16000', '-sample_fmt', 's16', input16k], {
72
+ timeout: 20000,
73
+ maxBuffer: 1024 * 1024,
74
+ });
75
+
76
+ let raw = '';
77
+ let txtPath = '';
78
+ try {
79
+ ({ raw, txtPath } = await transcribeOnce(wavPath, input16k, outBase));
80
+ let cleaned = cleanTranscript(raw);
81
+ log('stt raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
82
+ if (!cleaned) {
83
+ await sleep(300);
84
+ const retryBase = `${tmpBase}.retry`;
85
+ const retry = await transcribeOnce(wavPath, input16k, retryBase);
86
+ raw = retry.raw;
87
+ txtPath = retry.txtPath;
88
+ cleaned = cleanTranscript(raw);
89
+ log('stt retry raw', JSON.stringify(raw.trim()).slice(0, 500), 'cleaned', JSON.stringify(cleaned).slice(0, 500));
90
+ }
91
+ return cleaned;
92
+ } finally {
93
+ if (settings.debugDir) {
94
+ const debug16k = path.join(settings.debugDir, `stt-input-${stamp()}.wav`);
95
+ fs.copyFile(input16k, debug16k, () => {});
96
+ if (raw) fs.writeFile(path.join(settings.debugDir, `stt-raw-${stamp()}.txt`), raw, () => {});
97
+ }
98
+ fs.rm(input16k, { force: true }, () => {});
99
+ if (txtPath) fs.rm(txtPath, { force: true }, () => {});
100
+ }
101
+ }
102
+
103
+ function cleanTranscript(raw) {
104
+ const bad = [
105
+ '구독', '좋아요', '알림설정', '시청해주셔서', '시청해주신', '다음영상', '영상에서만나요',
106
+ '부탁드려요', '큰힘이됩니다',
107
+ 'mbc뉴스', '이준범기자입니다', '뉴스입니다', '기자입니다', '앵커', '속보', '보도입니다', '전해드립니다',
108
+ ];
109
+ const lines = raw
110
+ .split(/\r?\n/)
111
+ .map(l => l.trim())
112
+ .filter(Boolean)
113
+ .map(l => l.replace(/^\[[^\]]+\]\s*/, '').trim());
114
+ const kept = [];
115
+ for (const line of lines) {
116
+ const compact = line
117
+ .replace(/\s+/g, '')
118
+ .replace(/[\p{P}\p{S}_]+/gu, '');
119
+ if (!compact) continue;
120
+ if (/^[\(\[(【].*[\)\])】]$/.test(line.replace(/\s+/g, ''))) continue;
121
+ if (['끄덕', '끄덕끄덕', '박수', '웃음', '음악', '자막', '침묵', '무음'].includes(compact)) continue;
122
+ if (bad.some(b => compact.toLowerCase().includes(b))) continue;
123
+ if (isRepeatedNoiseTranscript(compact)) continue;
124
+ kept.push(line);
125
+ }
126
+ return kept.join(' ').trim();
127
+ }
128
+
129
+ function queueSegment(userId, file, pcmBytes, startedAtMs = Date.now(), endedAtMs = Date.now()) {
130
+ const pending = bridge.bridgeState.appendSegment(userId, {
131
+ file,
132
+ pcmBytes,
133
+ startedAtMs,
134
+ endedAtMs,
135
+ timerFactory: () => setTimeout(() => flushUtterance(userId).catch(e => warn('flushUtterance failed', userId, e?.stack || e)), UTTERANCE_IDLE_MS),
136
+ });
137
+ log('queued segment', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes, 'idleMs', UTTERANCE_IDLE_MS, 'epoch', pending.epoch);
138
+ }
139
+
140
+ async function flushUtterance(userId) {
141
+ const pending = bridge.bridgeState.deletePending(userId);
142
+ if (!pending) return;
143
+ if (pending.timer) clearTimeout(pending.timer);
144
+ const files = pending.files;
145
+ const pcmBytes = pending.pcmBytes;
146
+ const metricsTurn = newLatencyTurn(userId, pending.firstPacketAt || Date.now());
147
+ metricsTurn.mark('voice_first_packet', pending.firstPacketAt || Date.now());
148
+ metricsTurn.mark('voice_segment_end', pending.lastSegmentEndAt || Date.now());
149
+ metricsTurn.mark('utterance_flush');
150
+ metricsTurn.addMeta({ segments: files.length, pcmBytes, epoch: pending.epoch });
151
+ if (pending.epoch !== bridge.bridgeState.currentEpoch()) {
152
+ log('drop stale utterance after voice input queue reset', userId, 'utteranceEpoch', pending.epoch, 'currentEpoch', bridge.bridgeState.currentEpoch());
153
+ for (const file of files) fs.rm(file, { force: true }, () => {});
154
+ metricsTurn.finish({ status: 'stale_after_config_change' });
155
+ return;
156
+ }
157
+ if (pcmBytes < MIN_UTTERANCE_BYTES) {
158
+ log('skip short utterance', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'minBytes', MIN_UTTERANCE_BYTES);
159
+ metricsTurn.finish({ status: 'skip_short' });
160
+ return;
161
+ }
162
+ const merged = path.join(settings.debugDir, `utterance-merged-${stamp()}-${userId}.wav`);
163
+ await concatWavs(files, merged);
164
+ const levels = await analyzeAudio(merged);
165
+ log('utterance levels', userId, 'segments', files.length, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
166
+ if (await saveCapturedVoiceCloneSample(userId, merged, pcmBytes, files.length)) {
167
+ metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
168
+ metricsTurn.finish({ status: 'voice_clone_sample_saved' });
169
+ return;
170
+ }
171
+ const candidate = isBargeInCandidate(pcmBytes, levels);
172
+ if (bridge.speaking || bridge.processing) {
173
+ const thresholds = currentBargeInThresholds();
174
+ if (!candidate) {
175
+ log('check weak barge-in for explicit stop transcript', userId, 'pcmBytes', pcmBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholdBytes', thresholds.minBytes, 'thresholds', thresholds.minMeanDb, thresholds.minMaxDb, 'mode', thresholds.mode);
176
+ }
177
+ const validation = await validateProcessingBargeIn(userId, merged, pcmBytes, files.length);
178
+ if (validation?.action === 'interrupt') {
179
+ metricsTurn.finish({ status: bridge.processing ? 'barge_in_processing_interrupt' : 'barge_in_playback_interrupt' });
180
+ return;
181
+ }
182
+ if (bridge.processing && validation?.action === 'defer') {
183
+ const queued = enqueueDeferredProcessingUtterance({
184
+ userId,
185
+ wavPath: merged,
186
+ pcmBytes,
187
+ segments: files.length,
188
+ startedAtMs: pending.firstPacketAt || Date.now(),
189
+ });
190
+ metricsTurn.finish({ status: queued ? 'deferred_during_processing' : 'drop_deferred_during_processing' });
191
+ return;
192
+ }
193
+ metricsTurn.finish({ status: bridge.speaking ? 'barge_in_playback_ignored' : 'barge_in_processing_ignored' });
194
+ return;
195
+ }
196
+ // Drop only when BOTH overall energy and peak are low. Real Discord speech from this
197
+ // mic can have low mean volume while still carrying intelligible peaks; using OR here
198
+ // caused valid Korean utterances to be discarded as "low-energy".
199
+ if (levels.meanDb < MIN_MEAN_VOLUME_DB && levels.maxDb < MIN_MAX_VOLUME_DB) {
200
+ log('skip low-energy utterance', userId, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb, 'thresholds', MIN_MEAN_VOLUME_DB, MIN_MAX_VOLUME_DB, 'mode', 'both-below');
201
+ metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
202
+ metricsTurn.finish({ status: 'skip_low_energy' });
203
+ return;
204
+ }
205
+ metricsTurn.addMeta({ meanDb: levels.meanDb, maxDb: levels.maxDb });
206
+ await handleRecording(userId, merged, pcmBytes, files.length, metricsTurn);
207
+ }
208
+
209
+ function subscribeUser(receiver, userId) {
210
+ if (!isAllowed(userId)) return;
211
+ if (String(userId) === client.user?.id) return;
212
+ const wasSpeaking = bridge.speaking;
213
+ const wasProcessing = bridge.processing;
214
+ if ((wasSpeaking || wasProcessing) && !bridge.activeStreams.has(userId)) {
215
+ // Speaking-start alone is too noisy in Discord voice. Record and validate a
216
+ // real segment first; only confirmed playback barge-in stops the current
217
+ // audio chunk, and only explicit stop transcripts abort active agent work.
218
+ log('possible barge-in start; waiting for segment validation', userId, 'speaking', wasSpeaking, 'processing', wasProcessing);
219
+ }
220
+ if (bridge.activeStreams.has(userId)) return;
221
+ const pending = bridge.bridgeState.getPending(userId);
222
+ if (pending?.timer) {
223
+ bridge.bridgeState.clearPendingTimer(userId);
224
+ log('extend pending utterance because new segment started', userId, 'segments', pending.files.length, 'totalPcmBytes', pending.pcmBytes);
225
+ }
226
+
227
+ const file = path.join(settings.debugDir, `segment-${stamp()}-${userId}.wav`);
228
+ log('subscribe user', userId, file);
229
+ const opusStream = receiver.subscribe(userId, { end: { behavior: EndBehaviorType.AfterSilence, duration: SUBSCRIBE_AFTER_SILENCE_MS } });
230
+ const decoder = new prism.opus.Decoder({ rate: 48000, channels: 2, frameSize: 960 });
231
+ const writer = new wav.FileWriter(file, { sampleRate: 48000, channels: 2, bitDepth: 16 });
232
+ bridge.activeStreams.set(userId, { opusStream, decoder, writer, file, startedAtMs: Date.now() });
233
+ let pcmBytes = 0;
234
+ const liveThresholds = wasSpeaking && !wasProcessing ? currentPlaybackBargeInThresholds() : currentBargeInThresholds();
235
+ const liveBargeIn = shouldUseLivePlaybackBargeIn({ speaking: wasSpeaking, processing: wasProcessing }) ? createLiveBargeInMonitor({
236
+ minBytes: liveThresholds.minBytes,
237
+ minMeanDb: liveThresholds.minMeanDb,
238
+ minMaxDb: liveThresholds.minMaxDb,
239
+ requireBoth: liveThresholds.requireBoth,
240
+ log,
241
+ onConfirm: ({ pcmBytes: confirmedBytes, levels }) => {
242
+ log('confirmed live playback barge-in before segment end', userId, 'pcmBytes', confirmedBytes, 'meanDb', levels.meanDb, 'maxDb', levels.maxDb);
243
+ stopPlaybackForBargeIn(userId, 'confirmed-live-playback-barge-in');
244
+ },
245
+ }) : null;
246
+ decoder.on('data', chunk => {
247
+ pcmBytes += chunk.length;
248
+ liveBargeIn?.push(chunk);
249
+ });
250
+ opusStream.on('error', e => warn('opus stream error', userId, e?.stack || e));
251
+ decoder.on('error', e => warn('opus decoder error', userId, e?.stack || e));
252
+ writer.on('error', e => warn('wav writer error', userId, e?.stack || e));
253
+ opusStream.on('end', () => log('opus end', userId, 'pcmBytes', pcmBytes));
254
+ writer.on('finish', () => {
255
+ const streamState = bridge.activeStreams.get(userId);
256
+ bridge.activeStreams.delete(userId);
257
+ const endedAtMs = Date.now();
258
+ log('saved segment', userId, 'pcmBytes', pcmBytes, file);
259
+ queueSegment(userId, file, pcmBytes, streamState?.startedAtMs || endedAtMs, endedAtMs);
260
+ });
261
+ opusStream.pipe(decoder).pipe(writer);
262
+ }
263
+
264
+ return {
265
+ transcribeOnce,
266
+ transcribe,
267
+ cleanTranscript,
268
+ queueSegment,
269
+ flushUtterance,
270
+ subscribeUser,
271
+ };
272
+ }
@@ -0,0 +1,102 @@
1
+ import test from 'node:test';
2
+ import assert from 'node:assert/strict';
3
+ import { createVoiceIO } from './voice_io.mjs';
4
+ import { createBridge } from './bridge_context.mjs';
5
+
6
+ function makeDeps(overrides = {}) {
7
+ const bridge = createBridge();
8
+ // Seed bridgeState with the minimum shape the voice_io helpers use.
9
+ bridge.bridgeState = {
10
+ appendSegment: () => ({ files: ['a'], pcmBytes: 1024, epoch: 1 }),
11
+ deletePending: () => null,
12
+ getPending: () => null,
13
+ clearPendingTimer: () => {},
14
+ currentEpoch: () => 1,
15
+ };
16
+ return {
17
+ bridge,
18
+ settings: {
19
+ whisperBin: 'whisper-cli',
20
+ whisperModel: '/dev/null',
21
+ whisperLanguage: 'ko',
22
+ whisperTimeoutMs: 1000,
23
+ debugDir: '',
24
+ allowedUsers: new Set(),
25
+ },
26
+ client: { user: { id: 'bot-id' } },
27
+ execFileAsync: async () => ({ stdout: '', stderr: '' }),
28
+ log: () => {},
29
+ warn: () => {},
30
+ stamp: () => 'stamp',
31
+ sleep: async () => {},
32
+ isAllowed: () => true,
33
+ UTTERANCE_IDLE_MS: 4500,
34
+ SUBSCRIBE_AFTER_SILENCE_MS: 2200,
35
+ MIN_UTTERANCE_BYTES: 1024,
36
+ MIN_MEAN_VOLUME_DB: -35,
37
+ MIN_MAX_VOLUME_DB: -12,
38
+ currentBargeInThresholds: () => ({ minBytes: 0, minMeanDb: -40, minMaxDb: -20, mode: 'normal' }),
39
+ currentPlaybackBargeInThresholds: () => ({ minBytes: 0, minMeanDb: -40, minMaxDb: -20, requireBoth: true }),
40
+ createLiveBargeInMonitor: () => ({ push: () => {} }),
41
+ shouldUseLivePlaybackBargeIn: () => false,
42
+ stopPlaybackForBargeIn: () => false,
43
+ analyzeAudio: async () => ({ meanDb: -20, maxDb: -10 }),
44
+ concatWavs: async () => {},
45
+ saveCapturedVoiceCloneSample: async () => false,
46
+ isBargeInCandidate: () => false,
47
+ validateProcessingBargeIn: async () => ({ action: 'ignore', text: '' }),
48
+ enqueueDeferredProcessingUtterance: () => true,
49
+ newLatencyTurn: () => ({ mark: () => {}, addMeta: () => {}, finish: () => {} }),
50
+ handleRecording: async () => {},
51
+ ...overrides,
52
+ };
53
+ }
54
+
55
+ test('createVoiceIO exposes the expected functions', () => {
56
+ const voiceIO = createVoiceIO(makeDeps());
57
+ for (const name of ['transcribeOnce', 'transcribe', 'cleanTranscript', 'queueSegment', 'flushUtterance', 'subscribeUser']) {
58
+ assert.equal(typeof voiceIO[name], 'function', `${name} is exposed as a function`);
59
+ }
60
+ });
61
+
62
+ test('cleanTranscript strips junk lines and noise tokens', () => {
63
+ const { cleanTranscript } = createVoiceIO(makeDeps());
64
+ // Real-world whisper output: timestamp prefix, brackets, junk tokens
65
+ const raw = [
66
+ '[00:00:00.000 --> 00:00:01.000] 안녕하세요',
67
+ '구독 좋아요 부탁드려요',
68
+ '(박수)',
69
+ '오늘은 회의 안건을 정리해 봅시다',
70
+ '시청해주셔서 감사합니다',
71
+ ].join('\n');
72
+ const cleaned = cleanTranscript(raw);
73
+ assert.match(cleaned, /안녕하세요/);
74
+ assert.match(cleaned, /오늘은 회의 안건을 정리해 봅시다/);
75
+ assert.doesNotMatch(cleaned, /구독/);
76
+ assert.doesNotMatch(cleaned, /시청해주셔서/);
77
+ assert.doesNotMatch(cleaned, /박수/);
78
+ });
79
+
80
+ test('cleanTranscript returns empty string for noise-only input', () => {
81
+ const { cleanTranscript } = createVoiceIO(makeDeps());
82
+ assert.equal(cleanTranscript('(웃음)\n(박수)\n끄덕끄덕\n'), '');
83
+ assert.equal(cleanTranscript(''), '');
84
+ });
85
+
86
+ test('subscribeUser ignores the bot itself', () => {
87
+ let subscribed = false;
88
+ const deps = makeDeps({ isAllowed: () => true });
89
+ const { subscribeUser } = createVoiceIO(deps);
90
+ const receiver = { subscribe: () => { subscribed = true; return { on: () => {}, pipe: () => ({ pipe: () => ({}) }) }; } };
91
+ subscribeUser(receiver, 'bot-id');
92
+ assert.equal(subscribed, false, 'never subscribes to its own user id');
93
+ });
94
+
95
+ test('subscribeUser ignores disallowed users', () => {
96
+ let subscribed = false;
97
+ const deps = makeDeps({ isAllowed: () => false });
98
+ const { subscribeUser } = createVoiceIO(deps);
99
+ const receiver = { subscribe: () => { subscribed = true; return { on: () => {}, pipe: () => ({ pipe: () => ({}) }) }; } };
100
+ subscribeUser(receiver, 'someone-else');
101
+ assert.equal(subscribed, false);
102
+ });