verbalcoding 0.2.12 β†’ 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/.env.example +74 -4
  2. package/README.es.md +3 -1
  3. package/README.fr.md +3 -1
  4. package/README.ja.md +3 -1
  5. package/README.ko.md +4 -2
  6. package/README.md +4 -2
  7. package/README.ru.md +3 -1
  8. package/README.zh.md +3 -1
  9. package/app-node/agent_adapters.test.mjs +14 -0
  10. package/app-node/agent_routing.mjs +148 -0
  11. package/app-node/agent_routing.test.mjs +138 -0
  12. package/app-node/agent_turn.mjs +86 -0
  13. package/app-node/agent_turn.test.mjs +109 -0
  14. package/app-node/bridge_context.mjs +73 -0
  15. package/app-node/bridge_context.test.mjs +54 -0
  16. package/app-node/bridge_state.mjs +4 -0
  17. package/app-node/bridge_wireup.test.mjs +462 -0
  18. package/app-node/cli_install.test.mjs +31 -0
  19. package/app-node/cross_agent_routing.test.mjs +78 -0
  20. package/app-node/discord_command_router.mjs +204 -0
  21. package/app-node/discord_command_router.test.mjs +311 -0
  22. package/app-node/discord_voice_setup.mjs +251 -0
  23. package/app-node/discord_voice_setup.test.mjs +86 -0
  24. package/app-node/hermes_profiles.test.mjs +12 -1
  25. package/app-node/install_config.mjs +110 -3
  26. package/app-node/install_config.test.mjs +8 -0
  27. package/app-node/instance_doctor.test.mjs +9 -0
  28. package/app-node/instances.test.mjs +8 -1
  29. package/app-node/main.mjs +488 -1368
  30. package/app-node/mcp_tools.test.mjs +7 -0
  31. package/app-node/notification_handler.mjs +89 -0
  32. package/app-node/notification_handler.test.mjs +187 -0
  33. package/app-node/plan_dispatcher.mjs +215 -0
  34. package/app-node/plan_dispatcher.test.mjs +101 -0
  35. package/app-node/plan_mode.mjs +36 -7
  36. package/app-node/plan_mode.test.mjs +78 -0
  37. package/app-node/progress_handler.mjs +220 -0
  38. package/app-node/progress_handler.test.mjs +193 -0
  39. package/app-node/progress_speech.mjs +54 -32
  40. package/app-node/progress_speech.test.mjs +12 -3
  41. package/app-node/project_sessions.mjs +5 -2
  42. package/app-node/project_sessions.test.mjs +7 -0
  43. package/app-node/research_mode.mjs +282 -0
  44. package/app-node/research_mode.test.mjs +264 -0
  45. package/app-node/restart_notice.mjs +3 -0
  46. package/app-node/restart_notice.test.mjs +11 -0
  47. package/app-node/session_ontology.mjs +271 -0
  48. package/app-node/session_ontology.test.mjs +130 -0
  49. package/app-node/smart_progress.mjs +1 -1
  50. package/app-node/stream_sentencer.mjs +32 -2
  51. package/app-node/stream_sentencer.test.mjs +65 -0
  52. package/app-node/streaming_tts_queue.mjs +5 -1
  53. package/app-node/streaming_tts_queue.test.mjs +7 -1
  54. package/app-node/stt_whisper.mjs +24 -0
  55. package/app-node/stt_whisper.test.mjs +32 -0
  56. package/app-node/text_routing.mjs +4 -2
  57. package/app-node/tts_backends.mjs +537 -3
  58. package/app-node/tts_backends.test.mjs +454 -0
  59. package/app-node/tts_player.mjs +164 -0
  60. package/app-node/tts_player.test.mjs +202 -0
  61. package/app-node/tts_runtime.mjs +134 -0
  62. package/app-node/tts_runtime.test.mjs +89 -0
  63. package/app-node/tts_settings.mjs +150 -3
  64. package/app-node/tts_settings.test.mjs +204 -0
  65. package/app-node/tts_voice_config.mjs +136 -2
  66. package/app-node/tts_voice_config.test.mjs +94 -0
  67. package/app-node/utterance_router.mjs +216 -0
  68. package/app-node/utterance_router.test.mjs +236 -0
  69. package/app-node/voice_autojoin.mjs +37 -0
  70. package/app-node/voice_autojoin.test.mjs +59 -0
  71. package/app-node/voice_io.mjs +272 -0
  72. package/app-node/voice_io.test.mjs +102 -0
  73. package/app-node/voice_turn_runner.mjs +449 -0
  74. package/app-node/voice_turn_runner.test.mjs +289 -0
  75. package/docs/CONFIGURATION.md +12 -2
  76. package/docs/HARNESSES.md +58 -0
  77. package/docs/HARNESS_AIDER.md +50 -0
  78. package/docs/HARNESS_CLAUDE.md +56 -0
  79. package/docs/HARNESS_CODEX.md +56 -0
  80. package/docs/HARNESS_CURSOR.md +45 -0
  81. package/docs/HARNESS_GEMINI.md +45 -0
  82. package/docs/HARNESS_HERMES.md +57 -0
  83. package/docs/HARNESS_OPENCLAW.md +44 -0
  84. package/docs/HARNESS_OPENCODE.md +44 -0
  85. package/docs/README.md +1 -0
  86. package/docs/ROADMAP.md +20 -5
  87. package/docs/TTS_BACKENDS.md +227 -0
  88. package/docs/USAGE.md +22 -0
  89. package/docs/i18n/AGENTS.es.md +34 -0
  90. package/docs/i18n/AGENTS.fr.md +34 -0
  91. package/docs/i18n/AGENTS.ja.md +34 -0
  92. package/docs/i18n/AGENTS.ko.md +34 -0
  93. package/docs/i18n/AGENTS.ru.md +34 -0
  94. package/docs/i18n/AGENTS.zh.md +34 -0
  95. package/docs/i18n/HARNESSES.es.md +58 -0
  96. package/docs/i18n/HARNESSES.fr.md +58 -0
  97. package/docs/i18n/HARNESSES.ja.md +58 -0
  98. package/docs/i18n/HARNESSES.ko.md +58 -0
  99. package/docs/i18n/HARNESSES.ru.md +58 -0
  100. package/docs/i18n/HARNESSES.zh.md +58 -0
  101. package/docs/i18n/HARNESS_AIDER.es.md +48 -0
  102. package/docs/i18n/HARNESS_AIDER.fr.md +48 -0
  103. package/docs/i18n/HARNESS_AIDER.ja.md +50 -0
  104. package/docs/i18n/HARNESS_AIDER.ko.md +50 -0
  105. package/docs/i18n/HARNESS_AIDER.ru.md +48 -0
  106. package/docs/i18n/HARNESS_AIDER.zh.md +48 -0
  107. package/docs/i18n/HARNESS_CLAUDE.es.md +55 -0
  108. package/docs/i18n/HARNESS_CLAUDE.fr.md +55 -0
  109. package/docs/i18n/HARNESS_CLAUDE.ja.md +56 -0
  110. package/docs/i18n/HARNESS_CLAUDE.ko.md +56 -0
  111. package/docs/i18n/HARNESS_CLAUDE.ru.md +55 -0
  112. package/docs/i18n/HARNESS_CLAUDE.zh.md +56 -0
  113. package/docs/i18n/HARNESS_CODEX.es.md +55 -0
  114. package/docs/i18n/HARNESS_CODEX.fr.md +55 -0
  115. package/docs/i18n/HARNESS_CODEX.ja.md +56 -0
  116. package/docs/i18n/HARNESS_CODEX.ko.md +56 -0
  117. package/docs/i18n/HARNESS_CODEX.ru.md +55 -0
  118. package/docs/i18n/HARNESS_CODEX.zh.md +56 -0
  119. package/docs/i18n/HARNESS_CURSOR.es.md +42 -0
  120. package/docs/i18n/HARNESS_CURSOR.fr.md +42 -0
  121. package/docs/i18n/HARNESS_CURSOR.ja.md +45 -0
  122. package/docs/i18n/HARNESS_CURSOR.ko.md +45 -0
  123. package/docs/i18n/HARNESS_CURSOR.ru.md +42 -0
  124. package/docs/i18n/HARNESS_CURSOR.zh.md +42 -0
  125. package/docs/i18n/HARNESS_GEMINI.es.md +44 -0
  126. package/docs/i18n/HARNESS_GEMINI.fr.md +44 -0
  127. package/docs/i18n/HARNESS_GEMINI.ja.md +45 -0
  128. package/docs/i18n/HARNESS_GEMINI.ko.md +45 -0
  129. package/docs/i18n/HARNESS_GEMINI.ru.md +44 -0
  130. package/docs/i18n/HARNESS_GEMINI.zh.md +45 -0
  131. package/docs/i18n/HARNESS_HERMES.es.md +54 -0
  132. package/docs/i18n/HARNESS_HERMES.fr.md +54 -0
  133. package/docs/i18n/HARNESS_HERMES.ja.md +57 -0
  134. package/docs/i18n/HARNESS_HERMES.ko.md +57 -0
  135. package/docs/i18n/HARNESS_HERMES.ru.md +54 -0
  136. package/docs/i18n/HARNESS_HERMES.zh.md +57 -0
  137. package/docs/i18n/HARNESS_OPENCLAW.es.md +41 -0
  138. package/docs/i18n/HARNESS_OPENCLAW.fr.md +41 -0
  139. package/docs/i18n/HARNESS_OPENCLAW.ja.md +44 -0
  140. package/docs/i18n/HARNESS_OPENCLAW.ko.md +44 -0
  141. package/docs/i18n/HARNESS_OPENCLAW.ru.md +41 -0
  142. package/docs/i18n/HARNESS_OPENCLAW.zh.md +42 -0
  143. package/docs/i18n/HARNESS_OPENCODE.es.md +41 -0
  144. package/docs/i18n/HARNESS_OPENCODE.fr.md +41 -0
  145. package/docs/i18n/HARNESS_OPENCODE.ja.md +44 -0
  146. package/docs/i18n/HARNESS_OPENCODE.ko.md +44 -0
  147. package/docs/i18n/HARNESS_OPENCODE.ru.md +41 -0
  148. package/docs/i18n/HARNESS_OPENCODE.zh.md +44 -0
  149. package/docs/superpowers/plans/2026-05-14-cross-agent-voice-transfer.md +625 -0
  150. package/docs/superpowers/plans/2026-05-21-audio-overview-narrated-diffs.md +95 -0
  151. package/docs/superpowers/plans/2026-05-21-autoresearch-ontology.md +83 -0
  152. package/docs/superpowers/plans/2026-05-21-phase11-push-to-talk-wakeword-v2.md +77 -0
  153. package/docs/superpowers/plans/2026-05-21-phase12-multi-user-voice.md +147 -0
  154. package/docs/superpowers/plans/2026-05-21-phase14-verbalbench.md +136 -0
  155. package/docs/superpowers/plans/2026-05-21-phase15-phone-companion.md +72 -0
  156. package/integrations/fireredtts2/mlx_llm.py +183 -0
  157. package/integrations/fireredtts2/synth.py +156 -0
  158. package/integrations/fireredtts2/synth_mlx.py +196 -0
  159. package/integrations/mlxaudio/synth.py +74 -0
  160. package/integrations/neuttsair/synth.py +104 -0
  161. package/integrations/omnivoice/synth.py +110 -0
  162. package/package.json +6 -1
  163. package/scripts/cli.mjs +84 -0
  164. package/scripts/doctor.mjs +104 -4
  165. package/scripts/install.mjs +5 -1
  166. package/scripts/install_fireredtts2.sh +109 -0
  167. package/scripts/install_mlxaudio.sh +34 -0
  168. package/scripts/install_mossttsnano.sh +46 -0
  169. package/scripts/postinstall.mjs +34 -0
@@ -47,6 +47,93 @@ function baseSettings() {
47
47
  intraOpThreads: '',
48
48
  interOpThreads: '',
49
49
  },
50
+ omnivoice: {
51
+ python: '/project/.venv-omnivoice/bin/python',
52
+ model: 'k2-fsa/OmniVoice',
53
+ device: 'mps',
54
+ dtype: 'float16',
55
+ refAudio: '/project/voice-samples/me.wav',
56
+ refText: 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.',
57
+ language: 'ko',
58
+ speaker: 'warm korean male voice',
59
+ timeoutMs: 180000,
60
+ useForProgress: false,
61
+ },
62
+ qwen3tts: {
63
+ command: 'audio',
64
+ mode: 'custom',
65
+ model: '',
66
+ language: 'korean',
67
+ speaker: 'sohee',
68
+ instruct: 'calm conversational Korean',
69
+ refAudio: '/project/voice-samples/me.wav',
70
+ refText: 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.',
71
+ stream: true,
72
+ timeoutMs: 120000,
73
+ useForProgress: false,
74
+ },
75
+ fireredtts2: {
76
+ command: 'fireredtts2',
77
+ pretrainedDir: '/project/models/FireRedTTS2',
78
+ device: 'mps',
79
+ genType: 'monologue',
80
+ speaker: 'S1',
81
+ promptAudio: '/project/voice-samples/me.wav',
82
+ promptText: 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.',
83
+ useBf16: true,
84
+ timeoutMs: 180000,
85
+ useForProgress: false,
86
+ },
87
+ mossttsnano: {
88
+ command: 'python3',
89
+ script: '/project/vendor/MOSS-TTS-Nano/infer.py',
90
+ checkpoint: 'OpenMOSS-Team/MOSS-TTS-Nano',
91
+ audioTokenizer: 'OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano',
92
+ mode: 'voice_clone',
93
+ language: 'ko',
94
+ device: 'cpu',
95
+ dtype: 'float32',
96
+ promptAudio: '/project/voice-samples/me.wav',
97
+ promptText: 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.',
98
+ maxNewFrames: 256,
99
+ seed: '7',
100
+ timeoutMs: 120000,
101
+ useForProgress: false,
102
+ },
103
+ mossttsnano_mlx: {
104
+ python: 'python3',
105
+ script: '/project/integrations/mossttsnano_mlx/synth.py',
106
+ workerScript: '/project/integrations/mossttsnano_mlx/worker.py',
107
+ workerEnabled: false,
108
+ workerStartupTimeoutMs: 120000,
109
+ torchInferScript: '/project/vendor/MOSS-TTS-Nano/infer.py',
110
+ checkpoint: 'OpenMOSS-Team/MOSS-TTS-Nano',
111
+ audioTokenizer: 'OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano',
112
+ mode: 'voice_clone',
113
+ language: 'ko',
114
+ torchDevice: 'cpu',
115
+ torchDtype: 'float32',
116
+ promptAudio: '/project/voice-samples/me.wav',
117
+ promptText: 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.',
118
+ maxNewFrames: 120,
119
+ seed: '7',
120
+ timeoutMs: 180000,
121
+ useForProgress: false,
122
+ },
123
+ neuttsair: {
124
+ python: '/project/.venv-neuttsair/bin/python',
125
+ script: '/project/integrations/neuttsair/synth.py',
126
+ backboneRepo: 'neuphonic/neutts-air-q4-gguf',
127
+ backboneDevice: 'mps',
128
+ codecRepo: 'neuphonic/neucodec',
129
+ codecDevice: 'mps',
130
+ refAudio: '/project/voice-samples/me.wav',
131
+ refText: 'Reference voice text.',
132
+ language: 'en',
133
+ sampleRate: 24000,
134
+ timeoutMs: 120000,
135
+ useForProgress: false,
136
+ },
50
137
  };
51
138
  }
52
139
 
@@ -146,6 +233,29 @@ test('OpenVoice progress uses Edge fallback unless explicitly enabled', async ()
146
233
  assert.equal(calls[0].cmd, 'edge-tts');
147
234
  });
148
235
 
236
+ test('createTtsBackend forwards backend label to onFallback when non-edge backend errors', async () => {
237
+ const settings = { ...baseSettings(), backend: 'openvoice' };
238
+ const events = [];
239
+ const backend = createTtsBackend(settings, {
240
+ tmpdir: '/tmp',
241
+ existsSync: () => true,
242
+ statSync: () => ({ size: 123 }),
243
+ warn: () => {},
244
+ onFallback: payload => events.push(payload),
245
+ execFileAsync: async cmd => {
246
+ if (cmd.includes('.venv-openvoice')) throw new Error('openvoice missing');
247
+ },
248
+ });
249
+
250
+ await backend.synthesize('first', { kind: 'final' });
251
+ await backend.synthesize('second', { kind: 'final' });
252
+
253
+ assert.equal(events.length, 2);
254
+ assert.equal(events[0].backend, 'openvoice');
255
+ assert.equal(events[0].kind, 'final');
256
+ assert.ok(events[0].error instanceof Error);
257
+ });
258
+
149
259
  test('OpenVoice final synthesis falls back to Edge when wrapper fails', async () => {
150
260
  const calls = [];
151
261
  const settings = { ...baseSettings(), backend: 'openvoice' };
@@ -384,6 +494,350 @@ test('Supertonic falls back to Edge when local CLI fails', async () => {
384
494
  assert.ok(calls.some(call => /supertonic failed; falling back to edge/i.test(call.warn || '')));
385
495
  });
386
496
 
497
+ test('OmniVoice backend calls Python wrapper with model, reference sample, and output path', async () => {
498
+ const calls = [];
499
+ const settings = { ...baseSettings(), backend: 'omnivoice' };
500
+ const backend = createTtsBackend(settings, {
501
+ tmpdir: '/tmp',
502
+ existsSync: () => true,
503
+ statSync: () => ({ size: 999 }),
504
+ execFileAsync: async (cmd, args, options) => calls.push({ cmd, args, options }),
505
+ });
506
+
507
+ const out = await backend.synthesize('μ˜΄λ‹ˆλ³΄μ΄μŠ€ ν…ŒμŠ€νŠΈ', { kind: 'final' });
508
+
509
+ assert.equal(calls[0].cmd, '/project/.venv-omnivoice/bin/python');
510
+ assert.ok(calls[0].args.some(arg => String(arg).endsWith('integrations/omnivoice/synth.py')));
511
+ assert.ok(calls[0].args.includes('--model'));
512
+ assert.ok(calls[0].args.includes('k2-fsa/OmniVoice'));
513
+ assert.ok(calls[0].args.includes('--ref-audio'));
514
+ assert.ok(calls[0].args.includes('/project/voice-samples/me.wav'));
515
+ assert.ok(calls[0].args.includes('--ref-text'));
516
+ assert.ok(calls[0].args.includes('ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.'));
517
+ assert.ok(calls[0].args.includes('--speaker'));
518
+ assert.ok(calls[0].args.includes('warm korean male voice'));
519
+ assert.ok(calls[0].args.includes('--text'));
520
+ assert.ok(calls[0].args.includes('μ˜΄λ‹ˆλ³΄μ΄μŠ€ ν…ŒμŠ€νŠΈ'));
521
+ assert.equal(calls[0].options.timeout, 180000);
522
+ assert.match(out, /^\/tmp\/verbalcoding-omnivoice-/);
523
+ assert.deepEqual(backend.cacheKeyParts(), ['omnivoice', 'k2-fsa/OmniVoice', 'mps', 'float16', '/project/voice-samples/me.wav', 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.', 'ko', 'warm korean male voice']);
524
+ });
525
+
526
+ test('OmniVoice progress uses Edge fallback unless explicitly enabled', async () => {
527
+ const calls = [];
528
+ const settings = { ...baseSettings(), backend: 'omnivoice' };
529
+ const backend = createTtsBackend(settings, {
530
+ tmpdir: '/tmp',
531
+ existsSync: () => true,
532
+ statSync: () => ({ size: 123 }),
533
+ execFileAsync: async (cmd, args) => calls.push({ cmd, args }),
534
+ });
535
+
536
+ await backend.synthesize('μ§„ν–‰ μ•ˆλ‚΄', { kind: 'progress' });
537
+
538
+ assert.equal(calls[0].cmd, 'edge-tts');
539
+ });
540
+
541
+ test('OmniVoice falls back to Edge when Python wrapper fails', async () => {
542
+ const calls = [];
543
+ const settings = { ...baseSettings(), backend: 'omnivoice' };
544
+ const backend = createTtsBackend(settings, {
545
+ tmpdir: '/tmp',
546
+ existsSync: () => true,
547
+ statSync: () => ({ size: 123 }),
548
+ warn: (...args) => calls.push({ warn: args.join(' ') }),
549
+ execFileAsync: async (cmd, args) => {
550
+ calls.push({ cmd, args });
551
+ if (cmd.includes('.venv-omnivoice')) throw new Error('omnivoice missing');
552
+ },
553
+ });
554
+
555
+ await backend.synthesize('fallback', { kind: 'final' });
556
+
557
+ assert.ok(calls.some(call => call.cmd?.includes('.venv-omnivoice')));
558
+ assert.ok(calls.some(call => call.cmd === 'edge-tts'));
559
+ assert.ok(calls.some(call => /omnivoice failed; falling back to edge/i.test(call.warn || '')));
560
+ });
561
+
562
+ test('Qwen3 TTS backend calls audio CLI with qwen3 engine, speaker, language, and output path', async () => {
563
+ const calls = [];
564
+ const settings = { ...baseSettings(), backend: 'qwen3tts' };
565
+ const backend = createTtsBackend(settings, {
566
+ tmpdir: '/tmp',
567
+ existsSync: () => true,
568
+ statSync: () => ({ size: 999 }),
569
+ execFileAsync: async (cmd, args, options) => calls.push({ cmd, args, options }),
570
+ });
571
+
572
+ const out = await backend.synthesize('큐웬 ν‹°ν‹°μ—μŠ€ ν…ŒμŠ€νŠΈ', { kind: 'final' });
573
+
574
+ assert.equal(calls[0].cmd, 'audio');
575
+ assert.deepEqual(calls[0].args.slice(0, 5), ['speak', '큐웬 ν‹°ν‹°μ—μŠ€ ν…ŒμŠ€νŠΈ', '--engine', 'qwen3', '--output']);
576
+ assert.ok(calls[0].args.includes('--language'));
577
+ assert.ok(calls[0].args.includes('korean'));
578
+ assert.ok(calls[0].args.includes('--stream'));
579
+ assert.ok(calls[0].args.includes('--model'));
580
+ assert.ok(calls[0].args.includes('customVoice'));
581
+ assert.ok(calls[0].args.includes('--speaker'));
582
+ assert.ok(calls[0].args.includes('sohee'));
583
+ assert.ok(calls[0].args.includes('--instruct'));
584
+ assert.ok(calls[0].args.includes('calm conversational Korean'));
585
+ assert.equal(calls[0].options.timeout, 120000);
586
+ assert.match(out, /^\/tmp\/verbalcoding-qwen3tts-/);
587
+ assert.deepEqual(backend.cacheKeyParts(), ['qwen3tts', 'audio', 'custom', 'korean', 'sohee', 'calm conversational Korean', '/project/voice-samples/me.wav', 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.']);
588
+ });
589
+
590
+ test('Qwen3 TTS clone mode passes reference audio', async () => {
591
+ const calls = [];
592
+ const settings = { ...baseSettings(), backend: 'qwen3tts', qwen3tts: { ...baseSettings().qwen3tts, mode: 'clone' } };
593
+ const backend = createTtsBackend(settings, {
594
+ tmpdir: '/tmp',
595
+ existsSync: () => true,
596
+ statSync: () => ({ size: 999 }),
597
+ execFileAsync: async (cmd, args) => calls.push({ cmd, args }),
598
+ });
599
+
600
+ await backend.synthesize('볡제 μŒμ„± ν…ŒμŠ€νŠΈ', { kind: 'final' });
601
+
602
+ assert.ok(calls[0].args.includes('--model'));
603
+ assert.ok(calls[0].args.includes('base'));
604
+ assert.ok(calls[0].args.includes('--voice-sample'));
605
+ assert.ok(calls[0].args.includes('/project/voice-samples/me.wav'));
606
+ assert.equal(calls[0].args.includes('--speaker'), false);
607
+ });
608
+
609
+ test('Qwen3 TTS progress uses Edge fallback unless explicitly enabled', async () => {
610
+ const calls = [];
611
+ const settings = { ...baseSettings(), backend: 'qwen3tts' };
612
+ const backend = createTtsBackend(settings, {
613
+ tmpdir: '/tmp',
614
+ existsSync: () => true,
615
+ statSync: () => ({ size: 123 }),
616
+ execFileAsync: async (cmd, args) => calls.push({ cmd, args }),
617
+ });
618
+
619
+ await backend.synthesize('μ§„ν–‰ μ•ˆλ‚΄', { kind: 'progress' });
620
+
621
+ assert.equal(calls[0].cmd, 'edge-tts');
622
+ });
623
+
624
+ test('Qwen3 TTS falls back to Edge when local CLI fails', async () => {
625
+ const calls = [];
626
+ const settings = { ...baseSettings(), backend: 'qwen3tts' };
627
+ const backend = createTtsBackend(settings, {
628
+ tmpdir: '/tmp',
629
+ existsSync: () => true,
630
+ statSync: () => ({ size: 123 }),
631
+ warn: (...args) => calls.push({ warn: args.join(' ') }),
632
+ execFileAsync: async (cmd, args) => {
633
+ calls.push({ cmd, args });
634
+ if (cmd === 'audio') throw new Error('qwen3 tts missing');
635
+ },
636
+ });
637
+
638
+ await backend.synthesize('fallback', { kind: 'final' });
639
+
640
+ assert.ok(calls.some(call => call.cmd === 'audio'));
641
+ assert.ok(calls.some(call => call.cmd === 'edge-tts'));
642
+ assert.ok(calls.some(call => /qwen3tts failed; falling back to edge/i.test(call.warn || '')));
643
+ });
644
+
645
+ test('FireRedTTS-2 backend calls configured CLI with model, prompt, and output path', async () => {
646
+ const calls = [];
647
+ const settings = { ...baseSettings(), backend: 'fireredtts2' };
648
+ const backend = createTtsBackend(settings, {
649
+ tmpdir: '/tmp',
650
+ existsSync: () => true,
651
+ statSync: () => ({ size: 999 }),
652
+ execFileAsync: async (cmd, args, options) => calls.push({ cmd, args, options }),
653
+ });
654
+
655
+ const out = await backend.synthesize('νŒŒμ΄μ–΄λ ˆλ“œ ν…ŒμŠ€νŠΈ', { kind: 'final' });
656
+
657
+ assert.equal(calls[0].cmd, 'fireredtts2');
658
+ assert.deepEqual(calls[0].args.slice(0, 4), ['--text', 'νŒŒμ΄μ–΄λ ˆλ“œ ν…ŒμŠ€νŠΈ', '--output', calls[0].args[3]]);
659
+ assert.ok(calls[0].args.includes('--pretrained-dir'));
660
+ assert.ok(calls[0].args.includes('/project/models/FireRedTTS2'));
661
+ assert.ok(calls[0].args.includes('--prompt-audio'));
662
+ assert.ok(calls[0].args.includes('/project/voice-samples/me.wav'));
663
+ assert.ok(calls[0].args.includes('--bf16'));
664
+ assert.equal(calls[0].options.timeout, 180000);
665
+ assert.match(out, /^\/tmp\/verbalcoding-fireredtts2-/);
666
+ assert.deepEqual(backend.cacheKeyParts(), ['fireredtts2', 'fireredtts2', '/project/models/FireRedTTS2', 'mps', 'monologue', 'S1', '/project/voice-samples/me.wav', 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.', true]);
667
+ });
668
+
669
+ test('FireRedTTS-2 progress uses Edge fallback unless explicitly enabled', async () => {
670
+ const calls = [];
671
+ const settings = { ...baseSettings(), backend: 'fireredtts2' };
672
+ const backend = createTtsBackend(settings, {
673
+ tmpdir: '/tmp',
674
+ existsSync: () => true,
675
+ statSync: () => ({ size: 123 }),
676
+ execFileAsync: async (cmd, args) => calls.push({ cmd, args }),
677
+ });
678
+
679
+ await backend.synthesize('μ§„ν–‰ μ•ˆλ‚΄', { kind: 'progress' });
680
+
681
+ assert.equal(calls[0].cmd, 'edge-tts');
682
+ });
683
+
684
+ test('MOSS-TTS-Nano backend calls infer.py with checkpoint, prompt, and output path', async () => {
685
+ const calls = [];
686
+ const settings = { ...baseSettings(), backend: 'mossttsnano' };
687
+ const backend = createTtsBackend(settings, {
688
+ tmpdir: '/tmp',
689
+ existsSync: () => true,
690
+ statSync: () => ({ size: 999 }),
691
+ execFileAsync: async (cmd, args, options) => calls.push({ cmd, args, options }),
692
+ });
693
+
694
+ const out = await backend.synthesize('λͺ¨μŠ€ λ‚˜λ…Έ ν…ŒμŠ€νŠΈ', { kind: 'final' });
695
+
696
+ assert.equal(calls[0].cmd, 'python3');
697
+ assert.deepEqual(calls[0].args.slice(0, 5), ['/project/vendor/MOSS-TTS-Nano/infer.py', '--text', 'λͺ¨μŠ€ λ‚˜λ…Έ ν…ŒμŠ€νŠΈ', '--output-audio-path', calls[0].args[4]]);
698
+ assert.ok(calls[0].args.includes('--checkpoint'));
699
+ assert.ok(calls[0].args.includes('OpenMOSS-Team/MOSS-TTS-Nano'));
700
+ assert.ok(calls[0].args.includes('--audio-tokenizer-pretrained-name-or-path'));
701
+ assert.ok(calls[0].args.includes('--prompt-audio-path'));
702
+ assert.ok(calls[0].args.includes('/project/voice-samples/me.wav'));
703
+ assert.ok(calls[0].args.includes('--max-new-frames'));
704
+ assert.ok(calls[0].args.includes('256'));
705
+ assert.equal(calls[0].options.timeout, 120000);
706
+ assert.match(out, /^\/tmp\/verbalcoding-mossttsnano-/);
707
+ assert.deepEqual(backend.cacheKeyParts(), ['mossttsnano', 'python3', '/project/vendor/MOSS-TTS-Nano/infer.py', 'OpenMOSS-Team/MOSS-TTS-Nano', 'OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano', 'voice_clone', 'ko', 'cpu', 'float32', '/project/voice-samples/me.wav', 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.', 256, '7']);
708
+ });
709
+
710
+ test('MOSS-TTS-Nano falls back to Edge when local CLI fails', async () => {
711
+ const calls = [];
712
+ const settings = { ...baseSettings(), backend: 'mossttsnano' };
713
+ const backend = createTtsBackend(settings, {
714
+ tmpdir: '/tmp',
715
+ existsSync: () => true,
716
+ statSync: () => ({ size: 123 }),
717
+ warn: (...args) => calls.push({ warn: args.join(' ') }),
718
+ execFileAsync: async (cmd, args) => {
719
+ calls.push({ cmd, args });
720
+ if (cmd === 'python3') throw new Error('moss missing');
721
+ },
722
+ });
723
+
724
+ await backend.synthesize('fallback', { kind: 'final' });
725
+
726
+ assert.ok(calls.some(call => call.cmd === 'python3'));
727
+ assert.ok(calls.some(call => call.cmd === 'edge-tts'));
728
+ assert.ok(calls.some(call => /mossttsnano failed; falling back to edge/i.test(call.warn || '')));
729
+ });
730
+
731
+ test('MOSS-TTS-Nano MLX hybrid backend calls experimental synth wrapper', async () => {
732
+ const calls = [];
733
+ const settings = { ...baseSettings(), backend: 'mossttsnano_mlx' };
734
+ const backend = createTtsBackend(settings, {
735
+ tmpdir: '/tmp',
736
+ existsSync: () => true,
737
+ statSync: () => ({ size: 999 }),
738
+ execFileAsync: async (cmd, args, options) => calls.push({ cmd, args, options }),
739
+ });
740
+
741
+ const out = await backend.synthesize('λͺ¨μŠ€ μ— μ—˜μ—‘μŠ€ ν…ŒμŠ€νŠΈ', { kind: 'final' });
742
+
743
+ assert.equal(calls[0].cmd, 'python3');
744
+ assert.deepEqual(calls[0].args.slice(0, 5), ['/project/integrations/mossttsnano_mlx/synth.py', '--text', 'λͺ¨μŠ€ μ— μ—˜μ—‘μŠ€ ν…ŒμŠ€νŠΈ', '--output-audio-path', calls[0].args[4]]);
745
+ assert.ok(calls[0].args.includes('--torch-infer-script'));
746
+ assert.ok(calls[0].args.includes('/project/vendor/MOSS-TTS-Nano/infer.py'));
747
+ assert.ok(calls[0].args.includes('--torch-device'));
748
+ assert.ok(calls[0].args.includes('cpu'));
749
+ assert.ok(calls[0].args.includes('--torch-dtype'));
750
+ assert.ok(calls[0].args.includes('float32'));
751
+ assert.ok(calls[0].args.includes('--prompt-audio-path'));
752
+ assert.ok(calls[0].args.includes('/project/voice-samples/me.wav'));
753
+ assert.equal(calls[0].options.timeout, 180000);
754
+ assert.match(out, /^\/tmp\/verbalcoding-mossttsnano-mlx-/);
755
+ assert.deepEqual(backend.cacheKeyParts(), ['mossttsnano_mlx', 'subprocess', 'python3', '/project/integrations/mossttsnano_mlx/synth.py', '/project/integrations/mossttsnano_mlx/worker.py', '/project/vendor/MOSS-TTS-Nano/infer.py', 'OpenMOSS-Team/MOSS-TTS-Nano', 'OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano', 'voice_clone', 'ko', 'cpu', 'float32', '/project/voice-samples/me.wav', 'ν…ŒμŠ€νŠΈ κΈ°μ€€ μŒμ„±μž…λ‹ˆλ‹€.', 120, '7']);
756
+ });
757
+
758
+ test('MOSS-TTS-Nano MLX progress uses Edge fallback unless explicitly enabled', async () => {
759
+ const calls = [];
760
+ const settings = { ...baseSettings(), backend: 'mossttsnano_mlx' };
761
+ const backend = createTtsBackend(settings, {
762
+ tmpdir: '/tmp',
763
+ existsSync: () => true,
764
+ statSync: () => ({ size: 123 }),
765
+ execFileAsync: async (cmd, args) => calls.push({ cmd, args }),
766
+ });
767
+
768
+ await backend.synthesize('μ§„ν–‰ μ•ˆλ‚΄', { kind: 'progress' });
769
+
770
+ assert.equal(calls[0].cmd, 'edge-tts');
771
+ });
772
+
773
+ test('NeuTTS Air backend calls Python wrapper with GGUF backbone, reference sample, and output path', async () => {
774
+ const calls = [];
775
+ const settings = { ...baseSettings(), backend: 'neuttsair' };
776
+ const backend = createTtsBackend(settings, {
777
+ tmpdir: '/tmp',
778
+ existsSync: () => true,
779
+ statSync: () => ({ size: 999 }),
780
+ execFileAsync: async (cmd, args, options) => calls.push({ cmd, args, options }),
781
+ });
782
+
783
+ const out = await backend.synthesize('NeuTTS Air test', { kind: 'final' });
784
+
785
+ assert.equal(calls[0].cmd, '/project/.venv-neuttsair/bin/python');
786
+ assert.deepEqual(calls[0].args.slice(0, 5), ['/project/integrations/neuttsair/synth.py', '--text', 'NeuTTS Air test', '--output', calls[0].args[4]]);
787
+ assert.ok(calls[0].args.includes('--backbone-repo'));
788
+ assert.ok(calls[0].args.includes('neuphonic/neutts-air-q4-gguf'));
789
+ assert.ok(calls[0].args.includes('--backbone-device'));
790
+ assert.ok(calls[0].args.includes('mps'));
791
+ assert.ok(calls[0].args.includes('--codec-repo'));
792
+ assert.ok(calls[0].args.includes('neuphonic/neucodec'));
793
+ assert.ok(calls[0].args.includes('--codec-device'));
794
+ assert.ok(calls[0].args.includes('--ref-audio'));
795
+ assert.ok(calls[0].args.includes('/project/voice-samples/me.wav'));
796
+ assert.ok(calls[0].args.includes('--ref-text'));
797
+ assert.ok(calls[0].args.includes('Reference voice text.'));
798
+ assert.ok(calls[0].args.includes('--language'));
799
+ assert.ok(calls[0].args.includes('en'));
800
+ assert.equal(calls[0].options.timeout, 120000);
801
+ assert.match(out, /^\/tmp\/verbalcoding-neuttsair-/);
802
+ assert.deepEqual(backend.cacheKeyParts(), ['neuttsair', '/project/.venv-neuttsair/bin/python', '/project/integrations/neuttsair/synth.py', 'neuphonic/neutts-air-q4-gguf', 'mps', 'neuphonic/neucodec', 'mps', '/project/voice-samples/me.wav', 'Reference voice text.', 'en', 24000]);
803
+ });
804
+
805
+ test('NeuTTS Air progress uses Edge fallback unless explicitly enabled', async () => {
806
+ const calls = [];
807
+ const settings = { ...baseSettings(), backend: 'neuttsair' };
808
+ const backend = createTtsBackend(settings, {
809
+ tmpdir: '/tmp',
810
+ existsSync: () => true,
811
+ statSync: () => ({ size: 123 }),
812
+ execFileAsync: async (cmd, args) => calls.push({ cmd, args }),
813
+ });
814
+
815
+ await backend.synthesize('μ§„ν–‰ μ•ˆλ‚΄', { kind: 'progress' });
816
+
817
+ assert.equal(calls[0].cmd, 'edge-tts');
818
+ });
819
+
820
+ test('NeuTTS Air falls back to Edge when Python wrapper fails', async () => {
821
+ const calls = [];
822
+ const settings = { ...baseSettings(), backend: 'neuttsair' };
823
+ const backend = createTtsBackend(settings, {
824
+ tmpdir: '/tmp',
825
+ existsSync: () => true,
826
+ statSync: () => ({ size: 123 }),
827
+ warn: (...args) => calls.push({ warn: args.join(' ') }),
828
+ execFileAsync: async (cmd, args) => {
829
+ calls.push({ cmd, args });
830
+ if (cmd.includes('.venv-neuttsair')) throw new Error('neutts missing');
831
+ },
832
+ });
833
+
834
+ await backend.synthesize('fallback', { kind: 'final' });
835
+
836
+ assert.ok(calls.some(call => call.cmd?.includes('.venv-neuttsair')));
837
+ assert.ok(calls.some(call => call.cmd === 'edge-tts'));
838
+ assert.ok(calls.some(call => /neuttsair failed; falling back to edge/i.test(call.warn || '')));
839
+ });
840
+
387
841
  test('TTS backends omit signal option when no AbortSignal is provided', async () => {
388
842
  const calls = [];
389
843
  const backend = createTtsBackend(baseSettings(), {
@@ -0,0 +1,164 @@
1
+ // Text-to-speech playback pipeline: chunk text -> synth -> play through the
2
+ // shared @discordjs/voice AudioPlayer, with optional streaming (sentence-by-
3
+ // sentence) playback and barge-in cancellation.
4
+ //
5
+ // Phase 3 extraction from main.mjs. Reads/writes shared bridge state
6
+ // (connection, player, speaking, speechPlaybackGeneration, activeSentencer,
7
+ // activeStreamingQueue, streamingSpeechDelivered, ttsBackend) and calls back
8
+ // into helpers that still live in main.mjs (refreshTtsRuntimeConfig,
9
+ // waitEvent, sendText). Module-level imports keep the heavy dependencies
10
+ // (@discordjs/voice helpers, streaming utilities) out of main.mjs.
11
+
12
+ import fs from 'node:fs';
13
+ import { AudioPlayerStatus, StreamType, createAudioResource } from '@discordjs/voice';
14
+ import { splitForTTS } from './tts_chunks.mjs';
15
+ import { playChunkedTTSWithPrefetch } from './tts_prefetch.mjs';
16
+ import { createSentencer } from './stream_sentencer.mjs';
17
+ import { createStreamingTTSQueue } from './streaming_tts_queue.mjs';
18
+
19
+ export function createTtsPlayer(deps) {
20
+ const {
21
+ bridge,
22
+ settings,
23
+ log,
24
+ warn,
25
+ sleep,
26
+ sendText,
27
+ refreshTtsRuntimeConfig,
28
+ waitEvent,
29
+ isAbortError,
30
+ STREAMING_TTS_ENABLED,
31
+ } = deps;
32
+
33
+ async function synthTTS(text, signal) {
34
+ await refreshTtsRuntimeConfig();
35
+ let lastError = null;
36
+ for (let attempt = 1; attempt <= 3; attempt += 1) {
37
+ try {
38
+ log('final tts synth start', 'backend', bridge.ttsBackend.name, 'attempt', attempt, 'chars', String(text || '').length);
39
+ const out = await bridge.ttsBackend.synthesize(text, { signal, kind: 'final' });
40
+ log('final tts synth done', 'backend', bridge.ttsBackend.name, 'attempt', attempt, out, fs.statSync(out).size);
41
+ return out;
42
+ } catch (e) {
43
+ lastError = e;
44
+ if (isAbortError(e) || signal?.aborted) throw e;
45
+ warn('final tts synth failed', 'attempt', attempt, e?.stderr?.toString?.().slice(-500) || e?.message || e);
46
+ await sleep(1000 * attempt);
47
+ }
48
+ }
49
+ throw lastError;
50
+ }
51
+
52
+ async function playAudio(file, { deleteAfter = true } = {}) {
53
+ if (!bridge.connection) return;
54
+ bridge.speaking = true;
55
+ try {
56
+ const resource = createAudioResource(file, { inputType: StreamType.Arbitrary, inlineVolume: true });
57
+ resource.volume?.setVolume(settings.tts.volume);
58
+ bridge.player.play(resource);
59
+ bridge.connection.subscribe(bridge.player);
60
+ await waitEvent(bridge.player, AudioPlayerStatus.Idle, 120000).catch(() => {});
61
+ } finally {
62
+ bridge.speaking = false;
63
+ if (deleteAfter) fs.rm(file, { force: true }, () => {});
64
+ }
65
+ }
66
+
67
+ async function speakText(text, signal, metricsTurn = null, options = {}) {
68
+ const chunks = splitForTTS(text, settings.tts.maxChars);
69
+ if (!chunks.length) return;
70
+ if (options.mirrorText !== false) {
71
+ await sendText(`${options.mirrorPrefix || 'πŸ”Š μŒμ„±μœΌλ‘œ μ½λŠ” λ‚΄μš©'}:\n${String(text || '')}`);
72
+ }
73
+ log('TTS chunks', chunks.length, 'maxChars', settings.tts.maxChars, 'backend', bridge.ttsBackend.name);
74
+ const playbackGeneration = bridge.speechPlaybackGeneration;
75
+ const playbackStopped = () => playbackGeneration !== bridge.speechPlaybackGeneration;
76
+ let synthMs = 0;
77
+ let playMs = 0;
78
+ const ttsStart = Date.now();
79
+ await playChunkedTTSWithPrefetch(chunks, {
80
+ signal,
81
+ log,
82
+ synth: async chunk => {
83
+ if (playbackStopped()) return null;
84
+ const start = Date.now();
85
+ try { return await synthTTS(chunk, signal); }
86
+ finally { synthMs += Date.now() - start; }
87
+ },
88
+ play: async file => {
89
+ if (playbackStopped()) {
90
+ await fs.promises.rm(file, { force: true }).catch(() => {});
91
+ return;
92
+ }
93
+ const start = Date.now();
94
+ try { return await playAudio(file); }
95
+ finally { playMs += Date.now() - start; }
96
+ },
97
+ cleanup: file => fs.promises.rm(file, { force: true }),
98
+ });
99
+ metricsTurn?.stage('tts_synth', synthMs, { ttsChunks: chunks.length, spokenChars: String(text || '').length });
100
+ metricsTurn?.stage('tts_play', playMs);
101
+ metricsTurn?.stage('tts_total', Date.now() - ttsStart);
102
+ }
103
+
104
+ function beginStreamingTurn(signal) {
105
+ if (!STREAMING_TTS_ENABLED || !bridge.connection) return false;
106
+ bridge.streamingSpeechDelivered = false;
107
+ const sentencer = createSentencer({ minChars: 40, maxLatencyMs: 800 });
108
+ let streamingDropAnnounced = false;
109
+ const queue = createStreamingTTSQueue({
110
+ synth: async text => synthTTS(text, signal),
111
+ play: async file => playAudio(file, { deleteAfter: false }),
112
+ cleanup: async file => { try { await fs.promises.rm(file, { force: true }); } catch {} },
113
+ signal,
114
+ log,
115
+ onSynthError: () => {
116
+ if (streamingDropAnnounced || signal?.aborted) return;
117
+ streamingDropAnnounced = true;
118
+ const en = /^en/i.test(String(settings.voiceLanguage || ''));
119
+ const msg = en
120
+ ? 'Some sentences could not be spoken; check the text channel for the full answer.'
121
+ : '일뢀 λ¬Έμž₯ μŒμ„± 합성에 μ‹€νŒ¨ν–ˆμ–΄. 전체 닡변은 ν…μŠ€νŠΈ 채널을 ν™•μΈν•΄μ€˜.';
122
+ void sendText(`⚠️ ${msg}`).catch(e => warn('streaming synth notice send failed', e?.message || e));
123
+ },
124
+ });
125
+ sentencer.on('sentence', text => {
126
+ if (signal?.aborted) return;
127
+ queue.enqueue(text);
128
+ });
129
+ bridge.activeSentencer = sentencer;
130
+ bridge.activeStreamingQueue = queue;
131
+ log('streaming turn begin');
132
+ return true;
133
+ }
134
+
135
+ async function endStreamingTurn() {
136
+ const sentencer = bridge.activeSentencer;
137
+ const queue = bridge.activeStreamingQueue;
138
+ bridge.activeSentencer = null;
139
+ bridge.activeStreamingQueue = null;
140
+ if (!sentencer || !queue) return;
141
+ try { sentencer.flush(); } catch (e) { warn('streaming sentencer flush failed', e?.stack || e); }
142
+ try { await queue.drain(); } catch (e) { warn('streaming queue drain failed', e?.stack || e); }
143
+ bridge.streamingSpeechDelivered = queue.size === 0;
144
+ log('streaming turn end');
145
+ }
146
+
147
+ function stopPlaybackForBargeIn(userId, reason = 'playback-barge-in') {
148
+ if (!bridge.speaking) return false;
149
+ log('stop playback for barge-in', 'byUser', userId, 'reason', reason, 'speaking', bridge.speaking, 'processing', bridge.processing, 'turn', bridge.activeTurnId);
150
+ bridge.speechPlaybackGeneration += 1;
151
+ try { bridge.player.stop(true); } catch (e) { warn('stop playback failed', e?.stack || e); }
152
+ bridge.speaking = false;
153
+ return true;
154
+ }
155
+
156
+ return {
157
+ synthTTS,
158
+ playAudio,
159
+ speakText,
160
+ beginStreamingTurn,
161
+ endStreamingTurn,
162
+ stopPlaybackForBargeIn,
163
+ };
164
+ }