verbalcoding 0.2.12 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/.env.example +74 -4
  2. package/README.es.md +3 -1
  3. package/README.fr.md +3 -1
  4. package/README.ja.md +3 -1
  5. package/README.ko.md +4 -2
  6. package/README.md +4 -2
  7. package/README.ru.md +3 -1
  8. package/README.zh.md +3 -1
  9. package/app-node/agent_adapters.test.mjs +14 -0
  10. package/app-node/agent_routing.mjs +148 -0
  11. package/app-node/agent_routing.test.mjs +138 -0
  12. package/app-node/agent_turn.mjs +86 -0
  13. package/app-node/agent_turn.test.mjs +109 -0
  14. package/app-node/bridge_context.mjs +73 -0
  15. package/app-node/bridge_context.test.mjs +54 -0
  16. package/app-node/bridge_state.mjs +4 -0
  17. package/app-node/bridge_wireup.test.mjs +462 -0
  18. package/app-node/cli_install.test.mjs +31 -0
  19. package/app-node/cross_agent_routing.test.mjs +78 -0
  20. package/app-node/discord_command_router.mjs +204 -0
  21. package/app-node/discord_command_router.test.mjs +311 -0
  22. package/app-node/discord_voice_setup.mjs +251 -0
  23. package/app-node/discord_voice_setup.test.mjs +86 -0
  24. package/app-node/hermes_profiles.test.mjs +12 -1
  25. package/app-node/install_config.mjs +110 -3
  26. package/app-node/install_config.test.mjs +8 -0
  27. package/app-node/instance_doctor.test.mjs +9 -0
  28. package/app-node/instances.test.mjs +8 -1
  29. package/app-node/main.mjs +488 -1368
  30. package/app-node/mcp_tools.test.mjs +7 -0
  31. package/app-node/notification_handler.mjs +89 -0
  32. package/app-node/notification_handler.test.mjs +187 -0
  33. package/app-node/plan_dispatcher.mjs +215 -0
  34. package/app-node/plan_dispatcher.test.mjs +101 -0
  35. package/app-node/plan_mode.mjs +36 -7
  36. package/app-node/plan_mode.test.mjs +78 -0
  37. package/app-node/progress_handler.mjs +220 -0
  38. package/app-node/progress_handler.test.mjs +193 -0
  39. package/app-node/progress_speech.mjs +54 -32
  40. package/app-node/progress_speech.test.mjs +12 -3
  41. package/app-node/project_sessions.mjs +5 -2
  42. package/app-node/project_sessions.test.mjs +7 -0
  43. package/app-node/research_mode.mjs +282 -0
  44. package/app-node/research_mode.test.mjs +264 -0
  45. package/app-node/restart_notice.mjs +3 -0
  46. package/app-node/restart_notice.test.mjs +11 -0
  47. package/app-node/session_ontology.mjs +271 -0
  48. package/app-node/session_ontology.test.mjs +130 -0
  49. package/app-node/smart_progress.mjs +1 -1
  50. package/app-node/stream_sentencer.mjs +32 -2
  51. package/app-node/stream_sentencer.test.mjs +65 -0
  52. package/app-node/streaming_tts_queue.mjs +5 -1
  53. package/app-node/streaming_tts_queue.test.mjs +7 -1
  54. package/app-node/stt_whisper.mjs +24 -0
  55. package/app-node/stt_whisper.test.mjs +32 -0
  56. package/app-node/text_routing.mjs +4 -2
  57. package/app-node/tts_backends.mjs +537 -3
  58. package/app-node/tts_backends.test.mjs +454 -0
  59. package/app-node/tts_player.mjs +164 -0
  60. package/app-node/tts_player.test.mjs +202 -0
  61. package/app-node/tts_runtime.mjs +134 -0
  62. package/app-node/tts_runtime.test.mjs +89 -0
  63. package/app-node/tts_settings.mjs +150 -3
  64. package/app-node/tts_settings.test.mjs +204 -0
  65. package/app-node/tts_voice_config.mjs +136 -2
  66. package/app-node/tts_voice_config.test.mjs +94 -0
  67. package/app-node/utterance_router.mjs +216 -0
  68. package/app-node/utterance_router.test.mjs +236 -0
  69. package/app-node/voice_autojoin.mjs +37 -0
  70. package/app-node/voice_autojoin.test.mjs +59 -0
  71. package/app-node/voice_io.mjs +272 -0
  72. package/app-node/voice_io.test.mjs +102 -0
  73. package/app-node/voice_turn_runner.mjs +449 -0
  74. package/app-node/voice_turn_runner.test.mjs +289 -0
  75. package/docs/CONFIGURATION.md +12 -2
  76. package/docs/HARNESSES.md +58 -0
  77. package/docs/HARNESS_AIDER.md +50 -0
  78. package/docs/HARNESS_CLAUDE.md +56 -0
  79. package/docs/HARNESS_CODEX.md +56 -0
  80. package/docs/HARNESS_CURSOR.md +45 -0
  81. package/docs/HARNESS_GEMINI.md +45 -0
  82. package/docs/HARNESS_HERMES.md +57 -0
  83. package/docs/HARNESS_OPENCLAW.md +44 -0
  84. package/docs/HARNESS_OPENCODE.md +44 -0
  85. package/docs/README.md +1 -0
  86. package/docs/ROADMAP.md +20 -5
  87. package/docs/TTS_BACKENDS.md +227 -0
  88. package/docs/USAGE.md +22 -0
  89. package/docs/i18n/AGENTS.es.md +34 -0
  90. package/docs/i18n/AGENTS.fr.md +34 -0
  91. package/docs/i18n/AGENTS.ja.md +34 -0
  92. package/docs/i18n/AGENTS.ko.md +34 -0
  93. package/docs/i18n/AGENTS.ru.md +34 -0
  94. package/docs/i18n/AGENTS.zh.md +34 -0
  95. package/docs/i18n/HARNESSES.es.md +58 -0
  96. package/docs/i18n/HARNESSES.fr.md +58 -0
  97. package/docs/i18n/HARNESSES.ja.md +58 -0
  98. package/docs/i18n/HARNESSES.ko.md +58 -0
  99. package/docs/i18n/HARNESSES.ru.md +58 -0
  100. package/docs/i18n/HARNESSES.zh.md +58 -0
  101. package/docs/i18n/HARNESS_AIDER.es.md +48 -0
  102. package/docs/i18n/HARNESS_AIDER.fr.md +48 -0
  103. package/docs/i18n/HARNESS_AIDER.ja.md +50 -0
  104. package/docs/i18n/HARNESS_AIDER.ko.md +50 -0
  105. package/docs/i18n/HARNESS_AIDER.ru.md +48 -0
  106. package/docs/i18n/HARNESS_AIDER.zh.md +48 -0
  107. package/docs/i18n/HARNESS_CLAUDE.es.md +55 -0
  108. package/docs/i18n/HARNESS_CLAUDE.fr.md +55 -0
  109. package/docs/i18n/HARNESS_CLAUDE.ja.md +56 -0
  110. package/docs/i18n/HARNESS_CLAUDE.ko.md +56 -0
  111. package/docs/i18n/HARNESS_CLAUDE.ru.md +55 -0
  112. package/docs/i18n/HARNESS_CLAUDE.zh.md +56 -0
  113. package/docs/i18n/HARNESS_CODEX.es.md +55 -0
  114. package/docs/i18n/HARNESS_CODEX.fr.md +55 -0
  115. package/docs/i18n/HARNESS_CODEX.ja.md +56 -0
  116. package/docs/i18n/HARNESS_CODEX.ko.md +56 -0
  117. package/docs/i18n/HARNESS_CODEX.ru.md +55 -0
  118. package/docs/i18n/HARNESS_CODEX.zh.md +56 -0
  119. package/docs/i18n/HARNESS_CURSOR.es.md +42 -0
  120. package/docs/i18n/HARNESS_CURSOR.fr.md +42 -0
  121. package/docs/i18n/HARNESS_CURSOR.ja.md +45 -0
  122. package/docs/i18n/HARNESS_CURSOR.ko.md +45 -0
  123. package/docs/i18n/HARNESS_CURSOR.ru.md +42 -0
  124. package/docs/i18n/HARNESS_CURSOR.zh.md +42 -0
  125. package/docs/i18n/HARNESS_GEMINI.es.md +44 -0
  126. package/docs/i18n/HARNESS_GEMINI.fr.md +44 -0
  127. package/docs/i18n/HARNESS_GEMINI.ja.md +45 -0
  128. package/docs/i18n/HARNESS_GEMINI.ko.md +45 -0
  129. package/docs/i18n/HARNESS_GEMINI.ru.md +44 -0
  130. package/docs/i18n/HARNESS_GEMINI.zh.md +45 -0
  131. package/docs/i18n/HARNESS_HERMES.es.md +54 -0
  132. package/docs/i18n/HARNESS_HERMES.fr.md +54 -0
  133. package/docs/i18n/HARNESS_HERMES.ja.md +57 -0
  134. package/docs/i18n/HARNESS_HERMES.ko.md +57 -0
  135. package/docs/i18n/HARNESS_HERMES.ru.md +54 -0
  136. package/docs/i18n/HARNESS_HERMES.zh.md +57 -0
  137. package/docs/i18n/HARNESS_OPENCLAW.es.md +41 -0
  138. package/docs/i18n/HARNESS_OPENCLAW.fr.md +41 -0
  139. package/docs/i18n/HARNESS_OPENCLAW.ja.md +44 -0
  140. package/docs/i18n/HARNESS_OPENCLAW.ko.md +44 -0
  141. package/docs/i18n/HARNESS_OPENCLAW.ru.md +41 -0
  142. package/docs/i18n/HARNESS_OPENCLAW.zh.md +42 -0
  143. package/docs/i18n/HARNESS_OPENCODE.es.md +41 -0
  144. package/docs/i18n/HARNESS_OPENCODE.fr.md +41 -0
  145. package/docs/i18n/HARNESS_OPENCODE.ja.md +44 -0
  146. package/docs/i18n/HARNESS_OPENCODE.ko.md +44 -0
  147. package/docs/i18n/HARNESS_OPENCODE.ru.md +41 -0
  148. package/docs/i18n/HARNESS_OPENCODE.zh.md +44 -0
  149. package/docs/superpowers/plans/2026-05-14-cross-agent-voice-transfer.md +625 -0
  150. package/docs/superpowers/plans/2026-05-21-audio-overview-narrated-diffs.md +95 -0
  151. package/docs/superpowers/plans/2026-05-21-autoresearch-ontology.md +83 -0
  152. package/docs/superpowers/plans/2026-05-21-phase11-push-to-talk-wakeword-v2.md +77 -0
  153. package/docs/superpowers/plans/2026-05-21-phase12-multi-user-voice.md +147 -0
  154. package/docs/superpowers/plans/2026-05-21-phase14-verbalbench.md +136 -0
  155. package/docs/superpowers/plans/2026-05-21-phase15-phone-companion.md +72 -0
  156. package/integrations/fireredtts2/mlx_llm.py +183 -0
  157. package/integrations/fireredtts2/synth.py +156 -0
  158. package/integrations/fireredtts2/synth_mlx.py +196 -0
  159. package/integrations/mlxaudio/synth.py +74 -0
  160. package/integrations/neuttsair/synth.py +104 -0
  161. package/integrations/omnivoice/synth.py +110 -0
  162. package/package.json +6 -1
  163. package/scripts/cli.mjs +84 -0
  164. package/scripts/doctor.mjs +104 -4
  165. package/scripts/install.mjs +5 -1
  166. package/scripts/install_fireredtts2.sh +109 -0
  167. package/scripts/install_mlxaudio.sh +34 -0
  168. package/scripts/install_mossttsnano.sh +46 -0
  169. package/scripts/postinstall.mjs +34 -0
@@ -0,0 +1,183 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Optional
5
+
6
+ import numpy as np
7
+ import mlx.core as mx
8
+ import mlx.nn as nn
9
+ from mlx.utils import tree_unflatten
10
+ from mlx_lm.models.cache import KVCache
11
+ from mlx_lm.models.qwen2 import ModelArgs as Qwen2Args, TransformerBlock, create_attention_mask
12
+
13
+
14
+ @dataclass
15
+ class FireRedMLXArgs:
16
+ text_vocab_size: int = 151936
17
+ audio_vocab_size: int = 2051
18
+ audio_num_codebooks: int = 16
19
+ hidden_size: int = 1536
20
+ backbone_layers: int = 28
21
+ decoder_layers: int = 4
22
+ num_attention_heads: int = 12
23
+ num_key_value_heads: int = 2
24
+ intermediate_size: int = 8960
25
+ rms_norm_eps: float = 1e-6
26
+ max_position_embeddings: int = 4096
27
+ rope_theta: float = 1000000.0
28
+
29
+
30
+ def _qwen_args(num_layers: int, args: FireRedMLXArgs) -> Qwen2Args:
31
+ return Qwen2Args(
32
+ model_type="qwen2",
33
+ hidden_size=args.hidden_size,
34
+ num_hidden_layers=num_layers,
35
+ intermediate_size=args.intermediate_size,
36
+ num_attention_heads=args.num_attention_heads,
37
+ num_key_value_heads=args.num_key_value_heads,
38
+ rms_norm_eps=args.rms_norm_eps,
39
+ vocab_size=args.text_vocab_size,
40
+ max_position_embeddings=args.max_position_embeddings,
41
+ rope_theta=args.rope_theta,
42
+ tie_word_embeddings=True,
43
+ )
44
+
45
+
46
+ class QwenStack(nn.Module):
47
+ def __init__(self, num_layers: int, args: FireRedMLXArgs):
48
+ super().__init__()
49
+ qargs = _qwen_args(num_layers, args)
50
+ self.layers = [TransformerBlock(qargs) for _ in range(num_layers)]
51
+ self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
52
+
53
+ def __call__(self, h: mx.array, cache: Optional[list[Any]] = None) -> mx.array:
54
+ if cache is None:
55
+ cache = [None] * len(self.layers)
56
+ mask = create_attention_mask(h, cache[0])
57
+ for layer, c in zip(self.layers, cache):
58
+ h = layer(h, mask, c)
59
+ return self.norm(h)
60
+
61
+
62
+ class FireRedMLXLLM(nn.Module):
63
+ def __init__(self, args: FireRedMLXArgs | None = None):
64
+ super().__init__()
65
+ self.args = args or FireRedMLXArgs()
66
+ a = self.args
67
+ self.backbone = QwenStack(a.backbone_layers, a)
68
+ self.decoder = QwenStack(a.decoder_layers, a)
69
+ self.text_embeddings = nn.Embedding(a.text_vocab_size, a.hidden_size)
70
+ self.audio_embeddings = nn.Embedding(a.audio_vocab_size * a.audio_num_codebooks, a.hidden_size)
71
+ self.projection = nn.Linear(a.hidden_size, a.hidden_size, bias=False)
72
+ self.text_head = nn.Linear(a.hidden_size, a.text_vocab_size, bias=False)
73
+ self.codebook0_head = nn.Linear(a.hidden_size, a.audio_vocab_size, bias=False)
74
+ self.audio_head = mx.zeros((a.audio_num_codebooks - 1, a.hidden_size, a.audio_vocab_size))
75
+ self.backbone_cache: list[KVCache] | None = None
76
+ self.decoder_cache: list[KVCache] | None = None
77
+
78
+ def reset_caches(self) -> None:
79
+ self.backbone_cache = [KVCache() for _ in range(self.args.backbone_layers)]
80
+ self.decoder_cache = [KVCache() for _ in range(self.args.decoder_layers)]
81
+
82
+ def _embed_audio(self, codebook: int, tokens: mx.array) -> mx.array:
83
+ return self.audio_embeddings(tokens + codebook * self.args.audio_vocab_size)
84
+
85
+ def _embed_tokens(self, tokens: mx.array) -> mx.array:
86
+ text_embeds = mx.expand_dims(self.text_embeddings(tokens[:, :, -1]), -2)
87
+ offsets = self.args.audio_vocab_size * mx.arange(self.args.audio_num_codebooks)
88
+ audio_tokens = tokens[:, :, :-1] + offsets
89
+ flat = audio_tokens.reshape((-1,))
90
+ audio_embeds = self.audio_embeddings(flat).reshape(
91
+ (tokens.shape[0], tokens.shape[1], self.args.audio_num_codebooks, -1)
92
+ )
93
+ return mx.concatenate([audio_embeds, text_embeds], axis=-2)
94
+
95
+ @staticmethod
96
+ def _sample_topk(logits: mx.array, topk: int, temperature: float) -> mx.array:
97
+ # Logits are tiny here (audio vocab ~2k). Move to CPU for robust sampling while
98
+ # the rest of the heavy transformer math stays on MLX/Metal.
99
+ arr = np.array(logits / temperature, dtype=np.float64)
100
+ out = []
101
+ for row in arr:
102
+ k = min(topk, row.shape[-1])
103
+ idx = np.argpartition(row, -k)[-k:]
104
+ vals = row[idx]
105
+ vals = vals - vals.max()
106
+ probs = np.exp(vals)
107
+ probs = probs / probs.sum()
108
+ out.append(np.random.choice(idx, p=probs))
109
+ return mx.array(np.array(out, dtype=np.int32).reshape((-1, 1)))
110
+
111
+ def generate_frame(
112
+ self,
113
+ tokens: mx.array,
114
+ tokens_mask: mx.array,
115
+ temperature: float,
116
+ topk: int,
117
+ ) -> mx.array:
118
+ embeds = self._embed_tokens(tokens)
119
+ h = mx.sum(embeds * mx.expand_dims(tokens_mask, -1), axis=2)
120
+ h = self.backbone(h, self.backbone_cache)
121
+ last_h = h[:, -1, :]
122
+ c0_logits = self.codebook0_head(last_h)
123
+ c0_sample = self._sample_topk(c0_logits, topk, temperature)
124
+ c0_embed = self._embed_audio(0, c0_sample)
125
+ curr_h = mx.concatenate([mx.expand_dims(last_h, 1), c0_embed], axis=1)
126
+ curr_sample = c0_sample
127
+
128
+ # Decoder cache is per generated frame, matching the Torch implementation.
129
+ self.decoder_cache = [KVCache() for _ in range(self.args.decoder_layers)]
130
+ for i in range(1, self.args.audio_num_codebooks):
131
+ decoder_h = self.decoder(self.projection(curr_h), self.decoder_cache)
132
+ ci_logits = decoder_h[:, -1, :] @ self.audio_head[i - 1]
133
+ ci_sample = self._sample_topk(ci_logits, 10, 0.75)
134
+ ci_embed = self._embed_audio(i, ci_sample)
135
+ curr_h = ci_embed
136
+ curr_sample = mx.concatenate([curr_sample, ci_sample], axis=1)
137
+ mx.eval(curr_sample)
138
+ return curr_sample
139
+
140
+
141
+ def _map_qwen_key(prefix: str, key: str) -> str | None:
142
+ if not key.startswith(prefix + "."):
143
+ return None
144
+ rest = key[len(prefix) + 1 :]
145
+ rest = rest.replace("attn.output_proj.", "self_attn.o_proj.")
146
+ if ".attn." in rest:
147
+ rest = rest.replace(".attn.", ".self_attn.")
148
+ rest = rest.replace("mlp.w1.", "mlp.gate_proj.")
149
+ rest = rest.replace("mlp.w2.", "mlp.down_proj.")
150
+ rest = rest.replace("mlp.w3.", "mlp.up_proj.")
151
+ rest = rest.replace("sa_norm.scale", "input_layernorm.weight")
152
+ rest = rest.replace("mlp_norm.scale", "post_attention_layernorm.weight")
153
+ rest = rest.replace("norm.scale", "norm.weight")
154
+ return rest
155
+
156
+
157
+ def load_firered_mlx_from_state_dict(state_dict: dict[str, Any]) -> FireRedMLXLLM:
158
+ model = FireRedMLXLLM()
159
+ flat: list[tuple[str, mx.array]] = []
160
+ for key, value in state_dict.items():
161
+ if hasattr(value, "detach"):
162
+ value = value.detach().cpu().numpy()
163
+ arr = mx.array(value)
164
+ if key.startswith("backbone."):
165
+ mapped = _map_qwen_key("backbone", key)
166
+ if mapped:
167
+ flat.append(("backbone." + mapped, arr))
168
+ elif key.startswith("decoder."):
169
+ mapped = _map_qwen_key("decoder", key)
170
+ if mapped:
171
+ flat.append(("decoder." + mapped, arr))
172
+ elif key in {
173
+ "text_embeddings.weight",
174
+ "audio_embeddings.weight",
175
+ "projection.weight",
176
+ "text_head.weight",
177
+ "codebook0_head.weight",
178
+ "audio_head",
179
+ }:
180
+ flat.append((key, arr))
181
+ model.update(tree_unflatten(flat))
182
+ mx.eval(model.parameters())
183
+ return model
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env python3
2
+ """VerbalCoding FireRedTTS-2 synthesis wrapper.
3
+
4
+ This wrapper gives the Node bridge a stable CLI even though upstream FireRedTTS-2
5
+ is primarily documented as a Python API.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import os
11
+ import sys
12
+ from pathlib import Path
13
+
14
+
15
+ def _repo_root() -> Path:
16
+ return Path(__file__).resolve().parents[2]
17
+
18
+
19
+ def _resolve(root: Path, value: str | None) -> str | None:
20
+ if not value:
21
+ return None
22
+ p = Path(value).expanduser()
23
+ if not p.is_absolute():
24
+ p = root / p
25
+ return str(p)
26
+
27
+
28
+ def _auto_device(requested: str) -> str:
29
+ requested = (requested or "auto").lower()
30
+ if requested != "auto":
31
+ return requested
32
+ try:
33
+ import torch
34
+ if torch.cuda.is_available():
35
+ return "cuda"
36
+ if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
37
+ return "mps"
38
+ except Exception:
39
+ pass
40
+ return "cpu"
41
+
42
+
43
+ def parse_args() -> argparse.Namespace:
44
+ parser = argparse.ArgumentParser(description="Synthesize speech with FireRedTTS-2")
45
+ parser.add_argument("--text", required=True, help="Text to synthesize")
46
+ parser.add_argument("--output", required=True, help="Output WAV path")
47
+ parser.add_argument("--pretrained-dir", default="pretrained_models/FireRedTTS2")
48
+ parser.add_argument("--device", default="auto", help="auto | cuda | mps | cpu")
49
+ parser.add_argument("--gen-type", default="monologue", choices=["monologue", "dialogue"])
50
+ parser.add_argument("--speaker", default="S1", help="Speaker tag for monologue text; empty for raw text")
51
+ parser.add_argument("--prompt-audio", default="", help="Optional zero-shot prompt audio")
52
+ parser.add_argument("--prompt-text", default="", help="Transcript for prompt audio")
53
+ parser.add_argument("--temperature", type=float, default=0.9)
54
+ parser.add_argument("--topk", type=int, default=30)
55
+ parser.add_argument("--bf16", action="store_true", help="Use bfloat16 when upstream supports it")
56
+ return parser.parse_args()
57
+
58
+
59
+ def main() -> int:
60
+ args = parse_args()
61
+ root = _repo_root()
62
+ vendor = root / "vendor" / "FireRedTTS2"
63
+ if vendor.exists():
64
+ sys.path.insert(0, str(vendor))
65
+
66
+ try:
67
+ import torch
68
+ import soundfile as sf
69
+ from fireredtts2.fireredtts2 import FireRedTTS2
70
+ except Exception as exc:
71
+ print(
72
+ "FireRedTTS-2 Python dependencies are missing. Run `vc doctor` or "
73
+ "`scripts/install_fireredtts2.sh --yes` first.\n"
74
+ f"Import error: {exc}",
75
+ file=sys.stderr,
76
+ )
77
+ return 127
78
+
79
+ pretrained_dir = _resolve(root, args.pretrained_dir)
80
+ if not pretrained_dir or not Path(pretrained_dir).exists():
81
+ print(
82
+ f"FireRedTTS-2 pretrained model not found: {pretrained_dir}. "
83
+ "Run `vc doctor` to download it.",
84
+ file=sys.stderr,
85
+ )
86
+ return 66
87
+
88
+ prompt_audio = _resolve(root, args.prompt_audio)
89
+ prompt_text = args.prompt_text or None
90
+ if prompt_audio and not Path(prompt_audio).exists():
91
+ # Do not fail hard for a missing reference sample; random speaker mode is useful
92
+ # for first-run smoke tests and package installs.
93
+ prompt_audio = None
94
+ prompt_text = None
95
+
96
+ device = _auto_device(args.device)
97
+ output = Path(args.output).expanduser()
98
+ output.parent.mkdir(parents=True, exist_ok=True)
99
+
100
+ text = args.text.strip()
101
+ if args.gen_type == "monologue" and args.speaker and not text.startswith("["):
102
+ text = f"[{args.speaker}]{text}"
103
+
104
+ try:
105
+ if not torch.cuda.is_available():
106
+ original_torch_load = torch.load
107
+
108
+ def torch_load_with_map_location(*load_args, **load_kwargs):
109
+ load_kwargs.setdefault("map_location", torch.device(device))
110
+ return original_torch_load(*load_args, **load_kwargs)
111
+
112
+ torch.load = torch_load_with_map_location
113
+
114
+ model_kwargs = {
115
+ "pretrained_dir": pretrained_dir,
116
+ "gen_type": args.gen_type,
117
+ "device": device,
118
+ }
119
+ # Upstream added bf16 later; pass only if accepted.
120
+ if args.bf16:
121
+ model_kwargs["use_bf16"] = True
122
+ try:
123
+ tts = FireRedTTS2(**model_kwargs)
124
+ except TypeError:
125
+ model_kwargs.pop("use_bf16", None)
126
+ tts = FireRedTTS2(**model_kwargs)
127
+
128
+ generate_kwargs = {"text": text, "temperature": args.temperature, "topk": args.topk}
129
+ if prompt_audio:
130
+ generate_kwargs["prompt_wav"] = prompt_audio
131
+ if prompt_text:
132
+ generate_kwargs["prompt_text"] = prompt_text
133
+ try:
134
+ audio = tts.generate_monologue(**generate_kwargs)
135
+ except TypeError:
136
+ generate_kwargs.pop("temperature", None)
137
+ generate_kwargs.pop("topk", None)
138
+ audio = tts.generate_monologue(**generate_kwargs)
139
+
140
+ if hasattr(audio, "detach"):
141
+ audio = audio.detach().cpu()
142
+ if hasattr(audio, "numpy"):
143
+ audio_np = audio.numpy()
144
+ else:
145
+ audio_np = audio
146
+ if getattr(audio_np, "ndim", 1) == 2 and audio_np.shape[0] <= 8:
147
+ audio_np = audio_np.T
148
+ sf.write(str(output), audio_np, 24000)
149
+ except Exception as exc:
150
+ print(f"FireRedTTS-2 synthesis failed: {exc}", file=sys.stderr)
151
+ return 1
152
+ return 0
153
+
154
+
155
+ if __name__ == "__main__":
156
+ raise SystemExit(main())
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env python3
2
+ """Experimental MLX FireRedTTS-2 synthesis wrapper.
3
+
4
+ This ports the FireRedTTS-2 LLM token generator to MLX/Metal while keeping the
5
+ RedCodec encode/decode path in Torch. It is intended for Apple Silicon where the
6
+ upstream Torch Qwen generation path can hang or be unusably slow.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import os
13
+ import sys
14
+ from pathlib import Path
15
+
16
+
17
+ def _repo_root() -> Path:
18
+ return Path(__file__).resolve().parents[2]
19
+
20
+
21
+ def _resolve(root: Path, value: str | None) -> str | None:
22
+ if not value:
23
+ return None
24
+ p = Path(value).expanduser()
25
+ if not p.is_absolute():
26
+ p = root / p
27
+ return str(p)
28
+
29
+
30
+ def parse_args() -> argparse.Namespace:
31
+ parser = argparse.ArgumentParser(description="Synthesize speech with FireRedTTS-2 MLX LLM")
32
+ parser.add_argument("--text", required=True)
33
+ parser.add_argument("--output", required=True)
34
+ parser.add_argument("--pretrained-dir", default="pretrained_models/FireRedTTS2")
35
+ parser.add_argument("--device", default="mlx", help="accepted for compatibility; MLX chooses Metal automatically")
36
+ parser.add_argument("--gen-type", default="monologue", choices=["monologue", "dialogue"])
37
+ parser.add_argument("--speaker", default="S1")
38
+ parser.add_argument("--prompt-audio", default="")
39
+ parser.add_argument("--prompt-text", default="")
40
+ parser.add_argument("--temperature", type=float, default=0.9)
41
+ parser.add_argument("--topk", type=int, default=30)
42
+ parser.add_argument("--max-audio-ms", type=float, default=12_000)
43
+ parser.add_argument("--bf16", action="store_true", help="ignored; compatibility with torch wrapper")
44
+ return parser.parse_args()
45
+
46
+
47
+ def log(msg: str) -> None:
48
+ print(f"[firered-mlx] {msg}", file=sys.stderr, flush=True)
49
+
50
+
51
+ def main() -> int:
52
+ args = parse_args()
53
+ root = _repo_root()
54
+ vendor = root / "vendor" / "FireRedTTS2"
55
+ if vendor.exists():
56
+ sys.path.insert(0, str(vendor))
57
+ sys.path.insert(0, str(root))
58
+
59
+ try:
60
+ import numpy as np
61
+ import mlx.core as mx
62
+ import torch
63
+ import torchaudio
64
+ import soundfile as sf
65
+ from transformers import AutoTokenizer
66
+ from fireredtts2.codec import RedCodecInfer
67
+ from integrations.fireredtts2.mlx_llm import load_firered_mlx_from_state_dict
68
+ except Exception as exc:
69
+ print(f"FireRedTTS-2 MLX dependencies missing: {exc}", file=sys.stderr, flush=True)
70
+ return 127
71
+
72
+ pretrained_dir = Path(_resolve(root, args.pretrained_dir) or "")
73
+ if not pretrained_dir.exists():
74
+ print(f"FireRedTTS-2 pretrained model not found: {pretrained_dir}", file=sys.stderr, flush=True)
75
+ return 66
76
+
77
+ llm_ckpt = pretrained_dir / ("llm_pretrain.pt" if args.gen_type == "monologue" else "llm_posttrain.pt")
78
+ codec_config = pretrained_dir / "config_codec.json"
79
+ codec_ckpt = pretrained_dir / "codec.pt"
80
+ qwen_path = pretrained_dir / "Qwen2.5-1.5B"
81
+ output = Path(args.output).expanduser()
82
+ output.parent.mkdir(parents=True, exist_ok=True)
83
+
84
+ text = args.text.strip()
85
+ if args.gen_type == "monologue" and args.speaker and not text.startswith("["):
86
+ text = f"[{args.speaker}]{text}"
87
+
88
+ try:
89
+ log("loading MLX LLM checkpoint")
90
+ ckpt = torch.load(str(llm_ckpt), map_location="cpu", weights_only=False)["model"]
91
+ model = load_firered_mlx_from_state_dict(ckpt)
92
+ model.reset_caches()
93
+ del ckpt
94
+ log("MLX LLM loaded")
95
+
96
+ log("loading tokenizer")
97
+ tokenizer = AutoTokenizer.from_pretrained(str(qwen_path))
98
+
99
+ log("loading Torch codec")
100
+ original_torch_load = torch.load
101
+ def torch_load_with_map_location(*load_args, **load_kwargs):
102
+ load_kwargs.setdefault("map_location", torch.device("cpu"))
103
+ return original_torch_load(*load_args, **load_kwargs)
104
+ torch.load = torch_load_with_map_location
105
+ codec = RedCodecInfer.from_pretrained(str(codec_config), str(codec_ckpt)).eval()
106
+ torch.load = original_torch_load
107
+ # Keep codec on MPS if possible for decode/optional prompt encode.
108
+ codec_device = "mps" if torch.backends.mps.is_available() else "cpu"
109
+ codec = codec.to(codec_device)
110
+ log(f"codec loaded on {codec_device}")
111
+
112
+ frame_tokens = []
113
+ frame_masks = []
114
+ prompt_audio = _resolve(root, args.prompt_audio)
115
+ prompt_text = args.prompt_text or ""
116
+ if prompt_audio and Path(prompt_audio).exists():
117
+ log("encoding prompt audio with Torch codec")
118
+ audio, sr = torchaudio.load(prompt_audio)
119
+ if audio.shape[0] > 1:
120
+ audio = audio[0, :].unsqueeze(0)
121
+ audio16k = torchaudio.functional.resample(audio, sr, 16000)
122
+ audio_len = torch.tensor([audio16k.shape[1]], dtype=torch.long, device=codec_device)
123
+ audio_tokens, _ = codec.encode(audio16k.to(codec_device), audio_len, batch_size=24)
124
+ audio_tokens = audio_tokens.squeeze(0).detach().cpu().numpy()
125
+
126
+ speaker = f"[{args.speaker}]" if args.speaker and not args.speaker.startswith("[") else args.speaker
127
+ ptext = speaker + "<|text_start|>" + prompt_text + "<|text_end|>" if prompt_text else speaker + "<|text_start|><|text_end|>"
128
+ ids = tokenizer.encode(ptext)
129
+ tframe = mx.zeros((len(ids), 17), dtype=mx.int32)
130
+ tmask = mx.zeros((len(ids), 17), dtype=mx.bool_)
131
+ tframe[:, -1] = mx.array(ids, dtype=mx.int32)
132
+ tmask[:, -1] = True
133
+ frame_tokens.append(tframe)
134
+ frame_masks.append(tmask)
135
+
136
+ # add EOS frame after prompt audio
137
+ eos = np.zeros((audio_tokens.shape[0], 1), dtype=audio_tokens.dtype)
138
+ audio_tokens = np.concatenate([audio_tokens, eos], axis=1)
139
+ aframe = mx.zeros((audio_tokens.shape[1], 17), dtype=mx.int32)
140
+ amask = mx.zeros((audio_tokens.shape[1], 17), dtype=mx.bool_)
141
+ aframe[:, :-1] = mx.array(audio_tokens.T, dtype=mx.int32)
142
+ amask[:, :-1] = True
143
+ frame_tokens.append(aframe)
144
+ frame_masks.append(amask)
145
+
146
+ log("tokenizing target text")
147
+ speaker = ""
148
+ target = text
149
+ if text.startswith("[") and "]" in text:
150
+ speaker = text[: text.index("]") + 1]
151
+ target = text[text.index("]") + 1 :]
152
+ ids = tokenizer.encode(speaker + "<|text_start|>" + target + "<|text_end|>")
153
+ tframe = mx.zeros((len(ids), 17), dtype=mx.int32)
154
+ tmask = mx.zeros((len(ids), 17), dtype=mx.bool_)
155
+ tframe[:, -1] = mx.array(ids, dtype=mx.int32)
156
+ tmask[:, -1] = True
157
+ frame_tokens.append(tframe)
158
+ frame_masks.append(tmask)
159
+
160
+ curr_tokens = mx.expand_dims(mx.concatenate(frame_tokens, axis=0), 0)
161
+ curr_mask = mx.expand_dims(mx.concatenate(frame_masks, axis=0), 0)
162
+ mx.eval(curr_tokens, curr_mask)
163
+ log(f"prompt frames={curr_tokens.shape[1]}")
164
+
165
+ max_generation_len = max(1, int(args.max_audio_ms / 80))
166
+ samples = []
167
+ for i in range(max_generation_len):
168
+ sample = model.generate_frame(curr_tokens, curr_mask, args.temperature, args.topk)
169
+ mx.eval(sample)
170
+ if bool(mx.all(sample == 0).item()):
171
+ log(f"eos at frame {i}")
172
+ break
173
+ samples.append(sample)
174
+ zero = mx.zeros((1, 1), dtype=mx.int32)
175
+ curr_tokens = mx.expand_dims(mx.concatenate([sample, zero], axis=1), 1)
176
+ curr_mask = mx.expand_dims(mx.concatenate([mx.ones(sample.shape, dtype=mx.bool_), mx.zeros((1, 1), dtype=mx.bool_)], axis=1), 1)
177
+ if i == 0 or (i + 1) % 20 == 0:
178
+ log(f"generated frames={i+1}")
179
+ if not samples:
180
+ raise RuntimeError("MLX LLM produced no audio frames")
181
+
182
+ log("decoding audio tokens with Torch codec")
183
+ toks_np = mx.concatenate([mx.expand_dims(s, 0) for s in samples], axis=0)
184
+ toks_np = np.array(toks_np).transpose(1, 2, 0)
185
+ toks = torch.from_numpy(toks_np).long().to(codec_device)
186
+ audio = codec.decode(toks).squeeze(0).squeeze(0).detach().cpu().numpy()
187
+ sf.write(str(output), audio, 24000)
188
+ log(f"wrote {output}")
189
+ return 0
190
+ except Exception as exc:
191
+ print(f"FireRedTTS-2 MLX synthesis failed: {exc}", file=sys.stderr, flush=True)
192
+ return 1
193
+
194
+
195
+ if __name__ == "__main__":
196
+ raise SystemExit(main())
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env python3
2
+ """VerbalCoding mlx-audio synthesis wrapper."""
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from pathlib import Path
11
+
12
+
13
+ def parse_args() -> argparse.Namespace:
14
+ parser = argparse.ArgumentParser(description="Synthesize speech with mlx-audio")
15
+ parser.add_argument("--text", required=True)
16
+ parser.add_argument("--output", required=True)
17
+ parser.add_argument("--model", default="mlx-community/Qwen3-TTS-12Hz-1.7B-Base-8bit")
18
+ parser.add_argument("--voice", default="Chelsie")
19
+ parser.add_argument("--lang-code", default="ko")
20
+ parser.add_argument("--stream", action="store_true")
21
+ return parser.parse_args()
22
+
23
+
24
+ def newest_audio_file(directory: Path) -> Path | None:
25
+ candidates = []
26
+ for pattern in ("*.wav", "*.mp3", "*.flac", "*.m4a"):
27
+ candidates.extend(directory.glob(pattern))
28
+ if not candidates:
29
+ return None
30
+ return max(candidates, key=lambda p: p.stat().st_mtime)
31
+
32
+
33
+ def main() -> int:
34
+ args = parse_args()
35
+ output = Path(args.output).expanduser()
36
+ output.parent.mkdir(parents=True, exist_ok=True)
37
+ with tempfile.TemporaryDirectory(prefix="verbalcoding-mlxaudio-") as tmp:
38
+ out_dir = Path(tmp)
39
+ cmd = [
40
+ sys.executable,
41
+ "-m",
42
+ "mlx_audio.tts.generate",
43
+ "--model",
44
+ args.model,
45
+ "--text",
46
+ args.text,
47
+ "--voice",
48
+ args.voice,
49
+ "--lang_code",
50
+ args.lang_code,
51
+ "--output_path",
52
+ str(out_dir),
53
+ "--join_audio",
54
+ ]
55
+ if args.stream:
56
+ cmd.extend(["--stream", "--save"])
57
+ try:
58
+ subprocess.run(cmd, check=True, text=True, timeout=None)
59
+ except ModuleNotFoundError:
60
+ print("mlx-audio is not installed. Run scripts/install_mlxaudio.sh --yes first.", file=sys.stderr)
61
+ return 127
62
+ except subprocess.CalledProcessError as exc:
63
+ print(f"mlx-audio synthesis failed with exit {exc.returncode}", file=sys.stderr)
64
+ return exc.returncode or 1
65
+ audio = newest_audio_file(out_dir)
66
+ if not audio:
67
+ print("mlx-audio did not produce an audio file", file=sys.stderr)
68
+ return 66
69
+ shutil.copyfile(audio, output)
70
+ return 0
71
+
72
+
73
+ if __name__ == "__main__":
74
+ raise SystemExit(main())