verbalcoding 0.2.12 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +74 -4
- package/README.es.md +3 -1
- package/README.fr.md +3 -1
- package/README.ja.md +3 -1
- package/README.ko.md +4 -2
- package/README.md +4 -2
- package/README.ru.md +3 -1
- package/README.zh.md +3 -1
- package/app-node/agent_adapters.test.mjs +14 -0
- package/app-node/agent_routing.mjs +148 -0
- package/app-node/agent_routing.test.mjs +138 -0
- package/app-node/agent_turn.mjs +86 -0
- package/app-node/agent_turn.test.mjs +109 -0
- package/app-node/bridge_context.mjs +73 -0
- package/app-node/bridge_context.test.mjs +54 -0
- package/app-node/bridge_state.mjs +4 -0
- package/app-node/bridge_wireup.test.mjs +462 -0
- package/app-node/cli_install.test.mjs +31 -0
- package/app-node/cross_agent_routing.test.mjs +78 -0
- package/app-node/discord_command_router.mjs +204 -0
- package/app-node/discord_command_router.test.mjs +311 -0
- package/app-node/discord_voice_setup.mjs +251 -0
- package/app-node/discord_voice_setup.test.mjs +86 -0
- package/app-node/hermes_profiles.test.mjs +12 -1
- package/app-node/install_config.mjs +110 -3
- package/app-node/install_config.test.mjs +8 -0
- package/app-node/instance_doctor.test.mjs +9 -0
- package/app-node/instances.test.mjs +8 -1
- package/app-node/main.mjs +488 -1368
- package/app-node/mcp_tools.test.mjs +7 -0
- package/app-node/notification_handler.mjs +89 -0
- package/app-node/notification_handler.test.mjs +187 -0
- package/app-node/plan_dispatcher.mjs +215 -0
- package/app-node/plan_dispatcher.test.mjs +101 -0
- package/app-node/plan_mode.mjs +36 -7
- package/app-node/plan_mode.test.mjs +78 -0
- package/app-node/progress_handler.mjs +220 -0
- package/app-node/progress_handler.test.mjs +193 -0
- package/app-node/progress_speech.mjs +54 -32
- package/app-node/progress_speech.test.mjs +12 -3
- package/app-node/project_sessions.mjs +5 -2
- package/app-node/project_sessions.test.mjs +7 -0
- package/app-node/research_mode.mjs +282 -0
- package/app-node/research_mode.test.mjs +264 -0
- package/app-node/restart_notice.mjs +3 -0
- package/app-node/restart_notice.test.mjs +11 -0
- package/app-node/session_ontology.mjs +271 -0
- package/app-node/session_ontology.test.mjs +130 -0
- package/app-node/smart_progress.mjs +1 -1
- package/app-node/stream_sentencer.mjs +32 -2
- package/app-node/stream_sentencer.test.mjs +65 -0
- package/app-node/streaming_tts_queue.mjs +5 -1
- package/app-node/streaming_tts_queue.test.mjs +7 -1
- package/app-node/stt_whisper.mjs +24 -0
- package/app-node/stt_whisper.test.mjs +32 -0
- package/app-node/text_routing.mjs +4 -2
- package/app-node/tts_backends.mjs +537 -3
- package/app-node/tts_backends.test.mjs +454 -0
- package/app-node/tts_player.mjs +164 -0
- package/app-node/tts_player.test.mjs +202 -0
- package/app-node/tts_runtime.mjs +134 -0
- package/app-node/tts_runtime.test.mjs +89 -0
- package/app-node/tts_settings.mjs +150 -3
- package/app-node/tts_settings.test.mjs +204 -0
- package/app-node/tts_voice_config.mjs +136 -2
- package/app-node/tts_voice_config.test.mjs +94 -0
- package/app-node/utterance_router.mjs +216 -0
- package/app-node/utterance_router.test.mjs +236 -0
- package/app-node/voice_autojoin.mjs +37 -0
- package/app-node/voice_autojoin.test.mjs +59 -0
- package/app-node/voice_io.mjs +272 -0
- package/app-node/voice_io.test.mjs +102 -0
- package/app-node/voice_turn_runner.mjs +449 -0
- package/app-node/voice_turn_runner.test.mjs +289 -0
- package/docs/CONFIGURATION.md +12 -2
- package/docs/HARNESSES.md +58 -0
- package/docs/HARNESS_AIDER.md +50 -0
- package/docs/HARNESS_CLAUDE.md +56 -0
- package/docs/HARNESS_CODEX.md +56 -0
- package/docs/HARNESS_CURSOR.md +45 -0
- package/docs/HARNESS_GEMINI.md +45 -0
- package/docs/HARNESS_HERMES.md +57 -0
- package/docs/HARNESS_OPENCLAW.md +44 -0
- package/docs/HARNESS_OPENCODE.md +44 -0
- package/docs/README.md +1 -0
- package/docs/ROADMAP.md +20 -5
- package/docs/TTS_BACKENDS.md +227 -0
- package/docs/USAGE.md +22 -0
- package/docs/i18n/AGENTS.es.md +34 -0
- package/docs/i18n/AGENTS.fr.md +34 -0
- package/docs/i18n/AGENTS.ja.md +34 -0
- package/docs/i18n/AGENTS.ko.md +34 -0
- package/docs/i18n/AGENTS.ru.md +34 -0
- package/docs/i18n/AGENTS.zh.md +34 -0
- package/docs/i18n/HARNESSES.es.md +58 -0
- package/docs/i18n/HARNESSES.fr.md +58 -0
- package/docs/i18n/HARNESSES.ja.md +58 -0
- package/docs/i18n/HARNESSES.ko.md +58 -0
- package/docs/i18n/HARNESSES.ru.md +58 -0
- package/docs/i18n/HARNESSES.zh.md +58 -0
- package/docs/i18n/HARNESS_AIDER.es.md +48 -0
- package/docs/i18n/HARNESS_AIDER.fr.md +48 -0
- package/docs/i18n/HARNESS_AIDER.ja.md +50 -0
- package/docs/i18n/HARNESS_AIDER.ko.md +50 -0
- package/docs/i18n/HARNESS_AIDER.ru.md +48 -0
- package/docs/i18n/HARNESS_AIDER.zh.md +48 -0
- package/docs/i18n/HARNESS_CLAUDE.es.md +55 -0
- package/docs/i18n/HARNESS_CLAUDE.fr.md +55 -0
- package/docs/i18n/HARNESS_CLAUDE.ja.md +56 -0
- package/docs/i18n/HARNESS_CLAUDE.ko.md +56 -0
- package/docs/i18n/HARNESS_CLAUDE.ru.md +55 -0
- package/docs/i18n/HARNESS_CLAUDE.zh.md +56 -0
- package/docs/i18n/HARNESS_CODEX.es.md +55 -0
- package/docs/i18n/HARNESS_CODEX.fr.md +55 -0
- package/docs/i18n/HARNESS_CODEX.ja.md +56 -0
- package/docs/i18n/HARNESS_CODEX.ko.md +56 -0
- package/docs/i18n/HARNESS_CODEX.ru.md +55 -0
- package/docs/i18n/HARNESS_CODEX.zh.md +56 -0
- package/docs/i18n/HARNESS_CURSOR.es.md +42 -0
- package/docs/i18n/HARNESS_CURSOR.fr.md +42 -0
- package/docs/i18n/HARNESS_CURSOR.ja.md +45 -0
- package/docs/i18n/HARNESS_CURSOR.ko.md +45 -0
- package/docs/i18n/HARNESS_CURSOR.ru.md +42 -0
- package/docs/i18n/HARNESS_CURSOR.zh.md +42 -0
- package/docs/i18n/HARNESS_GEMINI.es.md +44 -0
- package/docs/i18n/HARNESS_GEMINI.fr.md +44 -0
- package/docs/i18n/HARNESS_GEMINI.ja.md +45 -0
- package/docs/i18n/HARNESS_GEMINI.ko.md +45 -0
- package/docs/i18n/HARNESS_GEMINI.ru.md +44 -0
- package/docs/i18n/HARNESS_GEMINI.zh.md +45 -0
- package/docs/i18n/HARNESS_HERMES.es.md +54 -0
- package/docs/i18n/HARNESS_HERMES.fr.md +54 -0
- package/docs/i18n/HARNESS_HERMES.ja.md +57 -0
- package/docs/i18n/HARNESS_HERMES.ko.md +57 -0
- package/docs/i18n/HARNESS_HERMES.ru.md +54 -0
- package/docs/i18n/HARNESS_HERMES.zh.md +57 -0
- package/docs/i18n/HARNESS_OPENCLAW.es.md +41 -0
- package/docs/i18n/HARNESS_OPENCLAW.fr.md +41 -0
- package/docs/i18n/HARNESS_OPENCLAW.ja.md +44 -0
- package/docs/i18n/HARNESS_OPENCLAW.ko.md +44 -0
- package/docs/i18n/HARNESS_OPENCLAW.ru.md +41 -0
- package/docs/i18n/HARNESS_OPENCLAW.zh.md +42 -0
- package/docs/i18n/HARNESS_OPENCODE.es.md +41 -0
- package/docs/i18n/HARNESS_OPENCODE.fr.md +41 -0
- package/docs/i18n/HARNESS_OPENCODE.ja.md +44 -0
- package/docs/i18n/HARNESS_OPENCODE.ko.md +44 -0
- package/docs/i18n/HARNESS_OPENCODE.ru.md +41 -0
- package/docs/i18n/HARNESS_OPENCODE.zh.md +44 -0
- package/docs/superpowers/plans/2026-05-14-cross-agent-voice-transfer.md +625 -0
- package/docs/superpowers/plans/2026-05-21-audio-overview-narrated-diffs.md +95 -0
- package/docs/superpowers/plans/2026-05-21-autoresearch-ontology.md +83 -0
- package/docs/superpowers/plans/2026-05-21-phase11-push-to-talk-wakeword-v2.md +77 -0
- package/docs/superpowers/plans/2026-05-21-phase12-multi-user-voice.md +147 -0
- package/docs/superpowers/plans/2026-05-21-phase14-verbalbench.md +136 -0
- package/docs/superpowers/plans/2026-05-21-phase15-phone-companion.md +72 -0
- package/integrations/fireredtts2/mlx_llm.py +183 -0
- package/integrations/fireredtts2/synth.py +156 -0
- package/integrations/fireredtts2/synth_mlx.py +196 -0
- package/integrations/mlxaudio/synth.py +74 -0
- package/integrations/neuttsair/synth.py +104 -0
- package/integrations/omnivoice/synth.py +110 -0
- package/package.json +6 -1
- package/scripts/cli.mjs +84 -0
- package/scripts/doctor.mjs +104 -4
- package/scripts/install.mjs +5 -1
- package/scripts/install_fireredtts2.sh +109 -0
- package/scripts/install_mlxaudio.sh +34 -0
- package/scripts/install_mossttsnano.sh +46 -0
- package/scripts/postinstall.mjs +34 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import mlx.core as mx
|
|
8
|
+
import mlx.nn as nn
|
|
9
|
+
from mlx.utils import tree_unflatten
|
|
10
|
+
from mlx_lm.models.cache import KVCache
|
|
11
|
+
from mlx_lm.models.qwen2 import ModelArgs as Qwen2Args, TransformerBlock, create_attention_mask
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class FireRedMLXArgs:
|
|
16
|
+
text_vocab_size: int = 151936
|
|
17
|
+
audio_vocab_size: int = 2051
|
|
18
|
+
audio_num_codebooks: int = 16
|
|
19
|
+
hidden_size: int = 1536
|
|
20
|
+
backbone_layers: int = 28
|
|
21
|
+
decoder_layers: int = 4
|
|
22
|
+
num_attention_heads: int = 12
|
|
23
|
+
num_key_value_heads: int = 2
|
|
24
|
+
intermediate_size: int = 8960
|
|
25
|
+
rms_norm_eps: float = 1e-6
|
|
26
|
+
max_position_embeddings: int = 4096
|
|
27
|
+
rope_theta: float = 1000000.0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _qwen_args(num_layers: int, args: FireRedMLXArgs) -> Qwen2Args:
|
|
31
|
+
return Qwen2Args(
|
|
32
|
+
model_type="qwen2",
|
|
33
|
+
hidden_size=args.hidden_size,
|
|
34
|
+
num_hidden_layers=num_layers,
|
|
35
|
+
intermediate_size=args.intermediate_size,
|
|
36
|
+
num_attention_heads=args.num_attention_heads,
|
|
37
|
+
num_key_value_heads=args.num_key_value_heads,
|
|
38
|
+
rms_norm_eps=args.rms_norm_eps,
|
|
39
|
+
vocab_size=args.text_vocab_size,
|
|
40
|
+
max_position_embeddings=args.max_position_embeddings,
|
|
41
|
+
rope_theta=args.rope_theta,
|
|
42
|
+
tie_word_embeddings=True,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class QwenStack(nn.Module):
|
|
47
|
+
def __init__(self, num_layers: int, args: FireRedMLXArgs):
|
|
48
|
+
super().__init__()
|
|
49
|
+
qargs = _qwen_args(num_layers, args)
|
|
50
|
+
self.layers = [TransformerBlock(qargs) for _ in range(num_layers)]
|
|
51
|
+
self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
|
|
52
|
+
|
|
53
|
+
def __call__(self, h: mx.array, cache: Optional[list[Any]] = None) -> mx.array:
|
|
54
|
+
if cache is None:
|
|
55
|
+
cache = [None] * len(self.layers)
|
|
56
|
+
mask = create_attention_mask(h, cache[0])
|
|
57
|
+
for layer, c in zip(self.layers, cache):
|
|
58
|
+
h = layer(h, mask, c)
|
|
59
|
+
return self.norm(h)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class FireRedMLXLLM(nn.Module):
|
|
63
|
+
def __init__(self, args: FireRedMLXArgs | None = None):
|
|
64
|
+
super().__init__()
|
|
65
|
+
self.args = args or FireRedMLXArgs()
|
|
66
|
+
a = self.args
|
|
67
|
+
self.backbone = QwenStack(a.backbone_layers, a)
|
|
68
|
+
self.decoder = QwenStack(a.decoder_layers, a)
|
|
69
|
+
self.text_embeddings = nn.Embedding(a.text_vocab_size, a.hidden_size)
|
|
70
|
+
self.audio_embeddings = nn.Embedding(a.audio_vocab_size * a.audio_num_codebooks, a.hidden_size)
|
|
71
|
+
self.projection = nn.Linear(a.hidden_size, a.hidden_size, bias=False)
|
|
72
|
+
self.text_head = nn.Linear(a.hidden_size, a.text_vocab_size, bias=False)
|
|
73
|
+
self.codebook0_head = nn.Linear(a.hidden_size, a.audio_vocab_size, bias=False)
|
|
74
|
+
self.audio_head = mx.zeros((a.audio_num_codebooks - 1, a.hidden_size, a.audio_vocab_size))
|
|
75
|
+
self.backbone_cache: list[KVCache] | None = None
|
|
76
|
+
self.decoder_cache: list[KVCache] | None = None
|
|
77
|
+
|
|
78
|
+
def reset_caches(self) -> None:
|
|
79
|
+
self.backbone_cache = [KVCache() for _ in range(self.args.backbone_layers)]
|
|
80
|
+
self.decoder_cache = [KVCache() for _ in range(self.args.decoder_layers)]
|
|
81
|
+
|
|
82
|
+
def _embed_audio(self, codebook: int, tokens: mx.array) -> mx.array:
|
|
83
|
+
return self.audio_embeddings(tokens + codebook * self.args.audio_vocab_size)
|
|
84
|
+
|
|
85
|
+
def _embed_tokens(self, tokens: mx.array) -> mx.array:
|
|
86
|
+
text_embeds = mx.expand_dims(self.text_embeddings(tokens[:, :, -1]), -2)
|
|
87
|
+
offsets = self.args.audio_vocab_size * mx.arange(self.args.audio_num_codebooks)
|
|
88
|
+
audio_tokens = tokens[:, :, :-1] + offsets
|
|
89
|
+
flat = audio_tokens.reshape((-1,))
|
|
90
|
+
audio_embeds = self.audio_embeddings(flat).reshape(
|
|
91
|
+
(tokens.shape[0], tokens.shape[1], self.args.audio_num_codebooks, -1)
|
|
92
|
+
)
|
|
93
|
+
return mx.concatenate([audio_embeds, text_embeds], axis=-2)
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _sample_topk(logits: mx.array, topk: int, temperature: float) -> mx.array:
|
|
97
|
+
# Logits are tiny here (audio vocab ~2k). Move to CPU for robust sampling while
|
|
98
|
+
# the rest of the heavy transformer math stays on MLX/Metal.
|
|
99
|
+
arr = np.array(logits / temperature, dtype=np.float64)
|
|
100
|
+
out = []
|
|
101
|
+
for row in arr:
|
|
102
|
+
k = min(topk, row.shape[-1])
|
|
103
|
+
idx = np.argpartition(row, -k)[-k:]
|
|
104
|
+
vals = row[idx]
|
|
105
|
+
vals = vals - vals.max()
|
|
106
|
+
probs = np.exp(vals)
|
|
107
|
+
probs = probs / probs.sum()
|
|
108
|
+
out.append(np.random.choice(idx, p=probs))
|
|
109
|
+
return mx.array(np.array(out, dtype=np.int32).reshape((-1, 1)))
|
|
110
|
+
|
|
111
|
+
def generate_frame(
|
|
112
|
+
self,
|
|
113
|
+
tokens: mx.array,
|
|
114
|
+
tokens_mask: mx.array,
|
|
115
|
+
temperature: float,
|
|
116
|
+
topk: int,
|
|
117
|
+
) -> mx.array:
|
|
118
|
+
embeds = self._embed_tokens(tokens)
|
|
119
|
+
h = mx.sum(embeds * mx.expand_dims(tokens_mask, -1), axis=2)
|
|
120
|
+
h = self.backbone(h, self.backbone_cache)
|
|
121
|
+
last_h = h[:, -1, :]
|
|
122
|
+
c0_logits = self.codebook0_head(last_h)
|
|
123
|
+
c0_sample = self._sample_topk(c0_logits, topk, temperature)
|
|
124
|
+
c0_embed = self._embed_audio(0, c0_sample)
|
|
125
|
+
curr_h = mx.concatenate([mx.expand_dims(last_h, 1), c0_embed], axis=1)
|
|
126
|
+
curr_sample = c0_sample
|
|
127
|
+
|
|
128
|
+
# Decoder cache is per generated frame, matching the Torch implementation.
|
|
129
|
+
self.decoder_cache = [KVCache() for _ in range(self.args.decoder_layers)]
|
|
130
|
+
for i in range(1, self.args.audio_num_codebooks):
|
|
131
|
+
decoder_h = self.decoder(self.projection(curr_h), self.decoder_cache)
|
|
132
|
+
ci_logits = decoder_h[:, -1, :] @ self.audio_head[i - 1]
|
|
133
|
+
ci_sample = self._sample_topk(ci_logits, 10, 0.75)
|
|
134
|
+
ci_embed = self._embed_audio(i, ci_sample)
|
|
135
|
+
curr_h = ci_embed
|
|
136
|
+
curr_sample = mx.concatenate([curr_sample, ci_sample], axis=1)
|
|
137
|
+
mx.eval(curr_sample)
|
|
138
|
+
return curr_sample
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _map_qwen_key(prefix: str, key: str) -> str | None:
|
|
142
|
+
if not key.startswith(prefix + "."):
|
|
143
|
+
return None
|
|
144
|
+
rest = key[len(prefix) + 1 :]
|
|
145
|
+
rest = rest.replace("attn.output_proj.", "self_attn.o_proj.")
|
|
146
|
+
if ".attn." in rest:
|
|
147
|
+
rest = rest.replace(".attn.", ".self_attn.")
|
|
148
|
+
rest = rest.replace("mlp.w1.", "mlp.gate_proj.")
|
|
149
|
+
rest = rest.replace("mlp.w2.", "mlp.down_proj.")
|
|
150
|
+
rest = rest.replace("mlp.w3.", "mlp.up_proj.")
|
|
151
|
+
rest = rest.replace("sa_norm.scale", "input_layernorm.weight")
|
|
152
|
+
rest = rest.replace("mlp_norm.scale", "post_attention_layernorm.weight")
|
|
153
|
+
rest = rest.replace("norm.scale", "norm.weight")
|
|
154
|
+
return rest
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def load_firered_mlx_from_state_dict(state_dict: dict[str, Any]) -> FireRedMLXLLM:
|
|
158
|
+
model = FireRedMLXLLM()
|
|
159
|
+
flat: list[tuple[str, mx.array]] = []
|
|
160
|
+
for key, value in state_dict.items():
|
|
161
|
+
if hasattr(value, "detach"):
|
|
162
|
+
value = value.detach().cpu().numpy()
|
|
163
|
+
arr = mx.array(value)
|
|
164
|
+
if key.startswith("backbone."):
|
|
165
|
+
mapped = _map_qwen_key("backbone", key)
|
|
166
|
+
if mapped:
|
|
167
|
+
flat.append(("backbone." + mapped, arr))
|
|
168
|
+
elif key.startswith("decoder."):
|
|
169
|
+
mapped = _map_qwen_key("decoder", key)
|
|
170
|
+
if mapped:
|
|
171
|
+
flat.append(("decoder." + mapped, arr))
|
|
172
|
+
elif key in {
|
|
173
|
+
"text_embeddings.weight",
|
|
174
|
+
"audio_embeddings.weight",
|
|
175
|
+
"projection.weight",
|
|
176
|
+
"text_head.weight",
|
|
177
|
+
"codebook0_head.weight",
|
|
178
|
+
"audio_head",
|
|
179
|
+
}:
|
|
180
|
+
flat.append((key, arr))
|
|
181
|
+
model.update(tree_unflatten(flat))
|
|
182
|
+
mx.eval(model.parameters())
|
|
183
|
+
return model
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""VerbalCoding FireRedTTS-2 synthesis wrapper.
|
|
3
|
+
|
|
4
|
+
This wrapper gives the Node bridge a stable CLI even though upstream FireRedTTS-2
|
|
5
|
+
is primarily documented as a Python API.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _repo_root() -> Path:
|
|
16
|
+
return Path(__file__).resolve().parents[2]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _resolve(root: Path, value: str | None) -> str | None:
|
|
20
|
+
if not value:
|
|
21
|
+
return None
|
|
22
|
+
p = Path(value).expanduser()
|
|
23
|
+
if not p.is_absolute():
|
|
24
|
+
p = root / p
|
|
25
|
+
return str(p)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _auto_device(requested: str) -> str:
|
|
29
|
+
requested = (requested or "auto").lower()
|
|
30
|
+
if requested != "auto":
|
|
31
|
+
return requested
|
|
32
|
+
try:
|
|
33
|
+
import torch
|
|
34
|
+
if torch.cuda.is_available():
|
|
35
|
+
return "cuda"
|
|
36
|
+
if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
|
|
37
|
+
return "mps"
|
|
38
|
+
except Exception:
|
|
39
|
+
pass
|
|
40
|
+
return "cpu"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse_args() -> argparse.Namespace:
|
|
44
|
+
parser = argparse.ArgumentParser(description="Synthesize speech with FireRedTTS-2")
|
|
45
|
+
parser.add_argument("--text", required=True, help="Text to synthesize")
|
|
46
|
+
parser.add_argument("--output", required=True, help="Output WAV path")
|
|
47
|
+
parser.add_argument("--pretrained-dir", default="pretrained_models/FireRedTTS2")
|
|
48
|
+
parser.add_argument("--device", default="auto", help="auto | cuda | mps | cpu")
|
|
49
|
+
parser.add_argument("--gen-type", default="monologue", choices=["monologue", "dialogue"])
|
|
50
|
+
parser.add_argument("--speaker", default="S1", help="Speaker tag for monologue text; empty for raw text")
|
|
51
|
+
parser.add_argument("--prompt-audio", default="", help="Optional zero-shot prompt audio")
|
|
52
|
+
parser.add_argument("--prompt-text", default="", help="Transcript for prompt audio")
|
|
53
|
+
parser.add_argument("--temperature", type=float, default=0.9)
|
|
54
|
+
parser.add_argument("--topk", type=int, default=30)
|
|
55
|
+
parser.add_argument("--bf16", action="store_true", help="Use bfloat16 when upstream supports it")
|
|
56
|
+
return parser.parse_args()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def main() -> int:
|
|
60
|
+
args = parse_args()
|
|
61
|
+
root = _repo_root()
|
|
62
|
+
vendor = root / "vendor" / "FireRedTTS2"
|
|
63
|
+
if vendor.exists():
|
|
64
|
+
sys.path.insert(0, str(vendor))
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
import torch
|
|
68
|
+
import soundfile as sf
|
|
69
|
+
from fireredtts2.fireredtts2 import FireRedTTS2
|
|
70
|
+
except Exception as exc:
|
|
71
|
+
print(
|
|
72
|
+
"FireRedTTS-2 Python dependencies are missing. Run `vc doctor` or "
|
|
73
|
+
"`scripts/install_fireredtts2.sh --yes` first.\n"
|
|
74
|
+
f"Import error: {exc}",
|
|
75
|
+
file=sys.stderr,
|
|
76
|
+
)
|
|
77
|
+
return 127
|
|
78
|
+
|
|
79
|
+
pretrained_dir = _resolve(root, args.pretrained_dir)
|
|
80
|
+
if not pretrained_dir or not Path(pretrained_dir).exists():
|
|
81
|
+
print(
|
|
82
|
+
f"FireRedTTS-2 pretrained model not found: {pretrained_dir}. "
|
|
83
|
+
"Run `vc doctor` to download it.",
|
|
84
|
+
file=sys.stderr,
|
|
85
|
+
)
|
|
86
|
+
return 66
|
|
87
|
+
|
|
88
|
+
prompt_audio = _resolve(root, args.prompt_audio)
|
|
89
|
+
prompt_text = args.prompt_text or None
|
|
90
|
+
if prompt_audio and not Path(prompt_audio).exists():
|
|
91
|
+
# Do not fail hard for a missing reference sample; random speaker mode is useful
|
|
92
|
+
# for first-run smoke tests and package installs.
|
|
93
|
+
prompt_audio = None
|
|
94
|
+
prompt_text = None
|
|
95
|
+
|
|
96
|
+
device = _auto_device(args.device)
|
|
97
|
+
output = Path(args.output).expanduser()
|
|
98
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
|
|
100
|
+
text = args.text.strip()
|
|
101
|
+
if args.gen_type == "monologue" and args.speaker and not text.startswith("["):
|
|
102
|
+
text = f"[{args.speaker}]{text}"
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
if not torch.cuda.is_available():
|
|
106
|
+
original_torch_load = torch.load
|
|
107
|
+
|
|
108
|
+
def torch_load_with_map_location(*load_args, **load_kwargs):
|
|
109
|
+
load_kwargs.setdefault("map_location", torch.device(device))
|
|
110
|
+
return original_torch_load(*load_args, **load_kwargs)
|
|
111
|
+
|
|
112
|
+
torch.load = torch_load_with_map_location
|
|
113
|
+
|
|
114
|
+
model_kwargs = {
|
|
115
|
+
"pretrained_dir": pretrained_dir,
|
|
116
|
+
"gen_type": args.gen_type,
|
|
117
|
+
"device": device,
|
|
118
|
+
}
|
|
119
|
+
# Upstream added bf16 later; pass only if accepted.
|
|
120
|
+
if args.bf16:
|
|
121
|
+
model_kwargs["use_bf16"] = True
|
|
122
|
+
try:
|
|
123
|
+
tts = FireRedTTS2(**model_kwargs)
|
|
124
|
+
except TypeError:
|
|
125
|
+
model_kwargs.pop("use_bf16", None)
|
|
126
|
+
tts = FireRedTTS2(**model_kwargs)
|
|
127
|
+
|
|
128
|
+
generate_kwargs = {"text": text, "temperature": args.temperature, "topk": args.topk}
|
|
129
|
+
if prompt_audio:
|
|
130
|
+
generate_kwargs["prompt_wav"] = prompt_audio
|
|
131
|
+
if prompt_text:
|
|
132
|
+
generate_kwargs["prompt_text"] = prompt_text
|
|
133
|
+
try:
|
|
134
|
+
audio = tts.generate_monologue(**generate_kwargs)
|
|
135
|
+
except TypeError:
|
|
136
|
+
generate_kwargs.pop("temperature", None)
|
|
137
|
+
generate_kwargs.pop("topk", None)
|
|
138
|
+
audio = tts.generate_monologue(**generate_kwargs)
|
|
139
|
+
|
|
140
|
+
if hasattr(audio, "detach"):
|
|
141
|
+
audio = audio.detach().cpu()
|
|
142
|
+
if hasattr(audio, "numpy"):
|
|
143
|
+
audio_np = audio.numpy()
|
|
144
|
+
else:
|
|
145
|
+
audio_np = audio
|
|
146
|
+
if getattr(audio_np, "ndim", 1) == 2 and audio_np.shape[0] <= 8:
|
|
147
|
+
audio_np = audio_np.T
|
|
148
|
+
sf.write(str(output), audio_np, 24000)
|
|
149
|
+
except Exception as exc:
|
|
150
|
+
print(f"FireRedTTS-2 synthesis failed: {exc}", file=sys.stderr)
|
|
151
|
+
return 1
|
|
152
|
+
return 0
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Experimental MLX FireRedTTS-2 synthesis wrapper.
|
|
3
|
+
|
|
4
|
+
This ports the FireRedTTS-2 LLM token generator to MLX/Metal while keeping the
|
|
5
|
+
RedCodec encode/decode path in Torch. It is intended for Apple Silicon where the
|
|
6
|
+
upstream Torch Qwen generation path can hang or be unusably slow.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _repo_root() -> Path:
|
|
18
|
+
return Path(__file__).resolve().parents[2]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _resolve(root: Path, value: str | None) -> str | None:
|
|
22
|
+
if not value:
|
|
23
|
+
return None
|
|
24
|
+
p = Path(value).expanduser()
|
|
25
|
+
if not p.is_absolute():
|
|
26
|
+
p = root / p
|
|
27
|
+
return str(p)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_args() -> argparse.Namespace:
|
|
31
|
+
parser = argparse.ArgumentParser(description="Synthesize speech with FireRedTTS-2 MLX LLM")
|
|
32
|
+
parser.add_argument("--text", required=True)
|
|
33
|
+
parser.add_argument("--output", required=True)
|
|
34
|
+
parser.add_argument("--pretrained-dir", default="pretrained_models/FireRedTTS2")
|
|
35
|
+
parser.add_argument("--device", default="mlx", help="accepted for compatibility; MLX chooses Metal automatically")
|
|
36
|
+
parser.add_argument("--gen-type", default="monologue", choices=["monologue", "dialogue"])
|
|
37
|
+
parser.add_argument("--speaker", default="S1")
|
|
38
|
+
parser.add_argument("--prompt-audio", default="")
|
|
39
|
+
parser.add_argument("--prompt-text", default="")
|
|
40
|
+
parser.add_argument("--temperature", type=float, default=0.9)
|
|
41
|
+
parser.add_argument("--topk", type=int, default=30)
|
|
42
|
+
parser.add_argument("--max-audio-ms", type=float, default=12_000)
|
|
43
|
+
parser.add_argument("--bf16", action="store_true", help="ignored; compatibility with torch wrapper")
|
|
44
|
+
return parser.parse_args()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def log(msg: str) -> None:
|
|
48
|
+
print(f"[firered-mlx] {msg}", file=sys.stderr, flush=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main() -> int:
|
|
52
|
+
args = parse_args()
|
|
53
|
+
root = _repo_root()
|
|
54
|
+
vendor = root / "vendor" / "FireRedTTS2"
|
|
55
|
+
if vendor.exists():
|
|
56
|
+
sys.path.insert(0, str(vendor))
|
|
57
|
+
sys.path.insert(0, str(root))
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
import numpy as np
|
|
61
|
+
import mlx.core as mx
|
|
62
|
+
import torch
|
|
63
|
+
import torchaudio
|
|
64
|
+
import soundfile as sf
|
|
65
|
+
from transformers import AutoTokenizer
|
|
66
|
+
from fireredtts2.codec import RedCodecInfer
|
|
67
|
+
from integrations.fireredtts2.mlx_llm import load_firered_mlx_from_state_dict
|
|
68
|
+
except Exception as exc:
|
|
69
|
+
print(f"FireRedTTS-2 MLX dependencies missing: {exc}", file=sys.stderr, flush=True)
|
|
70
|
+
return 127
|
|
71
|
+
|
|
72
|
+
pretrained_dir = Path(_resolve(root, args.pretrained_dir) or "")
|
|
73
|
+
if not pretrained_dir.exists():
|
|
74
|
+
print(f"FireRedTTS-2 pretrained model not found: {pretrained_dir}", file=sys.stderr, flush=True)
|
|
75
|
+
return 66
|
|
76
|
+
|
|
77
|
+
llm_ckpt = pretrained_dir / ("llm_pretrain.pt" if args.gen_type == "monologue" else "llm_posttrain.pt")
|
|
78
|
+
codec_config = pretrained_dir / "config_codec.json"
|
|
79
|
+
codec_ckpt = pretrained_dir / "codec.pt"
|
|
80
|
+
qwen_path = pretrained_dir / "Qwen2.5-1.5B"
|
|
81
|
+
output = Path(args.output).expanduser()
|
|
82
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
|
|
84
|
+
text = args.text.strip()
|
|
85
|
+
if args.gen_type == "monologue" and args.speaker and not text.startswith("["):
|
|
86
|
+
text = f"[{args.speaker}]{text}"
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
log("loading MLX LLM checkpoint")
|
|
90
|
+
ckpt = torch.load(str(llm_ckpt), map_location="cpu", weights_only=False)["model"]
|
|
91
|
+
model = load_firered_mlx_from_state_dict(ckpt)
|
|
92
|
+
model.reset_caches()
|
|
93
|
+
del ckpt
|
|
94
|
+
log("MLX LLM loaded")
|
|
95
|
+
|
|
96
|
+
log("loading tokenizer")
|
|
97
|
+
tokenizer = AutoTokenizer.from_pretrained(str(qwen_path))
|
|
98
|
+
|
|
99
|
+
log("loading Torch codec")
|
|
100
|
+
original_torch_load = torch.load
|
|
101
|
+
def torch_load_with_map_location(*load_args, **load_kwargs):
|
|
102
|
+
load_kwargs.setdefault("map_location", torch.device("cpu"))
|
|
103
|
+
return original_torch_load(*load_args, **load_kwargs)
|
|
104
|
+
torch.load = torch_load_with_map_location
|
|
105
|
+
codec = RedCodecInfer.from_pretrained(str(codec_config), str(codec_ckpt)).eval()
|
|
106
|
+
torch.load = original_torch_load
|
|
107
|
+
# Keep codec on MPS if possible for decode/optional prompt encode.
|
|
108
|
+
codec_device = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
109
|
+
codec = codec.to(codec_device)
|
|
110
|
+
log(f"codec loaded on {codec_device}")
|
|
111
|
+
|
|
112
|
+
frame_tokens = []
|
|
113
|
+
frame_masks = []
|
|
114
|
+
prompt_audio = _resolve(root, args.prompt_audio)
|
|
115
|
+
prompt_text = args.prompt_text or ""
|
|
116
|
+
if prompt_audio and Path(prompt_audio).exists():
|
|
117
|
+
log("encoding prompt audio with Torch codec")
|
|
118
|
+
audio, sr = torchaudio.load(prompt_audio)
|
|
119
|
+
if audio.shape[0] > 1:
|
|
120
|
+
audio = audio[0, :].unsqueeze(0)
|
|
121
|
+
audio16k = torchaudio.functional.resample(audio, sr, 16000)
|
|
122
|
+
audio_len = torch.tensor([audio16k.shape[1]], dtype=torch.long, device=codec_device)
|
|
123
|
+
audio_tokens, _ = codec.encode(audio16k.to(codec_device), audio_len, batch_size=24)
|
|
124
|
+
audio_tokens = audio_tokens.squeeze(0).detach().cpu().numpy()
|
|
125
|
+
|
|
126
|
+
speaker = f"[{args.speaker}]" if args.speaker and not args.speaker.startswith("[") else args.speaker
|
|
127
|
+
ptext = speaker + "<|text_start|>" + prompt_text + "<|text_end|>" if prompt_text else speaker + "<|text_start|><|text_end|>"
|
|
128
|
+
ids = tokenizer.encode(ptext)
|
|
129
|
+
tframe = mx.zeros((len(ids), 17), dtype=mx.int32)
|
|
130
|
+
tmask = mx.zeros((len(ids), 17), dtype=mx.bool_)
|
|
131
|
+
tframe[:, -1] = mx.array(ids, dtype=mx.int32)
|
|
132
|
+
tmask[:, -1] = True
|
|
133
|
+
frame_tokens.append(tframe)
|
|
134
|
+
frame_masks.append(tmask)
|
|
135
|
+
|
|
136
|
+
# add EOS frame after prompt audio
|
|
137
|
+
eos = np.zeros((audio_tokens.shape[0], 1), dtype=audio_tokens.dtype)
|
|
138
|
+
audio_tokens = np.concatenate([audio_tokens, eos], axis=1)
|
|
139
|
+
aframe = mx.zeros((audio_tokens.shape[1], 17), dtype=mx.int32)
|
|
140
|
+
amask = mx.zeros((audio_tokens.shape[1], 17), dtype=mx.bool_)
|
|
141
|
+
aframe[:, :-1] = mx.array(audio_tokens.T, dtype=mx.int32)
|
|
142
|
+
amask[:, :-1] = True
|
|
143
|
+
frame_tokens.append(aframe)
|
|
144
|
+
frame_masks.append(amask)
|
|
145
|
+
|
|
146
|
+
log("tokenizing target text")
|
|
147
|
+
speaker = ""
|
|
148
|
+
target = text
|
|
149
|
+
if text.startswith("[") and "]" in text:
|
|
150
|
+
speaker = text[: text.index("]") + 1]
|
|
151
|
+
target = text[text.index("]") + 1 :]
|
|
152
|
+
ids = tokenizer.encode(speaker + "<|text_start|>" + target + "<|text_end|>")
|
|
153
|
+
tframe = mx.zeros((len(ids), 17), dtype=mx.int32)
|
|
154
|
+
tmask = mx.zeros((len(ids), 17), dtype=mx.bool_)
|
|
155
|
+
tframe[:, -1] = mx.array(ids, dtype=mx.int32)
|
|
156
|
+
tmask[:, -1] = True
|
|
157
|
+
frame_tokens.append(tframe)
|
|
158
|
+
frame_masks.append(tmask)
|
|
159
|
+
|
|
160
|
+
curr_tokens = mx.expand_dims(mx.concatenate(frame_tokens, axis=0), 0)
|
|
161
|
+
curr_mask = mx.expand_dims(mx.concatenate(frame_masks, axis=0), 0)
|
|
162
|
+
mx.eval(curr_tokens, curr_mask)
|
|
163
|
+
log(f"prompt frames={curr_tokens.shape[1]}")
|
|
164
|
+
|
|
165
|
+
max_generation_len = max(1, int(args.max_audio_ms / 80))
|
|
166
|
+
samples = []
|
|
167
|
+
for i in range(max_generation_len):
|
|
168
|
+
sample = model.generate_frame(curr_tokens, curr_mask, args.temperature, args.topk)
|
|
169
|
+
mx.eval(sample)
|
|
170
|
+
if bool(mx.all(sample == 0).item()):
|
|
171
|
+
log(f"eos at frame {i}")
|
|
172
|
+
break
|
|
173
|
+
samples.append(sample)
|
|
174
|
+
zero = mx.zeros((1, 1), dtype=mx.int32)
|
|
175
|
+
curr_tokens = mx.expand_dims(mx.concatenate([sample, zero], axis=1), 1)
|
|
176
|
+
curr_mask = mx.expand_dims(mx.concatenate([mx.ones(sample.shape, dtype=mx.bool_), mx.zeros((1, 1), dtype=mx.bool_)], axis=1), 1)
|
|
177
|
+
if i == 0 or (i + 1) % 20 == 0:
|
|
178
|
+
log(f"generated frames={i+1}")
|
|
179
|
+
if not samples:
|
|
180
|
+
raise RuntimeError("MLX LLM produced no audio frames")
|
|
181
|
+
|
|
182
|
+
log("decoding audio tokens with Torch codec")
|
|
183
|
+
toks_np = mx.concatenate([mx.expand_dims(s, 0) for s in samples], axis=0)
|
|
184
|
+
toks_np = np.array(toks_np).transpose(1, 2, 0)
|
|
185
|
+
toks = torch.from_numpy(toks_np).long().to(codec_device)
|
|
186
|
+
audio = codec.decode(toks).squeeze(0).squeeze(0).detach().cpu().numpy()
|
|
187
|
+
sf.write(str(output), audio, 24000)
|
|
188
|
+
log(f"wrote {output}")
|
|
189
|
+
return 0
|
|
190
|
+
except Exception as exc:
|
|
191
|
+
print(f"FireRedTTS-2 MLX synthesis failed: {exc}", file=sys.stderr, flush=True)
|
|
192
|
+
return 1
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""VerbalCoding mlx-audio synthesis wrapper."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_args() -> argparse.Namespace:
|
|
14
|
+
parser = argparse.ArgumentParser(description="Synthesize speech with mlx-audio")
|
|
15
|
+
parser.add_argument("--text", required=True)
|
|
16
|
+
parser.add_argument("--output", required=True)
|
|
17
|
+
parser.add_argument("--model", default="mlx-community/Qwen3-TTS-12Hz-1.7B-Base-8bit")
|
|
18
|
+
parser.add_argument("--voice", default="Chelsie")
|
|
19
|
+
parser.add_argument("--lang-code", default="ko")
|
|
20
|
+
parser.add_argument("--stream", action="store_true")
|
|
21
|
+
return parser.parse_args()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def newest_audio_file(directory: Path) -> Path | None:
|
|
25
|
+
candidates = []
|
|
26
|
+
for pattern in ("*.wav", "*.mp3", "*.flac", "*.m4a"):
|
|
27
|
+
candidates.extend(directory.glob(pattern))
|
|
28
|
+
if not candidates:
|
|
29
|
+
return None
|
|
30
|
+
return max(candidates, key=lambda p: p.stat().st_mtime)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main() -> int:
|
|
34
|
+
args = parse_args()
|
|
35
|
+
output = Path(args.output).expanduser()
|
|
36
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
with tempfile.TemporaryDirectory(prefix="verbalcoding-mlxaudio-") as tmp:
|
|
38
|
+
out_dir = Path(tmp)
|
|
39
|
+
cmd = [
|
|
40
|
+
sys.executable,
|
|
41
|
+
"-m",
|
|
42
|
+
"mlx_audio.tts.generate",
|
|
43
|
+
"--model",
|
|
44
|
+
args.model,
|
|
45
|
+
"--text",
|
|
46
|
+
args.text,
|
|
47
|
+
"--voice",
|
|
48
|
+
args.voice,
|
|
49
|
+
"--lang_code",
|
|
50
|
+
args.lang_code,
|
|
51
|
+
"--output_path",
|
|
52
|
+
str(out_dir),
|
|
53
|
+
"--join_audio",
|
|
54
|
+
]
|
|
55
|
+
if args.stream:
|
|
56
|
+
cmd.extend(["--stream", "--save"])
|
|
57
|
+
try:
|
|
58
|
+
subprocess.run(cmd, check=True, text=True, timeout=None)
|
|
59
|
+
except ModuleNotFoundError:
|
|
60
|
+
print("mlx-audio is not installed. Run scripts/install_mlxaudio.sh --yes first.", file=sys.stderr)
|
|
61
|
+
return 127
|
|
62
|
+
except subprocess.CalledProcessError as exc:
|
|
63
|
+
print(f"mlx-audio synthesis failed with exit {exc.returncode}", file=sys.stderr)
|
|
64
|
+
return exc.returncode or 1
|
|
65
|
+
audio = newest_audio_file(out_dir)
|
|
66
|
+
if not audio:
|
|
67
|
+
print("mlx-audio did not produce an audio file", file=sys.stderr)
|
|
68
|
+
return 66
|
|
69
|
+
shutil.copyfile(audio, output)
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == "__main__":
|
|
74
|
+
raise SystemExit(main())
|