abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2403 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.1.dist-info/METADATA +213 -0
- abstractvoice-0.6.1.dist-info/RECORD +52 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.1.dist-info/METADATA +0 -1458
- abstractvoice-0.5.1.dist-info/RECORD +0 -23
- abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
5
|
+
|
|
6
|
+
from .engine_f5 import F5TTSVoiceCloningEngine
|
|
7
|
+
from .store import VoiceCloneStore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class VoiceCloner:
|
|
11
|
+
"""High-level voice cloning manager (optional).
|
|
12
|
+
|
|
13
|
+
Stores reference bundles locally and uses an engine to synthesize speech.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
*,
|
|
19
|
+
store: Optional[VoiceCloneStore] = None,
|
|
20
|
+
debug: bool = False,
|
|
21
|
+
whisper_model: str = "tiny",
|
|
22
|
+
reference_text_whisper_model: str = "small",
|
|
23
|
+
allow_downloads: bool = True,
|
|
24
|
+
default_engine: str = "f5_tts",
|
|
25
|
+
):
|
|
26
|
+
self.store = store or VoiceCloneStore()
|
|
27
|
+
self.debug = debug
|
|
28
|
+
self._whisper_model = whisper_model
|
|
29
|
+
self._reference_text_whisper_model = reference_text_whisper_model
|
|
30
|
+
self._allow_downloads = bool(allow_downloads)
|
|
31
|
+
self._default_engine = str(default_engine or "f5_tts").strip().lower()
|
|
32
|
+
self._engines: Dict[str, Any] = {}
|
|
33
|
+
|
|
34
|
+
def _get_engine(self, engine: str) -> Any:
|
|
35
|
+
name = str(engine or "").strip().lower()
|
|
36
|
+
if not name:
|
|
37
|
+
raise ValueError("engine must be a non-empty string")
|
|
38
|
+
if name in self._engines:
|
|
39
|
+
return self._engines[name]
|
|
40
|
+
|
|
41
|
+
# Lazy-load engines to avoid surprise model downloads during list/store operations.
|
|
42
|
+
if name == "f5_tts":
|
|
43
|
+
inst = F5TTSVoiceCloningEngine(whisper_model=self._whisper_model, debug=self.debug)
|
|
44
|
+
elif name == "chroma":
|
|
45
|
+
from .engine_chroma import ChromaVoiceCloningEngine
|
|
46
|
+
|
|
47
|
+
inst = ChromaVoiceCloningEngine(debug=self.debug, device="auto")
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(f"Unknown cloning engine: {name}")
|
|
50
|
+
|
|
51
|
+
self._engines[name] = inst
|
|
52
|
+
return inst
|
|
53
|
+
|
|
54
|
+
def set_quality_preset(self, preset: str) -> None:
|
|
55
|
+
# Best-effort across loaded engines (new engines are lazy-instantiated).
|
|
56
|
+
for eng in list(self._engines.values()):
|
|
57
|
+
try:
|
|
58
|
+
eng.set_quality_preset(preset)
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def unload_engine(self, engine: str) -> bool:
|
|
63
|
+
"""Best-effort unload a loaded engine to free memory.
|
|
64
|
+
|
|
65
|
+
This does NOT delete any cloned voices on disk; it only releases runtime
|
|
66
|
+
model weights/processors kept in memory.
|
|
67
|
+
"""
|
|
68
|
+
name = str(engine or "").strip().lower()
|
|
69
|
+
if not name:
|
|
70
|
+
return False
|
|
71
|
+
inst = self._engines.pop(name, None)
|
|
72
|
+
if inst is None:
|
|
73
|
+
return False
|
|
74
|
+
try:
|
|
75
|
+
if hasattr(inst, "unload"):
|
|
76
|
+
inst.unload()
|
|
77
|
+
except Exception:
|
|
78
|
+
pass
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
def unload_engines_except(self, keep_engine: str | None = None) -> int:
|
|
82
|
+
"""Unload all loaded engines except `keep_engine` (if provided)."""
|
|
83
|
+
keep = str(keep_engine or "").strip().lower() or None
|
|
84
|
+
removed = 0
|
|
85
|
+
for name in list(self._engines.keys()):
|
|
86
|
+
if keep and name == keep:
|
|
87
|
+
continue
|
|
88
|
+
if self.unload_engine(name):
|
|
89
|
+
removed += 1
|
|
90
|
+
return int(removed)
|
|
91
|
+
|
|
92
|
+
def unload_all_engines(self) -> int:
|
|
93
|
+
"""Unload all loaded engines."""
|
|
94
|
+
return self.unload_engines_except(None)
|
|
95
|
+
|
|
96
|
+
def get_runtime_info(self) -> Dict[str, Any]:
|
|
97
|
+
# Keep backward compatibility: return a single flat dict.
|
|
98
|
+
# Prefer F5 when available, otherwise return any loaded engine info.
|
|
99
|
+
if "f5_tts" in self._engines:
|
|
100
|
+
try:
|
|
101
|
+
return dict(self._engines["f5_tts"].runtime_info())
|
|
102
|
+
except Exception:
|
|
103
|
+
return {}
|
|
104
|
+
for eng in self._engines.values():
|
|
105
|
+
try:
|
|
106
|
+
return dict(eng.runtime_info())
|
|
107
|
+
except Exception:
|
|
108
|
+
continue
|
|
109
|
+
return {}
|
|
110
|
+
|
|
111
|
+
def clone_voice(
|
|
112
|
+
self,
|
|
113
|
+
reference_audio_path: str,
|
|
114
|
+
name: str | None = None,
|
|
115
|
+
*,
|
|
116
|
+
reference_text: str | None = None,
|
|
117
|
+
engine: str | None = None,
|
|
118
|
+
) -> str:
|
|
119
|
+
"""Create a new cloned voice from a file or directory.
|
|
120
|
+
|
|
121
|
+
If a directory is provided, all WAV/FLAC/OGG files inside are used.
|
|
122
|
+
"""
|
|
123
|
+
p = Path(reference_audio_path)
|
|
124
|
+
if not p.exists():
|
|
125
|
+
raise FileNotFoundError(str(p))
|
|
126
|
+
|
|
127
|
+
supported = {".wav", ".flac", ".ogg"}
|
|
128
|
+
|
|
129
|
+
engine_name = str(engine or self._default_engine).strip().lower()
|
|
130
|
+
if engine_name not in ("f5_tts", "chroma"):
|
|
131
|
+
raise ValueError("engine must be one of: f5_tts|chroma")
|
|
132
|
+
|
|
133
|
+
if p.is_dir():
|
|
134
|
+
refs = sorted([x for x in p.glob("*") if x.suffix.lower() in supported])
|
|
135
|
+
if not refs:
|
|
136
|
+
raise ValueError(f"No supported reference audio files found in: {p}")
|
|
137
|
+
else:
|
|
138
|
+
if p.suffix.lower() not in supported:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Unsupported reference audio format: {p.suffix}. "
|
|
141
|
+
f"Provide WAV/FLAC/OGG (got: {p})."
|
|
142
|
+
)
|
|
143
|
+
refs = [p]
|
|
144
|
+
|
|
145
|
+
if engine_name == "chroma" and len(refs) != 1:
|
|
146
|
+
raise ValueError(
|
|
147
|
+
"Chroma cloning currently supports exactly one reference audio file.\n"
|
|
148
|
+
"Provide a single WAV/FLAC/OGG file (not a directory with multiple files)."
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
voice_id = self.store.create_voice(
|
|
152
|
+
refs,
|
|
153
|
+
name=name,
|
|
154
|
+
reference_text=reference_text,
|
|
155
|
+
engine=engine_name,
|
|
156
|
+
meta={"source": str(p)},
|
|
157
|
+
)
|
|
158
|
+
return voice_id
|
|
159
|
+
|
|
160
|
+
def list_cloned_voices(self) -> List[Dict[str, Any]]:
|
|
161
|
+
return self.store.list_voices()
|
|
162
|
+
|
|
163
|
+
def get_cloned_voice(self, voice_id: str) -> Dict[str, Any]:
|
|
164
|
+
return self.store.get_voice_dict(voice_id)
|
|
165
|
+
|
|
166
|
+
def export_voice(self, voice_id: str, path: str) -> str:
|
|
167
|
+
return self.store.export_voice(voice_id, path)
|
|
168
|
+
|
|
169
|
+
def import_voice(self, path: str) -> str:
|
|
170
|
+
return self.store.import_voice(path)
|
|
171
|
+
|
|
172
|
+
def rename_cloned_voice(self, voice_id: str, new_name: str) -> None:
|
|
173
|
+
self.store.rename_voice(voice_id, new_name)
|
|
174
|
+
|
|
175
|
+
def delete_cloned_voice(self, voice_id: str) -> None:
|
|
176
|
+
self.store.delete_voice(voice_id)
|
|
177
|
+
|
|
178
|
+
def set_reference_text(self, voice_id: str, reference_text: str) -> None:
|
|
179
|
+
self.store.set_reference_text(voice_id, reference_text, source="manual")
|
|
180
|
+
|
|
181
|
+
def _ensure_reference_text(self, voice_id: str) -> str:
|
|
182
|
+
voice = self.store.get_voice(voice_id)
|
|
183
|
+
if (voice.reference_text or "").strip():
|
|
184
|
+
return str(voice.reference_text).strip()
|
|
185
|
+
|
|
186
|
+
# One-time fallback: transcribe reference audio and persist.
|
|
187
|
+
ref_paths = self.store.resolve_reference_paths(voice_id)
|
|
188
|
+
|
|
189
|
+
# Use a slightly larger model by default for better transcript quality.
|
|
190
|
+
from ..adapters.stt_faster_whisper import FasterWhisperAdapter
|
|
191
|
+
import numpy as np
|
|
192
|
+
import soundfile as sf
|
|
193
|
+
|
|
194
|
+
# Build a short mono float32 clip (<= 30s) at 16k for STT.
|
|
195
|
+
# Chroma-style prompting can benefit from longer reference transcripts; this
|
|
196
|
+
# is a one-time cost per cloned voice.
|
|
197
|
+
max_seconds = 30.0
|
|
198
|
+
target_sr = 16000
|
|
199
|
+
merged = []
|
|
200
|
+
for p in ref_paths:
|
|
201
|
+
audio, sr = sf.read(str(p), always_2d=True, dtype="float32")
|
|
202
|
+
mono = np.mean(audio, axis=1).astype(np.float32)
|
|
203
|
+
# simple linear resample (avoid extra deps)
|
|
204
|
+
from ..audio.resample import linear_resample_mono
|
|
205
|
+
|
|
206
|
+
mono = linear_resample_mono(mono, int(sr), target_sr)
|
|
207
|
+
merged.append(mono)
|
|
208
|
+
clip = np.concatenate(merged) if merged else np.zeros((0,), dtype=np.float32)
|
|
209
|
+
clip = clip[: int(target_sr * max_seconds)]
|
|
210
|
+
|
|
211
|
+
stt = FasterWhisperAdapter(
|
|
212
|
+
model_size=self._reference_text_whisper_model,
|
|
213
|
+
device="cpu",
|
|
214
|
+
compute_type="int8",
|
|
215
|
+
allow_downloads=bool(self._allow_downloads),
|
|
216
|
+
)
|
|
217
|
+
if not stt.is_available():
|
|
218
|
+
raise RuntimeError(
|
|
219
|
+
"This cloned voice has no stored reference_text.\n"
|
|
220
|
+
"Auto-fallback requires a cached STT model, but downloads are disabled.\n"
|
|
221
|
+
"Fix options:\n"
|
|
222
|
+
" - Prefetch outside the REPL: abstractvoice-prefetch --stt small\n"
|
|
223
|
+
" - Or set it manually: /clone_set_ref_text <id> \"...\""
|
|
224
|
+
)
|
|
225
|
+
import os
|
|
226
|
+
import tempfile
|
|
227
|
+
|
|
228
|
+
# Use file-based STT (stronger decoding settings in the adapter) for better transcript quality.
|
|
229
|
+
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
230
|
+
tmp_path = tmp.name
|
|
231
|
+
tmp.close()
|
|
232
|
+
try:
|
|
233
|
+
sf.write(str(tmp_path), clip, int(target_sr), format="WAV", subtype="PCM_16")
|
|
234
|
+
|
|
235
|
+
# 3-pass ASR consensus: reduces occasional non-determinism / decoding instability.
|
|
236
|
+
def _normalize_ref_text(s: str) -> str:
|
|
237
|
+
s = " ".join(str(s or "").strip().split())
|
|
238
|
+
if s and not (s.endswith(".") or s.endswith("!") or s.endswith("?") or s.endswith("。")):
|
|
239
|
+
s = s + "."
|
|
240
|
+
return s
|
|
241
|
+
|
|
242
|
+
def _edit_distance(a: str, b: str) -> int:
|
|
243
|
+
# Levenshtein distance (iterative DP, O(len(a)*len(b))).
|
|
244
|
+
a = str(a or "")
|
|
245
|
+
b = str(b or "")
|
|
246
|
+
if a == b:
|
|
247
|
+
return 0
|
|
248
|
+
if not a:
|
|
249
|
+
return len(b)
|
|
250
|
+
if not b:
|
|
251
|
+
return len(a)
|
|
252
|
+
# Ensure `b` is the longer string to keep the inner list small.
|
|
253
|
+
if len(a) > len(b):
|
|
254
|
+
a, b = b, a
|
|
255
|
+
prev = list(range(len(b) + 1))
|
|
256
|
+
for i, ca in enumerate(a, start=1):
|
|
257
|
+
cur = [i]
|
|
258
|
+
for j, cb in enumerate(b, start=1):
|
|
259
|
+
ins = cur[j - 1] + 1
|
|
260
|
+
dele = prev[j] + 1
|
|
261
|
+
sub = prev[j - 1] + (0 if ca == cb else 1)
|
|
262
|
+
cur.append(min(ins, dele, sub))
|
|
263
|
+
prev = cur
|
|
264
|
+
return int(prev[-1])
|
|
265
|
+
|
|
266
|
+
candidates: List[str] = []
|
|
267
|
+
for _ in range(3):
|
|
268
|
+
t = (stt.transcribe(str(tmp_path)) or "").strip()
|
|
269
|
+
candidates.append(_normalize_ref_text(t))
|
|
270
|
+
|
|
271
|
+
# Majority vote on normalized candidates.
|
|
272
|
+
counts: Dict[str, int] = {}
|
|
273
|
+
for c in candidates:
|
|
274
|
+
counts[c] = counts.get(c, 0) + 1
|
|
275
|
+
best = ""
|
|
276
|
+
best_n = -1
|
|
277
|
+
for c, n in counts.items():
|
|
278
|
+
if n > best_n:
|
|
279
|
+
best = c
|
|
280
|
+
best_n = int(n)
|
|
281
|
+
|
|
282
|
+
# No majority: choose the closest candidate (consensus by edit distance).
|
|
283
|
+
if best_n <= 1 and candidates:
|
|
284
|
+
best_sum = None
|
|
285
|
+
best_c = ""
|
|
286
|
+
for i, c in enumerate(candidates):
|
|
287
|
+
s = 0
|
|
288
|
+
for j, other in enumerate(candidates):
|
|
289
|
+
if j == i:
|
|
290
|
+
continue
|
|
291
|
+
s += _edit_distance(c, other)
|
|
292
|
+
if best_sum is None or s < best_sum:
|
|
293
|
+
best_sum = int(s)
|
|
294
|
+
best_c = c
|
|
295
|
+
best = best_c
|
|
296
|
+
|
|
297
|
+
best = _normalize_ref_text(best)
|
|
298
|
+
if not best.strip():
|
|
299
|
+
raise RuntimeError(
|
|
300
|
+
"Failed to auto-generate reference_text from the reference audio.\n"
|
|
301
|
+
"Fix options:\n"
|
|
302
|
+
" - Provide a clearer 6–10s reference sample\n"
|
|
303
|
+
" - Or set it manually: /clone_set_ref_text <id> \"...\""
|
|
304
|
+
)
|
|
305
|
+
finally:
|
|
306
|
+
try:
|
|
307
|
+
os.unlink(tmp_path)
|
|
308
|
+
except Exception:
|
|
309
|
+
pass
|
|
310
|
+
|
|
311
|
+
# Persist so we never re-transcribe for this voice.
|
|
312
|
+
self.store.set_reference_text(voice_id, best, source="asr")
|
|
313
|
+
return best
|
|
314
|
+
|
|
315
|
+
def speak_to_bytes(self, text: str, *, voice_id: str, format: str = "wav", speed: Optional[float] = None) -> bytes:
|
|
316
|
+
if format.lower() != "wav":
|
|
317
|
+
raise ValueError("Voice cloning currently supports WAV output only.")
|
|
318
|
+
|
|
319
|
+
voice = self.store.get_voice(voice_id)
|
|
320
|
+
# Best-effort: normalize stored references (e.g. MP3-in-WAV) to avoid noisy
|
|
321
|
+
# native decoder stderr output during synthesis.
|
|
322
|
+
try:
|
|
323
|
+
self.store.normalize_reference_audio(voice_id)
|
|
324
|
+
except Exception:
|
|
325
|
+
pass
|
|
326
|
+
ref_paths = self.store.resolve_reference_paths(voice_id)
|
|
327
|
+
ref_text = self._ensure_reference_text(voice_id)
|
|
328
|
+
eng = self._get_engine(getattr(voice, "engine", None) or "f5_tts")
|
|
329
|
+
return eng.infer_to_wav_bytes(text=text, reference_paths=ref_paths, reference_text=ref_text, speed=speed)
|
|
330
|
+
|
|
331
|
+
def speak_to_audio_chunks(
|
|
332
|
+
self,
|
|
333
|
+
text: str,
|
|
334
|
+
*,
|
|
335
|
+
voice_id: str,
|
|
336
|
+
speed: Optional[float] = None,
|
|
337
|
+
max_chars: int = 120,
|
|
338
|
+
):
|
|
339
|
+
voice = self.store.get_voice(voice_id)
|
|
340
|
+
try:
|
|
341
|
+
self.store.normalize_reference_audio(voice_id)
|
|
342
|
+
except Exception:
|
|
343
|
+
pass
|
|
344
|
+
ref_paths = self.store.resolve_reference_paths(voice_id)
|
|
345
|
+
ref_text = self._ensure_reference_text(voice_id)
|
|
346
|
+
eng = self._get_engine(getattr(voice, "engine", None) or "f5_tts")
|
|
347
|
+
return eng.infer_to_audio_chunks(
|
|
348
|
+
text=text, reference_paths=ref_paths, reference_text=ref_text, speed=speed, max_chars=int(max_chars)
|
|
349
|
+
)
|