abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.1.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.1.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,349 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Iterable, List, Optional
5
+
6
+ from .engine_f5 import F5TTSVoiceCloningEngine
7
+ from .store import VoiceCloneStore
8
+
9
+
10
+ class VoiceCloner:
11
+ """High-level voice cloning manager (optional).
12
+
13
+ Stores reference bundles locally and uses an engine to synthesize speech.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ *,
19
+ store: Optional[VoiceCloneStore] = None,
20
+ debug: bool = False,
21
+ whisper_model: str = "tiny",
22
+ reference_text_whisper_model: str = "small",
23
+ allow_downloads: bool = True,
24
+ default_engine: str = "f5_tts",
25
+ ):
26
+ self.store = store or VoiceCloneStore()
27
+ self.debug = debug
28
+ self._whisper_model = whisper_model
29
+ self._reference_text_whisper_model = reference_text_whisper_model
30
+ self._allow_downloads = bool(allow_downloads)
31
+ self._default_engine = str(default_engine or "f5_tts").strip().lower()
32
+ self._engines: Dict[str, Any] = {}
33
+
34
+ def _get_engine(self, engine: str) -> Any:
35
+ name = str(engine or "").strip().lower()
36
+ if not name:
37
+ raise ValueError("engine must be a non-empty string")
38
+ if name in self._engines:
39
+ return self._engines[name]
40
+
41
+ # Lazy-load engines to avoid surprise model downloads during list/store operations.
42
+ if name == "f5_tts":
43
+ inst = F5TTSVoiceCloningEngine(whisper_model=self._whisper_model, debug=self.debug)
44
+ elif name == "chroma":
45
+ from .engine_chroma import ChromaVoiceCloningEngine
46
+
47
+ inst = ChromaVoiceCloningEngine(debug=self.debug, device="auto")
48
+ else:
49
+ raise ValueError(f"Unknown cloning engine: {name}")
50
+
51
+ self._engines[name] = inst
52
+ return inst
53
+
54
+ def set_quality_preset(self, preset: str) -> None:
55
+ # Best-effort across loaded engines (new engines are lazy-instantiated).
56
+ for eng in list(self._engines.values()):
57
+ try:
58
+ eng.set_quality_preset(preset)
59
+ except Exception:
60
+ pass
61
+
62
+ def unload_engine(self, engine: str) -> bool:
63
+ """Best-effort unload a loaded engine to free memory.
64
+
65
+ This does NOT delete any cloned voices on disk; it only releases runtime
66
+ model weights/processors kept in memory.
67
+ """
68
+ name = str(engine or "").strip().lower()
69
+ if not name:
70
+ return False
71
+ inst = self._engines.pop(name, None)
72
+ if inst is None:
73
+ return False
74
+ try:
75
+ if hasattr(inst, "unload"):
76
+ inst.unload()
77
+ except Exception:
78
+ pass
79
+ return True
80
+
81
+ def unload_engines_except(self, keep_engine: str | None = None) -> int:
82
+ """Unload all loaded engines except `keep_engine` (if provided)."""
83
+ keep = str(keep_engine or "").strip().lower() or None
84
+ removed = 0
85
+ for name in list(self._engines.keys()):
86
+ if keep and name == keep:
87
+ continue
88
+ if self.unload_engine(name):
89
+ removed += 1
90
+ return int(removed)
91
+
92
+ def unload_all_engines(self) -> int:
93
+ """Unload all loaded engines."""
94
+ return self.unload_engines_except(None)
95
+
96
+ def get_runtime_info(self) -> Dict[str, Any]:
97
+ # Keep backward compatibility: return a single flat dict.
98
+ # Prefer F5 when available, otherwise return any loaded engine info.
99
+ if "f5_tts" in self._engines:
100
+ try:
101
+ return dict(self._engines["f5_tts"].runtime_info())
102
+ except Exception:
103
+ return {}
104
+ for eng in self._engines.values():
105
+ try:
106
+ return dict(eng.runtime_info())
107
+ except Exception:
108
+ continue
109
+ return {}
110
+
111
+ def clone_voice(
112
+ self,
113
+ reference_audio_path: str,
114
+ name: str | None = None,
115
+ *,
116
+ reference_text: str | None = None,
117
+ engine: str | None = None,
118
+ ) -> str:
119
+ """Create a new cloned voice from a file or directory.
120
+
121
+ If a directory is provided, all WAV/FLAC/OGG files inside are used.
122
+ """
123
+ p = Path(reference_audio_path)
124
+ if not p.exists():
125
+ raise FileNotFoundError(str(p))
126
+
127
+ supported = {".wav", ".flac", ".ogg"}
128
+
129
+ engine_name = str(engine or self._default_engine).strip().lower()
130
+ if engine_name not in ("f5_tts", "chroma"):
131
+ raise ValueError("engine must be one of: f5_tts|chroma")
132
+
133
+ if p.is_dir():
134
+ refs = sorted([x for x in p.glob("*") if x.suffix.lower() in supported])
135
+ if not refs:
136
+ raise ValueError(f"No supported reference audio files found in: {p}")
137
+ else:
138
+ if p.suffix.lower() not in supported:
139
+ raise ValueError(
140
+ f"Unsupported reference audio format: {p.suffix}. "
141
+ f"Provide WAV/FLAC/OGG (got: {p})."
142
+ )
143
+ refs = [p]
144
+
145
+ if engine_name == "chroma" and len(refs) != 1:
146
+ raise ValueError(
147
+ "Chroma cloning currently supports exactly one reference audio file.\n"
148
+ "Provide a single WAV/FLAC/OGG file (not a directory with multiple files)."
149
+ )
150
+
151
+ voice_id = self.store.create_voice(
152
+ refs,
153
+ name=name,
154
+ reference_text=reference_text,
155
+ engine=engine_name,
156
+ meta={"source": str(p)},
157
+ )
158
+ return voice_id
159
+
160
+ def list_cloned_voices(self) -> List[Dict[str, Any]]:
161
+ return self.store.list_voices()
162
+
163
+ def get_cloned_voice(self, voice_id: str) -> Dict[str, Any]:
164
+ return self.store.get_voice_dict(voice_id)
165
+
166
+ def export_voice(self, voice_id: str, path: str) -> str:
167
+ return self.store.export_voice(voice_id, path)
168
+
169
+ def import_voice(self, path: str) -> str:
170
+ return self.store.import_voice(path)
171
+
172
+ def rename_cloned_voice(self, voice_id: str, new_name: str) -> None:
173
+ self.store.rename_voice(voice_id, new_name)
174
+
175
+ def delete_cloned_voice(self, voice_id: str) -> None:
176
+ self.store.delete_voice(voice_id)
177
+
178
+ def set_reference_text(self, voice_id: str, reference_text: str) -> None:
179
+ self.store.set_reference_text(voice_id, reference_text, source="manual")
180
+
181
+ def _ensure_reference_text(self, voice_id: str) -> str:
182
+ voice = self.store.get_voice(voice_id)
183
+ if (voice.reference_text or "").strip():
184
+ return str(voice.reference_text).strip()
185
+
186
+ # One-time fallback: transcribe reference audio and persist.
187
+ ref_paths = self.store.resolve_reference_paths(voice_id)
188
+
189
+ # Use a slightly larger model by default for better transcript quality.
190
+ from ..adapters.stt_faster_whisper import FasterWhisperAdapter
191
+ import numpy as np
192
+ import soundfile as sf
193
+
194
+ # Build a short mono float32 clip (<= 30s) at 16k for STT.
195
+ # Chroma-style prompting can benefit from longer reference transcripts; this
196
+ # is a one-time cost per cloned voice.
197
+ max_seconds = 30.0
198
+ target_sr = 16000
199
+ merged = []
200
+ for p in ref_paths:
201
+ audio, sr = sf.read(str(p), always_2d=True, dtype="float32")
202
+ mono = np.mean(audio, axis=1).astype(np.float32)
203
+ # simple linear resample (avoid extra deps)
204
+ from ..audio.resample import linear_resample_mono
205
+
206
+ mono = linear_resample_mono(mono, int(sr), target_sr)
207
+ merged.append(mono)
208
+ clip = np.concatenate(merged) if merged else np.zeros((0,), dtype=np.float32)
209
+ clip = clip[: int(target_sr * max_seconds)]
210
+
211
+ stt = FasterWhisperAdapter(
212
+ model_size=self._reference_text_whisper_model,
213
+ device="cpu",
214
+ compute_type="int8",
215
+ allow_downloads=bool(self._allow_downloads),
216
+ )
217
+ if not stt.is_available():
218
+ raise RuntimeError(
219
+ "This cloned voice has no stored reference_text.\n"
220
+ "Auto-fallback requires a cached STT model, but downloads are disabled.\n"
221
+ "Fix options:\n"
222
+ " - Prefetch outside the REPL: abstractvoice-prefetch --stt small\n"
223
+ " - Or set it manually: /clone_set_ref_text <id> \"...\""
224
+ )
225
+ import os
226
+ import tempfile
227
+
228
+ # Use file-based STT (stronger decoding settings in the adapter) for better transcript quality.
229
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
230
+ tmp_path = tmp.name
231
+ tmp.close()
232
+ try:
233
+ sf.write(str(tmp_path), clip, int(target_sr), format="WAV", subtype="PCM_16")
234
+
235
+ # 3-pass ASR consensus: reduces occasional non-determinism / decoding instability.
236
+ def _normalize_ref_text(s: str) -> str:
237
+ s = " ".join(str(s or "").strip().split())
238
+ if s and not (s.endswith(".") or s.endswith("!") or s.endswith("?") or s.endswith("。")):
239
+ s = s + "."
240
+ return s
241
+
242
+ def _edit_distance(a: str, b: str) -> int:
243
+ # Levenshtein distance (iterative DP, O(len(a)*len(b))).
244
+ a = str(a or "")
245
+ b = str(b or "")
246
+ if a == b:
247
+ return 0
248
+ if not a:
249
+ return len(b)
250
+ if not b:
251
+ return len(a)
252
+ # Ensure `b` is the longer string to keep the inner list small.
253
+ if len(a) > len(b):
254
+ a, b = b, a
255
+ prev = list(range(len(b) + 1))
256
+ for i, ca in enumerate(a, start=1):
257
+ cur = [i]
258
+ for j, cb in enumerate(b, start=1):
259
+ ins = cur[j - 1] + 1
260
+ dele = prev[j] + 1
261
+ sub = prev[j - 1] + (0 if ca == cb else 1)
262
+ cur.append(min(ins, dele, sub))
263
+ prev = cur
264
+ return int(prev[-1])
265
+
266
+ candidates: List[str] = []
267
+ for _ in range(3):
268
+ t = (stt.transcribe(str(tmp_path)) or "").strip()
269
+ candidates.append(_normalize_ref_text(t))
270
+
271
+ # Majority vote on normalized candidates.
272
+ counts: Dict[str, int] = {}
273
+ for c in candidates:
274
+ counts[c] = counts.get(c, 0) + 1
275
+ best = ""
276
+ best_n = -1
277
+ for c, n in counts.items():
278
+ if n > best_n:
279
+ best = c
280
+ best_n = int(n)
281
+
282
+ # No majority: choose the closest candidate (consensus by edit distance).
283
+ if best_n <= 1 and candidates:
284
+ best_sum = None
285
+ best_c = ""
286
+ for i, c in enumerate(candidates):
287
+ s = 0
288
+ for j, other in enumerate(candidates):
289
+ if j == i:
290
+ continue
291
+ s += _edit_distance(c, other)
292
+ if best_sum is None or s < best_sum:
293
+ best_sum = int(s)
294
+ best_c = c
295
+ best = best_c
296
+
297
+ best = _normalize_ref_text(best)
298
+ if not best.strip():
299
+ raise RuntimeError(
300
+ "Failed to auto-generate reference_text from the reference audio.\n"
301
+ "Fix options:\n"
302
+ " - Provide a clearer 6–10s reference sample\n"
303
+ " - Or set it manually: /clone_set_ref_text <id> \"...\""
304
+ )
305
+ finally:
306
+ try:
307
+ os.unlink(tmp_path)
308
+ except Exception:
309
+ pass
310
+
311
+ # Persist so we never re-transcribe for this voice.
312
+ self.store.set_reference_text(voice_id, best, source="asr")
313
+ return best
314
+
315
+ def speak_to_bytes(self, text: str, *, voice_id: str, format: str = "wav", speed: Optional[float] = None) -> bytes:
316
+ if format.lower() != "wav":
317
+ raise ValueError("Voice cloning currently supports WAV output only.")
318
+
319
+ voice = self.store.get_voice(voice_id)
320
+ # Best-effort: normalize stored references (e.g. MP3-in-WAV) to avoid noisy
321
+ # native decoder stderr output during synthesis.
322
+ try:
323
+ self.store.normalize_reference_audio(voice_id)
324
+ except Exception:
325
+ pass
326
+ ref_paths = self.store.resolve_reference_paths(voice_id)
327
+ ref_text = self._ensure_reference_text(voice_id)
328
+ eng = self._get_engine(getattr(voice, "engine", None) or "f5_tts")
329
+ return eng.infer_to_wav_bytes(text=text, reference_paths=ref_paths, reference_text=ref_text, speed=speed)
330
+
331
+ def speak_to_audio_chunks(
332
+ self,
333
+ text: str,
334
+ *,
335
+ voice_id: str,
336
+ speed: Optional[float] = None,
337
+ max_chars: int = 120,
338
+ ):
339
+ voice = self.store.get_voice(voice_id)
340
+ try:
341
+ self.store.normalize_reference_audio(voice_id)
342
+ except Exception:
343
+ pass
344
+ ref_paths = self.store.resolve_reference_paths(voice_id)
345
+ ref_text = self._ensure_reference_text(voice_id)
346
+ eng = self._get_engine(getattr(voice, "engine", None) or "f5_tts")
347
+ return eng.infer_to_audio_chunks(
348
+ text=text, reference_paths=ref_paths, reference_text=ref_text, speed=speed, max_chars=int(max_chars)
349
+ )