abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.1.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.1.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,362 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import shutil
6
+ import tempfile
7
+ import time
8
+ import threading
9
+ import uuid
10
+ from dataclasses import dataclass, asdict
11
+ from pathlib import Path
12
+ from typing import Any, Dict, Iterable, List, Optional
13
+
14
+ import appdirs
15
+
16
+ _STDERR_FD_LOCK = threading.Lock()
17
+
18
+
19
+ class _SilenceStderrFD:
20
+ """Temporarily redirect OS-level stderr (fd=2) to /dev/null.
21
+
22
+ Some native decoders (e.g. mpg123 via libsndfile) write directly to fd=2,
23
+ bypassing Python's sys.stderr. We use this to keep interactive CLI output
24
+ clean when decoding odd inputs like MP3-in-WAV.
25
+ """
26
+
27
+ def __enter__(self):
28
+ self._lock = _STDERR_FD_LOCK
29
+ self._lock.acquire()
30
+ self._devnull_fd = None
31
+ self._saved_stderr_fd = None
32
+ try:
33
+ self._devnull_fd = os.open(os.devnull, os.O_WRONLY)
34
+ self._saved_stderr_fd = os.dup(2)
35
+ os.dup2(self._devnull_fd, 2)
36
+ except Exception:
37
+ self.__exit__(None, None, None)
38
+ return self
39
+
40
+ def __exit__(self, exc_type, exc, tb):
41
+ try:
42
+ if self._saved_stderr_fd is not None:
43
+ try:
44
+ os.dup2(self._saved_stderr_fd, 2)
45
+ except Exception:
46
+ pass
47
+ finally:
48
+ try:
49
+ if self._saved_stderr_fd is not None:
50
+ os.close(self._saved_stderr_fd)
51
+ except Exception:
52
+ pass
53
+ try:
54
+ if self._devnull_fd is not None:
55
+ os.close(self._devnull_fd)
56
+ except Exception:
57
+ pass
58
+ try:
59
+ self._lock.release()
60
+ except Exception:
61
+ pass
62
+ return False
63
+
64
+
65
+ @dataclass(frozen=True)
66
+ class ClonedVoice:
67
+ voice_id: str
68
+ name: str
69
+ created_at: float
70
+ reference_files: List[str] # relative to voice directory
71
+ reference_text: Optional[str] = None
72
+ engine: str = "f5_tts"
73
+ meta: Dict[str, Any] = None
74
+
75
+
76
+ class VoiceCloneStore:
77
+ """Stores cloned-voice metadata + reference audio bundles locally.
78
+
79
+ Design principles:
80
+ - Keep the storage format portable and engine-agnostic.
81
+ - Avoid embedding binary blobs in JSON; store files on disk.
82
+ """
83
+
84
+ def __init__(self, base_dir: Optional[str | Path] = None):
85
+ if base_dir is None:
86
+ root = Path(appdirs.user_data_dir("abstractvoice"))
87
+ self._base_dir = root / "cloned_voices"
88
+ else:
89
+ self._base_dir = Path(base_dir)
90
+ self._base_dir.mkdir(parents=True, exist_ok=True)
91
+
92
+ self._index_path = self._base_dir / "index.json"
93
+ if not self._index_path.exists():
94
+ self._write_index({})
95
+
96
+ def _read_index(self) -> Dict[str, Any]:
97
+ try:
98
+ return json.loads(self._index_path.read_text(encoding="utf-8"))
99
+ except Exception:
100
+ return {}
101
+
102
+ def _write_index(self, data: Dict[str, Any]) -> None:
103
+ self._index_path.write_text(json.dumps(data, indent=2, sort_keys=True), encoding="utf-8")
104
+
105
+ def _voice_dir(self, voice_id: str) -> Path:
106
+ return self._base_dir / voice_id
107
+
108
+ def resolve_reference_paths(self, voice_id: str) -> List[Path]:
109
+ voice = self.get_voice(voice_id)
110
+ vdir = self._voice_dir(voice.voice_id)
111
+ return [vdir / rel for rel in voice.reference_files]
112
+
113
+ def normalize_reference_audio(self, voice_id: str) -> int:
114
+ """Best-effort normalize stored references for a voice.
115
+
116
+ Currently converts WAV files that are actually MPEG-compressed (MP3-in-WAV)
117
+ into standard PCM16 WAVs. This prevents native decoder warnings like:
118
+ "Illegal Audio-MPEG-Header ... Trying to resync ..."
119
+ from polluting interactive CLI output during cloning synthesis.
120
+ """
121
+ try:
122
+ voice = self.get_voice(voice_id)
123
+ except Exception:
124
+ return 0
125
+
126
+ vdir = self._voice_dir(voice.voice_id)
127
+ converted = 0
128
+ for rel in (voice.reference_files or []):
129
+ p = vdir / str(rel)
130
+ try:
131
+ if self._normalize_wav_mpeg_to_pcm_inplace(p):
132
+ converted += 1
133
+ except Exception:
134
+ # Normalization is best-effort; inference can still attempt decode.
135
+ continue
136
+ return converted
137
+
138
+ def _normalize_wav_mpeg_to_pcm_inplace(self, path: Path) -> bool:
139
+ if path.suffix.lower() != ".wav":
140
+ return False
141
+ if not path.exists():
142
+ return False
143
+ try:
144
+ import soundfile as sf
145
+ except Exception:
146
+ return False
147
+
148
+ try:
149
+ info = sf.info(str(path))
150
+ except Exception:
151
+ return False
152
+
153
+ # Example: format=WAV subtype=MPEG_LAYER_III
154
+ fmt = str(getattr(info, "format", "") or "").strip().upper()
155
+ subtype = str(getattr(info, "subtype", "") or "").strip().upper()
156
+ if fmt != "WAV" or not subtype.startswith("MPEG"):
157
+ return False
158
+
159
+ # Decode once (silencing native decoder stderr), then rewrite as PCM16 WAV.
160
+ with _SilenceStderrFD():
161
+ audio, sr = sf.read(str(path), always_2d=True, dtype="float32")
162
+
163
+ tmp = tempfile.NamedTemporaryFile(dir=str(path.parent), suffix=".wav", delete=False)
164
+ tmp_path = Path(tmp.name)
165
+ tmp.close()
166
+ try:
167
+ sf.write(str(tmp_path), audio, int(sr), format="WAV", subtype="PCM_16")
168
+ tmp_path.replace(path)
169
+ finally:
170
+ try:
171
+ tmp_path.unlink(missing_ok=True) # type: ignore[arg-type]
172
+ except Exception:
173
+ pass
174
+ return True
175
+
176
+ def create_voice(
177
+ self,
178
+ reference_paths: Iterable[str | Path],
179
+ *,
180
+ name: Optional[str] = None,
181
+ reference_text: Optional[str] = None,
182
+ engine: str = "f5_tts",
183
+ meta: Optional[Dict[str, Any]] = None,
184
+ ) -> str:
185
+ paths = [Path(p) for p in reference_paths]
186
+ if not paths:
187
+ raise ValueError("reference_paths must contain at least one file")
188
+ for p in paths:
189
+ if not p.exists():
190
+ raise FileNotFoundError(str(p))
191
+ if p.is_dir():
192
+ raise ValueError(f"Reference path must be a file, got directory: {p}")
193
+
194
+ voice_id = uuid.uuid4().hex
195
+ vdir = self._voice_dir(voice_id)
196
+ vdir.mkdir(parents=True, exist_ok=True)
197
+
198
+ copied: List[str] = []
199
+ for i, p in enumerate(paths):
200
+ dest = vdir / f"ref_{i}{p.suffix.lower()}"
201
+ if p.suffix.lower() == ".wav":
202
+ # If the WAV container is actually MPEG-compressed, normalize to PCM16 WAV
203
+ # to avoid noisy mpg123 "resync" messages later during synthesis.
204
+ try:
205
+ import soundfile as sf
206
+
207
+ info = sf.info(str(p))
208
+ fmt = str(getattr(info, "format", "") or "").strip().upper()
209
+ subtype = str(getattr(info, "subtype", "") or "").strip().upper()
210
+ if fmt == "WAV" and subtype.startswith("MPEG"):
211
+ with _SilenceStderrFD():
212
+ audio, sr = sf.read(str(p), always_2d=True, dtype="float32")
213
+ sf.write(str(dest), audio, int(sr), format="WAV", subtype="PCM_16")
214
+ copied.append(dest.name)
215
+ continue
216
+ except Exception:
217
+ # Fall back to raw copy; synthesis may still attempt decode.
218
+ pass
219
+
220
+ shutil.copy2(p, dest)
221
+ copied.append(dest.name)
222
+
223
+ meta_out = dict(meta or {})
224
+ if (reference_text or "").strip() and not meta_out.get("reference_text_source"):
225
+ # Keep metadata consistent with `set_reference_text(..., source="manual")`.
226
+ meta_out["reference_text_source"] = "manual"
227
+
228
+ record = ClonedVoice(
229
+ voice_id=voice_id,
230
+ name=name or f"voice_{voice_id[:8]}",
231
+ created_at=time.time(),
232
+ reference_files=copied,
233
+ reference_text=reference_text,
234
+ engine=engine,
235
+ meta=meta_out,
236
+ )
237
+
238
+ index = self._read_index()
239
+ index[voice_id] = asdict(record)
240
+ self._write_index(index)
241
+ return voice_id
242
+
243
+ def get_voice(self, voice_id: str) -> ClonedVoice:
244
+ index = self._read_index()
245
+ if voice_id not in index:
246
+ raise KeyError(f"Unknown voice_id: {voice_id}")
247
+ data = index[voice_id]
248
+ return ClonedVoice(**data)
249
+
250
+ def get_voice_dict(self, voice_id: str) -> Dict[str, Any]:
251
+ """Return the stored voice record as a JSON-serializable dict."""
252
+ v = self.get_voice(voice_id)
253
+ return {"voice_id": voice_id, **asdict(v)}
254
+
255
+ def list_voices(self) -> List[Dict[str, Any]]:
256
+ index = self._read_index()
257
+ out: List[Dict[str, Any]] = []
258
+ for voice_id, data in index.items():
259
+ out.append({"voice_id": voice_id, **data})
260
+ # newest first
261
+ out.sort(key=lambda d: float(d.get("created_at", 0)), reverse=True)
262
+ return out
263
+
264
+ def set_reference_text(self, voice_id: str, reference_text: str, *, source: str | None = None) -> None:
265
+ """Set (or replace) the stored reference text for a cloned voice.
266
+
267
+ This matters a lot for cloning quality: if reference_text is garbled,
268
+ the model often produces artifacts (wrong words bleeding into output).
269
+ """
270
+ index = self._read_index()
271
+ if voice_id not in index:
272
+ raise KeyError(f"Unknown voice_id: {voice_id}")
273
+ data = dict(index[voice_id])
274
+ data["reference_text"] = str(reference_text or "")
275
+ if source:
276
+ meta = dict(data.get("meta") or {})
277
+ meta["reference_text_source"] = str(source)
278
+ data["meta"] = meta
279
+ index[voice_id] = data
280
+ self._write_index(index)
281
+
282
+ def export_voice(self, voice_id: str, path: str | Path) -> str:
283
+ """Export a voice bundle as a zip archive."""
284
+ import zipfile
285
+
286
+ voice = self.get_voice(voice_id)
287
+ vdir = self._voice_dir(voice_id)
288
+ if not vdir.exists():
289
+ raise FileNotFoundError(str(vdir))
290
+
291
+ out_path = Path(path)
292
+ if out_path.suffix.lower() != ".zip":
293
+ out_path = out_path.with_suffix(".zip")
294
+
295
+ with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
296
+ z.writestr("voice.json", json.dumps(asdict(voice), indent=2, sort_keys=True))
297
+ for rel in voice.reference_files:
298
+ fp = vdir / rel
299
+ z.write(fp, arcname=f"refs/{rel}")
300
+
301
+ return str(out_path)
302
+
303
+ def import_voice(self, path: str | Path) -> str:
304
+ """Import a voice bundle zip archive into the local store."""
305
+ import zipfile
306
+
307
+ src = Path(path)
308
+ if not src.exists():
309
+ raise FileNotFoundError(str(src))
310
+
311
+ with zipfile.ZipFile(src, "r") as z:
312
+ voice_data = json.loads(z.read("voice.json").decode("utf-8"))
313
+
314
+ # New id on import to avoid collisions.
315
+ new_id = uuid.uuid4().hex
316
+ vdir = self._voice_dir(new_id)
317
+ vdir.mkdir(parents=True, exist_ok=True)
318
+
319
+ refs = []
320
+ for name in z.namelist():
321
+ if not name.startswith("refs/"):
322
+ continue
323
+ rel = Path(name).name
324
+ dest = vdir / rel
325
+ with z.open(name) as src_fp, open(dest, "wb") as out_fp:
326
+ shutil.copyfileobj(src_fp, out_fp)
327
+ refs.append(rel)
328
+
329
+ voice_data["voice_id"] = new_id
330
+ voice_data["reference_files"] = refs
331
+
332
+ index = self._read_index()
333
+ index[new_id] = voice_data
334
+ self._write_index(index)
335
+
336
+ return new_id
337
+
338
+ def rename_voice(self, voice_id: str, new_name: str) -> None:
339
+ index = self._read_index()
340
+ if voice_id not in index:
341
+ raise KeyError(f"Unknown voice_id: {voice_id}")
342
+ data = dict(index[voice_id])
343
+ data["name"] = str(new_name or "").strip() or data.get("name") or f"voice_{voice_id[:8]}"
344
+ index[voice_id] = data
345
+ self._write_index(index)
346
+
347
+ def delete_voice(self, voice_id: str) -> None:
348
+ """Delete a voice entry and its reference files from disk."""
349
+ index = self._read_index()
350
+ if voice_id not in index:
351
+ raise KeyError(f"Unknown voice_id: {voice_id}")
352
+
353
+ vdir = self._voice_dir(voice_id)
354
+ try:
355
+ if vdir.exists():
356
+ shutil.rmtree(vdir)
357
+ except Exception:
358
+ # If deletion fails, do not leave index in an inconsistent state.
359
+ raise
360
+
361
+ del index[voice_id]
362
+ self._write_index(index)
@@ -0,0 +1,6 @@
1
+ """Compute helpers (device selection, acceleration notes)."""
2
+
3
+ from .device import best_torch_device, best_faster_whisper_device
4
+
5
+ __all__ = ["best_torch_device", "best_faster_whisper_device"]
6
+
@@ -0,0 +1,73 @@
1
+ """Device selection helpers.
2
+
3
+ We have multiple compute backends in this project:
4
+ - torch models (cloning): can use CUDA/MPS/XPU/CPU depending on local setup.
5
+ - faster-whisper / CTranslate2 (STT): CUDA or CPU (no MPS backend today).
6
+
7
+ Design goal: choose the best available device by default, while still allowing
8
+ explicit overrides in higher-level APIs.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import sys
15
+
16
+
17
+ def best_torch_device() -> str:
18
+ """Return best torch device string: cuda|mps|xpu|cpu.
19
+
20
+ Honors env var `ABSTRACTVOICE_TORCH_DEVICE` when set (e.g. "cpu", "mps", "cuda").
21
+ """
22
+ forced = (os.environ.get("ABSTRACTVOICE_TORCH_DEVICE") or "").strip().lower()
23
+ if forced:
24
+ return forced
25
+
26
+ try:
27
+ import torch
28
+
29
+ # CUDA (NVIDIA, and often ROCm via the CUDA API surface in PyTorch builds)
30
+ if torch.cuda.is_available():
31
+ return "cuda"
32
+
33
+ # Apple Silicon (preferred on macOS when available)
34
+ if sys.platform == "darwin":
35
+ try:
36
+ if torch.backends.mps.is_available():
37
+ return "mps"
38
+ except Exception:
39
+ pass
40
+
41
+ # Intel XPU
42
+ try:
43
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
44
+ return "xpu"
45
+ except Exception:
46
+ pass
47
+
48
+ except Exception:
49
+ pass
50
+
51
+ return "cpu"
52
+
53
+
54
+ def best_faster_whisper_device() -> str:
55
+ """Return best device for faster-whisper: cuda|cpu.
56
+
57
+ Honors env var `ABSTRACTVOICE_WHISPER_DEVICE`.
58
+ Note: faster-whisper doesn't support MPS as a backend today.
59
+ """
60
+ forced = (os.environ.get("ABSTRACTVOICE_WHISPER_DEVICE") or "").strip().lower()
61
+ if forced:
62
+ return forced
63
+
64
+ try:
65
+ import torch
66
+
67
+ if torch.cuda.is_available():
68
+ return "cuda"
69
+ except Exception:
70
+ pass
71
+
72
+ return "cpu"
73
+
@@ -0,0 +1,2 @@
1
+ """Configuration and constants for AbstractVoice."""
2
+
@@ -0,0 +1,19 @@
1
+ """Language metadata (Piper-first).
2
+
3
+ AbstractVoice core uses Piper as the default (and only) TTS engine. We keep a
4
+ small language list here for validation / UX messaging. Voice selection is
5
+ handled by the Piper adapter itself.
6
+ """
7
+
8
+ LANGUAGES = {
9
+ "en": {"name": "English"},
10
+ "fr": {"name": "French"},
11
+ "de": {"name": "German"},
12
+ "es": {"name": "Spanish"},
13
+ "ru": {"name": "Russian"},
14
+ "zh": {"name": "Chinese"},
15
+ }
16
+
17
+ # Universal safe fallback language code.
18
+ SAFE_FALLBACK = "en"
19
+
@@ -26,7 +26,6 @@ class DependencyChecker:
26
26
  }
27
27
 
28
28
  OPTIONAL_DEPS = {
29
- "coqui-tts": ("0.27.0", "0.30.0"),
30
29
  "openai-whisper": ("20230314", None),
31
30
  "sounddevice": ("0.4.6", None),
32
31
  "librosa": ("0.10.0", "0.11.0"),