abstractvoice 0.5.2__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2408 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/text_sanitize.py +33 -0
  32. abstractvoice/tts/__init__.py +3 -3
  33. abstractvoice/tts/adapter_tts_engine.py +210 -0
  34. abstractvoice/tts/tts_engine.py +257 -1208
  35. abstractvoice/vm/__init__.py +2 -0
  36. abstractvoice/vm/common.py +21 -0
  37. abstractvoice/vm/core.py +139 -0
  38. abstractvoice/vm/manager.py +108 -0
  39. abstractvoice/vm/stt_mixin.py +158 -0
  40. abstractvoice/vm/tts_mixin.py +550 -0
  41. abstractvoice/voice_manager.py +6 -1061
  42. abstractvoice-0.6.2.dist-info/METADATA +213 -0
  43. abstractvoice-0.6.2.dist-info/RECORD +53 -0
  44. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/WHEEL +1 -1
  45. abstractvoice-0.6.2.dist-info/entry_points.txt +6 -0
  46. abstractvoice/instant_setup.py +0 -83
  47. abstractvoice/simple_model_manager.py +0 -539
  48. abstractvoice-0.5.2.dist-info/METADATA +0 -1458
  49. abstractvoice-0.5.2.dist-info/RECORD +0 -23
  50. abstractvoice-0.5.2.dist-info/entry_points.txt +0 -2
  51. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/licenses/LICENSE +0 -0
  52. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,401 @@
1
+ """Faster-Whisper STT Adapter - High-performance speech recognition.
2
+
3
+ Faster-Whisper is a reimplementation of OpenAI's Whisper using CTranslate2:
4
+ - 4x faster inference than openai-whisper
5
+ - 60% lower memory usage with INT8 quantization
6
+ - Same accuracy as openai-whisper
7
+ - Better CPU performance
8
+ - Supports GPU acceleration (CUDA) if available
9
+ """
10
+
11
+ import os
12
+ import io
13
+ import logging
14
+ import numpy as np
15
+ import tempfile
16
+ from pathlib import Path
17
+ from typing import Optional, Dict, Any
18
+ import wave
19
+
20
+ from .base import STTAdapter
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class FasterWhisperAdapter(STTAdapter):
26
+ """Faster-Whisper STT adapter using faster-whisper package.
27
+
28
+ This adapter provides high-performance speech-to-text with same accuracy
29
+ as openai-whisper but 4x faster and 60% lower memory usage.
30
+ """
31
+
32
+ # Supported models (size -> (parameters, speed, accuracy))
33
+ MODELS = {
34
+ 'tiny': {'params': '39M', 'speed': 'very_fast', 'accuracy': 'low'},
35
+ 'base': {'params': '74M', 'speed': 'fast', 'accuracy': 'good'}, # Default
36
+ 'small': {'params': '244M', 'speed': 'medium', 'accuracy': 'better'},
37
+ 'medium': {'params': '769M', 'speed': 'slow', 'accuracy': 'high'},
38
+ 'large-v2': {'params': '1550M', 'speed': 'very_slow', 'accuracy': 'best'},
39
+ 'large-v3': {'params': '1550M', 'speed': 'very_slow', 'accuracy': 'best'},
40
+ }
41
+
42
+ # Supported languages
43
+ LANGUAGES = [
44
+ 'en', 'fr', 'de', 'es', 'ru', 'zh', # Required 6
45
+ 'it', 'pt', 'ja', 'ko', 'ar', 'hi', # Additional common languages
46
+ ]
47
+
48
+ def __init__(
49
+ self,
50
+ model_size: str = "base",
51
+ device: str = "auto",
52
+ compute_type: str = "int8",
53
+ *,
54
+ allow_downloads: bool = True,
55
+ ):
56
+ """Initialize Faster-Whisper STT adapter.
57
+
58
+ Args:
59
+ model_size: Model size ('tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3')
60
+ device: Device to run on ('cpu', 'cuda', 'auto')
61
+ compute_type: Computation type ('int8', 'float16', 'float32')
62
+ int8 provides 60% memory reduction with minimal accuracy loss
63
+ """
64
+ self._faster_whisper_available = False
65
+ self._model = None
66
+ self._model_size = model_size
67
+ self._device = device
68
+ self._compute_type = compute_type
69
+ self._current_language = None
70
+ self._allow_downloads = bool(allow_downloads)
71
+
72
+ # Try to import faster-whisper
73
+ try:
74
+ from faster_whisper import WhisperModel
75
+ self._WhisperModel = WhisperModel
76
+ self._faster_whisper_available = True
77
+ logger.info("✅ Faster-Whisper initialized successfully")
78
+
79
+ # Load model (best-effort). When allow_downloads=False we force offline mode
80
+ # so we never trigger downloads in interactive contexts (e.g. REPL).
81
+ self._load_model(model_size, device, compute_type)
82
+
83
+ except ImportError as e:
84
+ logger.warning(f"⚠️ Faster-Whisper not available: {e}")
85
+ logger.info(
86
+ "To install Faster-Whisper:\n"
87
+ " pip install faster-whisper>=0.10.0\n"
88
+ "This will enable 4x faster STT with same accuracy."
89
+ )
90
+
91
+ def _load_model(self, model_size: str, device: str = "auto", compute_type: str = "int8") -> bool:
92
+ """Load Faster-Whisper model.
93
+
94
+ Args:
95
+ model_size: Model size
96
+ device: Device ('cpu', 'cuda', 'auto')
97
+ compute_type: Computation type ('int8', 'float16', 'float32')
98
+
99
+ Returns:
100
+ True if successful, False otherwise
101
+ """
102
+ if not self._faster_whisper_available:
103
+ return False
104
+
105
+ if model_size not in self.MODELS:
106
+ logger.warning(f"⚠️ Unknown model size '{model_size}', using 'base'")
107
+ model_size = 'base'
108
+
109
+ try:
110
+ from ..compute import best_faster_whisper_device
111
+
112
+ if device == "auto":
113
+ device = best_faster_whisper_device()
114
+
115
+ logger.info(f"⬇️ Loading Faster-Whisper model: {model_size} ({self.MODELS[model_size]['params']}) on {device}")
116
+
117
+ # Load model (may auto-download if not cached).
118
+ # When downloads are not allowed, force HF offline mode so we never pull
119
+ # bytes from the network implicitly.
120
+ old_offline = os.environ.get("HF_HUB_OFFLINE")
121
+ old_disable_pb = os.environ.get("HF_HUB_DISABLE_PROGRESS_BARS")
122
+ if not self._allow_downloads:
123
+ os.environ["HF_HUB_OFFLINE"] = "1"
124
+ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
125
+ try:
126
+ self._model = self._WhisperModel(
127
+ model_size,
128
+ device=device,
129
+ compute_type=compute_type,
130
+ download_root=None, # Use default cache (~/.cache/huggingface)
131
+ )
132
+ finally:
133
+ if not self._allow_downloads:
134
+ if old_offline is None:
135
+ os.environ.pop("HF_HUB_OFFLINE", None)
136
+ else:
137
+ os.environ["HF_HUB_OFFLINE"] = old_offline
138
+ if old_disable_pb is None:
139
+ os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
140
+ else:
141
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = old_disable_pb
142
+
143
+ self._model_size = model_size
144
+ self._device = device
145
+ self._compute_type = compute_type
146
+
147
+ logger.info(f"✅ Loaded Faster-Whisper model: {model_size}")
148
+ return True
149
+
150
+ except Exception as e:
151
+ if self._allow_downloads:
152
+ logger.error(f"❌ Failed to load Faster-Whisper model: {e}")
153
+ else:
154
+ # Offline mode: model might simply not be cached locally.
155
+ logger.info(f"ℹ️ Faster-Whisper model '{model_size}' not available locally (offline mode).")
156
+ return False
157
+
158
+ def transcribe(
159
+ self,
160
+ audio_path: str,
161
+ language: Optional[str] = None,
162
+ *,
163
+ hotwords: Optional[str] = None,
164
+ initial_prompt: Optional[str] = None,
165
+ condition_on_previous_text: bool = True,
166
+ ) -> str:
167
+ """Transcribe audio file to text.
168
+
169
+ Args:
170
+ audio_path: Path to audio file
171
+ language: Target language (optional, auto-detect if not provided)
172
+
173
+ Returns:
174
+ Transcribed text
175
+ """
176
+ if not self.is_available():
177
+ raise RuntimeError(
178
+ "Faster-Whisper is not available. Install with: pip install faster-whisper>=0.10.0"
179
+ )
180
+
181
+ try:
182
+ # Transcribe with faster-whisper
183
+ segments, info = self._model.transcribe(
184
+ audio_path,
185
+ language=language,
186
+ beam_size=5,
187
+ best_of=5,
188
+ temperature=0.0,
189
+ vad_filter=True, # Use Voice Activity Detection
190
+ vad_parameters=dict(min_silence_duration_ms=500),
191
+ hotwords=hotwords,
192
+ initial_prompt=initial_prompt,
193
+ condition_on_previous_text=bool(condition_on_previous_text),
194
+ )
195
+
196
+ # Combine all segments
197
+ text = " ".join([segment.text.strip() for segment in segments])
198
+
199
+ if language is None:
200
+ logger.debug(f"Detected language: {info.language} (confidence: {info.language_probability:.2f})")
201
+
202
+ return text.strip()
203
+
204
+ except Exception as e:
205
+ logger.error(f"❌ Faster-Whisper transcription failed: {e}")
206
+ raise RuntimeError(f"Transcription failed: {e}") from e
207
+
208
+ def transcribe_from_bytes(
209
+ self,
210
+ audio_bytes: bytes,
211
+ language: Optional[str] = None,
212
+ *,
213
+ hotwords: Optional[str] = None,
214
+ initial_prompt: Optional[str] = None,
215
+ condition_on_previous_text: bool = True,
216
+ ) -> str:
217
+ """Transcribe audio from bytes (network use case).
218
+
219
+ Args:
220
+ audio_bytes: Audio data as bytes (WAV format)
221
+ language: Target language (optional, auto-detect if not provided)
222
+
223
+ Returns:
224
+ Transcribed text
225
+ """
226
+ # Save bytes to temporary file and transcribe
227
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
228
+ tmp_file.write(audio_bytes)
229
+ tmp_path = tmp_file.name
230
+
231
+ try:
232
+ return self.transcribe(
233
+ tmp_path,
234
+ language=language,
235
+ hotwords=hotwords,
236
+ initial_prompt=initial_prompt,
237
+ condition_on_previous_text=bool(condition_on_previous_text),
238
+ )
239
+ finally:
240
+ # Clean up temp file
241
+ try:
242
+ os.unlink(tmp_path)
243
+ except:
244
+ pass
245
+
246
+ def transcribe_from_array(
247
+ self,
248
+ audio_array: np.ndarray,
249
+ sample_rate: int,
250
+ language: Optional[str] = None,
251
+ *,
252
+ hotwords: Optional[str] = None,
253
+ initial_prompt: Optional[str] = None,
254
+ condition_on_previous_text: bool = True,
255
+ ) -> str:
256
+ """Transcribe audio from numpy array.
257
+
258
+ Args:
259
+ audio_array: Audio data as numpy array (float32, range -1.0 to 1.0)
260
+ sample_rate: Sample rate of the audio in Hz
261
+ language: Target language (optional, auto-detect if not provided)
262
+
263
+ Returns:
264
+ Transcribed text
265
+ """
266
+ # Fast path: pass float32 mono directly to faster-whisper (no temp files).
267
+ # NOTE: faster-whisper expects 16kHz audio when passing an array. We resample
268
+ # lightweightly if needed.
269
+ if not self.is_available():
270
+ raise RuntimeError(
271
+ "Faster-Whisper is not available. Install with: pip install faster-whisper>=0.10.0"
272
+ )
273
+ try:
274
+ import numpy as _np
275
+
276
+ x = _np.asarray(audio_array, dtype=_np.float32).reshape(-1)
277
+ sr = int(sample_rate)
278
+ if sr != 16000:
279
+ from ..audio.resample import linear_resample_mono
280
+
281
+ x = linear_resample_mono(x, sr, 16000)
282
+ sr = 16000
283
+
284
+ segments, info = self._model.transcribe(
285
+ x,
286
+ language=language,
287
+ beam_size=2,
288
+ best_of=2,
289
+ temperature=0.0,
290
+ vad_filter=False,
291
+ hotwords=hotwords,
292
+ initial_prompt=initial_prompt,
293
+ condition_on_previous_text=bool(condition_on_previous_text),
294
+ without_timestamps=True,
295
+ )
296
+ text = " ".join([segment.text.strip() for segment in segments])
297
+ if language is None:
298
+ logger.debug(f"Detected language: {info.language} (confidence: {info.language_probability:.2f})")
299
+ return text.strip()
300
+ except Exception:
301
+ # Fallback to file-based path for maximum compatibility.
302
+ audio_bytes = self._array_to_wav_bytes(audio_array, sample_rate)
303
+ return self.transcribe_from_bytes(
304
+ audio_bytes,
305
+ language=language,
306
+ hotwords=hotwords,
307
+ initial_prompt=initial_prompt,
308
+ condition_on_previous_text=bool(condition_on_previous_text),
309
+ )
310
+
311
+ def _array_to_wav_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
312
+ """Convert numpy array to WAV bytes.
313
+
314
+ Args:
315
+ audio_array: Audio as float32 array [-1.0, 1.0]
316
+ sample_rate: Sample rate in Hz
317
+
318
+ Returns:
319
+ WAV file as bytes
320
+ """
321
+ # Convert to 16-bit PCM
322
+ audio_int16 = (audio_array * 32767).astype(np.int16)
323
+
324
+ # Create WAV file in memory
325
+ buffer = io.BytesIO()
326
+
327
+ with wave.open(buffer, 'wb') as wav_file:
328
+ wav_file.setnchannels(1) # Mono
329
+ wav_file.setsampwidth(2) # 16-bit
330
+ wav_file.setframerate(sample_rate)
331
+ wav_file.writeframes(audio_int16.tobytes())
332
+
333
+ return buffer.getvalue()
334
+
335
+ def set_language(self, language: str) -> bool:
336
+ """Set the default language for transcription.
337
+
338
+ Args:
339
+ language: ISO 639-1 language code
340
+
341
+ Returns:
342
+ True if successful, False otherwise
343
+ """
344
+ if language not in self.LANGUAGES:
345
+ logger.warning(f"⚠️ Language {language} may not be well-supported")
346
+ return False
347
+
348
+ self._current_language = language
349
+ return True
350
+
351
+ def get_supported_languages(self) -> list[str]:
352
+ """Get list of supported language codes.
353
+
354
+ Returns:
355
+ List of ISO 639-1 language codes
356
+ """
357
+ return self.LANGUAGES.copy()
358
+
359
+ def is_available(self) -> bool:
360
+ """Check if Faster-Whisper is available and functional.
361
+
362
+ Returns:
363
+ True if the engine can be used, False otherwise
364
+ """
365
+ return self._faster_whisper_available and self._model is not None
366
+
367
+ def change_model(self, model_size: str) -> bool:
368
+ """Change the Whisper model size.
369
+
370
+ Args:
371
+ model_size: New model size ('tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3')
372
+
373
+ Returns:
374
+ True if successful, False otherwise
375
+ """
376
+ if model_size == self._model_size:
377
+ logger.debug(f"Model {model_size} already loaded")
378
+ return True
379
+
380
+ return self._load_model(model_size, self._device, self._compute_type)
381
+
382
+ def get_info(self) -> Dict[str, Any]:
383
+ """Get metadata about Faster-Whisper engine.
384
+
385
+ Returns:
386
+ Dictionary with engine information
387
+ """
388
+ info = super().get_info()
389
+ info.update({
390
+ 'engine': 'Faster-Whisper',
391
+ 'version': '1.2.0+',
392
+ 'model_size': self._model_size,
393
+ 'model_params': self.MODELS.get(self._model_size, {}).get('params', 'unknown'),
394
+ 'device': self._device,
395
+ 'compute_type': self._compute_type,
396
+ 'current_language': self._current_language,
397
+ 'performance': f"{self.MODELS.get(self._model_size, {}).get('speed', 'unknown')} speed, "
398
+ f"{self.MODELS.get(self._model_size, {}).get('accuracy', 'unknown')} accuracy",
399
+ 'memory_optimization': 'INT8 quantization' if self._compute_type == 'int8' else None
400
+ })
401
+ return info