abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.1.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.1.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -1,1297 +1,346 @@
1
- """TTS Engine for high-quality speech synthesis with interrupt handling.
1
+ """Core audio playback utilities (Piper-first).
2
2
 
3
- This module implements best practices for TTS synthesis including:
4
- - Sentence segmentation for long text (prevents attention degradation)
5
- - Text chunking for extremely long content
6
- - Text preprocessing and normalization
7
- - Robust error handling
3
+ AbstractVoice core intentionally avoids shipping legacy Coqui-based TTSEngine
4
+ logic. This module contains only reusable audio utilities:
5
+ - `NonBlockingAudioPlayer` for low-latency pause/resume/stop
6
+ - `apply_speed_without_pitch_change` (optional librosa)
8
7
  """
9
8
 
10
- import threading
11
- import time
12
- import numpy as np
13
- import os
14
- import sys
9
+ from __future__ import annotations
10
+
15
11
  import logging
16
- import warnings
17
- import re
12
+ import os
18
13
  import queue
14
+ import threading
15
+ from typing import Callable, Optional
19
16
 
20
- # Lazy imports for heavy dependencies
21
- def _import_tts():
22
- """Import TTS with helpful error message if dependencies missing."""
23
- try:
24
- from TTS.api import TTS
25
- return TTS
26
- except ImportError as e:
27
- error_msg = str(e).lower()
17
+ import numpy as np
28
18
 
29
- # Check for specific PyTorch/TorchVision conflicts
30
- if "torchvision::nms does not exist" in error_msg or "gpt2pretrainedmodel" in error_msg:
31
- raise ImportError(
32
- "❌ PyTorch/TorchVision version conflict detected!\n\n"
33
- "This is a known compatibility issue. To fix:\n\n"
34
- "1. Uninstall conflicting packages:\n"
35
- " pip uninstall torch torchvision torchaudio transformers\n\n"
36
- "2. Reinstall with compatible versions:\n"
37
- " pip install abstractvoice[all] # Installs tested compatible versions\n\n"
38
- "3. Or use specific PyTorch version:\n"
39
- " pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1\n"
40
- " pip install abstractvoice[voice-full]\n\n"
41
- "For conda environments, consider:\n"
42
- " conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n\n"
43
- f"Original error: {e}"
44
- ) from e
45
- elif "no module named 'tts'" in error_msg or "coqui" in error_msg:
46
- raise ImportError(
47
- "TTS functionality requires coqui-tts. Install with:\n"
48
- " pip install abstractvoice[tts] # For TTS only\n"
49
- " pip install abstractvoice[voice-full] # For complete voice functionality\n"
50
- " pip install abstractvoice[all] # For all features\n"
51
- f"Original error: {e}"
52
- ) from e
53
- else:
54
- # Generic import error
55
- raise ImportError(
56
- "TTS functionality requires optional dependencies. Install with:\n"
57
- " pip install abstractvoice[tts] # For TTS only\n"
58
- " pip install abstractvoice[voice-full] # For complete voice functionality\n"
59
- " pip install abstractvoice[all] # For all features\n\n"
60
- "If you're getting PyTorch-related errors, try:\n"
61
- " pip install abstractvoice[core-tts] # Lightweight TTS without extras\n\n"
62
- f"Original error: {e}"
63
- ) from e
19
+ from ..audio.resample import linear_resample_mono
64
20
 
65
- def _import_audio_deps():
66
- """Import audio dependencies with helpful error message if missing."""
67
- try:
68
- import sounddevice as sd
69
- import librosa
70
- return sd, librosa
71
- except ImportError as e:
72
- error_msg = str(e).lower()
73
21
 
74
- if "sounddevice" in error_msg:
75
- raise ImportError(
76
- "Audio playback requires sounddevice. Install with:\n"
77
- " pip install abstractvoice[audio-only] # For audio processing only\n"
78
- " pip install abstractvoice[voice-full] # For complete voice functionality\n"
79
- " pip install abstractvoice[all] # For all features\n\n"
80
- "On some systems, you may need system audio libraries:\n"
81
- " Ubuntu/Debian: sudo apt-get install portaudio19-dev\n"
82
- " macOS: brew install portaudio\n"
83
- " Windows: Usually works out of the box\n\n"
84
- f"Original error: {e}"
85
- ) from e
86
- elif "librosa" in error_msg:
87
- raise ImportError(
88
- "Audio processing requires librosa. Install with:\n"
89
- " pip install abstractvoice[tts] # For TTS functionality\n"
90
- " pip install abstractvoice[voice-full] # For complete voice functionality\n"
91
- " pip install abstractvoice[all] # For all features\n\n"
92
- f"Original error: {e}"
93
- ) from e
94
- else:
95
- # Generic audio import error
96
- raise ImportError(
97
- "Audio functionality requires optional dependencies. Install with:\n"
98
- " pip install abstractvoice[audio-only] # For audio processing only\n"
99
- " pip install abstractvoice[voice-full] # For complete voice functionality\n"
100
- " pip install abstractvoice[all] # For all features\n\n"
101
- f"Original error: {e}"
102
- ) from e
22
+ _STDERR_FD_LOCK = threading.Lock()
103
23
 
104
- # Suppress the PyTorch FutureWarning about torch.load
105
- warnings.filterwarnings(
106
- "ignore",
107
- message="You are using `torch.load` with `weights_only=False`",
108
- category=FutureWarning
109
- )
110
24
 
111
- # Suppress pkg_resources deprecation warning from jieba
112
- warnings.filterwarnings(
113
- "ignore",
114
- message=".*pkg_resources is deprecated.*",
115
- category=DeprecationWarning
116
- )
25
+ class _SilenceStderrFD:
26
+ """Temporarily redirect OS-level stderr (fd=2) to /dev/null.
117
27
 
118
- # Suppress coqpit deserialization warnings from TTS models
119
- warnings.filterwarnings(
120
- "ignore",
121
- message=".*Type mismatch.*",
122
- category=UserWarning
123
- )
124
- warnings.filterwarnings(
125
- "ignore",
126
- message=".*Failed to deserialize field.*",
127
- category=UserWarning
128
- )
28
+ PortAudio (and some underlying CoreAudio/AUHAL code paths) can emit warnings
29
+ directly to stderr, bypassing Python's `sys.stderr`. In interactive REPL
30
+ contexts this can corrupt the prompt/spinner UI.
31
+ """
129
32
 
130
- # Suppress macOS audio warnings (harmless but annoying)
131
- import os
132
- os.environ['PYTHONWARNINGS'] = 'ignore'
33
+ def __init__(self, enabled: bool = True):
34
+ self.enabled = bool(enabled)
35
+ self._devnull_fd = None
36
+ self._saved_stderr_fd = None
37
+
38
+ def __enter__(self):
39
+ if not self.enabled:
40
+ return self
41
+ _STDERR_FD_LOCK.acquire()
42
+ try:
43
+ self._devnull_fd = os.open(os.devnull, os.O_WRONLY)
44
+ self._saved_stderr_fd = os.dup(2)
45
+ os.dup2(self._devnull_fd, 2)
46
+ except Exception:
47
+ self.__exit__(None, None, None)
48
+ return self
49
+
50
+ def __exit__(self, exc_type, exc, tb):
51
+ if not self.enabled:
52
+ return False
53
+ try:
54
+ if self._saved_stderr_fd is not None:
55
+ try:
56
+ os.dup2(self._saved_stderr_fd, 2)
57
+ except Exception:
58
+ pass
59
+ finally:
60
+ try:
61
+ if self._saved_stderr_fd is not None:
62
+ os.close(self._saved_stderr_fd)
63
+ except Exception:
64
+ pass
65
+ try:
66
+ if self._devnull_fd is not None:
67
+ os.close(self._devnull_fd)
68
+ except Exception:
69
+ pass
70
+ try:
71
+ _STDERR_FD_LOCK.release()
72
+ except Exception:
73
+ pass
74
+ return False
75
+
76
+
77
+ def _import_sounddevice():
78
+ try:
79
+ import sounddevice as sd
80
+ return sd
81
+ except ImportError as e:
82
+ raise ImportError(
83
+ "Audio playback requires sounddevice. Install with:\n"
84
+ " pip install abstractvoice\n"
85
+ f"Original error: {e}"
86
+ ) from e
87
+
88
+
89
+ def _import_librosa():
90
+ try:
91
+ import librosa
92
+ return librosa
93
+ except ImportError as e:
94
+ raise ImportError(
95
+ "Speed/pitch processing requires librosa. Install with:\n"
96
+ " pip install \"abstractvoice[audio-fx]\"\n"
97
+ f"Original error: {e}"
98
+ ) from e
133
99
 
134
- def preprocess_text(text):
135
- """Preprocess text for better TTS synthesis.
136
-
137
- This function normalizes text to prevent synthesis errors:
138
- - Removes excessive whitespace
139
- - Normalizes punctuation
140
- - Handles common abbreviations
141
- - Removes problematic characters
142
-
143
- Args:
144
- text: Input text string
145
-
146
- Returns:
147
- Cleaned and normalized text
148
- """
149
- if not text:
150
- return text
151
-
152
- # Remove excessive whitespace
153
- text = re.sub(r'\s+', ' ', text)
154
-
155
- # Normalize ellipsis
156
- text = text.replace('...', '.')
157
-
158
- # Remove or normalize problematic characters
159
- # Keep basic punctuation that helps with prosody
160
- text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
161
-
162
- # Ensure proper spacing after punctuation
163
- text = re.sub(r'([.,!?;:])([^\s])', r'\1 \2', text)
164
-
165
- return text.strip()
166
100
 
101
+ def apply_speed_without_pitch_change(audio: np.ndarray, speed: float, sr: int = 22050) -> np.ndarray:
102
+ """Apply speed change without affecting pitch (best-effort).
167
103
 
168
- def apply_speed_without_pitch_change(audio, speed, sr=22050):
169
- """Apply speed change without affecting pitch using librosa time_stretch.
170
-
171
- Args:
172
- audio: Audio samples as numpy array
173
- speed: Speed multiplier (0.5-2.0, where >1.0 is faster, <1.0 is slower)
174
- sr: Sample rate (default 22050)
175
-
176
- Returns:
177
- Time-stretched audio samples
104
+ If librosa is not installed (or fails), returns the original audio.
178
105
  """
179
- if speed == 1.0:
106
+ if not speed or speed == 1.0:
180
107
  return audio
181
-
182
- # librosa.effects.time_stretch expects rate parameter where:
183
- # rate > 1.0 makes audio faster (shorter)
184
- # rate < 1.0 makes audio slower (longer)
185
- # This matches our speed semantics
108
+
186
109
  try:
187
- _, librosa = _import_audio_deps()
188
- stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
189
- return stretched_audio
110
+ librosa = _import_librosa()
111
+ return librosa.effects.time_stretch(audio, rate=float(speed))
190
112
  except Exception as e:
191
- # If time-stretching fails, return original audio
192
113
  logging.warning(f"Time-stretching failed: {e}, using original audio")
193
114
  return audio
194
115
 
195
116
 
196
117
  class NonBlockingAudioPlayer:
197
- """Non-blocking audio player using OutputStream callbacks for immediate pause/resume."""
198
-
199
- def __init__(self, sample_rate=22050, debug_mode=False):
200
- self.sample_rate = sample_rate
118
+ """Non-blocking audio player using OutputStream callbacks for pause/resume."""
119
+
120
+ def __init__(self, sample_rate: int = 22050, debug_mode: bool = False):
121
+ self.sample_rate = int(sample_rate)
201
122
  self.debug_mode = debug_mode
202
-
203
- # Audio queue and playback state
204
- self.audio_queue = queue.Queue()
123
+
124
+ self.audio_queue: "queue.Queue[np.ndarray]" = queue.Queue()
205
125
  self.stream = None
206
126
  self.is_playing = False
207
- self.is_paused = False
208
- self.pause_lock = threading.Lock()
209
-
210
- # Current audio buffer management
211
- self.current_audio = None
127
+
128
+ self._pause_lock = threading.Lock()
129
+ self._paused = False
130
+
131
+ self.current_audio: Optional[np.ndarray] = None
212
132
  self.current_position = 0
213
- self.playback_complete_callback = None
214
-
215
- # NEW: Enhanced audio lifecycle callbacks
216
- self.on_audio_start = None # Called when first audio sample plays
217
- self.on_audio_end = None # Called when last audio sample finishes
218
- self.on_audio_pause = None # Called when audio is paused
219
- self.on_audio_resume = None # Called when audio is resumed
220
- self._audio_started = False # Track if we've fired start callback
221
-
222
- def _audio_callback(self, outdata, frames, time, status):
223
- """Callback function for OutputStream - provides immediate pause/resume."""
133
+
134
+ self.playback_complete_callback: Optional[Callable[[], None]] = None
135
+
136
+ # Audio lifecycle callbacks
137
+ self.on_audio_start: Optional[Callable[[], None]] = None
138
+ self.on_audio_end: Optional[Callable[[], None]] = None
139
+ self.on_audio_pause: Optional[Callable[[], None]] = None
140
+ self.on_audio_resume: Optional[Callable[[], None]] = None
141
+ self._audio_started = False
142
+
143
+ # Optional hook: called with chunks that are actually written to the output.
144
+ # Used for advanced features like AEC (barge-in without self-interruption).
145
+ self.on_audio_chunk = None # Callable[[np.ndarray, int], None] | None
146
+
147
+ def _audio_callback(self, outdata, frames, _time, status):
224
148
  if status and self.debug_mode:
225
149
  print(f"Audio callback status: {status}")
226
-
227
- # Check pause state (thread-safe)
228
- with self.pause_lock:
229
- if self.is_paused:
230
- # Output silence when paused - immediate response
150
+
151
+ with self._pause_lock:
152
+ if self._paused:
231
153
  outdata.fill(0)
232
154
  return
233
-
155
+
234
156
  try:
235
- # Get next audio chunk if needed
236
157
  if self.current_audio is None or self.current_position >= len(self.current_audio):
237
158
  try:
238
159
  self.current_audio = self.audio_queue.get_nowait()
239
160
  self.current_position = 0
240
- if self.debug_mode:
241
- print(f" > Playing audio chunk ({len(self.current_audio)} samples)")
242
161
  except queue.Empty:
243
- # No more audio - output silence and mark as not playing
244
162
  outdata.fill(0)
245
163
  if self.is_playing:
246
164
  self.is_playing = False
247
- self._audio_started = False # Reset for next playback
248
-
249
- # Fire audio end callback
165
+ self._audio_started = False
250
166
  if self.on_audio_end:
251
167
  threading.Thread(target=self.on_audio_end, daemon=True).start()
252
-
253
168
  if self.playback_complete_callback:
254
- # Call completion callback in a separate thread to avoid blocking
255
169
  threading.Thread(target=self.playback_complete_callback, daemon=True).start()
256
170
  return
257
-
258
- # Calculate how much audio we can output this frame
171
+
259
172
  remaining = len(self.current_audio) - self.current_position
260
173
  frames_to_output = min(frames, remaining)
261
-
262
- # Fire audio start callback on first real audio output
174
+
263
175
  if frames_to_output > 0 and not self._audio_started:
264
176
  self._audio_started = True
265
177
  if self.on_audio_start:
266
178
  threading.Thread(target=self.on_audio_start, daemon=True).start()
267
-
268
- # Output the audio data
179
+
269
180
  if frames_to_output > 0:
270
- # Handle both mono and stereo output
271
- if outdata.shape[1] == 1: # Mono output
272
- outdata[:frames_to_output, 0] = self.current_audio[self.current_position:self.current_position + frames_to_output]
273
- else: # Stereo output
274
- audio_data = self.current_audio[self.current_position:self.current_position + frames_to_output]
275
- outdata[:frames_to_output, 0] = audio_data # Left channel
276
- outdata[:frames_to_output, 1] = audio_data # Right channel
277
-
181
+ if outdata.shape[1] == 1:
182
+ outdata[:frames_to_output, 0] = self.current_audio[
183
+ self.current_position : self.current_position + frames_to_output
184
+ ]
185
+ else:
186
+ audio_data = self.current_audio[
187
+ self.current_position : self.current_position + frames_to_output
188
+ ]
189
+ outdata[:frames_to_output, 0] = audio_data
190
+ outdata[:frames_to_output, 1] = audio_data
191
+
192
+ # Emit the actual output chunk (mono float32) for optional consumers.
193
+ try:
194
+ if self.on_audio_chunk:
195
+ chunk = self.current_audio[
196
+ self.current_position : self.current_position + frames_to_output
197
+ ]
198
+ self.on_audio_chunk(chunk, int(self.sample_rate))
199
+ except Exception:
200
+ # Never let optional hooks break audio playback.
201
+ pass
278
202
  self.current_position += frames_to_output
279
-
280
- # Fill remaining with silence if needed
203
+
281
204
  if frames_to_output < frames:
282
205
  outdata[frames_to_output:].fill(0)
283
-
206
+
284
207
  except Exception as e:
285
208
  if self.debug_mode:
286
209
  print(f"Error in audio callback: {e}")
287
210
  outdata.fill(0)
288
-
211
+
289
212
  def start_stream(self):
290
- """Start the audio stream."""
291
- if self.stream is None:
292
- try:
293
- sd, _ = _import_audio_deps()
294
- self.stream = sd.OutputStream(
295
- samplerate=self.sample_rate,
296
- channels=1, # Mono output
297
- callback=self._audio_callback,
298
- blocksize=1024, # Small buffer for low latency
299
- dtype=np.float32
300
- )
301
- self.stream.start()
302
- if self.debug_mode:
303
- print(" > Audio stream started")
304
- except Exception as e:
305
- if self.debug_mode:
306
- print(f"Error starting audio stream: {e}")
307
- raise
308
-
213
+ if self.stream is not None:
214
+ return
215
+ sd = _import_sounddevice()
216
+
217
+ desired_sr = int(self.sample_rate)
218
+ candidates: list[int] = [desired_sr]
219
+ try:
220
+ dev = sd.query_devices(None, "output") # default output device
221
+ default_sr = int(round(float(dev.get("default_samplerate", 0) or 0)))
222
+ if default_sr and default_sr not in candidates:
223
+ candidates.append(default_sr)
224
+ except Exception:
225
+ default_sr = 0
226
+
227
+ # Common output rates (keep short; we already prefer desired/default).
228
+ for sr in (48000, 44100, 24000, 22050, 16000):
229
+ if sr not in candidates:
230
+ candidates.append(sr)
231
+
232
+ last_err: Exception | None = None
233
+ for sr in candidates:
234
+ for blocksize in (1024, 0): # 0 => PortAudio decides (often most compatible)
235
+ stream = None
236
+ try:
237
+ with _SilenceStderrFD(enabled=not self.debug_mode):
238
+ stream = sd.OutputStream(
239
+ samplerate=int(sr),
240
+ channels=1,
241
+ callback=self._audio_callback,
242
+ blocksize=int(blocksize),
243
+ dtype=np.float32,
244
+ )
245
+ stream.start()
246
+ self.stream = stream
247
+ self.sample_rate = int(sr)
248
+ if self.debug_mode and int(sr) != desired_sr:
249
+ print(f"⚠️ Output device rejected {desired_sr}Hz; using {sr}Hz (resampling)")
250
+ return
251
+ except Exception as e:
252
+ last_err = e
253
+ try:
254
+ if stream is not None:
255
+ stream.close()
256
+ except Exception:
257
+ pass
258
+ continue
259
+
260
+ # If we couldn't start, surface the last error.
261
+ if last_err is not None:
262
+ raise last_err
263
+ raise RuntimeError("Failed to start audio output stream")
264
+
309
265
  def stop_stream(self):
310
- """Stop the audio stream."""
311
266
  if self.stream:
312
267
  try:
313
268
  self.stream.stop()
269
+ except Exception:
270
+ pass
271
+ try:
314
272
  self.stream.close()
315
- if self.debug_mode:
316
- print(" > Audio stream stopped")
317
- except Exception as e:
318
- if self.debug_mode:
319
- print(f"Error stopping audio stream: {e}")
320
- finally:
321
- self.stream = None
273
+ except Exception:
274
+ pass
275
+ self.stream = None
322
276
 
323
277
  self.is_playing = False
324
- with self.pause_lock:
325
- self.is_paused = False
278
+ with self._pause_lock:
279
+ self._paused = False
326
280
  self.clear_queue()
327
281
 
328
282
  def cleanup(self):
329
- """Cleanup resources to prevent memory conflicts."""
283
+ self.stop_stream()
284
+ self.current_audio = None
285
+ self.playback_complete_callback = None
286
+
287
+ def play_audio(self, audio_array: np.ndarray, *, sample_rate: int | None = None):
288
+ if audio_array is None or len(audio_array) == 0:
289
+ return
290
+
291
+ # Ensure mono float32 vector.
330
292
  try:
331
- self.stop_stream()
332
- # Clear any remaining references
333
- self.current_audio = None
334
- self.playback_complete_callback = None
335
- if self.debug_mode:
336
- print(" > Audio player cleaned up")
337
- except Exception as e:
338
- if self.debug_mode:
339
- print(f"Audio cleanup warning: {e}")
340
-
341
- def play_audio(self, audio_array):
342
- """Add audio to the playback queue."""
343
- if audio_array is not None and len(audio_array) > 0:
344
- # Ensure audio is float32 and normalized
345
- if audio_array.dtype != np.float32:
346
- audio_array = audio_array.astype(np.float32)
347
-
348
- # Normalize if needed
349
- if np.max(np.abs(audio_array)) > 1.0:
350
- audio_array = audio_array / np.max(np.abs(audio_array))
351
-
352
- self.audio_queue.put(audio_array)
353
- self.is_playing = True
354
-
355
- # Start stream if not already running
356
- if self.stream is None:
357
- self.start_stream()
358
-
359
- def pause(self):
360
- """Pause audio playback immediately."""
361
- with self.pause_lock:
362
- if self.is_playing and not self.is_paused:
363
- self.is_paused = True
364
- if self.debug_mode:
365
- print(" > Audio paused immediately")
366
-
367
- # Fire audio pause callback
293
+ if hasattr(audio_array, "ndim") and int(audio_array.ndim) > 1:
294
+ audio_array = np.mean(audio_array, axis=1).astype(np.float32)
295
+ except Exception:
296
+ pass
297
+ audio_array = np.asarray(audio_array, dtype=np.float32).reshape(-1)
298
+
299
+ # If we haven't started the output stream yet, do so first. This allows
300
+ # `start_stream()` to fall back to a compatible device sample rate.
301
+ if self.stream is None:
302
+ self.start_stream()
303
+
304
+ sr_in = int(sample_rate) if sample_rate is not None else int(self.sample_rate)
305
+ sr_out = int(self.sample_rate)
306
+ if sr_in != sr_out:
307
+ audio_array = linear_resample_mono(audio_array, sr_in, sr_out)
308
+
309
+ max_abs = float(np.max(np.abs(audio_array))) if len(audio_array) else 0.0
310
+ if max_abs > 1.0:
311
+ audio_array = audio_array / max_abs
312
+
313
+ self.audio_queue.put(audio_array)
314
+ self.is_playing = True
315
+ # Stream should already be started above when needed.
316
+
317
+ def pause(self) -> bool:
318
+ with self._pause_lock:
319
+ if self.is_playing and not self._paused:
320
+ self._paused = True
368
321
  if self.on_audio_pause:
369
322
  threading.Thread(target=self.on_audio_pause, daemon=True).start()
370
-
371
323
  return True
372
324
  return False
373
-
374
- def resume(self):
375
- """Resume audio playback immediately."""
376
- with self.pause_lock:
377
- if self.is_paused:
378
- self.is_paused = False
379
- if self.debug_mode:
380
- print(" > Audio resumed immediately")
381
-
382
- # Fire audio resume callback
325
+
326
+ def resume(self) -> bool:
327
+ with self._pause_lock:
328
+ if self._paused:
329
+ self._paused = False
383
330
  if self.on_audio_resume:
384
331
  threading.Thread(target=self.on_audio_resume, daemon=True).start()
385
-
386
332
  return True
387
333
  return False
388
-
389
- def is_paused_state(self):
390
- """Check if audio is currently paused."""
391
- with self.pause_lock:
392
- return self.is_paused
393
-
334
+
335
+ def is_paused_state(self) -> bool:
336
+ with self._pause_lock:
337
+ return bool(self._paused)
338
+
394
339
  def clear_queue(self):
395
- """Clear the audio queue."""
396
340
  while not self.audio_queue.empty():
397
341
  try:
398
342
  self.audio_queue.get_nowait()
399
343
  except queue.Empty:
400
344
  break
401
-
402
- # Reset current audio buffer
403
345
  self.current_audio = None
404
346
  self.current_position = 0
405
-
406
-
407
- def chunk_long_text(text, max_chunk_size=300):
408
- """Split very long text into manageable chunks at natural boundaries.
409
-
410
- For extremely long texts, this function splits at paragraph or sentence
411
- boundaries to prevent memory issues and attention degradation.
412
-
413
- Args:
414
- text: Input text string
415
- max_chunk_size: Maximum characters per chunk (default 300)
416
-
417
- Returns:
418
- List of text chunks
419
- """
420
- if len(text) <= max_chunk_size:
421
- return [text]
422
-
423
- chunks = []
424
-
425
- # First try to split by paragraphs
426
- paragraphs = text.split('\n\n')
427
-
428
- current_chunk = ""
429
- for para in paragraphs:
430
- # If adding this paragraph would exceed limit and we have content
431
- if len(current_chunk) + len(para) > max_chunk_size and current_chunk:
432
- chunks.append(current_chunk.strip())
433
- current_chunk = para
434
- else:
435
- if current_chunk:
436
- current_chunk += "\n\n" + para
437
- else:
438
- current_chunk = para
439
-
440
- # If a single paragraph is too long, split by sentences
441
- if len(current_chunk) > max_chunk_size:
442
- # Split on sentence boundaries
443
- sentences = re.split(r'([.!?]+\s+)', current_chunk)
444
- temp_chunk = ""
445
-
446
- for i in range(0, len(sentences), 2):
447
- sentence = sentences[i]
448
- punct = sentences[i+1] if i+1 < len(sentences) else ""
449
-
450
- if len(temp_chunk) + len(sentence) + len(punct) > max_chunk_size and temp_chunk:
451
- chunks.append(temp_chunk.strip())
452
- temp_chunk = sentence + punct
453
- else:
454
- temp_chunk += sentence + punct
455
-
456
- current_chunk = temp_chunk
457
-
458
- # Add remaining text
459
- if current_chunk:
460
- chunks.append(current_chunk.strip())
461
-
462
- return chunks if chunks else [text]
463
-
464
-
465
- class TTSEngine:
466
- """Text-to-speech engine with interrupt capability."""
467
-
468
- def __init__(self, model_name="tts_models/en/ljspeech/vits", debug_mode=False, streaming=True):
469
- """Initialize the TTS engine.
470
-
471
- Args:
472
- model_name: TTS model to use (default: vits - best quality, requires espeak-ng)
473
- debug_mode: Enable debug output
474
- streaming: Enable streaming playback (start playing while synthesizing remaining chunks)
475
-
476
- Note:
477
- VITS model (default) requires espeak-ng for best quality:
478
- - macOS: brew install espeak-ng
479
- - Linux: sudo apt-get install espeak-ng
480
- - Windows: See installation guide in README
481
-
482
- If espeak-ng is not available, will auto-fallback to fast_pitch
483
- """
484
- # Set up debug mode
485
- self.debug_mode = debug_mode
486
- self.streaming = streaming
487
-
488
- # Callback to notify when TTS starts/stops (for pausing voice recognition)
489
- self.on_playback_start = None
490
- self.on_playback_end = None
491
-
492
- # Suppress TTS output unless in debug mode
493
- if not debug_mode:
494
- # Suppress all TTS logging
495
- logging.getLogger('TTS').setLevel(logging.ERROR)
496
- logging.getLogger('TTS.utils.audio').setLevel(logging.ERROR)
497
- logging.getLogger('TTS.utils.io').setLevel(logging.ERROR)
498
- logging.getLogger('numba').setLevel(logging.ERROR)
499
-
500
- # Disable stdout during TTS loading
501
- os.environ['TTS_VERBOSE'] = '0'
502
-
503
- # Temporarily redirect stdout to suppress TTS init messages
504
- orig_stdout = sys.stdout
505
- null_out = open(os.devnull, 'w')
506
- sys.stdout = null_out
507
-
508
- try:
509
- if self.debug_mode:
510
- print(f" > Loading TTS model: {model_name}")
511
-
512
- # Try simple, effective initialization strategy
513
- try:
514
- TTS = _import_tts()
515
- success, final_model = self._load_with_simple_fallback(TTS, model_name, debug_mode)
516
- if not success:
517
- # If all fails, provide actionable guidance
518
- self._handle_model_load_failure(debug_mode)
519
- elif self.debug_mode and final_model != model_name:
520
- print(f" > Loaded fallback model: {final_model}")
521
- except Exception as e:
522
- error_msg = str(e).lower()
523
- # Check if this is an espeak-related error
524
- if ("espeak" in error_msg or "phoneme" in error_msg):
525
- self._handle_espeak_fallback(debug_mode)
526
- else:
527
- # Different error, re-raise
528
- raise
529
- finally:
530
- # Restore stdout if we redirected it
531
- if not debug_mode:
532
- sys.stdout = orig_stdout
533
- null_out.close()
534
-
535
- # Initialize non-blocking audio player for immediate pause/resume
536
- self.audio_player = NonBlockingAudioPlayer(sample_rate=22050, debug_mode=debug_mode)
537
- self.audio_player.playback_complete_callback = self._on_playback_complete
538
-
539
- # Legacy playback state (for compatibility with existing code)
540
- self.is_playing = False
541
- self.stop_flag = threading.Event()
542
- self.pause_flag = threading.Event()
543
- self.pause_flag.set() # Initially not paused (set means "not paused")
544
- self.playback_thread = None
545
- self.start_time = 0
546
- self.audio_queue = [] # Queue for streaming playback
547
- self.queue_lock = threading.Lock() # Thread-safe queue access
548
-
549
- # Pause/resume state
550
- self.pause_lock = threading.Lock() # Thread-safe pause operations
551
- self.is_paused_state = False # Explicit paused state tracking
552
-
553
- def _load_with_simple_fallback(self, TTS, preferred_model: str, debug_mode: bool) -> tuple[bool, str]:
554
- """Load TTS model with bulletproof compatibility-first strategy."""
555
- from ..simple_model_manager import get_model_manager
556
-
557
- model_manager = get_model_manager(debug_mode=debug_mode)
558
-
559
- # Step 1: Check espeak availability for smart model filtering
560
- espeak_available = self._check_espeak_available()
561
- if debug_mode and not espeak_available:
562
- print(" > espeak-ng not found, will skip VITS models")
563
-
564
- # Step 2: Try the REQUESTED model first if it's cached
565
- cached_models = model_manager.get_cached_models()
566
- if cached_models and debug_mode:
567
- print(f" > Found {len(cached_models)} cached models")
568
-
569
- # FORCE USER'S CHOICE: Try the specifically requested model first
570
- if preferred_model in cached_models:
571
- try:
572
- if debug_mode:
573
- print(f" > LOADING REQUESTED MODEL: {preferred_model}")
574
-
575
- # Safety check for Italian VITS models that might crash
576
- if "it/" in preferred_model and "vits" in preferred_model:
577
- if debug_mode:
578
- print(f" > Italian VITS model detected - using safe loading...")
579
-
580
- self.tts = TTS(model_name=preferred_model, progress_bar=self.debug_mode)
581
-
582
- if debug_mode:
583
- print(f" > ✅ SUCCESS: Loaded requested model: {preferred_model}")
584
- return True, preferred_model
585
-
586
- except Exception as e:
587
- error_msg = str(e).lower()
588
- if debug_mode:
589
- print(f" > ❌ Requested model failed: {e}")
590
-
591
- # Special handling for Italian model crashes
592
- if "it/" in preferred_model and ("segmentation" in error_msg or "crash" in error_msg):
593
- if debug_mode:
594
- print(f" > Italian model caused crash - marking as incompatible")
595
- # Force fallback for crashed Italian models
596
- pass
597
-
598
- # Only fall back if the model actually failed to load, not due to dependencies
599
-
600
- # Step 3: Only fall back to compatibility order if requested model failed
601
- if debug_mode:
602
- print(" > Requested model unavailable, trying fallback models...")
603
-
604
- # Compatibility-first fallback order
605
- fallback_models = [
606
- "tts_models/en/ljspeech/tacotron2-DDC", # Most reliable (Linda)
607
- "tts_models/en/jenny/jenny", # Different female speaker (Jenny)
608
- "tts_models/en/ek1/tacotron2", # Male British accent (Edward)
609
- "tts_models/en/sam/tacotron-DDC", # Different male voice (Sam)
610
- "tts_models/en/ljspeech/fast_pitch", # Lightweight alternative
611
- "tts_models/en/ljspeech/glow-tts", # Another alternative
612
- "tts_models/en/vctk/vits", # Multi-speaker (requires espeak)
613
- "tts_models/en/ljspeech/vits", # Premium (requires espeak)
614
- ]
615
-
616
- # Remove the preferred model from fallbacks to avoid duplicate attempts
617
- fallback_models = [m for m in fallback_models if m != preferred_model]
618
-
619
- # Try fallback models
620
- for model in fallback_models:
621
- if model in cached_models:
622
- # Skip VITS models if no espeak
623
- if "vits" in model and not espeak_available:
624
- if debug_mode:
625
- print(f" > Skipping {model} (requires espeak-ng)")
626
- continue
627
-
628
- try:
629
- if debug_mode:
630
- print(f" > Trying fallback model: {model}")
631
- self.tts = TTS(model_name=model, progress_bar=self.debug_mode)
632
- if debug_mode:
633
- print(f" > ✅ Successfully loaded fallback: {model}")
634
- return True, model
635
- except Exception as e:
636
- if debug_mode:
637
- print(f" > ❌ Fallback {model} failed: {e}")
638
-
639
- # Step 4: If no cached models work, try downloading requested model first
640
- if debug_mode:
641
- print(" > No cached models worked, attempting downloads...")
642
-
643
- # Try downloading the requested model first
644
- if "vits" not in preferred_model or espeak_available:
645
- try:
646
- if debug_mode:
647
- print(f" > Downloading requested model: {preferred_model}...")
648
- success = model_manager.download_model(preferred_model)
649
- if success:
650
- self.tts = TTS(model_name=preferred_model, progress_bar=self.debug_mode)
651
- if debug_mode:
652
- print(f" > ✅ Downloaded and loaded requested: {preferred_model}")
653
- return True, preferred_model
654
- elif debug_mode:
655
- print(f" > ❌ Download failed for requested model: {preferred_model}")
656
- except Exception as e:
657
- if debug_mode:
658
- print(f" > ❌ Failed to download/load requested model: {e}")
659
-
660
- # Step 5: If requested model download failed, try fallback downloads
661
- for model in fallback_models:
662
- # Skip VITS models if no espeak
663
- if "vits" in model and not espeak_available:
664
- continue
665
-
666
- try:
667
- if debug_mode:
668
- print(f" > Downloading fallback: {model}...")
669
-
670
- # First try to download
671
- success = model_manager.download_model(model)
672
- if success:
673
- # Then try to load
674
- self.tts = TTS(model_name=model, progress_bar=self.debug_mode)
675
- if debug_mode:
676
- print(f" > ✅ Downloaded and loaded fallback: {model}")
677
- return True, model
678
- elif debug_mode:
679
- print(f" > ❌ Download failed for {model}")
680
-
681
- except Exception as e:
682
- if debug_mode:
683
- print(f" > ❌ Failed to load {model}: {e}")
684
-
685
- return False, None
686
-
687
- def _check_espeak_available(self) -> bool:
688
- """Check if espeak-ng is available on the system."""
689
- import subprocess
690
- try:
691
- subprocess.run(['espeak-ng', '--version'],
692
- capture_output=True, check=True, timeout=5)
693
- return True
694
- except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
695
- # Try alternative espeak command
696
- try:
697
- subprocess.run(['espeak', '--version'],
698
- capture_output=True, check=True, timeout=5)
699
- return True
700
- except:
701
- return False
702
-
703
- def _handle_espeak_fallback(self, debug_mode: bool):
704
- """Handle espeak-related errors with fallback to non-phoneme models."""
705
- # Restore stdout to show user-friendly message
706
- if not debug_mode:
707
- sys.stdout = sys.__stdout__
708
-
709
- print("\n" + "="*70)
710
- print("⚠️ VITS Model Requires espeak-ng (Not Found)")
711
- print("="*70)
712
- print("\nFor BEST voice quality, install espeak-ng:")
713
- print(" • macOS: brew install espeak-ng")
714
- print(" • Linux: sudo apt-get install espeak-ng")
715
- print(" • Windows: conda install espeak-ng (or see README)")
716
- print("\nFalling back to compatible models (no espeak dependency)")
717
- print("="*70 + "\n")
718
-
719
- if not debug_mode:
720
- import os
721
- null_out = open(os.devnull, 'w')
722
- sys.stdout = null_out
723
-
724
- # Try non-phoneme models that don't require espeak (compatibility-first order)
725
- from TTS.api import TTS
726
- fallback_models = [
727
- "tts_models/en/ljspeech/tacotron2-DDC", # Most reliable (Linda)
728
- "tts_models/en/jenny/jenny", # Different female speaker (Jenny)
729
- "tts_models/en/ek1/tacotron2", # Male British accent (Edward)
730
- "tts_models/en/sam/tacotron-DDC", # Different male voice (Sam)
731
- "tts_models/en/ljspeech/fast_pitch", # Lightweight alternative
732
- "tts_models/en/ljspeech/glow-tts" # Another alternative
733
- ]
734
-
735
- tts_loaded = False
736
- for fallback_model in fallback_models:
737
- try:
738
- if debug_mode:
739
- print(f"Trying fallback model: {fallback_model}")
740
- self.tts = TTS(model_name=fallback_model, progress_bar=self.debug_mode)
741
- tts_loaded = True
742
- break
743
- except Exception as fallback_error:
744
- if debug_mode:
745
- print(f"Fallback {fallback_model} failed: {fallback_error}")
746
- continue
747
-
748
- if not tts_loaded:
749
- self._handle_model_load_failure(debug_mode)
750
-
751
- def _handle_model_load_failure(self, debug_mode: bool):
752
- """Handle complete model loading failure with actionable guidance."""
753
- # Restore stdout to show user-friendly message
754
- if not debug_mode:
755
- sys.stdout = sys.__stdout__
756
-
757
- print("\n" + "="*70)
758
- print("❌ TTS Model Loading Failed")
759
- print("="*70)
760
- print("\nNo TTS models could be loaded (offline or online).")
761
- print("\nQuick fixes:")
762
- print(" 1. Download essential models:")
763
- print(" abstractvoice download-models")
764
- print(" 2. Check internet connectivity")
765
- print(" 3. Clear corrupted cache:")
766
- print(" rm -rf ~/.cache/tts ~/.local/share/tts")
767
- print(" 4. Reinstall TTS:")
768
- print(" pip install --force-reinstall coqui-tts")
769
- print(" 5. Use text-only mode:")
770
- print(" abstractvoice --no-tts")
771
- print("="*70)
772
-
773
- raise RuntimeError(
774
- "❌ Failed to load any TTS model.\n"
775
- "This typically means:\n"
776
- " • No models cached locally AND no internet connection\n"
777
- " • Corrupted model cache\n"
778
- " • Insufficient disk space\n"
779
- " • Network firewall blocking downloads\n\n"
780
- "Run 'abstractvoice download-models' when you have internet access."
781
- )
782
-
783
- def _on_playback_complete(self):
784
- """Callback when audio playback completes."""
785
- self.is_playing = False
786
- if self.on_playback_end:
787
- self.on_playback_end()
788
-
789
- def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
790
- """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
791
- # Stop any existing playback
792
- self.stop()
793
-
794
- if not text:
795
- return False
796
-
797
- try:
798
- # Preprocess text for better synthesis quality
799
- processed_text = preprocess_text(text)
800
-
801
- if self.debug_mode:
802
- print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
803
- print(f" > Text length: {len(processed_text)} chars")
804
- if language != 'en':
805
- print(f" > Language: {language}")
806
- if speed != 1.0:
807
- print(f" > Using speed multiplier: {speed}x")
808
-
809
- # For very long text, chunk it at natural boundaries
810
- text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
811
-
812
- if self.debug_mode and len(text_chunks) > 1:
813
- print(f" > Split into {len(text_chunks)} chunks for processing")
814
-
815
- # Set playing state
816
- self.is_playing = True
817
- self.is_paused_state = False
818
-
819
- # Call start callback
820
- if self.on_playback_start:
821
- self.on_playback_start()
822
-
823
- # Synthesize and queue audio chunks
824
- def synthesis_worker():
825
- try:
826
- for i, chunk in enumerate(text_chunks):
827
- if self.stop_flag.is_set():
828
- break
829
-
830
- if self.debug_mode and len(text_chunks) > 1:
831
- print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
832
-
833
- # Generate audio for this chunk with language support
834
- try:
835
- # Check if this is an XTTS model (supports language parameter)
836
- if 'xtts' in self.tts.model_name.lower():
837
- chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
838
- if self.debug_mode and language != 'en':
839
- print(f" > Using XTTS with language: {language}")
840
- else:
841
- # Monolingual model - ignore language parameter
842
- chunk_audio = self.tts.tts(chunk, split_sentences=True)
843
- if self.debug_mode and language != 'en':
844
- print(f" > Monolingual model - ignoring language parameter")
845
- except Exception as tts_error:
846
- # Fallback: try without language parameter
847
- if self.debug_mode:
848
- print(f" > TTS with language failed, trying without: {tts_error}")
849
- chunk_audio = self.tts.tts(chunk, split_sentences=True)
850
-
851
- if chunk_audio and len(chunk_audio) > 0:
852
- # Apply speed adjustment
853
- if speed != 1.0:
854
- chunk_audio = apply_speed_without_pitch_change(
855
- np.array(chunk_audio), speed
856
- )
857
-
858
- # Queue the audio for playback
859
- self.audio_player.play_audio(np.array(chunk_audio))
860
-
861
- if self.debug_mode:
862
- print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
863
-
864
- # Small delay between chunks to prevent overwhelming the queue
865
- time.sleep(0.01)
866
-
867
- except Exception as e:
868
- if self.debug_mode:
869
- print(f"Error in synthesis worker: {e}")
870
- finally:
871
- # Synthesis complete - audio player will handle completion callback
872
- pass
873
-
874
- # Start synthesis in background thread
875
- synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
876
- synthesis_thread.start()
877
-
878
- return True
879
-
880
- except Exception as e:
881
- if self.debug_mode:
882
- print(f"Error in _speak_with_nonblocking_player: {e}")
883
- self.is_playing = False
884
- return False
885
-
886
- def speak(self, text, speed=1.0, callback=None, language='en'):
887
- """Convert text to speech and play audio with language support.
888
-
889
- Implements SOTA best practices for long text synthesis:
890
- - Text preprocessing and normalization
891
- - Intelligent chunking for very long text (>500 chars)
892
- - Sentence segmentation to prevent attention degradation
893
- - Seamless audio concatenation for chunks
894
- - Multilingual support via XTTS models
895
-
896
- Args:
897
- text: Text to convert to speech
898
- speed: Speed multiplier (0.5-2.0)
899
- callback: Function to call when speech is complete
900
- language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
901
-
902
- Returns:
903
- True if speech started, False if text was empty
904
- """
905
- # Use the new non-blocking audio player for immediate pause/resume
906
- return self._speak_with_nonblocking_player(text, speed, callback, language)
907
-
908
- if not text:
909
- return False
910
-
911
- try:
912
- # Preprocess text for better synthesis quality
913
- processed_text = preprocess_text(text)
914
-
915
- if self.debug_mode:
916
- print(f" > Speaking: '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
917
- print(f" > Text length: {len(processed_text)} chars")
918
- if speed != 1.0:
919
- print(f" > Using speed multiplier: {speed}x")
920
-
921
- # For very long text, chunk it at natural boundaries
922
- # Use 300 chars to stay well within model's training distribution
923
- text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
924
-
925
- if self.debug_mode and len(text_chunks) > 1:
926
- print(f" > Split into {len(text_chunks)} chunks for processing")
927
-
928
- # Redirect stdout for non-debug mode
929
- orig_stdout = None
930
- null_out = None
931
- if not self.debug_mode:
932
- orig_stdout = sys.stdout
933
- null_out = open(os.devnull, 'w')
934
- sys.stdout = null_out
935
-
936
- try:
937
- # Choose synthesis strategy based on streaming mode
938
- if self.streaming and len(text_chunks) > 1:
939
- # STREAMING MODE: Synthesize and play progressively
940
- if self.debug_mode:
941
- sys.stdout = sys.__stdout__
942
- print(f" > Streaming mode: will start playback after first chunk")
943
- if not self.debug_mode:
944
- sys.stdout = null_out
945
-
946
- # Synthesize first chunk
947
- if self.debug_mode:
948
- sys.stdout = sys.__stdout__
949
- print(f" > Processing chunk 1/{len(text_chunks)} ({len(text_chunks[0])} chars)...")
950
- if not self.debug_mode:
951
- sys.stdout = null_out
952
-
953
- first_audio = self.tts.tts(text_chunks[0], split_sentences=True)
954
-
955
- if not first_audio:
956
- if self.debug_mode:
957
- sys.stdout = sys.__stdout__
958
- print("TTS failed to generate audio for first chunk.")
959
- return False
960
-
961
- # Apply speed adjustment using time-stretching (preserves pitch)
962
- if speed != 1.0:
963
- first_audio = apply_speed_without_pitch_change(
964
- np.array(first_audio), speed
965
- )
966
-
967
- if self.debug_mode:
968
- sys.stdout = sys.__stdout__
969
- print(f" > Chunk 1 generated {len(first_audio)} audio samples")
970
- if speed != 1.0:
971
- print(f" > Applied time-stretch: {speed}x (pitch preserved)")
972
- print(f" > Starting playback while synthesizing remaining chunks...")
973
- if not self.debug_mode:
974
- sys.stdout = null_out
975
-
976
- # Initialize queue with first chunk
977
- with self.queue_lock:
978
- self.audio_queue = [first_audio]
979
-
980
- # Start playback thread (will play from queue)
981
- audio = None # Will use queue instead
982
-
983
- else:
984
- # NON-STREAMING MODE: Synthesize all chunks then play
985
- audio_chunks = []
986
- for i, chunk in enumerate(text_chunks):
987
- if self.debug_mode and len(text_chunks) > 1:
988
- sys.stdout = sys.__stdout__
989
- print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
990
- if not self.debug_mode:
991
- sys.stdout = null_out
992
-
993
- # Use split_sentences=True (SOTA best practice)
994
- chunk_audio = self.tts.tts(chunk, split_sentences=True)
995
-
996
- if chunk_audio:
997
- # Apply speed adjustment using time-stretching (preserves pitch)
998
- if speed != 1.0:
999
- chunk_audio = apply_speed_without_pitch_change(
1000
- np.array(chunk_audio), speed
1001
- )
1002
- audio_chunks.append(chunk_audio)
1003
- if self.debug_mode and len(text_chunks) > 1:
1004
- sys.stdout = sys.__stdout__
1005
- print(f" > Chunk {i+1} generated {len(chunk_audio)} audio samples")
1006
- if not self.debug_mode:
1007
- sys.stdout = null_out
1008
- elif self.debug_mode:
1009
- sys.stdout = sys.__stdout__
1010
- print(f" > Warning: Chunk {i+1} failed to generate audio")
1011
- if not self.debug_mode:
1012
- sys.stdout = null_out
1013
-
1014
- if not audio_chunks:
1015
- if self.debug_mode:
1016
- sys.stdout = sys.__stdout__
1017
- print("TTS failed to generate audio.")
1018
- return False
1019
-
1020
- # Concatenate audio arrays
1021
- if len(audio_chunks) == 1:
1022
- audio = audio_chunks[0]
1023
- else:
1024
- audio = np.concatenate(audio_chunks)
1025
- if self.debug_mode:
1026
- sys.stdout = sys.__stdout__
1027
- print(f" > Concatenated {len(audio_chunks)} chunks into {len(audio)} total audio samples")
1028
- if not self.debug_mode:
1029
- sys.stdout = null_out
1030
-
1031
- finally:
1032
- # Restore stdout if we redirected it
1033
- if not self.debug_mode and orig_stdout:
1034
- sys.stdout = orig_stdout
1035
- if null_out:
1036
- null_out.close()
1037
-
1038
- def _audio_playback():
1039
- # Import sounddevice at runtime to avoid loading heavy dependencies
1040
- sd, _ = _import_audio_deps()
1041
-
1042
- try:
1043
- self.is_playing = True
1044
- self.start_time = time.time()
1045
-
1046
- # Notify that playback is starting (to pause voice recognition)
1047
- if self.on_playback_start:
1048
- self.on_playback_start()
1049
-
1050
- # Use standard playback rate (speed is handled via time-stretching)
1051
- playback_rate = 22050
1052
-
1053
- # STREAMING MODE: Play from queue while synthesizing remaining chunks
1054
- if audio is None: # Streaming mode indicator
1055
- # Start background thread to synthesize remaining chunks
1056
- def _synthesize_remaining():
1057
- for i in range(1, len(text_chunks)):
1058
- if self.stop_flag.is_set():
1059
- break
1060
-
1061
- if self.debug_mode:
1062
- print(f" > [Background] Processing chunk {i+1}/{len(text_chunks)} ({len(text_chunks[i])} chars)...")
1063
-
1064
- try:
1065
- chunk_audio = self.tts.tts(text_chunks[i], split_sentences=True)
1066
- if chunk_audio:
1067
- # Apply speed adjustment using time-stretching (preserves pitch)
1068
- if speed != 1.0:
1069
- chunk_audio = apply_speed_without_pitch_change(
1070
- np.array(chunk_audio), speed
1071
- )
1072
- with self.queue_lock:
1073
- self.audio_queue.append(chunk_audio)
1074
- if self.debug_mode:
1075
- print(f" > [Background] Chunk {i+1} generated {len(chunk_audio)} samples, added to queue")
1076
- except Exception as e:
1077
- if self.debug_mode:
1078
- print(f" > [Background] Chunk {i+1} synthesis error: {e}")
1079
-
1080
- synthesis_thread = threading.Thread(target=_synthesize_remaining)
1081
- synthesis_thread.daemon = True
1082
- synthesis_thread.start()
1083
-
1084
- # Play chunks from queue as they become available
1085
- chunks_played = 0
1086
- while chunks_played < len(text_chunks) and not self.stop_flag.is_set():
1087
- # Check for pause before processing next chunk
1088
- while not self.pause_flag.is_set() and not self.stop_flag.is_set():
1089
- time.sleep(0.1) # Non-blocking pause check
1090
-
1091
- if self.stop_flag.is_set():
1092
- break
1093
-
1094
- # Wait for next chunk to be available
1095
- while True:
1096
- with self.queue_lock:
1097
- if chunks_played < len(self.audio_queue):
1098
- chunk_to_play = self.audio_queue[chunks_played]
1099
- break
1100
- if self.stop_flag.is_set():
1101
- break
1102
- time.sleep(0.05) # Short wait before checking again
1103
-
1104
- if self.stop_flag.is_set():
1105
- break
1106
-
1107
- # Play this chunk
1108
- audio_array = np.array(chunk_to_play)
1109
- sd.play(audio_array, samplerate=playback_rate)
1110
-
1111
- # Wait for this chunk to finish (with frequent pause checks)
1112
- while not self.stop_flag.is_set() and sd.get_stream().active:
1113
- # Check for pause more frequently
1114
- if not self.pause_flag.is_set():
1115
- # Paused - let current audio finish naturally (avoids terminal interference)
1116
- break
1117
- time.sleep(0.05) # Check every 50ms for better responsiveness
1118
-
1119
- if self.stop_flag.is_set():
1120
- # Only use sd.stop() for explicit stop, not pause
1121
- sd.stop()
1122
- break
1123
-
1124
- chunks_played += 1
1125
-
1126
- synthesis_thread.join(timeout=1.0) # Wait for synthesis to complete
1127
-
1128
- else:
1129
- # NON-STREAMING MODE: Play concatenated audio
1130
- audio_array = np.array(audio)
1131
- sd.play(audio_array, samplerate=playback_rate)
1132
-
1133
- # Wait for playback to complete or stop flag (with pause support)
1134
- while not self.stop_flag.is_set() and sd.get_stream().active:
1135
- # Check for pause more frequently
1136
- if not self.pause_flag.is_set():
1137
- # Paused - let current audio finish naturally and wait
1138
- if self.debug_mode:
1139
- print(" > Audio paused, waiting for resume...")
1140
- # Non-blocking wait for resume
1141
- while not self.pause_flag.is_set() and not self.stop_flag.is_set():
1142
- time.sleep(0.1)
1143
- if not self.stop_flag.is_set():
1144
- # Resume - restart the audio (non-streaming limitation)
1145
- if self.debug_mode:
1146
- print(" > Resuming audio from beginning of current segment...")
1147
- sd.play(audio_array, samplerate=playback_rate)
1148
- time.sleep(0.05) # Check every 50ms for better responsiveness
1149
-
1150
- sd.stop()
1151
-
1152
- self.is_playing = False
1153
-
1154
- # Notify that playback has ended (to resume voice recognition)
1155
- if self.on_playback_end:
1156
- self.on_playback_end()
1157
-
1158
- if self.debug_mode:
1159
- duration = time.time() - self.start_time
1160
- if not self.stop_flag.is_set(): # Only if completed normally
1161
- print(f" > Speech completed in {duration:.2f} seconds")
1162
-
1163
- # Call the callback if provided and speech completed normally
1164
- if callback and not self.stop_flag.is_set():
1165
- callback()
1166
-
1167
- except Exception as e:
1168
- if self.debug_mode:
1169
- print(f"Audio playback error: {e}")
1170
- self.is_playing = False
1171
- # Ensure we notify end even on error
1172
- if self.on_playback_end:
1173
- self.on_playback_end()
1174
-
1175
- # Start playback in a separate thread
1176
- self.stop_flag.clear()
1177
- self.pause_flag.set() # Ensure we start unpaused
1178
- self.is_paused_state = False # Reset paused state
1179
- self.playback_thread = threading.Thread(target=_audio_playback)
1180
- self.playback_thread.start()
1181
- return True
1182
-
1183
- except Exception as e:
1184
- if self.debug_mode:
1185
- print(f"TTS error: {e}")
1186
- return False
1187
-
1188
- def stop(self):
1189
- """Stop current audio playback.
1190
-
1191
- Returns:
1192
- True if playback was stopped, False if no playback was active
1193
- """
1194
- stopped = False
1195
-
1196
- # Stop new non-blocking audio player
1197
- if self.audio_player.is_playing:
1198
- self.audio_player.stop_stream()
1199
- stopped = True
1200
- if self.debug_mode:
1201
- print(" > TTS playback stopped (non-blocking)")
1202
-
1203
- # Stop legacy playback system
1204
- if self.playback_thread and self.playback_thread.is_alive():
1205
- self.stop_flag.set()
1206
- self.pause_flag.set() # Ensure we're not stuck in pause
1207
- self.is_paused_state = False # Reset paused state
1208
- self.playback_thread.join()
1209
- self.playback_thread = None
1210
- stopped = True
1211
-
1212
- if self.debug_mode:
1213
- print(" > TTS playback interrupted (legacy)")
1214
-
1215
- # Reset state
1216
- self.is_playing = False
1217
- self.is_paused_state = False
1218
-
1219
- return stopped
1220
-
1221
- def pause(self):
1222
- """Pause current speech playback.
1223
-
1224
- Uses a non-interfering pause method that avoids terminal I/O issues.
1225
-
1226
- Returns:
1227
- True if paused, False if no playback was active
1228
- """
1229
- # Try new non-blocking audio player first
1230
- if self.audio_player.is_playing:
1231
- result = self.audio_player.pause()
1232
- if result:
1233
- self.is_paused_state = True
1234
- if self.debug_mode:
1235
- print(" > TTS paused immediately (non-blocking)")
1236
- return result
1237
-
1238
- # Fallback to legacy system
1239
- if self.playback_thread and self.playback_thread.is_alive() and self.is_playing:
1240
- self.pause_flag.clear() # Clear means "paused"
1241
- self.is_paused_state = True # Explicit state tracking
1242
-
1243
- if self.debug_mode:
1244
- print(" > TTS paused (legacy method)")
1245
-
1246
- return True
1247
-
1248
- return False
1249
-
1250
- def resume(self):
1251
- """Resume paused speech playback.
1252
-
1253
- Returns:
1254
- True if resumed, False if not paused or no playback active
1255
- """
1256
- if self.is_paused_state:
1257
- # Try new non-blocking audio player first
1258
- if self.audio_player.is_paused_state():
1259
- result = self.audio_player.resume()
1260
- if result:
1261
- self.is_paused_state = False
1262
- if self.debug_mode:
1263
- print(" > TTS resumed immediately (non-blocking)")
1264
- return True
1265
-
1266
- # Fallback to legacy system
1267
- if self.playback_thread and self.playback_thread.is_alive():
1268
- # Thread is still alive, can resume
1269
- self.pause_flag.set() # Set means "not paused"
1270
- self.is_paused_state = False # Clear explicit state
1271
- if self.debug_mode:
1272
- print(" > TTS resumed (legacy method)")
1273
- return True
1274
- else:
1275
- # Thread died while paused, nothing to resume
1276
- self.is_paused_state = False # Clear paused state
1277
- if self.debug_mode:
1278
- print(" > TTS was paused but playback already completed")
1279
- return False
1280
- return False
1281
-
1282
- def is_paused(self):
1283
- """Check if TTS is currently paused.
1284
-
1285
- Returns:
1286
- True if paused, False otherwise
1287
- """
1288
- return self.is_paused_state
1289
-
1290
- def is_active(self):
1291
- """Check if TTS is currently playing.
1292
-
1293
- Returns:
1294
- True if TTS is active, False otherwise
1295
- """
1296
- return self.is_playing
1297
-