abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2403 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.1.dist-info/METADATA +213 -0
- abstractvoice-0.6.1.dist-info/RECORD +52 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.1.dist-info/METADATA +0 -1458
- abstractvoice-0.5.1.dist-info/RECORD +0 -23
- abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
abstractvoice/tts/tts_engine.py
CHANGED
|
@@ -1,1297 +1,346 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Core audio playback utilities (Piper-first).
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
- Robust error handling
|
|
3
|
+
AbstractVoice core intentionally avoids shipping legacy Coqui-based TTSEngine
|
|
4
|
+
logic. This module contains only reusable audio utilities:
|
|
5
|
+
- `NonBlockingAudioPlayer` for low-latency pause/resume/stop
|
|
6
|
+
- `apply_speed_without_pitch_change` (optional librosa)
|
|
8
7
|
"""
|
|
9
8
|
|
|
10
|
-
import
|
|
11
|
-
|
|
12
|
-
import numpy as np
|
|
13
|
-
import os
|
|
14
|
-
import sys
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
15
11
|
import logging
|
|
16
|
-
import
|
|
17
|
-
import re
|
|
12
|
+
import os
|
|
18
13
|
import queue
|
|
14
|
+
import threading
|
|
15
|
+
from typing import Callable, Optional
|
|
19
16
|
|
|
20
|
-
|
|
21
|
-
def _import_tts():
|
|
22
|
-
"""Import TTS with helpful error message if dependencies missing."""
|
|
23
|
-
try:
|
|
24
|
-
from TTS.api import TTS
|
|
25
|
-
return TTS
|
|
26
|
-
except ImportError as e:
|
|
27
|
-
error_msg = str(e).lower()
|
|
17
|
+
import numpy as np
|
|
28
18
|
|
|
29
|
-
|
|
30
|
-
if "torchvision::nms does not exist" in error_msg or "gpt2pretrainedmodel" in error_msg:
|
|
31
|
-
raise ImportError(
|
|
32
|
-
"❌ PyTorch/TorchVision version conflict detected!\n\n"
|
|
33
|
-
"This is a known compatibility issue. To fix:\n\n"
|
|
34
|
-
"1. Uninstall conflicting packages:\n"
|
|
35
|
-
" pip uninstall torch torchvision torchaudio transformers\n\n"
|
|
36
|
-
"2. Reinstall with compatible versions:\n"
|
|
37
|
-
" pip install abstractvoice[all] # Installs tested compatible versions\n\n"
|
|
38
|
-
"3. Or use specific PyTorch version:\n"
|
|
39
|
-
" pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1\n"
|
|
40
|
-
" pip install abstractvoice[voice-full]\n\n"
|
|
41
|
-
"For conda environments, consider:\n"
|
|
42
|
-
" conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n\n"
|
|
43
|
-
f"Original error: {e}"
|
|
44
|
-
) from e
|
|
45
|
-
elif "no module named 'tts'" in error_msg or "coqui" in error_msg:
|
|
46
|
-
raise ImportError(
|
|
47
|
-
"TTS functionality requires coqui-tts. Install with:\n"
|
|
48
|
-
" pip install abstractvoice[tts] # For TTS only\n"
|
|
49
|
-
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
50
|
-
" pip install abstractvoice[all] # For all features\n"
|
|
51
|
-
f"Original error: {e}"
|
|
52
|
-
) from e
|
|
53
|
-
else:
|
|
54
|
-
# Generic import error
|
|
55
|
-
raise ImportError(
|
|
56
|
-
"TTS functionality requires optional dependencies. Install with:\n"
|
|
57
|
-
" pip install abstractvoice[tts] # For TTS only\n"
|
|
58
|
-
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
59
|
-
" pip install abstractvoice[all] # For all features\n\n"
|
|
60
|
-
"If you're getting PyTorch-related errors, try:\n"
|
|
61
|
-
" pip install abstractvoice[core-tts] # Lightweight TTS without extras\n\n"
|
|
62
|
-
f"Original error: {e}"
|
|
63
|
-
) from e
|
|
19
|
+
from ..audio.resample import linear_resample_mono
|
|
64
20
|
|
|
65
|
-
def _import_audio_deps():
|
|
66
|
-
"""Import audio dependencies with helpful error message if missing."""
|
|
67
|
-
try:
|
|
68
|
-
import sounddevice as sd
|
|
69
|
-
import librosa
|
|
70
|
-
return sd, librosa
|
|
71
|
-
except ImportError as e:
|
|
72
|
-
error_msg = str(e).lower()
|
|
73
21
|
|
|
74
|
-
|
|
75
|
-
raise ImportError(
|
|
76
|
-
"Audio playback requires sounddevice. Install with:\n"
|
|
77
|
-
" pip install abstractvoice[audio-only] # For audio processing only\n"
|
|
78
|
-
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
79
|
-
" pip install abstractvoice[all] # For all features\n\n"
|
|
80
|
-
"On some systems, you may need system audio libraries:\n"
|
|
81
|
-
" Ubuntu/Debian: sudo apt-get install portaudio19-dev\n"
|
|
82
|
-
" macOS: brew install portaudio\n"
|
|
83
|
-
" Windows: Usually works out of the box\n\n"
|
|
84
|
-
f"Original error: {e}"
|
|
85
|
-
) from e
|
|
86
|
-
elif "librosa" in error_msg:
|
|
87
|
-
raise ImportError(
|
|
88
|
-
"Audio processing requires librosa. Install with:\n"
|
|
89
|
-
" pip install abstractvoice[tts] # For TTS functionality\n"
|
|
90
|
-
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
91
|
-
" pip install abstractvoice[all] # For all features\n\n"
|
|
92
|
-
f"Original error: {e}"
|
|
93
|
-
) from e
|
|
94
|
-
else:
|
|
95
|
-
# Generic audio import error
|
|
96
|
-
raise ImportError(
|
|
97
|
-
"Audio functionality requires optional dependencies. Install with:\n"
|
|
98
|
-
" pip install abstractvoice[audio-only] # For audio processing only\n"
|
|
99
|
-
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
100
|
-
" pip install abstractvoice[all] # For all features\n\n"
|
|
101
|
-
f"Original error: {e}"
|
|
102
|
-
) from e
|
|
22
|
+
_STDERR_FD_LOCK = threading.Lock()
|
|
103
23
|
|
|
104
|
-
# Suppress the PyTorch FutureWarning about torch.load
|
|
105
|
-
warnings.filterwarnings(
|
|
106
|
-
"ignore",
|
|
107
|
-
message="You are using `torch.load` with `weights_only=False`",
|
|
108
|
-
category=FutureWarning
|
|
109
|
-
)
|
|
110
24
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
"ignore",
|
|
114
|
-
message=".*pkg_resources is deprecated.*",
|
|
115
|
-
category=DeprecationWarning
|
|
116
|
-
)
|
|
25
|
+
class _SilenceStderrFD:
|
|
26
|
+
"""Temporarily redirect OS-level stderr (fd=2) to /dev/null.
|
|
117
27
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
category=UserWarning
|
|
123
|
-
)
|
|
124
|
-
warnings.filterwarnings(
|
|
125
|
-
"ignore",
|
|
126
|
-
message=".*Failed to deserialize field.*",
|
|
127
|
-
category=UserWarning
|
|
128
|
-
)
|
|
28
|
+
PortAudio (and some underlying CoreAudio/AUHAL code paths) can emit warnings
|
|
29
|
+
directly to stderr, bypassing Python's `sys.stderr`. In interactive REPL
|
|
30
|
+
contexts this can corrupt the prompt/spinner UI.
|
|
31
|
+
"""
|
|
129
32
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
33
|
+
def __init__(self, enabled: bool = True):
|
|
34
|
+
self.enabled = bool(enabled)
|
|
35
|
+
self._devnull_fd = None
|
|
36
|
+
self._saved_stderr_fd = None
|
|
37
|
+
|
|
38
|
+
def __enter__(self):
|
|
39
|
+
if not self.enabled:
|
|
40
|
+
return self
|
|
41
|
+
_STDERR_FD_LOCK.acquire()
|
|
42
|
+
try:
|
|
43
|
+
self._devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
|
44
|
+
self._saved_stderr_fd = os.dup(2)
|
|
45
|
+
os.dup2(self._devnull_fd, 2)
|
|
46
|
+
except Exception:
|
|
47
|
+
self.__exit__(None, None, None)
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
def __exit__(self, exc_type, exc, tb):
|
|
51
|
+
if not self.enabled:
|
|
52
|
+
return False
|
|
53
|
+
try:
|
|
54
|
+
if self._saved_stderr_fd is not None:
|
|
55
|
+
try:
|
|
56
|
+
os.dup2(self._saved_stderr_fd, 2)
|
|
57
|
+
except Exception:
|
|
58
|
+
pass
|
|
59
|
+
finally:
|
|
60
|
+
try:
|
|
61
|
+
if self._saved_stderr_fd is not None:
|
|
62
|
+
os.close(self._saved_stderr_fd)
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
try:
|
|
66
|
+
if self._devnull_fd is not None:
|
|
67
|
+
os.close(self._devnull_fd)
|
|
68
|
+
except Exception:
|
|
69
|
+
pass
|
|
70
|
+
try:
|
|
71
|
+
_STDERR_FD_LOCK.release()
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _import_sounddevice():
|
|
78
|
+
try:
|
|
79
|
+
import sounddevice as sd
|
|
80
|
+
return sd
|
|
81
|
+
except ImportError as e:
|
|
82
|
+
raise ImportError(
|
|
83
|
+
"Audio playback requires sounddevice. Install with:\n"
|
|
84
|
+
" pip install abstractvoice\n"
|
|
85
|
+
f"Original error: {e}"
|
|
86
|
+
) from e
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _import_librosa():
|
|
90
|
+
try:
|
|
91
|
+
import librosa
|
|
92
|
+
return librosa
|
|
93
|
+
except ImportError as e:
|
|
94
|
+
raise ImportError(
|
|
95
|
+
"Speed/pitch processing requires librosa. Install with:\n"
|
|
96
|
+
" pip install \"abstractvoice[audio-fx]\"\n"
|
|
97
|
+
f"Original error: {e}"
|
|
98
|
+
) from e
|
|
133
99
|
|
|
134
|
-
def preprocess_text(text):
|
|
135
|
-
"""Preprocess text for better TTS synthesis.
|
|
136
|
-
|
|
137
|
-
This function normalizes text to prevent synthesis errors:
|
|
138
|
-
- Removes excessive whitespace
|
|
139
|
-
- Normalizes punctuation
|
|
140
|
-
- Handles common abbreviations
|
|
141
|
-
- Removes problematic characters
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
text: Input text string
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
Cleaned and normalized text
|
|
148
|
-
"""
|
|
149
|
-
if not text:
|
|
150
|
-
return text
|
|
151
|
-
|
|
152
|
-
# Remove excessive whitespace
|
|
153
|
-
text = re.sub(r'\s+', ' ', text)
|
|
154
|
-
|
|
155
|
-
# Normalize ellipsis
|
|
156
|
-
text = text.replace('...', '.')
|
|
157
|
-
|
|
158
|
-
# Remove or normalize problematic characters
|
|
159
|
-
# Keep basic punctuation that helps with prosody
|
|
160
|
-
text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
|
|
161
|
-
|
|
162
|
-
# Ensure proper spacing after punctuation
|
|
163
|
-
text = re.sub(r'([.,!?;:])([^\s])', r'\1 \2', text)
|
|
164
|
-
|
|
165
|
-
return text.strip()
|
|
166
100
|
|
|
101
|
+
def apply_speed_without_pitch_change(audio: np.ndarray, speed: float, sr: int = 22050) -> np.ndarray:
|
|
102
|
+
"""Apply speed change without affecting pitch (best-effort).
|
|
167
103
|
|
|
168
|
-
|
|
169
|
-
"""Apply speed change without affecting pitch using librosa time_stretch.
|
|
170
|
-
|
|
171
|
-
Args:
|
|
172
|
-
audio: Audio samples as numpy array
|
|
173
|
-
speed: Speed multiplier (0.5-2.0, where >1.0 is faster, <1.0 is slower)
|
|
174
|
-
sr: Sample rate (default 22050)
|
|
175
|
-
|
|
176
|
-
Returns:
|
|
177
|
-
Time-stretched audio samples
|
|
104
|
+
If librosa is not installed (or fails), returns the original audio.
|
|
178
105
|
"""
|
|
179
|
-
if speed == 1.0:
|
|
106
|
+
if not speed or speed == 1.0:
|
|
180
107
|
return audio
|
|
181
|
-
|
|
182
|
-
# librosa.effects.time_stretch expects rate parameter where:
|
|
183
|
-
# rate > 1.0 makes audio faster (shorter)
|
|
184
|
-
# rate < 1.0 makes audio slower (longer)
|
|
185
|
-
# This matches our speed semantics
|
|
108
|
+
|
|
186
109
|
try:
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
return stretched_audio
|
|
110
|
+
librosa = _import_librosa()
|
|
111
|
+
return librosa.effects.time_stretch(audio, rate=float(speed))
|
|
190
112
|
except Exception as e:
|
|
191
|
-
# If time-stretching fails, return original audio
|
|
192
113
|
logging.warning(f"Time-stretching failed: {e}, using original audio")
|
|
193
114
|
return audio
|
|
194
115
|
|
|
195
116
|
|
|
196
117
|
class NonBlockingAudioPlayer:
|
|
197
|
-
"""Non-blocking audio player using OutputStream callbacks for
|
|
198
|
-
|
|
199
|
-
def __init__(self, sample_rate=22050, debug_mode=False):
|
|
200
|
-
self.sample_rate = sample_rate
|
|
118
|
+
"""Non-blocking audio player using OutputStream callbacks for pause/resume."""
|
|
119
|
+
|
|
120
|
+
def __init__(self, sample_rate: int = 22050, debug_mode: bool = False):
|
|
121
|
+
self.sample_rate = int(sample_rate)
|
|
201
122
|
self.debug_mode = debug_mode
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
self.audio_queue = queue.Queue()
|
|
123
|
+
|
|
124
|
+
self.audio_queue: "queue.Queue[np.ndarray]" = queue.Queue()
|
|
205
125
|
self.stream = None
|
|
206
126
|
self.is_playing = False
|
|
207
|
-
|
|
208
|
-
self.
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
self.current_audio = None
|
|
127
|
+
|
|
128
|
+
self._pause_lock = threading.Lock()
|
|
129
|
+
self._paused = False
|
|
130
|
+
|
|
131
|
+
self.current_audio: Optional[np.ndarray] = None
|
|
212
132
|
self.current_position = 0
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
self.
|
|
218
|
-
self.
|
|
219
|
-
self.
|
|
220
|
-
self.
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
133
|
+
|
|
134
|
+
self.playback_complete_callback: Optional[Callable[[], None]] = None
|
|
135
|
+
|
|
136
|
+
# Audio lifecycle callbacks
|
|
137
|
+
self.on_audio_start: Optional[Callable[[], None]] = None
|
|
138
|
+
self.on_audio_end: Optional[Callable[[], None]] = None
|
|
139
|
+
self.on_audio_pause: Optional[Callable[[], None]] = None
|
|
140
|
+
self.on_audio_resume: Optional[Callable[[], None]] = None
|
|
141
|
+
self._audio_started = False
|
|
142
|
+
|
|
143
|
+
# Optional hook: called with chunks that are actually written to the output.
|
|
144
|
+
# Used for advanced features like AEC (barge-in without self-interruption).
|
|
145
|
+
self.on_audio_chunk = None # Callable[[np.ndarray, int], None] | None
|
|
146
|
+
|
|
147
|
+
def _audio_callback(self, outdata, frames, _time, status):
|
|
224
148
|
if status and self.debug_mode:
|
|
225
149
|
print(f"Audio callback status: {status}")
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
if self.is_paused:
|
|
230
|
-
# Output silence when paused - immediate response
|
|
150
|
+
|
|
151
|
+
with self._pause_lock:
|
|
152
|
+
if self._paused:
|
|
231
153
|
outdata.fill(0)
|
|
232
154
|
return
|
|
233
|
-
|
|
155
|
+
|
|
234
156
|
try:
|
|
235
|
-
# Get next audio chunk if needed
|
|
236
157
|
if self.current_audio is None or self.current_position >= len(self.current_audio):
|
|
237
158
|
try:
|
|
238
159
|
self.current_audio = self.audio_queue.get_nowait()
|
|
239
160
|
self.current_position = 0
|
|
240
|
-
if self.debug_mode:
|
|
241
|
-
print(f" > Playing audio chunk ({len(self.current_audio)} samples)")
|
|
242
161
|
except queue.Empty:
|
|
243
|
-
# No more audio - output silence and mark as not playing
|
|
244
162
|
outdata.fill(0)
|
|
245
163
|
if self.is_playing:
|
|
246
164
|
self.is_playing = False
|
|
247
|
-
self._audio_started = False
|
|
248
|
-
|
|
249
|
-
# Fire audio end callback
|
|
165
|
+
self._audio_started = False
|
|
250
166
|
if self.on_audio_end:
|
|
251
167
|
threading.Thread(target=self.on_audio_end, daemon=True).start()
|
|
252
|
-
|
|
253
168
|
if self.playback_complete_callback:
|
|
254
|
-
# Call completion callback in a separate thread to avoid blocking
|
|
255
169
|
threading.Thread(target=self.playback_complete_callback, daemon=True).start()
|
|
256
170
|
return
|
|
257
|
-
|
|
258
|
-
# Calculate how much audio we can output this frame
|
|
171
|
+
|
|
259
172
|
remaining = len(self.current_audio) - self.current_position
|
|
260
173
|
frames_to_output = min(frames, remaining)
|
|
261
|
-
|
|
262
|
-
# Fire audio start callback on first real audio output
|
|
174
|
+
|
|
263
175
|
if frames_to_output > 0 and not self._audio_started:
|
|
264
176
|
self._audio_started = True
|
|
265
177
|
if self.on_audio_start:
|
|
266
178
|
threading.Thread(target=self.on_audio_start, daemon=True).start()
|
|
267
|
-
|
|
268
|
-
# Output the audio data
|
|
179
|
+
|
|
269
180
|
if frames_to_output > 0:
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
181
|
+
if outdata.shape[1] == 1:
|
|
182
|
+
outdata[:frames_to_output, 0] = self.current_audio[
|
|
183
|
+
self.current_position : self.current_position + frames_to_output
|
|
184
|
+
]
|
|
185
|
+
else:
|
|
186
|
+
audio_data = self.current_audio[
|
|
187
|
+
self.current_position : self.current_position + frames_to_output
|
|
188
|
+
]
|
|
189
|
+
outdata[:frames_to_output, 0] = audio_data
|
|
190
|
+
outdata[:frames_to_output, 1] = audio_data
|
|
191
|
+
|
|
192
|
+
# Emit the actual output chunk (mono float32) for optional consumers.
|
|
193
|
+
try:
|
|
194
|
+
if self.on_audio_chunk:
|
|
195
|
+
chunk = self.current_audio[
|
|
196
|
+
self.current_position : self.current_position + frames_to_output
|
|
197
|
+
]
|
|
198
|
+
self.on_audio_chunk(chunk, int(self.sample_rate))
|
|
199
|
+
except Exception:
|
|
200
|
+
# Never let optional hooks break audio playback.
|
|
201
|
+
pass
|
|
278
202
|
self.current_position += frames_to_output
|
|
279
|
-
|
|
280
|
-
# Fill remaining with silence if needed
|
|
203
|
+
|
|
281
204
|
if frames_to_output < frames:
|
|
282
205
|
outdata[frames_to_output:].fill(0)
|
|
283
|
-
|
|
206
|
+
|
|
284
207
|
except Exception as e:
|
|
285
208
|
if self.debug_mode:
|
|
286
209
|
print(f"Error in audio callback: {e}")
|
|
287
210
|
outdata.fill(0)
|
|
288
|
-
|
|
211
|
+
|
|
289
212
|
def start_stream(self):
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
213
|
+
if self.stream is not None:
|
|
214
|
+
return
|
|
215
|
+
sd = _import_sounddevice()
|
|
216
|
+
|
|
217
|
+
desired_sr = int(self.sample_rate)
|
|
218
|
+
candidates: list[int] = [desired_sr]
|
|
219
|
+
try:
|
|
220
|
+
dev = sd.query_devices(None, "output") # default output device
|
|
221
|
+
default_sr = int(round(float(dev.get("default_samplerate", 0) or 0)))
|
|
222
|
+
if default_sr and default_sr not in candidates:
|
|
223
|
+
candidates.append(default_sr)
|
|
224
|
+
except Exception:
|
|
225
|
+
default_sr = 0
|
|
226
|
+
|
|
227
|
+
# Common output rates (keep short; we already prefer desired/default).
|
|
228
|
+
for sr in (48000, 44100, 24000, 22050, 16000):
|
|
229
|
+
if sr not in candidates:
|
|
230
|
+
candidates.append(sr)
|
|
231
|
+
|
|
232
|
+
last_err: Exception | None = None
|
|
233
|
+
for sr in candidates:
|
|
234
|
+
for blocksize in (1024, 0): # 0 => PortAudio decides (often most compatible)
|
|
235
|
+
stream = None
|
|
236
|
+
try:
|
|
237
|
+
with _SilenceStderrFD(enabled=not self.debug_mode):
|
|
238
|
+
stream = sd.OutputStream(
|
|
239
|
+
samplerate=int(sr),
|
|
240
|
+
channels=1,
|
|
241
|
+
callback=self._audio_callback,
|
|
242
|
+
blocksize=int(blocksize),
|
|
243
|
+
dtype=np.float32,
|
|
244
|
+
)
|
|
245
|
+
stream.start()
|
|
246
|
+
self.stream = stream
|
|
247
|
+
self.sample_rate = int(sr)
|
|
248
|
+
if self.debug_mode and int(sr) != desired_sr:
|
|
249
|
+
print(f"⚠️ Output device rejected {desired_sr}Hz; using {sr}Hz (resampling)")
|
|
250
|
+
return
|
|
251
|
+
except Exception as e:
|
|
252
|
+
last_err = e
|
|
253
|
+
try:
|
|
254
|
+
if stream is not None:
|
|
255
|
+
stream.close()
|
|
256
|
+
except Exception:
|
|
257
|
+
pass
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
# If we couldn't start, surface the last error.
|
|
261
|
+
if last_err is not None:
|
|
262
|
+
raise last_err
|
|
263
|
+
raise RuntimeError("Failed to start audio output stream")
|
|
264
|
+
|
|
309
265
|
def stop_stream(self):
|
|
310
|
-
"""Stop the audio stream."""
|
|
311
266
|
if self.stream:
|
|
312
267
|
try:
|
|
313
268
|
self.stream.stop()
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
try:
|
|
314
272
|
self.stream.close()
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
if self.debug_mode:
|
|
319
|
-
print(f"Error stopping audio stream: {e}")
|
|
320
|
-
finally:
|
|
321
|
-
self.stream = None
|
|
273
|
+
except Exception:
|
|
274
|
+
pass
|
|
275
|
+
self.stream = None
|
|
322
276
|
|
|
323
277
|
self.is_playing = False
|
|
324
|
-
with self.
|
|
325
|
-
self.
|
|
278
|
+
with self._pause_lock:
|
|
279
|
+
self._paused = False
|
|
326
280
|
self.clear_queue()
|
|
327
281
|
|
|
328
282
|
def cleanup(self):
|
|
329
|
-
|
|
283
|
+
self.stop_stream()
|
|
284
|
+
self.current_audio = None
|
|
285
|
+
self.playback_complete_callback = None
|
|
286
|
+
|
|
287
|
+
def play_audio(self, audio_array: np.ndarray, *, sample_rate: int | None = None):
|
|
288
|
+
if audio_array is None or len(audio_array) == 0:
|
|
289
|
+
return
|
|
290
|
+
|
|
291
|
+
# Ensure mono float32 vector.
|
|
330
292
|
try:
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
def pause(self):
|
|
360
|
-
"""Pause audio playback immediately."""
|
|
361
|
-
with self.pause_lock:
|
|
362
|
-
if self.is_playing and not self.is_paused:
|
|
363
|
-
self.is_paused = True
|
|
364
|
-
if self.debug_mode:
|
|
365
|
-
print(" > Audio paused immediately")
|
|
366
|
-
|
|
367
|
-
# Fire audio pause callback
|
|
293
|
+
if hasattr(audio_array, "ndim") and int(audio_array.ndim) > 1:
|
|
294
|
+
audio_array = np.mean(audio_array, axis=1).astype(np.float32)
|
|
295
|
+
except Exception:
|
|
296
|
+
pass
|
|
297
|
+
audio_array = np.asarray(audio_array, dtype=np.float32).reshape(-1)
|
|
298
|
+
|
|
299
|
+
# If we haven't started the output stream yet, do so first. This allows
|
|
300
|
+
# `start_stream()` to fall back to a compatible device sample rate.
|
|
301
|
+
if self.stream is None:
|
|
302
|
+
self.start_stream()
|
|
303
|
+
|
|
304
|
+
sr_in = int(sample_rate) if sample_rate is not None else int(self.sample_rate)
|
|
305
|
+
sr_out = int(self.sample_rate)
|
|
306
|
+
if sr_in != sr_out:
|
|
307
|
+
audio_array = linear_resample_mono(audio_array, sr_in, sr_out)
|
|
308
|
+
|
|
309
|
+
max_abs = float(np.max(np.abs(audio_array))) if len(audio_array) else 0.0
|
|
310
|
+
if max_abs > 1.0:
|
|
311
|
+
audio_array = audio_array / max_abs
|
|
312
|
+
|
|
313
|
+
self.audio_queue.put(audio_array)
|
|
314
|
+
self.is_playing = True
|
|
315
|
+
# Stream should already be started above when needed.
|
|
316
|
+
|
|
317
|
+
def pause(self) -> bool:
|
|
318
|
+
with self._pause_lock:
|
|
319
|
+
if self.is_playing and not self._paused:
|
|
320
|
+
self._paused = True
|
|
368
321
|
if self.on_audio_pause:
|
|
369
322
|
threading.Thread(target=self.on_audio_pause, daemon=True).start()
|
|
370
|
-
|
|
371
323
|
return True
|
|
372
324
|
return False
|
|
373
|
-
|
|
374
|
-
def resume(self):
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
self.is_paused = False
|
|
379
|
-
if self.debug_mode:
|
|
380
|
-
print(" > Audio resumed immediately")
|
|
381
|
-
|
|
382
|
-
# Fire audio resume callback
|
|
325
|
+
|
|
326
|
+
def resume(self) -> bool:
|
|
327
|
+
with self._pause_lock:
|
|
328
|
+
if self._paused:
|
|
329
|
+
self._paused = False
|
|
383
330
|
if self.on_audio_resume:
|
|
384
331
|
threading.Thread(target=self.on_audio_resume, daemon=True).start()
|
|
385
|
-
|
|
386
332
|
return True
|
|
387
333
|
return False
|
|
388
|
-
|
|
389
|
-
def is_paused_state(self):
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
334
|
+
|
|
335
|
+
def is_paused_state(self) -> bool:
|
|
336
|
+
with self._pause_lock:
|
|
337
|
+
return bool(self._paused)
|
|
338
|
+
|
|
394
339
|
def clear_queue(self):
|
|
395
|
-
"""Clear the audio queue."""
|
|
396
340
|
while not self.audio_queue.empty():
|
|
397
341
|
try:
|
|
398
342
|
self.audio_queue.get_nowait()
|
|
399
343
|
except queue.Empty:
|
|
400
344
|
break
|
|
401
|
-
|
|
402
|
-
# Reset current audio buffer
|
|
403
345
|
self.current_audio = None
|
|
404
346
|
self.current_position = 0
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
def chunk_long_text(text, max_chunk_size=300):
|
|
408
|
-
"""Split very long text into manageable chunks at natural boundaries.
|
|
409
|
-
|
|
410
|
-
For extremely long texts, this function splits at paragraph or sentence
|
|
411
|
-
boundaries to prevent memory issues and attention degradation.
|
|
412
|
-
|
|
413
|
-
Args:
|
|
414
|
-
text: Input text string
|
|
415
|
-
max_chunk_size: Maximum characters per chunk (default 300)
|
|
416
|
-
|
|
417
|
-
Returns:
|
|
418
|
-
List of text chunks
|
|
419
|
-
"""
|
|
420
|
-
if len(text) <= max_chunk_size:
|
|
421
|
-
return [text]
|
|
422
|
-
|
|
423
|
-
chunks = []
|
|
424
|
-
|
|
425
|
-
# First try to split by paragraphs
|
|
426
|
-
paragraphs = text.split('\n\n')
|
|
427
|
-
|
|
428
|
-
current_chunk = ""
|
|
429
|
-
for para in paragraphs:
|
|
430
|
-
# If adding this paragraph would exceed limit and we have content
|
|
431
|
-
if len(current_chunk) + len(para) > max_chunk_size and current_chunk:
|
|
432
|
-
chunks.append(current_chunk.strip())
|
|
433
|
-
current_chunk = para
|
|
434
|
-
else:
|
|
435
|
-
if current_chunk:
|
|
436
|
-
current_chunk += "\n\n" + para
|
|
437
|
-
else:
|
|
438
|
-
current_chunk = para
|
|
439
|
-
|
|
440
|
-
# If a single paragraph is too long, split by sentences
|
|
441
|
-
if len(current_chunk) > max_chunk_size:
|
|
442
|
-
# Split on sentence boundaries
|
|
443
|
-
sentences = re.split(r'([.!?]+\s+)', current_chunk)
|
|
444
|
-
temp_chunk = ""
|
|
445
|
-
|
|
446
|
-
for i in range(0, len(sentences), 2):
|
|
447
|
-
sentence = sentences[i]
|
|
448
|
-
punct = sentences[i+1] if i+1 < len(sentences) else ""
|
|
449
|
-
|
|
450
|
-
if len(temp_chunk) + len(sentence) + len(punct) > max_chunk_size and temp_chunk:
|
|
451
|
-
chunks.append(temp_chunk.strip())
|
|
452
|
-
temp_chunk = sentence + punct
|
|
453
|
-
else:
|
|
454
|
-
temp_chunk += sentence + punct
|
|
455
|
-
|
|
456
|
-
current_chunk = temp_chunk
|
|
457
|
-
|
|
458
|
-
# Add remaining text
|
|
459
|
-
if current_chunk:
|
|
460
|
-
chunks.append(current_chunk.strip())
|
|
461
|
-
|
|
462
|
-
return chunks if chunks else [text]
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
class TTSEngine:
|
|
466
|
-
"""Text-to-speech engine with interrupt capability."""
|
|
467
|
-
|
|
468
|
-
def __init__(self, model_name="tts_models/en/ljspeech/vits", debug_mode=False, streaming=True):
|
|
469
|
-
"""Initialize the TTS engine.
|
|
470
|
-
|
|
471
|
-
Args:
|
|
472
|
-
model_name: TTS model to use (default: vits - best quality, requires espeak-ng)
|
|
473
|
-
debug_mode: Enable debug output
|
|
474
|
-
streaming: Enable streaming playback (start playing while synthesizing remaining chunks)
|
|
475
|
-
|
|
476
|
-
Note:
|
|
477
|
-
VITS model (default) requires espeak-ng for best quality:
|
|
478
|
-
- macOS: brew install espeak-ng
|
|
479
|
-
- Linux: sudo apt-get install espeak-ng
|
|
480
|
-
- Windows: See installation guide in README
|
|
481
|
-
|
|
482
|
-
If espeak-ng is not available, will auto-fallback to fast_pitch
|
|
483
|
-
"""
|
|
484
|
-
# Set up debug mode
|
|
485
|
-
self.debug_mode = debug_mode
|
|
486
|
-
self.streaming = streaming
|
|
487
|
-
|
|
488
|
-
# Callback to notify when TTS starts/stops (for pausing voice recognition)
|
|
489
|
-
self.on_playback_start = None
|
|
490
|
-
self.on_playback_end = None
|
|
491
|
-
|
|
492
|
-
# Suppress TTS output unless in debug mode
|
|
493
|
-
if not debug_mode:
|
|
494
|
-
# Suppress all TTS logging
|
|
495
|
-
logging.getLogger('TTS').setLevel(logging.ERROR)
|
|
496
|
-
logging.getLogger('TTS.utils.audio').setLevel(logging.ERROR)
|
|
497
|
-
logging.getLogger('TTS.utils.io').setLevel(logging.ERROR)
|
|
498
|
-
logging.getLogger('numba').setLevel(logging.ERROR)
|
|
499
|
-
|
|
500
|
-
# Disable stdout during TTS loading
|
|
501
|
-
os.environ['TTS_VERBOSE'] = '0'
|
|
502
|
-
|
|
503
|
-
# Temporarily redirect stdout to suppress TTS init messages
|
|
504
|
-
orig_stdout = sys.stdout
|
|
505
|
-
null_out = open(os.devnull, 'w')
|
|
506
|
-
sys.stdout = null_out
|
|
507
|
-
|
|
508
|
-
try:
|
|
509
|
-
if self.debug_mode:
|
|
510
|
-
print(f" > Loading TTS model: {model_name}")
|
|
511
|
-
|
|
512
|
-
# Try simple, effective initialization strategy
|
|
513
|
-
try:
|
|
514
|
-
TTS = _import_tts()
|
|
515
|
-
success, final_model = self._load_with_simple_fallback(TTS, model_name, debug_mode)
|
|
516
|
-
if not success:
|
|
517
|
-
# If all fails, provide actionable guidance
|
|
518
|
-
self._handle_model_load_failure(debug_mode)
|
|
519
|
-
elif self.debug_mode and final_model != model_name:
|
|
520
|
-
print(f" > Loaded fallback model: {final_model}")
|
|
521
|
-
except Exception as e:
|
|
522
|
-
error_msg = str(e).lower()
|
|
523
|
-
# Check if this is an espeak-related error
|
|
524
|
-
if ("espeak" in error_msg or "phoneme" in error_msg):
|
|
525
|
-
self._handle_espeak_fallback(debug_mode)
|
|
526
|
-
else:
|
|
527
|
-
# Different error, re-raise
|
|
528
|
-
raise
|
|
529
|
-
finally:
|
|
530
|
-
# Restore stdout if we redirected it
|
|
531
|
-
if not debug_mode:
|
|
532
|
-
sys.stdout = orig_stdout
|
|
533
|
-
null_out.close()
|
|
534
|
-
|
|
535
|
-
# Initialize non-blocking audio player for immediate pause/resume
|
|
536
|
-
self.audio_player = NonBlockingAudioPlayer(sample_rate=22050, debug_mode=debug_mode)
|
|
537
|
-
self.audio_player.playback_complete_callback = self._on_playback_complete
|
|
538
|
-
|
|
539
|
-
# Legacy playback state (for compatibility with existing code)
|
|
540
|
-
self.is_playing = False
|
|
541
|
-
self.stop_flag = threading.Event()
|
|
542
|
-
self.pause_flag = threading.Event()
|
|
543
|
-
self.pause_flag.set() # Initially not paused (set means "not paused")
|
|
544
|
-
self.playback_thread = None
|
|
545
|
-
self.start_time = 0
|
|
546
|
-
self.audio_queue = [] # Queue for streaming playback
|
|
547
|
-
self.queue_lock = threading.Lock() # Thread-safe queue access
|
|
548
|
-
|
|
549
|
-
# Pause/resume state
|
|
550
|
-
self.pause_lock = threading.Lock() # Thread-safe pause operations
|
|
551
|
-
self.is_paused_state = False # Explicit paused state tracking
|
|
552
|
-
|
|
553
|
-
def _load_with_simple_fallback(self, TTS, preferred_model: str, debug_mode: bool) -> tuple[bool, str]:
|
|
554
|
-
"""Load TTS model with bulletproof compatibility-first strategy."""
|
|
555
|
-
from ..simple_model_manager import get_model_manager
|
|
556
|
-
|
|
557
|
-
model_manager = get_model_manager(debug_mode=debug_mode)
|
|
558
|
-
|
|
559
|
-
# Step 1: Check espeak availability for smart model filtering
|
|
560
|
-
espeak_available = self._check_espeak_available()
|
|
561
|
-
if debug_mode and not espeak_available:
|
|
562
|
-
print(" > espeak-ng not found, will skip VITS models")
|
|
563
|
-
|
|
564
|
-
# Step 2: Try the REQUESTED model first if it's cached
|
|
565
|
-
cached_models = model_manager.get_cached_models()
|
|
566
|
-
if cached_models and debug_mode:
|
|
567
|
-
print(f" > Found {len(cached_models)} cached models")
|
|
568
|
-
|
|
569
|
-
# FORCE USER'S CHOICE: Try the specifically requested model first
|
|
570
|
-
if preferred_model in cached_models:
|
|
571
|
-
try:
|
|
572
|
-
if debug_mode:
|
|
573
|
-
print(f" > LOADING REQUESTED MODEL: {preferred_model}")
|
|
574
|
-
|
|
575
|
-
# Safety check for Italian VITS models that might crash
|
|
576
|
-
if "it/" in preferred_model and "vits" in preferred_model:
|
|
577
|
-
if debug_mode:
|
|
578
|
-
print(f" > Italian VITS model detected - using safe loading...")
|
|
579
|
-
|
|
580
|
-
self.tts = TTS(model_name=preferred_model, progress_bar=self.debug_mode)
|
|
581
|
-
|
|
582
|
-
if debug_mode:
|
|
583
|
-
print(f" > ✅ SUCCESS: Loaded requested model: {preferred_model}")
|
|
584
|
-
return True, preferred_model
|
|
585
|
-
|
|
586
|
-
except Exception as e:
|
|
587
|
-
error_msg = str(e).lower()
|
|
588
|
-
if debug_mode:
|
|
589
|
-
print(f" > ❌ Requested model failed: {e}")
|
|
590
|
-
|
|
591
|
-
# Special handling for Italian model crashes
|
|
592
|
-
if "it/" in preferred_model and ("segmentation" in error_msg or "crash" in error_msg):
|
|
593
|
-
if debug_mode:
|
|
594
|
-
print(f" > Italian model caused crash - marking as incompatible")
|
|
595
|
-
# Force fallback for crashed Italian models
|
|
596
|
-
pass
|
|
597
|
-
|
|
598
|
-
# Only fall back if the model actually failed to load, not due to dependencies
|
|
599
|
-
|
|
600
|
-
# Step 3: Only fall back to compatibility order if requested model failed
|
|
601
|
-
if debug_mode:
|
|
602
|
-
print(" > Requested model unavailable, trying fallback models...")
|
|
603
|
-
|
|
604
|
-
# Compatibility-first fallback order
|
|
605
|
-
fallback_models = [
|
|
606
|
-
"tts_models/en/ljspeech/tacotron2-DDC", # Most reliable (Linda)
|
|
607
|
-
"tts_models/en/jenny/jenny", # Different female speaker (Jenny)
|
|
608
|
-
"tts_models/en/ek1/tacotron2", # Male British accent (Edward)
|
|
609
|
-
"tts_models/en/sam/tacotron-DDC", # Different male voice (Sam)
|
|
610
|
-
"tts_models/en/ljspeech/fast_pitch", # Lightweight alternative
|
|
611
|
-
"tts_models/en/ljspeech/glow-tts", # Another alternative
|
|
612
|
-
"tts_models/en/vctk/vits", # Multi-speaker (requires espeak)
|
|
613
|
-
"tts_models/en/ljspeech/vits", # Premium (requires espeak)
|
|
614
|
-
]
|
|
615
|
-
|
|
616
|
-
# Remove the preferred model from fallbacks to avoid duplicate attempts
|
|
617
|
-
fallback_models = [m for m in fallback_models if m != preferred_model]
|
|
618
|
-
|
|
619
|
-
# Try fallback models
|
|
620
|
-
for model in fallback_models:
|
|
621
|
-
if model in cached_models:
|
|
622
|
-
# Skip VITS models if no espeak
|
|
623
|
-
if "vits" in model and not espeak_available:
|
|
624
|
-
if debug_mode:
|
|
625
|
-
print(f" > Skipping {model} (requires espeak-ng)")
|
|
626
|
-
continue
|
|
627
|
-
|
|
628
|
-
try:
|
|
629
|
-
if debug_mode:
|
|
630
|
-
print(f" > Trying fallback model: {model}")
|
|
631
|
-
self.tts = TTS(model_name=model, progress_bar=self.debug_mode)
|
|
632
|
-
if debug_mode:
|
|
633
|
-
print(f" > ✅ Successfully loaded fallback: {model}")
|
|
634
|
-
return True, model
|
|
635
|
-
except Exception as e:
|
|
636
|
-
if debug_mode:
|
|
637
|
-
print(f" > ❌ Fallback {model} failed: {e}")
|
|
638
|
-
|
|
639
|
-
# Step 4: If no cached models work, try downloading requested model first
|
|
640
|
-
if debug_mode:
|
|
641
|
-
print(" > No cached models worked, attempting downloads...")
|
|
642
|
-
|
|
643
|
-
# Try downloading the requested model first
|
|
644
|
-
if "vits" not in preferred_model or espeak_available:
|
|
645
|
-
try:
|
|
646
|
-
if debug_mode:
|
|
647
|
-
print(f" > Downloading requested model: {preferred_model}...")
|
|
648
|
-
success = model_manager.download_model(preferred_model)
|
|
649
|
-
if success:
|
|
650
|
-
self.tts = TTS(model_name=preferred_model, progress_bar=self.debug_mode)
|
|
651
|
-
if debug_mode:
|
|
652
|
-
print(f" > ✅ Downloaded and loaded requested: {preferred_model}")
|
|
653
|
-
return True, preferred_model
|
|
654
|
-
elif debug_mode:
|
|
655
|
-
print(f" > ❌ Download failed for requested model: {preferred_model}")
|
|
656
|
-
except Exception as e:
|
|
657
|
-
if debug_mode:
|
|
658
|
-
print(f" > ❌ Failed to download/load requested model: {e}")
|
|
659
|
-
|
|
660
|
-
# Step 5: If requested model download failed, try fallback downloads
|
|
661
|
-
for model in fallback_models:
|
|
662
|
-
# Skip VITS models if no espeak
|
|
663
|
-
if "vits" in model and not espeak_available:
|
|
664
|
-
continue
|
|
665
|
-
|
|
666
|
-
try:
|
|
667
|
-
if debug_mode:
|
|
668
|
-
print(f" > Downloading fallback: {model}...")
|
|
669
|
-
|
|
670
|
-
# First try to download
|
|
671
|
-
success = model_manager.download_model(model)
|
|
672
|
-
if success:
|
|
673
|
-
# Then try to load
|
|
674
|
-
self.tts = TTS(model_name=model, progress_bar=self.debug_mode)
|
|
675
|
-
if debug_mode:
|
|
676
|
-
print(f" > ✅ Downloaded and loaded fallback: {model}")
|
|
677
|
-
return True, model
|
|
678
|
-
elif debug_mode:
|
|
679
|
-
print(f" > ❌ Download failed for {model}")
|
|
680
|
-
|
|
681
|
-
except Exception as e:
|
|
682
|
-
if debug_mode:
|
|
683
|
-
print(f" > ❌ Failed to load {model}: {e}")
|
|
684
|
-
|
|
685
|
-
return False, None
|
|
686
|
-
|
|
687
|
-
def _check_espeak_available(self) -> bool:
|
|
688
|
-
"""Check if espeak-ng is available on the system."""
|
|
689
|
-
import subprocess
|
|
690
|
-
try:
|
|
691
|
-
subprocess.run(['espeak-ng', '--version'],
|
|
692
|
-
capture_output=True, check=True, timeout=5)
|
|
693
|
-
return True
|
|
694
|
-
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
|
|
695
|
-
# Try alternative espeak command
|
|
696
|
-
try:
|
|
697
|
-
subprocess.run(['espeak', '--version'],
|
|
698
|
-
capture_output=True, check=True, timeout=5)
|
|
699
|
-
return True
|
|
700
|
-
except:
|
|
701
|
-
return False
|
|
702
|
-
|
|
703
|
-
def _handle_espeak_fallback(self, debug_mode: bool):
|
|
704
|
-
"""Handle espeak-related errors with fallback to non-phoneme models."""
|
|
705
|
-
# Restore stdout to show user-friendly message
|
|
706
|
-
if not debug_mode:
|
|
707
|
-
sys.stdout = sys.__stdout__
|
|
708
|
-
|
|
709
|
-
print("\n" + "="*70)
|
|
710
|
-
print("⚠️ VITS Model Requires espeak-ng (Not Found)")
|
|
711
|
-
print("="*70)
|
|
712
|
-
print("\nFor BEST voice quality, install espeak-ng:")
|
|
713
|
-
print(" • macOS: brew install espeak-ng")
|
|
714
|
-
print(" • Linux: sudo apt-get install espeak-ng")
|
|
715
|
-
print(" • Windows: conda install espeak-ng (or see README)")
|
|
716
|
-
print("\nFalling back to compatible models (no espeak dependency)")
|
|
717
|
-
print("="*70 + "\n")
|
|
718
|
-
|
|
719
|
-
if not debug_mode:
|
|
720
|
-
import os
|
|
721
|
-
null_out = open(os.devnull, 'w')
|
|
722
|
-
sys.stdout = null_out
|
|
723
|
-
|
|
724
|
-
# Try non-phoneme models that don't require espeak (compatibility-first order)
|
|
725
|
-
from TTS.api import TTS
|
|
726
|
-
fallback_models = [
|
|
727
|
-
"tts_models/en/ljspeech/tacotron2-DDC", # Most reliable (Linda)
|
|
728
|
-
"tts_models/en/jenny/jenny", # Different female speaker (Jenny)
|
|
729
|
-
"tts_models/en/ek1/tacotron2", # Male British accent (Edward)
|
|
730
|
-
"tts_models/en/sam/tacotron-DDC", # Different male voice (Sam)
|
|
731
|
-
"tts_models/en/ljspeech/fast_pitch", # Lightweight alternative
|
|
732
|
-
"tts_models/en/ljspeech/glow-tts" # Another alternative
|
|
733
|
-
]
|
|
734
|
-
|
|
735
|
-
tts_loaded = False
|
|
736
|
-
for fallback_model in fallback_models:
|
|
737
|
-
try:
|
|
738
|
-
if debug_mode:
|
|
739
|
-
print(f"Trying fallback model: {fallback_model}")
|
|
740
|
-
self.tts = TTS(model_name=fallback_model, progress_bar=self.debug_mode)
|
|
741
|
-
tts_loaded = True
|
|
742
|
-
break
|
|
743
|
-
except Exception as fallback_error:
|
|
744
|
-
if debug_mode:
|
|
745
|
-
print(f"Fallback {fallback_model} failed: {fallback_error}")
|
|
746
|
-
continue
|
|
747
|
-
|
|
748
|
-
if not tts_loaded:
|
|
749
|
-
self._handle_model_load_failure(debug_mode)
|
|
750
|
-
|
|
751
|
-
def _handle_model_load_failure(self, debug_mode: bool):
|
|
752
|
-
"""Handle complete model loading failure with actionable guidance."""
|
|
753
|
-
# Restore stdout to show user-friendly message
|
|
754
|
-
if not debug_mode:
|
|
755
|
-
sys.stdout = sys.__stdout__
|
|
756
|
-
|
|
757
|
-
print("\n" + "="*70)
|
|
758
|
-
print("❌ TTS Model Loading Failed")
|
|
759
|
-
print("="*70)
|
|
760
|
-
print("\nNo TTS models could be loaded (offline or online).")
|
|
761
|
-
print("\nQuick fixes:")
|
|
762
|
-
print(" 1. Download essential models:")
|
|
763
|
-
print(" abstractvoice download-models")
|
|
764
|
-
print(" 2. Check internet connectivity")
|
|
765
|
-
print(" 3. Clear corrupted cache:")
|
|
766
|
-
print(" rm -rf ~/.cache/tts ~/.local/share/tts")
|
|
767
|
-
print(" 4. Reinstall TTS:")
|
|
768
|
-
print(" pip install --force-reinstall coqui-tts")
|
|
769
|
-
print(" 5. Use text-only mode:")
|
|
770
|
-
print(" abstractvoice --no-tts")
|
|
771
|
-
print("="*70)
|
|
772
|
-
|
|
773
|
-
raise RuntimeError(
|
|
774
|
-
"❌ Failed to load any TTS model.\n"
|
|
775
|
-
"This typically means:\n"
|
|
776
|
-
" • No models cached locally AND no internet connection\n"
|
|
777
|
-
" • Corrupted model cache\n"
|
|
778
|
-
" • Insufficient disk space\n"
|
|
779
|
-
" • Network firewall blocking downloads\n\n"
|
|
780
|
-
"Run 'abstractvoice download-models' when you have internet access."
|
|
781
|
-
)
|
|
782
|
-
|
|
783
|
-
def _on_playback_complete(self):
|
|
784
|
-
"""Callback when audio playback completes."""
|
|
785
|
-
self.is_playing = False
|
|
786
|
-
if self.on_playback_end:
|
|
787
|
-
self.on_playback_end()
|
|
788
|
-
|
|
789
|
-
def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
|
|
790
|
-
"""Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
|
|
791
|
-
# Stop any existing playback
|
|
792
|
-
self.stop()
|
|
793
|
-
|
|
794
|
-
if not text:
|
|
795
|
-
return False
|
|
796
|
-
|
|
797
|
-
try:
|
|
798
|
-
# Preprocess text for better synthesis quality
|
|
799
|
-
processed_text = preprocess_text(text)
|
|
800
|
-
|
|
801
|
-
if self.debug_mode:
|
|
802
|
-
print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
|
|
803
|
-
print(f" > Text length: {len(processed_text)} chars")
|
|
804
|
-
if language != 'en':
|
|
805
|
-
print(f" > Language: {language}")
|
|
806
|
-
if speed != 1.0:
|
|
807
|
-
print(f" > Using speed multiplier: {speed}x")
|
|
808
|
-
|
|
809
|
-
# For very long text, chunk it at natural boundaries
|
|
810
|
-
text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
|
|
811
|
-
|
|
812
|
-
if self.debug_mode and len(text_chunks) > 1:
|
|
813
|
-
print(f" > Split into {len(text_chunks)} chunks for processing")
|
|
814
|
-
|
|
815
|
-
# Set playing state
|
|
816
|
-
self.is_playing = True
|
|
817
|
-
self.is_paused_state = False
|
|
818
|
-
|
|
819
|
-
# Call start callback
|
|
820
|
-
if self.on_playback_start:
|
|
821
|
-
self.on_playback_start()
|
|
822
|
-
|
|
823
|
-
# Synthesize and queue audio chunks
|
|
824
|
-
def synthesis_worker():
|
|
825
|
-
try:
|
|
826
|
-
for i, chunk in enumerate(text_chunks):
|
|
827
|
-
if self.stop_flag.is_set():
|
|
828
|
-
break
|
|
829
|
-
|
|
830
|
-
if self.debug_mode and len(text_chunks) > 1:
|
|
831
|
-
print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
|
|
832
|
-
|
|
833
|
-
# Generate audio for this chunk with language support
|
|
834
|
-
try:
|
|
835
|
-
# Check if this is an XTTS model (supports language parameter)
|
|
836
|
-
if 'xtts' in self.tts.model_name.lower():
|
|
837
|
-
chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
|
|
838
|
-
if self.debug_mode and language != 'en':
|
|
839
|
-
print(f" > Using XTTS with language: {language}")
|
|
840
|
-
else:
|
|
841
|
-
# Monolingual model - ignore language parameter
|
|
842
|
-
chunk_audio = self.tts.tts(chunk, split_sentences=True)
|
|
843
|
-
if self.debug_mode and language != 'en':
|
|
844
|
-
print(f" > Monolingual model - ignoring language parameter")
|
|
845
|
-
except Exception as tts_error:
|
|
846
|
-
# Fallback: try without language parameter
|
|
847
|
-
if self.debug_mode:
|
|
848
|
-
print(f" > TTS with language failed, trying without: {tts_error}")
|
|
849
|
-
chunk_audio = self.tts.tts(chunk, split_sentences=True)
|
|
850
|
-
|
|
851
|
-
if chunk_audio and len(chunk_audio) > 0:
|
|
852
|
-
# Apply speed adjustment
|
|
853
|
-
if speed != 1.0:
|
|
854
|
-
chunk_audio = apply_speed_without_pitch_change(
|
|
855
|
-
np.array(chunk_audio), speed
|
|
856
|
-
)
|
|
857
|
-
|
|
858
|
-
# Queue the audio for playback
|
|
859
|
-
self.audio_player.play_audio(np.array(chunk_audio))
|
|
860
|
-
|
|
861
|
-
if self.debug_mode:
|
|
862
|
-
print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
|
|
863
|
-
|
|
864
|
-
# Small delay between chunks to prevent overwhelming the queue
|
|
865
|
-
time.sleep(0.01)
|
|
866
|
-
|
|
867
|
-
except Exception as e:
|
|
868
|
-
if self.debug_mode:
|
|
869
|
-
print(f"Error in synthesis worker: {e}")
|
|
870
|
-
finally:
|
|
871
|
-
# Synthesis complete - audio player will handle completion callback
|
|
872
|
-
pass
|
|
873
|
-
|
|
874
|
-
# Start synthesis in background thread
|
|
875
|
-
synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
|
|
876
|
-
synthesis_thread.start()
|
|
877
|
-
|
|
878
|
-
return True
|
|
879
|
-
|
|
880
|
-
except Exception as e:
|
|
881
|
-
if self.debug_mode:
|
|
882
|
-
print(f"Error in _speak_with_nonblocking_player: {e}")
|
|
883
|
-
self.is_playing = False
|
|
884
|
-
return False
|
|
885
|
-
|
|
886
|
-
def speak(self, text, speed=1.0, callback=None, language='en'):
|
|
887
|
-
"""Convert text to speech and play audio with language support.
|
|
888
|
-
|
|
889
|
-
Implements SOTA best practices for long text synthesis:
|
|
890
|
-
- Text preprocessing and normalization
|
|
891
|
-
- Intelligent chunking for very long text (>500 chars)
|
|
892
|
-
- Sentence segmentation to prevent attention degradation
|
|
893
|
-
- Seamless audio concatenation for chunks
|
|
894
|
-
- Multilingual support via XTTS models
|
|
895
|
-
|
|
896
|
-
Args:
|
|
897
|
-
text: Text to convert to speech
|
|
898
|
-
speed: Speed multiplier (0.5-2.0)
|
|
899
|
-
callback: Function to call when speech is complete
|
|
900
|
-
language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
|
|
901
|
-
|
|
902
|
-
Returns:
|
|
903
|
-
True if speech started, False if text was empty
|
|
904
|
-
"""
|
|
905
|
-
# Use the new non-blocking audio player for immediate pause/resume
|
|
906
|
-
return self._speak_with_nonblocking_player(text, speed, callback, language)
|
|
907
|
-
|
|
908
|
-
if not text:
|
|
909
|
-
return False
|
|
910
|
-
|
|
911
|
-
try:
|
|
912
|
-
# Preprocess text for better synthesis quality
|
|
913
|
-
processed_text = preprocess_text(text)
|
|
914
|
-
|
|
915
|
-
if self.debug_mode:
|
|
916
|
-
print(f" > Speaking: '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
|
|
917
|
-
print(f" > Text length: {len(processed_text)} chars")
|
|
918
|
-
if speed != 1.0:
|
|
919
|
-
print(f" > Using speed multiplier: {speed}x")
|
|
920
|
-
|
|
921
|
-
# For very long text, chunk it at natural boundaries
|
|
922
|
-
# Use 300 chars to stay well within model's training distribution
|
|
923
|
-
text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
|
|
924
|
-
|
|
925
|
-
if self.debug_mode and len(text_chunks) > 1:
|
|
926
|
-
print(f" > Split into {len(text_chunks)} chunks for processing")
|
|
927
|
-
|
|
928
|
-
# Redirect stdout for non-debug mode
|
|
929
|
-
orig_stdout = None
|
|
930
|
-
null_out = None
|
|
931
|
-
if not self.debug_mode:
|
|
932
|
-
orig_stdout = sys.stdout
|
|
933
|
-
null_out = open(os.devnull, 'w')
|
|
934
|
-
sys.stdout = null_out
|
|
935
|
-
|
|
936
|
-
try:
|
|
937
|
-
# Choose synthesis strategy based on streaming mode
|
|
938
|
-
if self.streaming and len(text_chunks) > 1:
|
|
939
|
-
# STREAMING MODE: Synthesize and play progressively
|
|
940
|
-
if self.debug_mode:
|
|
941
|
-
sys.stdout = sys.__stdout__
|
|
942
|
-
print(f" > Streaming mode: will start playback after first chunk")
|
|
943
|
-
if not self.debug_mode:
|
|
944
|
-
sys.stdout = null_out
|
|
945
|
-
|
|
946
|
-
# Synthesize first chunk
|
|
947
|
-
if self.debug_mode:
|
|
948
|
-
sys.stdout = sys.__stdout__
|
|
949
|
-
print(f" > Processing chunk 1/{len(text_chunks)} ({len(text_chunks[0])} chars)...")
|
|
950
|
-
if not self.debug_mode:
|
|
951
|
-
sys.stdout = null_out
|
|
952
|
-
|
|
953
|
-
first_audio = self.tts.tts(text_chunks[0], split_sentences=True)
|
|
954
|
-
|
|
955
|
-
if not first_audio:
|
|
956
|
-
if self.debug_mode:
|
|
957
|
-
sys.stdout = sys.__stdout__
|
|
958
|
-
print("TTS failed to generate audio for first chunk.")
|
|
959
|
-
return False
|
|
960
|
-
|
|
961
|
-
# Apply speed adjustment using time-stretching (preserves pitch)
|
|
962
|
-
if speed != 1.0:
|
|
963
|
-
first_audio = apply_speed_without_pitch_change(
|
|
964
|
-
np.array(first_audio), speed
|
|
965
|
-
)
|
|
966
|
-
|
|
967
|
-
if self.debug_mode:
|
|
968
|
-
sys.stdout = sys.__stdout__
|
|
969
|
-
print(f" > Chunk 1 generated {len(first_audio)} audio samples")
|
|
970
|
-
if speed != 1.0:
|
|
971
|
-
print(f" > Applied time-stretch: {speed}x (pitch preserved)")
|
|
972
|
-
print(f" > Starting playback while synthesizing remaining chunks...")
|
|
973
|
-
if not self.debug_mode:
|
|
974
|
-
sys.stdout = null_out
|
|
975
|
-
|
|
976
|
-
# Initialize queue with first chunk
|
|
977
|
-
with self.queue_lock:
|
|
978
|
-
self.audio_queue = [first_audio]
|
|
979
|
-
|
|
980
|
-
# Start playback thread (will play from queue)
|
|
981
|
-
audio = None # Will use queue instead
|
|
982
|
-
|
|
983
|
-
else:
|
|
984
|
-
# NON-STREAMING MODE: Synthesize all chunks then play
|
|
985
|
-
audio_chunks = []
|
|
986
|
-
for i, chunk in enumerate(text_chunks):
|
|
987
|
-
if self.debug_mode and len(text_chunks) > 1:
|
|
988
|
-
sys.stdout = sys.__stdout__
|
|
989
|
-
print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
|
|
990
|
-
if not self.debug_mode:
|
|
991
|
-
sys.stdout = null_out
|
|
992
|
-
|
|
993
|
-
# Use split_sentences=True (SOTA best practice)
|
|
994
|
-
chunk_audio = self.tts.tts(chunk, split_sentences=True)
|
|
995
|
-
|
|
996
|
-
if chunk_audio:
|
|
997
|
-
# Apply speed adjustment using time-stretching (preserves pitch)
|
|
998
|
-
if speed != 1.0:
|
|
999
|
-
chunk_audio = apply_speed_without_pitch_change(
|
|
1000
|
-
np.array(chunk_audio), speed
|
|
1001
|
-
)
|
|
1002
|
-
audio_chunks.append(chunk_audio)
|
|
1003
|
-
if self.debug_mode and len(text_chunks) > 1:
|
|
1004
|
-
sys.stdout = sys.__stdout__
|
|
1005
|
-
print(f" > Chunk {i+1} generated {len(chunk_audio)} audio samples")
|
|
1006
|
-
if not self.debug_mode:
|
|
1007
|
-
sys.stdout = null_out
|
|
1008
|
-
elif self.debug_mode:
|
|
1009
|
-
sys.stdout = sys.__stdout__
|
|
1010
|
-
print(f" > Warning: Chunk {i+1} failed to generate audio")
|
|
1011
|
-
if not self.debug_mode:
|
|
1012
|
-
sys.stdout = null_out
|
|
1013
|
-
|
|
1014
|
-
if not audio_chunks:
|
|
1015
|
-
if self.debug_mode:
|
|
1016
|
-
sys.stdout = sys.__stdout__
|
|
1017
|
-
print("TTS failed to generate audio.")
|
|
1018
|
-
return False
|
|
1019
|
-
|
|
1020
|
-
# Concatenate audio arrays
|
|
1021
|
-
if len(audio_chunks) == 1:
|
|
1022
|
-
audio = audio_chunks[0]
|
|
1023
|
-
else:
|
|
1024
|
-
audio = np.concatenate(audio_chunks)
|
|
1025
|
-
if self.debug_mode:
|
|
1026
|
-
sys.stdout = sys.__stdout__
|
|
1027
|
-
print(f" > Concatenated {len(audio_chunks)} chunks into {len(audio)} total audio samples")
|
|
1028
|
-
if not self.debug_mode:
|
|
1029
|
-
sys.stdout = null_out
|
|
1030
|
-
|
|
1031
|
-
finally:
|
|
1032
|
-
# Restore stdout if we redirected it
|
|
1033
|
-
if not self.debug_mode and orig_stdout:
|
|
1034
|
-
sys.stdout = orig_stdout
|
|
1035
|
-
if null_out:
|
|
1036
|
-
null_out.close()
|
|
1037
|
-
|
|
1038
|
-
def _audio_playback():
|
|
1039
|
-
# Import sounddevice at runtime to avoid loading heavy dependencies
|
|
1040
|
-
sd, _ = _import_audio_deps()
|
|
1041
|
-
|
|
1042
|
-
try:
|
|
1043
|
-
self.is_playing = True
|
|
1044
|
-
self.start_time = time.time()
|
|
1045
|
-
|
|
1046
|
-
# Notify that playback is starting (to pause voice recognition)
|
|
1047
|
-
if self.on_playback_start:
|
|
1048
|
-
self.on_playback_start()
|
|
1049
|
-
|
|
1050
|
-
# Use standard playback rate (speed is handled via time-stretching)
|
|
1051
|
-
playback_rate = 22050
|
|
1052
|
-
|
|
1053
|
-
# STREAMING MODE: Play from queue while synthesizing remaining chunks
|
|
1054
|
-
if audio is None: # Streaming mode indicator
|
|
1055
|
-
# Start background thread to synthesize remaining chunks
|
|
1056
|
-
def _synthesize_remaining():
|
|
1057
|
-
for i in range(1, len(text_chunks)):
|
|
1058
|
-
if self.stop_flag.is_set():
|
|
1059
|
-
break
|
|
1060
|
-
|
|
1061
|
-
if self.debug_mode:
|
|
1062
|
-
print(f" > [Background] Processing chunk {i+1}/{len(text_chunks)} ({len(text_chunks[i])} chars)...")
|
|
1063
|
-
|
|
1064
|
-
try:
|
|
1065
|
-
chunk_audio = self.tts.tts(text_chunks[i], split_sentences=True)
|
|
1066
|
-
if chunk_audio:
|
|
1067
|
-
# Apply speed adjustment using time-stretching (preserves pitch)
|
|
1068
|
-
if speed != 1.0:
|
|
1069
|
-
chunk_audio = apply_speed_without_pitch_change(
|
|
1070
|
-
np.array(chunk_audio), speed
|
|
1071
|
-
)
|
|
1072
|
-
with self.queue_lock:
|
|
1073
|
-
self.audio_queue.append(chunk_audio)
|
|
1074
|
-
if self.debug_mode:
|
|
1075
|
-
print(f" > [Background] Chunk {i+1} generated {len(chunk_audio)} samples, added to queue")
|
|
1076
|
-
except Exception as e:
|
|
1077
|
-
if self.debug_mode:
|
|
1078
|
-
print(f" > [Background] Chunk {i+1} synthesis error: {e}")
|
|
1079
|
-
|
|
1080
|
-
synthesis_thread = threading.Thread(target=_synthesize_remaining)
|
|
1081
|
-
synthesis_thread.daemon = True
|
|
1082
|
-
synthesis_thread.start()
|
|
1083
|
-
|
|
1084
|
-
# Play chunks from queue as they become available
|
|
1085
|
-
chunks_played = 0
|
|
1086
|
-
while chunks_played < len(text_chunks) and not self.stop_flag.is_set():
|
|
1087
|
-
# Check for pause before processing next chunk
|
|
1088
|
-
while not self.pause_flag.is_set() and not self.stop_flag.is_set():
|
|
1089
|
-
time.sleep(0.1) # Non-blocking pause check
|
|
1090
|
-
|
|
1091
|
-
if self.stop_flag.is_set():
|
|
1092
|
-
break
|
|
1093
|
-
|
|
1094
|
-
# Wait for next chunk to be available
|
|
1095
|
-
while True:
|
|
1096
|
-
with self.queue_lock:
|
|
1097
|
-
if chunks_played < len(self.audio_queue):
|
|
1098
|
-
chunk_to_play = self.audio_queue[chunks_played]
|
|
1099
|
-
break
|
|
1100
|
-
if self.stop_flag.is_set():
|
|
1101
|
-
break
|
|
1102
|
-
time.sleep(0.05) # Short wait before checking again
|
|
1103
|
-
|
|
1104
|
-
if self.stop_flag.is_set():
|
|
1105
|
-
break
|
|
1106
|
-
|
|
1107
|
-
# Play this chunk
|
|
1108
|
-
audio_array = np.array(chunk_to_play)
|
|
1109
|
-
sd.play(audio_array, samplerate=playback_rate)
|
|
1110
|
-
|
|
1111
|
-
# Wait for this chunk to finish (with frequent pause checks)
|
|
1112
|
-
while not self.stop_flag.is_set() and sd.get_stream().active:
|
|
1113
|
-
# Check for pause more frequently
|
|
1114
|
-
if not self.pause_flag.is_set():
|
|
1115
|
-
# Paused - let current audio finish naturally (avoids terminal interference)
|
|
1116
|
-
break
|
|
1117
|
-
time.sleep(0.05) # Check every 50ms for better responsiveness
|
|
1118
|
-
|
|
1119
|
-
if self.stop_flag.is_set():
|
|
1120
|
-
# Only use sd.stop() for explicit stop, not pause
|
|
1121
|
-
sd.stop()
|
|
1122
|
-
break
|
|
1123
|
-
|
|
1124
|
-
chunks_played += 1
|
|
1125
|
-
|
|
1126
|
-
synthesis_thread.join(timeout=1.0) # Wait for synthesis to complete
|
|
1127
|
-
|
|
1128
|
-
else:
|
|
1129
|
-
# NON-STREAMING MODE: Play concatenated audio
|
|
1130
|
-
audio_array = np.array(audio)
|
|
1131
|
-
sd.play(audio_array, samplerate=playback_rate)
|
|
1132
|
-
|
|
1133
|
-
# Wait for playback to complete or stop flag (with pause support)
|
|
1134
|
-
while not self.stop_flag.is_set() and sd.get_stream().active:
|
|
1135
|
-
# Check for pause more frequently
|
|
1136
|
-
if not self.pause_flag.is_set():
|
|
1137
|
-
# Paused - let current audio finish naturally and wait
|
|
1138
|
-
if self.debug_mode:
|
|
1139
|
-
print(" > Audio paused, waiting for resume...")
|
|
1140
|
-
# Non-blocking wait for resume
|
|
1141
|
-
while not self.pause_flag.is_set() and not self.stop_flag.is_set():
|
|
1142
|
-
time.sleep(0.1)
|
|
1143
|
-
if not self.stop_flag.is_set():
|
|
1144
|
-
# Resume - restart the audio (non-streaming limitation)
|
|
1145
|
-
if self.debug_mode:
|
|
1146
|
-
print(" > Resuming audio from beginning of current segment...")
|
|
1147
|
-
sd.play(audio_array, samplerate=playback_rate)
|
|
1148
|
-
time.sleep(0.05) # Check every 50ms for better responsiveness
|
|
1149
|
-
|
|
1150
|
-
sd.stop()
|
|
1151
|
-
|
|
1152
|
-
self.is_playing = False
|
|
1153
|
-
|
|
1154
|
-
# Notify that playback has ended (to resume voice recognition)
|
|
1155
|
-
if self.on_playback_end:
|
|
1156
|
-
self.on_playback_end()
|
|
1157
|
-
|
|
1158
|
-
if self.debug_mode:
|
|
1159
|
-
duration = time.time() - self.start_time
|
|
1160
|
-
if not self.stop_flag.is_set(): # Only if completed normally
|
|
1161
|
-
print(f" > Speech completed in {duration:.2f} seconds")
|
|
1162
|
-
|
|
1163
|
-
# Call the callback if provided and speech completed normally
|
|
1164
|
-
if callback and not self.stop_flag.is_set():
|
|
1165
|
-
callback()
|
|
1166
|
-
|
|
1167
|
-
except Exception as e:
|
|
1168
|
-
if self.debug_mode:
|
|
1169
|
-
print(f"Audio playback error: {e}")
|
|
1170
|
-
self.is_playing = False
|
|
1171
|
-
# Ensure we notify end even on error
|
|
1172
|
-
if self.on_playback_end:
|
|
1173
|
-
self.on_playback_end()
|
|
1174
|
-
|
|
1175
|
-
# Start playback in a separate thread
|
|
1176
|
-
self.stop_flag.clear()
|
|
1177
|
-
self.pause_flag.set() # Ensure we start unpaused
|
|
1178
|
-
self.is_paused_state = False # Reset paused state
|
|
1179
|
-
self.playback_thread = threading.Thread(target=_audio_playback)
|
|
1180
|
-
self.playback_thread.start()
|
|
1181
|
-
return True
|
|
1182
|
-
|
|
1183
|
-
except Exception as e:
|
|
1184
|
-
if self.debug_mode:
|
|
1185
|
-
print(f"TTS error: {e}")
|
|
1186
|
-
return False
|
|
1187
|
-
|
|
1188
|
-
def stop(self):
|
|
1189
|
-
"""Stop current audio playback.
|
|
1190
|
-
|
|
1191
|
-
Returns:
|
|
1192
|
-
True if playback was stopped, False if no playback was active
|
|
1193
|
-
"""
|
|
1194
|
-
stopped = False
|
|
1195
|
-
|
|
1196
|
-
# Stop new non-blocking audio player
|
|
1197
|
-
if self.audio_player.is_playing:
|
|
1198
|
-
self.audio_player.stop_stream()
|
|
1199
|
-
stopped = True
|
|
1200
|
-
if self.debug_mode:
|
|
1201
|
-
print(" > TTS playback stopped (non-blocking)")
|
|
1202
|
-
|
|
1203
|
-
# Stop legacy playback system
|
|
1204
|
-
if self.playback_thread and self.playback_thread.is_alive():
|
|
1205
|
-
self.stop_flag.set()
|
|
1206
|
-
self.pause_flag.set() # Ensure we're not stuck in pause
|
|
1207
|
-
self.is_paused_state = False # Reset paused state
|
|
1208
|
-
self.playback_thread.join()
|
|
1209
|
-
self.playback_thread = None
|
|
1210
|
-
stopped = True
|
|
1211
|
-
|
|
1212
|
-
if self.debug_mode:
|
|
1213
|
-
print(" > TTS playback interrupted (legacy)")
|
|
1214
|
-
|
|
1215
|
-
# Reset state
|
|
1216
|
-
self.is_playing = False
|
|
1217
|
-
self.is_paused_state = False
|
|
1218
|
-
|
|
1219
|
-
return stopped
|
|
1220
|
-
|
|
1221
|
-
def pause(self):
|
|
1222
|
-
"""Pause current speech playback.
|
|
1223
|
-
|
|
1224
|
-
Uses a non-interfering pause method that avoids terminal I/O issues.
|
|
1225
|
-
|
|
1226
|
-
Returns:
|
|
1227
|
-
True if paused, False if no playback was active
|
|
1228
|
-
"""
|
|
1229
|
-
# Try new non-blocking audio player first
|
|
1230
|
-
if self.audio_player.is_playing:
|
|
1231
|
-
result = self.audio_player.pause()
|
|
1232
|
-
if result:
|
|
1233
|
-
self.is_paused_state = True
|
|
1234
|
-
if self.debug_mode:
|
|
1235
|
-
print(" > TTS paused immediately (non-blocking)")
|
|
1236
|
-
return result
|
|
1237
|
-
|
|
1238
|
-
# Fallback to legacy system
|
|
1239
|
-
if self.playback_thread and self.playback_thread.is_alive() and self.is_playing:
|
|
1240
|
-
self.pause_flag.clear() # Clear means "paused"
|
|
1241
|
-
self.is_paused_state = True # Explicit state tracking
|
|
1242
|
-
|
|
1243
|
-
if self.debug_mode:
|
|
1244
|
-
print(" > TTS paused (legacy method)")
|
|
1245
|
-
|
|
1246
|
-
return True
|
|
1247
|
-
|
|
1248
|
-
return False
|
|
1249
|
-
|
|
1250
|
-
def resume(self):
|
|
1251
|
-
"""Resume paused speech playback.
|
|
1252
|
-
|
|
1253
|
-
Returns:
|
|
1254
|
-
True if resumed, False if not paused or no playback active
|
|
1255
|
-
"""
|
|
1256
|
-
if self.is_paused_state:
|
|
1257
|
-
# Try new non-blocking audio player first
|
|
1258
|
-
if self.audio_player.is_paused_state():
|
|
1259
|
-
result = self.audio_player.resume()
|
|
1260
|
-
if result:
|
|
1261
|
-
self.is_paused_state = False
|
|
1262
|
-
if self.debug_mode:
|
|
1263
|
-
print(" > TTS resumed immediately (non-blocking)")
|
|
1264
|
-
return True
|
|
1265
|
-
|
|
1266
|
-
# Fallback to legacy system
|
|
1267
|
-
if self.playback_thread and self.playback_thread.is_alive():
|
|
1268
|
-
# Thread is still alive, can resume
|
|
1269
|
-
self.pause_flag.set() # Set means "not paused"
|
|
1270
|
-
self.is_paused_state = False # Clear explicit state
|
|
1271
|
-
if self.debug_mode:
|
|
1272
|
-
print(" > TTS resumed (legacy method)")
|
|
1273
|
-
return True
|
|
1274
|
-
else:
|
|
1275
|
-
# Thread died while paused, nothing to resume
|
|
1276
|
-
self.is_paused_state = False # Clear paused state
|
|
1277
|
-
if self.debug_mode:
|
|
1278
|
-
print(" > TTS was paused but playback already completed")
|
|
1279
|
-
return False
|
|
1280
|
-
return False
|
|
1281
|
-
|
|
1282
|
-
def is_paused(self):
|
|
1283
|
-
"""Check if TTS is currently paused.
|
|
1284
|
-
|
|
1285
|
-
Returns:
|
|
1286
|
-
True if paused, False otherwise
|
|
1287
|
-
"""
|
|
1288
|
-
return self.is_paused_state
|
|
1289
|
-
|
|
1290
|
-
def is_active(self):
|
|
1291
|
-
"""Check if TTS is currently playing.
|
|
1292
|
-
|
|
1293
|
-
Returns:
|
|
1294
|
-
True if TTS is active, False otherwise
|
|
1295
|
-
"""
|
|
1296
|
-
return self.is_playing
|
|
1297
|
-
|