abstractvoice 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,931 @@
1
+ """TTS Engine for high-quality speech synthesis with interrupt handling.
2
+
3
+ This module implements best practices for TTS synthesis including:
4
+ - Sentence segmentation for long text (prevents attention degradation)
5
+ - Text chunking for extremely long content
6
+ - Text preprocessing and normalization
7
+ - Robust error handling
8
+ """
9
+
10
+ import threading
11
+ import time
12
+ import numpy as np
13
+ import sounddevice as sd
14
+ import os
15
+ import sys
16
+ import logging
17
+ import warnings
18
+ import re
19
+ from TTS.api import TTS
20
+ import librosa
21
+ import queue
22
+
23
+ # Suppress the PyTorch FutureWarning about torch.load
24
+ warnings.filterwarnings(
25
+ "ignore",
26
+ message="You are using `torch.load` with `weights_only=False`",
27
+ category=FutureWarning
28
+ )
29
+
30
+ # Suppress pkg_resources deprecation warning from jieba
31
+ warnings.filterwarnings(
32
+ "ignore",
33
+ message=".*pkg_resources is deprecated.*",
34
+ category=DeprecationWarning
35
+ )
36
+
37
+ # Suppress coqpit deserialization warnings from TTS models
38
+ warnings.filterwarnings(
39
+ "ignore",
40
+ message=".*Type mismatch.*",
41
+ category=UserWarning
42
+ )
43
+ warnings.filterwarnings(
44
+ "ignore",
45
+ message=".*Failed to deserialize field.*",
46
+ category=UserWarning
47
+ )
48
+
49
+ # Suppress macOS audio warnings (harmless but annoying)
50
+ import os
51
+ os.environ['PYTHONWARNINGS'] = 'ignore'
52
+
53
+ def preprocess_text(text):
54
+ """Preprocess text for better TTS synthesis.
55
+
56
+ This function normalizes text to prevent synthesis errors:
57
+ - Removes excessive whitespace
58
+ - Normalizes punctuation
59
+ - Handles common abbreviations
60
+ - Removes problematic characters
61
+
62
+ Args:
63
+ text: Input text string
64
+
65
+ Returns:
66
+ Cleaned and normalized text
67
+ """
68
+ if not text:
69
+ return text
70
+
71
+ # Remove excessive whitespace
72
+ text = re.sub(r'\s+', ' ', text)
73
+
74
+ # Normalize ellipsis
75
+ text = text.replace('...', '.')
76
+
77
+ # Remove or normalize problematic characters
78
+ # Keep basic punctuation that helps with prosody
79
+ text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
80
+
81
+ # Ensure proper spacing after punctuation
82
+ text = re.sub(r'([.,!?;:])([^\s])', r'\1 \2', text)
83
+
84
+ return text.strip()
85
+
86
+
87
+ def apply_speed_without_pitch_change(audio, speed, sr=22050):
88
+ """Apply speed change without affecting pitch using librosa time_stretch.
89
+
90
+ Args:
91
+ audio: Audio samples as numpy array
92
+ speed: Speed multiplier (0.5-2.0, where >1.0 is faster, <1.0 is slower)
93
+ sr: Sample rate (default 22050)
94
+
95
+ Returns:
96
+ Time-stretched audio samples
97
+ """
98
+ if speed == 1.0:
99
+ return audio
100
+
101
+ # librosa.effects.time_stretch expects rate parameter where:
102
+ # rate > 1.0 makes audio faster (shorter)
103
+ # rate < 1.0 makes audio slower (longer)
104
+ # This matches our speed semantics
105
+ try:
106
+ stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
107
+ return stretched_audio
108
+ except Exception as e:
109
+ # If time-stretching fails, return original audio
110
+ logging.warning(f"Time-stretching failed: {e}, using original audio")
111
+ return audio
112
+
113
+
114
+ class NonBlockingAudioPlayer:
115
+ """Non-blocking audio player using OutputStream callbacks for immediate pause/resume."""
116
+
117
+ def __init__(self, sample_rate=22050, debug_mode=False):
118
+ self.sample_rate = sample_rate
119
+ self.debug_mode = debug_mode
120
+
121
+ # Audio queue and playback state
122
+ self.audio_queue = queue.Queue()
123
+ self.stream = None
124
+ self.is_playing = False
125
+ self.is_paused = False
126
+ self.pause_lock = threading.Lock()
127
+
128
+ # Current audio buffer management
129
+ self.current_audio = None
130
+ self.current_position = 0
131
+ self.playback_complete_callback = None
132
+
133
+ def _audio_callback(self, outdata, frames, time, status):
134
+ """Callback function for OutputStream - provides immediate pause/resume."""
135
+ if status and self.debug_mode:
136
+ print(f"Audio callback status: {status}")
137
+
138
+ # Check pause state (thread-safe)
139
+ with self.pause_lock:
140
+ if self.is_paused:
141
+ # Output silence when paused - immediate response
142
+ outdata.fill(0)
143
+ return
144
+
145
+ try:
146
+ # Get next audio chunk if needed
147
+ if self.current_audio is None or self.current_position >= len(self.current_audio):
148
+ try:
149
+ self.current_audio = self.audio_queue.get_nowait()
150
+ self.current_position = 0
151
+ if self.debug_mode:
152
+ print(f" > Playing audio chunk ({len(self.current_audio)} samples)")
153
+ except queue.Empty:
154
+ # No more audio - output silence and mark as not playing
155
+ outdata.fill(0)
156
+ if self.is_playing:
157
+ self.is_playing = False
158
+ if self.playback_complete_callback:
159
+ # Call completion callback in a separate thread to avoid blocking
160
+ threading.Thread(target=self.playback_complete_callback, daemon=True).start()
161
+ return
162
+
163
+ # Calculate how much audio we can output this frame
164
+ remaining = len(self.current_audio) - self.current_position
165
+ frames_to_output = min(frames, remaining)
166
+
167
+ # Output the audio data
168
+ if frames_to_output > 0:
169
+ # Handle both mono and stereo output
170
+ if outdata.shape[1] == 1: # Mono output
171
+ outdata[:frames_to_output, 0] = self.current_audio[self.current_position:self.current_position + frames_to_output]
172
+ else: # Stereo output
173
+ audio_data = self.current_audio[self.current_position:self.current_position + frames_to_output]
174
+ outdata[:frames_to_output, 0] = audio_data # Left channel
175
+ outdata[:frames_to_output, 1] = audio_data # Right channel
176
+
177
+ self.current_position += frames_to_output
178
+
179
+ # Fill remaining with silence if needed
180
+ if frames_to_output < frames:
181
+ outdata[frames_to_output:].fill(0)
182
+
183
+ except Exception as e:
184
+ if self.debug_mode:
185
+ print(f"Error in audio callback: {e}")
186
+ outdata.fill(0)
187
+
188
+ def start_stream(self):
189
+ """Start the audio stream."""
190
+ if self.stream is None:
191
+ try:
192
+ self.stream = sd.OutputStream(
193
+ samplerate=self.sample_rate,
194
+ channels=1, # Mono output
195
+ callback=self._audio_callback,
196
+ blocksize=1024, # Small buffer for low latency
197
+ dtype=np.float32
198
+ )
199
+ self.stream.start()
200
+ if self.debug_mode:
201
+ print(" > Audio stream started")
202
+ except Exception as e:
203
+ if self.debug_mode:
204
+ print(f"Error starting audio stream: {e}")
205
+ raise
206
+
207
+ def stop_stream(self):
208
+ """Stop the audio stream."""
209
+ if self.stream:
210
+ try:
211
+ self.stream.stop()
212
+ self.stream.close()
213
+ if self.debug_mode:
214
+ print(" > Audio stream stopped")
215
+ except Exception as e:
216
+ if self.debug_mode:
217
+ print(f"Error stopping audio stream: {e}")
218
+ finally:
219
+ self.stream = None
220
+
221
+ self.is_playing = False
222
+ with self.pause_lock:
223
+ self.is_paused = False
224
+ self.clear_queue()
225
+
226
+ def play_audio(self, audio_array):
227
+ """Add audio to the playback queue."""
228
+ if audio_array is not None and len(audio_array) > 0:
229
+ # Ensure audio is float32 and normalized
230
+ if audio_array.dtype != np.float32:
231
+ audio_array = audio_array.astype(np.float32)
232
+
233
+ # Normalize if needed
234
+ if np.max(np.abs(audio_array)) > 1.0:
235
+ audio_array = audio_array / np.max(np.abs(audio_array))
236
+
237
+ self.audio_queue.put(audio_array)
238
+ self.is_playing = True
239
+
240
+ # Start stream if not already running
241
+ if self.stream is None:
242
+ self.start_stream()
243
+
244
+ def pause(self):
245
+ """Pause audio playback immediately."""
246
+ with self.pause_lock:
247
+ if self.is_playing and not self.is_paused:
248
+ self.is_paused = True
249
+ if self.debug_mode:
250
+ print(" > Audio paused immediately")
251
+ return True
252
+ return False
253
+
254
+ def resume(self):
255
+ """Resume audio playback immediately."""
256
+ with self.pause_lock:
257
+ if self.is_paused:
258
+ self.is_paused = False
259
+ if self.debug_mode:
260
+ print(" > Audio resumed immediately")
261
+ return True
262
+ return False
263
+
264
+ def is_paused_state(self):
265
+ """Check if audio is currently paused."""
266
+ with self.pause_lock:
267
+ return self.is_paused
268
+
269
+ def clear_queue(self):
270
+ """Clear the audio queue."""
271
+ while not self.audio_queue.empty():
272
+ try:
273
+ self.audio_queue.get_nowait()
274
+ except queue.Empty:
275
+ break
276
+
277
+ # Reset current audio buffer
278
+ self.current_audio = None
279
+ self.current_position = 0
280
+
281
+
282
+ def chunk_long_text(text, max_chunk_size=300):
283
+ """Split very long text into manageable chunks at natural boundaries.
284
+
285
+ For extremely long texts, this function splits at paragraph or sentence
286
+ boundaries to prevent memory issues and attention degradation.
287
+
288
+ Args:
289
+ text: Input text string
290
+ max_chunk_size: Maximum characters per chunk (default 300)
291
+
292
+ Returns:
293
+ List of text chunks
294
+ """
295
+ if len(text) <= max_chunk_size:
296
+ return [text]
297
+
298
+ chunks = []
299
+
300
+ # First try to split by paragraphs
301
+ paragraphs = text.split('\n\n')
302
+
303
+ current_chunk = ""
304
+ for para in paragraphs:
305
+ # If adding this paragraph would exceed limit and we have content
306
+ if len(current_chunk) + len(para) > max_chunk_size and current_chunk:
307
+ chunks.append(current_chunk.strip())
308
+ current_chunk = para
309
+ else:
310
+ if current_chunk:
311
+ current_chunk += "\n\n" + para
312
+ else:
313
+ current_chunk = para
314
+
315
+ # If a single paragraph is too long, split by sentences
316
+ if len(current_chunk) > max_chunk_size:
317
+ # Split on sentence boundaries
318
+ sentences = re.split(r'([.!?]+\s+)', current_chunk)
319
+ temp_chunk = ""
320
+
321
+ for i in range(0, len(sentences), 2):
322
+ sentence = sentences[i]
323
+ punct = sentences[i+1] if i+1 < len(sentences) else ""
324
+
325
+ if len(temp_chunk) + len(sentence) + len(punct) > max_chunk_size and temp_chunk:
326
+ chunks.append(temp_chunk.strip())
327
+ temp_chunk = sentence + punct
328
+ else:
329
+ temp_chunk += sentence + punct
330
+
331
+ current_chunk = temp_chunk
332
+
333
+ # Add remaining text
334
+ if current_chunk:
335
+ chunks.append(current_chunk.strip())
336
+
337
+ return chunks if chunks else [text]
338
+
339
+
340
+ class TTSEngine:
341
+ """Text-to-speech engine with interrupt capability."""
342
+
343
+ def __init__(self, model_name="tts_models/en/ljspeech/vits", debug_mode=False, streaming=True):
344
+ """Initialize the TTS engine.
345
+
346
+ Args:
347
+ model_name: TTS model to use (default: vits - best quality, requires espeak-ng)
348
+ debug_mode: Enable debug output
349
+ streaming: Enable streaming playback (start playing while synthesizing remaining chunks)
350
+
351
+ Note:
352
+ VITS model (default) requires espeak-ng for best quality:
353
+ - macOS: brew install espeak-ng
354
+ - Linux: sudo apt-get install espeak-ng
355
+ - Windows: See installation guide in README
356
+
357
+ If espeak-ng is not available, will auto-fallback to fast_pitch
358
+ """
359
+ # Set up debug mode
360
+ self.debug_mode = debug_mode
361
+ self.streaming = streaming
362
+
363
+ # Callback to notify when TTS starts/stops (for pausing voice recognition)
364
+ self.on_playback_start = None
365
+ self.on_playback_end = None
366
+
367
+ # Suppress TTS output unless in debug mode
368
+ if not debug_mode:
369
+ # Suppress all TTS logging
370
+ logging.getLogger('TTS').setLevel(logging.ERROR)
371
+ logging.getLogger('TTS.utils.audio').setLevel(logging.ERROR)
372
+ logging.getLogger('TTS.utils.io').setLevel(logging.ERROR)
373
+ logging.getLogger('numba').setLevel(logging.ERROR)
374
+
375
+ # Disable stdout during TTS loading
376
+ os.environ['TTS_VERBOSE'] = '0'
377
+
378
+ # Temporarily redirect stdout to suppress TTS init messages
379
+ orig_stdout = sys.stdout
380
+ null_out = open(os.devnull, 'w')
381
+ sys.stdout = null_out
382
+
383
+ try:
384
+ if self.debug_mode:
385
+ print(f" > Loading TTS model: {model_name}")
386
+
387
+ # Try to initialize TTS
388
+ try:
389
+ self.tts = TTS(model_name=model_name, progress_bar=self.debug_mode)
390
+ except Exception as e:
391
+ error_msg = str(e).lower()
392
+ # Check if this is an espeak-related error
393
+ if ("espeak" in error_msg or "phoneme" in error_msg):
394
+ # Restore stdout to show user-friendly message
395
+ if not debug_mode:
396
+ sys.stdout = sys.__stdout__
397
+
398
+ print("\n" + "="*70)
399
+ print("⚠️ VITS Model Requires espeak-ng (Not Found)")
400
+ print("="*70)
401
+ print("\nFor BEST voice quality, install espeak-ng:")
402
+ print(" • macOS: brew install espeak-ng")
403
+ print(" • Linux: sudo apt-get install espeak-ng")
404
+ print(" • Windows: conda install espeak-ng (or see README)")
405
+ print("\nFalling back to fast_pitch (lower quality, but works)")
406
+ print("="*70 + "\n")
407
+
408
+ if not debug_mode:
409
+ sys.stdout = null_out
410
+
411
+ # Fallback to fast_pitch
412
+ self.tts = TTS(model_name="tts_models/en/ljspeech/fast_pitch", progress_bar=self.debug_mode)
413
+ else:
414
+ # Different error, re-raise
415
+ raise
416
+ finally:
417
+ # Restore stdout if we redirected it
418
+ if not debug_mode:
419
+ sys.stdout = orig_stdout
420
+ null_out.close()
421
+
422
+ # Initialize non-blocking audio player for immediate pause/resume
423
+ self.audio_player = NonBlockingAudioPlayer(sample_rate=22050, debug_mode=debug_mode)
424
+ self.audio_player.playback_complete_callback = self._on_playback_complete
425
+
426
+ # Legacy playback state (for compatibility with existing code)
427
+ self.is_playing = False
428
+ self.stop_flag = threading.Event()
429
+ self.pause_flag = threading.Event()
430
+ self.pause_flag.set() # Initially not paused (set means "not paused")
431
+ self.playback_thread = None
432
+ self.start_time = 0
433
+ self.audio_queue = [] # Queue for streaming playback
434
+ self.queue_lock = threading.Lock() # Thread-safe queue access
435
+
436
+ # Pause/resume state
437
+ self.pause_lock = threading.Lock() # Thread-safe pause operations
438
+ self.is_paused_state = False # Explicit paused state tracking
439
+
440
+ def _on_playback_complete(self):
441
+ """Callback when audio playback completes."""
442
+ self.is_playing = False
443
+ if self.on_playback_end:
444
+ self.on_playback_end()
445
+
446
+ def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None):
447
+ """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume."""
448
+ # Stop any existing playback
449
+ self.stop()
450
+
451
+ if not text:
452
+ return False
453
+
454
+ try:
455
+ # Preprocess text for better synthesis quality
456
+ processed_text = preprocess_text(text)
457
+
458
+ if self.debug_mode:
459
+ print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
460
+ print(f" > Text length: {len(processed_text)} chars")
461
+ if speed != 1.0:
462
+ print(f" > Using speed multiplier: {speed}x")
463
+
464
+ # For very long text, chunk it at natural boundaries
465
+ text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
466
+
467
+ if self.debug_mode and len(text_chunks) > 1:
468
+ print(f" > Split into {len(text_chunks)} chunks for processing")
469
+
470
+ # Set playing state
471
+ self.is_playing = True
472
+ self.is_paused_state = False
473
+
474
+ # Call start callback
475
+ if self.on_playback_start:
476
+ self.on_playback_start()
477
+
478
+ # Synthesize and queue audio chunks
479
+ def synthesis_worker():
480
+ try:
481
+ for i, chunk in enumerate(text_chunks):
482
+ if self.stop_flag.is_set():
483
+ break
484
+
485
+ if self.debug_mode and len(text_chunks) > 1:
486
+ print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
487
+
488
+ # Generate audio for this chunk
489
+ chunk_audio = self.tts.tts(chunk, split_sentences=True)
490
+
491
+ if chunk_audio and len(chunk_audio) > 0:
492
+ # Apply speed adjustment
493
+ if speed != 1.0:
494
+ chunk_audio = apply_speed_without_pitch_change(
495
+ np.array(chunk_audio), speed
496
+ )
497
+
498
+ # Queue the audio for playback
499
+ self.audio_player.play_audio(np.array(chunk_audio))
500
+
501
+ if self.debug_mode:
502
+ print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
503
+
504
+ # Small delay between chunks to prevent overwhelming the queue
505
+ time.sleep(0.01)
506
+
507
+ except Exception as e:
508
+ if self.debug_mode:
509
+ print(f"Error in synthesis worker: {e}")
510
+ finally:
511
+ # Synthesis complete - audio player will handle completion callback
512
+ pass
513
+
514
+ # Start synthesis in background thread
515
+ synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
516
+ synthesis_thread.start()
517
+
518
+ return True
519
+
520
+ except Exception as e:
521
+ if self.debug_mode:
522
+ print(f"Error in _speak_with_nonblocking_player: {e}")
523
+ self.is_playing = False
524
+ return False
525
+
526
+ def speak(self, text, speed=1.0, callback=None):
527
+ """Convert text to speech and play audio.
528
+
529
+ Implements SOTA best practices for long text synthesis:
530
+ - Text preprocessing and normalization
531
+ - Intelligent chunking for very long text (>500 chars)
532
+ - Sentence segmentation to prevent attention degradation
533
+ - Seamless audio concatenation for chunks
534
+
535
+ Args:
536
+ text: Text to convert to speech
537
+ speed: Speed multiplier (0.5-2.0)
538
+ callback: Function to call when speech is complete
539
+
540
+ Returns:
541
+ True if speech started, False if text was empty
542
+ """
543
+ # Use the new non-blocking audio player for immediate pause/resume
544
+ return self._speak_with_nonblocking_player(text, speed, callback)
545
+
546
+ if not text:
547
+ return False
548
+
549
+ try:
550
+ # Preprocess text for better synthesis quality
551
+ processed_text = preprocess_text(text)
552
+
553
+ if self.debug_mode:
554
+ print(f" > Speaking: '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
555
+ print(f" > Text length: {len(processed_text)} chars")
556
+ if speed != 1.0:
557
+ print(f" > Using speed multiplier: {speed}x")
558
+
559
+ # For very long text, chunk it at natural boundaries
560
+ # Use 300 chars to stay well within model's training distribution
561
+ text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
562
+
563
+ if self.debug_mode and len(text_chunks) > 1:
564
+ print(f" > Split into {len(text_chunks)} chunks for processing")
565
+
566
+ # Redirect stdout for non-debug mode
567
+ orig_stdout = None
568
+ null_out = None
569
+ if not self.debug_mode:
570
+ orig_stdout = sys.stdout
571
+ null_out = open(os.devnull, 'w')
572
+ sys.stdout = null_out
573
+
574
+ try:
575
+ # Choose synthesis strategy based on streaming mode
576
+ if self.streaming and len(text_chunks) > 1:
577
+ # STREAMING MODE: Synthesize and play progressively
578
+ if self.debug_mode:
579
+ sys.stdout = sys.__stdout__
580
+ print(f" > Streaming mode: will start playback after first chunk")
581
+ if not self.debug_mode:
582
+ sys.stdout = null_out
583
+
584
+ # Synthesize first chunk
585
+ if self.debug_mode:
586
+ sys.stdout = sys.__stdout__
587
+ print(f" > Processing chunk 1/{len(text_chunks)} ({len(text_chunks[0])} chars)...")
588
+ if not self.debug_mode:
589
+ sys.stdout = null_out
590
+
591
+ first_audio = self.tts.tts(text_chunks[0], split_sentences=True)
592
+
593
+ if not first_audio:
594
+ if self.debug_mode:
595
+ sys.stdout = sys.__stdout__
596
+ print("TTS failed to generate audio for first chunk.")
597
+ return False
598
+
599
+ # Apply speed adjustment using time-stretching (preserves pitch)
600
+ if speed != 1.0:
601
+ first_audio = apply_speed_without_pitch_change(
602
+ np.array(first_audio), speed
603
+ )
604
+
605
+ if self.debug_mode:
606
+ sys.stdout = sys.__stdout__
607
+ print(f" > Chunk 1 generated {len(first_audio)} audio samples")
608
+ if speed != 1.0:
609
+ print(f" > Applied time-stretch: {speed}x (pitch preserved)")
610
+ print(f" > Starting playback while synthesizing remaining chunks...")
611
+ if not self.debug_mode:
612
+ sys.stdout = null_out
613
+
614
+ # Initialize queue with first chunk
615
+ with self.queue_lock:
616
+ self.audio_queue = [first_audio]
617
+
618
+ # Start playback thread (will play from queue)
619
+ audio = None # Will use queue instead
620
+
621
+ else:
622
+ # NON-STREAMING MODE: Synthesize all chunks then play
623
+ audio_chunks = []
624
+ for i, chunk in enumerate(text_chunks):
625
+ if self.debug_mode and len(text_chunks) > 1:
626
+ sys.stdout = sys.__stdout__
627
+ print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
628
+ if not self.debug_mode:
629
+ sys.stdout = null_out
630
+
631
+ # Use split_sentences=True (SOTA best practice)
632
+ chunk_audio = self.tts.tts(chunk, split_sentences=True)
633
+
634
+ if chunk_audio:
635
+ # Apply speed adjustment using time-stretching (preserves pitch)
636
+ if speed != 1.0:
637
+ chunk_audio = apply_speed_without_pitch_change(
638
+ np.array(chunk_audio), speed
639
+ )
640
+ audio_chunks.append(chunk_audio)
641
+ if self.debug_mode and len(text_chunks) > 1:
642
+ sys.stdout = sys.__stdout__
643
+ print(f" > Chunk {i+1} generated {len(chunk_audio)} audio samples")
644
+ if not self.debug_mode:
645
+ sys.stdout = null_out
646
+ elif self.debug_mode:
647
+ sys.stdout = sys.__stdout__
648
+ print(f" > Warning: Chunk {i+1} failed to generate audio")
649
+ if not self.debug_mode:
650
+ sys.stdout = null_out
651
+
652
+ if not audio_chunks:
653
+ if self.debug_mode:
654
+ sys.stdout = sys.__stdout__
655
+ print("TTS failed to generate audio.")
656
+ return False
657
+
658
+ # Concatenate audio arrays
659
+ if len(audio_chunks) == 1:
660
+ audio = audio_chunks[0]
661
+ else:
662
+ audio = np.concatenate(audio_chunks)
663
+ if self.debug_mode:
664
+ sys.stdout = sys.__stdout__
665
+ print(f" > Concatenated {len(audio_chunks)} chunks into {len(audio)} total audio samples")
666
+ if not self.debug_mode:
667
+ sys.stdout = null_out
668
+
669
+ finally:
670
+ # Restore stdout if we redirected it
671
+ if not self.debug_mode and orig_stdout:
672
+ sys.stdout = orig_stdout
673
+ if null_out:
674
+ null_out.close()
675
+
676
+ def _audio_playback():
677
+ try:
678
+ self.is_playing = True
679
+ self.start_time = time.time()
680
+
681
+ # Notify that playback is starting (to pause voice recognition)
682
+ if self.on_playback_start:
683
+ self.on_playback_start()
684
+
685
+ # Use standard playback rate (speed is handled via time-stretching)
686
+ playback_rate = 22050
687
+
688
+ # STREAMING MODE: Play from queue while synthesizing remaining chunks
689
+ if audio is None: # Streaming mode indicator
690
+ # Start background thread to synthesize remaining chunks
691
+ def _synthesize_remaining():
692
+ for i in range(1, len(text_chunks)):
693
+ if self.stop_flag.is_set():
694
+ break
695
+
696
+ if self.debug_mode:
697
+ print(f" > [Background] Processing chunk {i+1}/{len(text_chunks)} ({len(text_chunks[i])} chars)...")
698
+
699
+ try:
700
+ chunk_audio = self.tts.tts(text_chunks[i], split_sentences=True)
701
+ if chunk_audio:
702
+ # Apply speed adjustment using time-stretching (preserves pitch)
703
+ if speed != 1.0:
704
+ chunk_audio = apply_speed_without_pitch_change(
705
+ np.array(chunk_audio), speed
706
+ )
707
+ with self.queue_lock:
708
+ self.audio_queue.append(chunk_audio)
709
+ if self.debug_mode:
710
+ print(f" > [Background] Chunk {i+1} generated {len(chunk_audio)} samples, added to queue")
711
+ except Exception as e:
712
+ if self.debug_mode:
713
+ print(f" > [Background] Chunk {i+1} synthesis error: {e}")
714
+
715
+ synthesis_thread = threading.Thread(target=_synthesize_remaining)
716
+ synthesis_thread.daemon = True
717
+ synthesis_thread.start()
718
+
719
+ # Play chunks from queue as they become available
720
+ chunks_played = 0
721
+ while chunks_played < len(text_chunks) and not self.stop_flag.is_set():
722
+ # Check for pause before processing next chunk
723
+ while not self.pause_flag.is_set() and not self.stop_flag.is_set():
724
+ time.sleep(0.1) # Non-blocking pause check
725
+
726
+ if self.stop_flag.is_set():
727
+ break
728
+
729
+ # Wait for next chunk to be available
730
+ while True:
731
+ with self.queue_lock:
732
+ if chunks_played < len(self.audio_queue):
733
+ chunk_to_play = self.audio_queue[chunks_played]
734
+ break
735
+ if self.stop_flag.is_set():
736
+ break
737
+ time.sleep(0.05) # Short wait before checking again
738
+
739
+ if self.stop_flag.is_set():
740
+ break
741
+
742
+ # Play this chunk
743
+ audio_array = np.array(chunk_to_play)
744
+ sd.play(audio_array, samplerate=playback_rate)
745
+
746
+ # Wait for this chunk to finish (with frequent pause checks)
747
+ while not self.stop_flag.is_set() and sd.get_stream().active:
748
+ # Check for pause more frequently
749
+ if not self.pause_flag.is_set():
750
+ # Paused - let current audio finish naturally (avoids terminal interference)
751
+ break
752
+ time.sleep(0.05) # Check every 50ms for better responsiveness
753
+
754
+ if self.stop_flag.is_set():
755
+ # Only use sd.stop() for explicit stop, not pause
756
+ sd.stop()
757
+ break
758
+
759
+ chunks_played += 1
760
+
761
+ synthesis_thread.join(timeout=1.0) # Wait for synthesis to complete
762
+
763
+ else:
764
+ # NON-STREAMING MODE: Play concatenated audio
765
+ audio_array = np.array(audio)
766
+ sd.play(audio_array, samplerate=playback_rate)
767
+
768
+ # Wait for playback to complete or stop flag (with pause support)
769
+ while not self.stop_flag.is_set() and sd.get_stream().active:
770
+ # Check for pause more frequently
771
+ if not self.pause_flag.is_set():
772
+ # Paused - let current audio finish naturally and wait
773
+ if self.debug_mode:
774
+ print(" > Audio paused, waiting for resume...")
775
+ # Non-blocking wait for resume
776
+ while not self.pause_flag.is_set() and not self.stop_flag.is_set():
777
+ time.sleep(0.1)
778
+ if not self.stop_flag.is_set():
779
+ # Resume - restart the audio (non-streaming limitation)
780
+ if self.debug_mode:
781
+ print(" > Resuming audio from beginning of current segment...")
782
+ sd.play(audio_array, samplerate=playback_rate)
783
+ time.sleep(0.05) # Check every 50ms for better responsiveness
784
+
785
+ sd.stop()
786
+
787
+ self.is_playing = False
788
+
789
+ # Notify that playback has ended (to resume voice recognition)
790
+ if self.on_playback_end:
791
+ self.on_playback_end()
792
+
793
+ if self.debug_mode:
794
+ duration = time.time() - self.start_time
795
+ if not self.stop_flag.is_set(): # Only if completed normally
796
+ print(f" > Speech completed in {duration:.2f} seconds")
797
+
798
+ # Call the callback if provided and speech completed normally
799
+ if callback and not self.stop_flag.is_set():
800
+ callback()
801
+
802
+ except Exception as e:
803
+ if self.debug_mode:
804
+ print(f"Audio playback error: {e}")
805
+ self.is_playing = False
806
+ # Ensure we notify end even on error
807
+ if self.on_playback_end:
808
+ self.on_playback_end()
809
+
810
+ # Start playback in a separate thread
811
+ self.stop_flag.clear()
812
+ self.pause_flag.set() # Ensure we start unpaused
813
+ self.is_paused_state = False # Reset paused state
814
+ self.playback_thread = threading.Thread(target=_audio_playback)
815
+ self.playback_thread.start()
816
+ return True
817
+
818
+ except Exception as e:
819
+ if self.debug_mode:
820
+ print(f"TTS error: {e}")
821
+ return False
822
+
823
+ def stop(self):
824
+ """Stop current audio playback.
825
+
826
+ Returns:
827
+ True if playback was stopped, False if no playback was active
828
+ """
829
+ stopped = False
830
+
831
+ # Stop new non-blocking audio player
832
+ if self.audio_player.is_playing:
833
+ self.audio_player.stop_stream()
834
+ stopped = True
835
+ if self.debug_mode:
836
+ print(" > TTS playback stopped (non-blocking)")
837
+
838
+ # Stop legacy playback system
839
+ if self.playback_thread and self.playback_thread.is_alive():
840
+ self.stop_flag.set()
841
+ self.pause_flag.set() # Ensure we're not stuck in pause
842
+ self.is_paused_state = False # Reset paused state
843
+ self.playback_thread.join()
844
+ self.playback_thread = None
845
+ stopped = True
846
+
847
+ if self.debug_mode:
848
+ print(" > TTS playback interrupted (legacy)")
849
+
850
+ # Reset state
851
+ self.is_playing = False
852
+ self.is_paused_state = False
853
+
854
+ return stopped
855
+
856
+ def pause(self):
857
+ """Pause current speech playback.
858
+
859
+ Uses a non-interfering pause method that avoids terminal I/O issues.
860
+
861
+ Returns:
862
+ True if paused, False if no playback was active
863
+ """
864
+ # Try new non-blocking audio player first
865
+ if self.audio_player.is_playing:
866
+ result = self.audio_player.pause()
867
+ if result:
868
+ self.is_paused_state = True
869
+ if self.debug_mode:
870
+ print(" > TTS paused immediately (non-blocking)")
871
+ return result
872
+
873
+ # Fallback to legacy system
874
+ if self.playback_thread and self.playback_thread.is_alive() and self.is_playing:
875
+ self.pause_flag.clear() # Clear means "paused"
876
+ self.is_paused_state = True # Explicit state tracking
877
+
878
+ if self.debug_mode:
879
+ print(" > TTS paused (legacy method)")
880
+
881
+ return True
882
+
883
+ return False
884
+
885
+ def resume(self):
886
+ """Resume paused speech playback.
887
+
888
+ Returns:
889
+ True if resumed, False if not paused or no playback active
890
+ """
891
+ if self.is_paused_state:
892
+ # Try new non-blocking audio player first
893
+ if self.audio_player.is_paused_state():
894
+ result = self.audio_player.resume()
895
+ if result:
896
+ self.is_paused_state = False
897
+ if self.debug_mode:
898
+ print(" > TTS resumed immediately (non-blocking)")
899
+ return True
900
+
901
+ # Fallback to legacy system
902
+ if self.playback_thread and self.playback_thread.is_alive():
903
+ # Thread is still alive, can resume
904
+ self.pause_flag.set() # Set means "not paused"
905
+ self.is_paused_state = False # Clear explicit state
906
+ if self.debug_mode:
907
+ print(" > TTS resumed (legacy method)")
908
+ return True
909
+ else:
910
+ # Thread died while paused, nothing to resume
911
+ self.is_paused_state = False # Clear paused state
912
+ if self.debug_mode:
913
+ print(" > TTS was paused but playback already completed")
914
+ return False
915
+ return False
916
+
917
+ def is_paused(self):
918
+ """Check if TTS is currently paused.
919
+
920
+ Returns:
921
+ True if paused, False otherwise
922
+ """
923
+ return self.is_paused_state
924
+
925
+ def is_active(self):
926
+ """Check if TTS is currently playing.
927
+
928
+ Returns:
929
+ True if TTS is active, False otherwise
930
+ """
931
+ return self.is_playing