npcpy 1.3.21__py3-none-any.whl → 1.3.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcpy/data/audio.py +58 -286
- npcpy/data/image.py +15 -15
- npcpy/data/web.py +2 -2
- npcpy/gen/audio_gen.py +172 -2
- npcpy/gen/image_gen.py +113 -62
- npcpy/gen/response.py +239 -0
- npcpy/llm_funcs.py +73 -71
- npcpy/memory/command_history.py +117 -69
- npcpy/memory/kg_vis.py +74 -74
- npcpy/npc_compiler.py +261 -26
- npcpy/npc_sysenv.py +4 -1
- npcpy/serve.py +393 -91
- npcpy/work/desktop.py +31 -5
- npcpy-1.3.23.dist-info/METADATA +416 -0
- {npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/RECORD +18 -18
- npcpy-1.3.21.dist-info/METADATA +0 -1039
- {npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/WHEEL +0 -0
- {npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/licenses/LICENSE +0 -0
- {npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/top_level.txt +0 -0
npcpy/data/audio.py
CHANGED
|
@@ -6,45 +6,22 @@ import time
|
|
|
6
6
|
import queue
|
|
7
7
|
import re
|
|
8
8
|
import json
|
|
9
|
-
|
|
10
9
|
import subprocess
|
|
10
|
+
import logging
|
|
11
11
|
|
|
12
|
+
from typing import Optional, List, Dict, Any
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Audio constants
|
|
12
17
|
try:
|
|
13
|
-
import torch
|
|
14
18
|
import pyaudio
|
|
15
|
-
import wave
|
|
16
|
-
from typing import Optional, List, Dict, Any
|
|
17
|
-
from gtts import gTTS
|
|
18
|
-
from faster_whisper import WhisperModel
|
|
19
|
-
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
|
|
20
|
-
|
|
21
|
-
import pygame
|
|
22
|
-
|
|
23
19
|
FORMAT = pyaudio.paInt16
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
is_speaking = False
|
|
30
|
-
should_stop_speaking = False
|
|
31
|
-
tts_sequence = 0
|
|
32
|
-
recording_data = []
|
|
33
|
-
buffer_data = []
|
|
34
|
-
is_recording = False
|
|
35
|
-
last_speech_time = 0
|
|
36
|
-
running = True
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
audio_queue = queue.Queue()
|
|
40
|
-
tts_queue = queue.PriorityQueue()
|
|
41
|
-
cleanup_files = []
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
pygame.mixer.quit()
|
|
45
|
-
pygame.mixer.init(frequency=44100, size=-16, channels=2, buffer=512)
|
|
46
|
-
except:
|
|
47
|
-
print("audio dependencies not installed")
|
|
20
|
+
except ImportError:
|
|
21
|
+
FORMAT = 8 # paInt16 value fallback
|
|
22
|
+
CHANNELS = 1
|
|
23
|
+
RATE = 16000
|
|
24
|
+
CHUNK = 512
|
|
48
25
|
|
|
49
26
|
|
|
50
27
|
def convert_mp3_to_wav(mp3_file, wav_file):
|
|
@@ -90,49 +67,9 @@ def check_ffmpeg():
|
|
|
90
67
|
return False
|
|
91
68
|
|
|
92
69
|
|
|
93
|
-
def get_context_string():
|
|
94
|
-
context = []
|
|
95
|
-
for exchange in history:
|
|
96
|
-
context.append(f"User: {exchange['user']}")
|
|
97
|
-
context.append(f"Assistant: {exchange['assistant']}")
|
|
98
|
-
return "\n".join(context)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def cleanup_temp_files():
|
|
103
|
-
global cleanup_files
|
|
104
|
-
for file in list(cleanup_files):
|
|
105
|
-
try:
|
|
106
|
-
if os.path.exists(file):
|
|
107
|
-
os.remove(file)
|
|
108
|
-
cleanup_files.remove(file)
|
|
109
|
-
except Exception:
|
|
110
|
-
pass
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def interrupt_speech():
|
|
114
|
-
global should_stop_speaking
|
|
115
|
-
should_stop_speaking = True
|
|
116
|
-
pygame.mixer.music.stop()
|
|
117
|
-
pygame.mixer.music.unload()
|
|
118
|
-
|
|
119
|
-
while not tts_queue.empty():
|
|
120
|
-
try:
|
|
121
|
-
_, temp_filename = tts_queue.get_nowait()
|
|
122
|
-
try:
|
|
123
|
-
if os.path.exists(temp_filename):
|
|
124
|
-
os.remove(temp_filename)
|
|
125
|
-
except:
|
|
126
|
-
if temp_filename not in cleanup_files:
|
|
127
|
-
cleanup_files.append(temp_filename)
|
|
128
|
-
except queue.Empty:
|
|
129
|
-
break
|
|
130
|
-
|
|
131
|
-
global tts_sequence
|
|
132
|
-
tts_sequence = 0
|
|
133
|
-
|
|
134
|
-
|
|
135
70
|
def audio_callback(in_data, frame_count, time_info, status):
|
|
71
|
+
import pyaudio
|
|
72
|
+
audio_queue = queue.Queue()
|
|
136
73
|
audio_queue.put(in_data)
|
|
137
74
|
return (in_data, pyaudio.paContinue)
|
|
138
75
|
|
|
@@ -571,218 +508,67 @@ def get_available_stt_engines() -> dict:
|
|
|
571
508
|
|
|
572
509
|
|
|
573
510
|
|
|
574
|
-
def load_history():
|
|
575
|
-
global history
|
|
576
|
-
try:
|
|
577
|
-
if os.path.exists(memory_file):
|
|
578
|
-
with open(memory_file, "r") as f:
|
|
579
|
-
history = json.load(f)
|
|
580
|
-
except Exception as e:
|
|
581
|
-
print(f"Error loading conversation history: {e}")
|
|
582
|
-
history = []
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
def save_history():
|
|
586
|
-
try:
|
|
587
|
-
with open(memory_file, "w") as f:
|
|
588
|
-
json.dump(history, f)
|
|
589
|
-
except Exception as e:
|
|
590
|
-
print(f"Error saving conversation history: {e}")
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
def add_exchange(user_input, assistant_response):
|
|
594
|
-
global history
|
|
595
|
-
exchange = {
|
|
596
|
-
"user": user_input,
|
|
597
|
-
"assistant": assistant_response,
|
|
598
|
-
"timestamp": time.time(),
|
|
599
|
-
}
|
|
600
|
-
history.append(exchange)
|
|
601
|
-
if len(history) > max_history:
|
|
602
|
-
history.pop(0)
|
|
603
|
-
save_history()
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
def get_context_string():
|
|
607
|
-
context = []
|
|
608
|
-
for exchange in history:
|
|
609
|
-
context.append(f"User: {exchange['user']}")
|
|
610
|
-
context.append(f"Assistant: {exchange['assistant']}")
|
|
611
|
-
return "\n".join(context)
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
def cleanup_temp_files():
|
|
616
|
-
global cleanup_files
|
|
617
|
-
for file in list(cleanup_files):
|
|
618
|
-
try:
|
|
619
|
-
if os.path.exists(file):
|
|
620
|
-
os.remove(file)
|
|
621
|
-
cleanup_files.remove(file)
|
|
622
|
-
except Exception:
|
|
623
|
-
pass
|
|
624
|
-
|
|
625
511
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
pygame.mixer.music.stop()
|
|
630
|
-
pygame.mixer.music.unload()
|
|
631
|
-
|
|
632
|
-
while not tts_queue.empty():
|
|
633
|
-
try:
|
|
634
|
-
_, temp_filename = tts_queue.get_nowait()
|
|
635
|
-
try:
|
|
636
|
-
if os.path.exists(temp_filename):
|
|
637
|
-
os.remove(temp_filename)
|
|
638
|
-
except:
|
|
639
|
-
if temp_filename not in cleanup_files:
|
|
640
|
-
cleanup_files.append(temp_filename)
|
|
641
|
-
except queue.Empty:
|
|
642
|
-
break
|
|
643
|
-
|
|
644
|
-
tts_sequence = 0
|
|
645
|
-
is_speaking = False
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
def audio_callback(in_data, frame_count, time_info, status):
|
|
649
|
-
audio_queue.put(in_data)
|
|
650
|
-
return (in_data, pyaudio.paContinue)
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
def play_audio_from_queue():
|
|
655
|
-
global is_speaking, cleanup_files, should_stop_speaking
|
|
656
|
-
next_sequence = 0
|
|
657
|
-
|
|
658
|
-
while True:
|
|
659
|
-
if should_stop_speaking:
|
|
660
|
-
pygame.mixer.music.stop()
|
|
661
|
-
pygame.mixer.music.unload()
|
|
662
|
-
|
|
663
|
-
while not tts_queue.empty():
|
|
664
|
-
try:
|
|
665
|
-
_, temp_filename = tts_queue.get_nowait()
|
|
666
|
-
try:
|
|
667
|
-
if os.path.exists(temp_filename):
|
|
668
|
-
os.remove(temp_filename)
|
|
669
|
-
except:
|
|
670
|
-
if temp_filename not in cleanup_files:
|
|
671
|
-
cleanup_files.append(temp_filename)
|
|
672
|
-
except queue.Empty:
|
|
673
|
-
break
|
|
674
|
-
|
|
675
|
-
next_sequence = 0
|
|
676
|
-
is_speaking = False
|
|
677
|
-
should_stop_speaking = False
|
|
678
|
-
time.sleep(0.1)
|
|
679
|
-
continue
|
|
680
|
-
|
|
681
|
-
try:
|
|
682
|
-
if not tts_queue.empty():
|
|
683
|
-
sequence, temp_filename = tts_queue.queue[0]
|
|
684
|
-
|
|
685
|
-
if sequence == next_sequence:
|
|
686
|
-
sequence, temp_filename = tts_queue.get()
|
|
687
|
-
is_speaking = True
|
|
688
|
-
|
|
689
|
-
try:
|
|
690
|
-
if len(cleanup_files) > 0 and not pygame.mixer.music.get_busy():
|
|
691
|
-
cleanup_temp_files()
|
|
692
|
-
|
|
693
|
-
if should_stop_speaking:
|
|
694
|
-
continue
|
|
695
|
-
|
|
696
|
-
pygame.mixer.music.load(temp_filename)
|
|
697
|
-
pygame.mixer.music.play()
|
|
698
|
-
|
|
699
|
-
while (
|
|
700
|
-
pygame.mixer.music.get_busy() and not should_stop_speaking
|
|
701
|
-
):
|
|
702
|
-
pygame.time.wait(50)
|
|
703
|
-
|
|
704
|
-
pygame.mixer.music.unload()
|
|
705
|
-
|
|
706
|
-
except Exception as e:
|
|
707
|
-
print(f"Audio playback error: {str(e)}")
|
|
708
|
-
finally:
|
|
709
|
-
try:
|
|
710
|
-
if os.path.exists(temp_filename):
|
|
711
|
-
os.remove(temp_filename)
|
|
712
|
-
except:
|
|
713
|
-
if temp_filename not in cleanup_files:
|
|
714
|
-
cleanup_files.append(temp_filename)
|
|
715
|
-
|
|
716
|
-
if not should_stop_speaking:
|
|
717
|
-
next_sequence += 1
|
|
718
|
-
is_speaking = False
|
|
719
|
-
|
|
720
|
-
time.sleep(0.05)
|
|
721
|
-
except Exception:
|
|
722
|
-
time.sleep(0.05)
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
import pygame
|
|
726
|
-
from gtts import gTTS
|
|
727
|
-
import tempfile
|
|
728
|
-
import os
|
|
729
|
-
import logging
|
|
730
|
-
|
|
731
|
-
logging.basicConfig(level=logging.ERROR)
|
|
732
|
-
logger = logging.getLogger(__name__)
|
|
733
|
-
|
|
734
|
-
import pyaudio
|
|
735
|
-
import wave
|
|
736
|
-
from gtts import gTTS
|
|
737
|
-
import tempfile
|
|
738
|
-
import os
|
|
739
|
-
import logging
|
|
512
|
+
# =============================================================================
|
|
513
|
+
# TTS Playback Helpers (use unified audio_gen.text_to_speech)
|
|
514
|
+
# =============================================================================
|
|
740
515
|
|
|
741
|
-
|
|
742
|
-
|
|
516
|
+
def create_and_queue_audio(text, state, engine="kokoro", voice=None):
|
|
517
|
+
"""Create and play TTS audio using the unified engine interface.
|
|
743
518
|
|
|
519
|
+
Args:
|
|
520
|
+
text: Text to speak
|
|
521
|
+
state: Dict with 'tts_is_speaking', 'tts_just_finished', 'running' keys
|
|
522
|
+
engine: TTS engine name (kokoro, qwen3, elevenlabs, openai, gemini, gtts)
|
|
523
|
+
voice: Voice ID (engine-specific)
|
|
524
|
+
"""
|
|
525
|
+
import wave
|
|
526
|
+
import uuid
|
|
744
527
|
|
|
745
|
-
def create_and_queue_audio(text, state):
|
|
746
|
-
"""Create and queue audio with state awareness for TTS/recording coordination"""
|
|
747
|
-
|
|
748
528
|
state["tts_is_speaking"] = True
|
|
749
529
|
|
|
750
530
|
if not text.strip():
|
|
751
|
-
print("Empty text, skipping TTS")
|
|
752
531
|
state["tts_is_speaking"] = False
|
|
753
532
|
return
|
|
754
533
|
|
|
755
534
|
try:
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
wav_file = os.path.join(temp_dir, f"temp_{unique_id}.wav")
|
|
535
|
+
from npcpy.gen.audio_gen import text_to_speech
|
|
536
|
+
|
|
537
|
+
audio_bytes = text_to_speech(text, engine=engine, voice=voice)
|
|
760
538
|
|
|
761
|
-
|
|
762
|
-
|
|
539
|
+
# Write to temp file and play
|
|
540
|
+
suffix = '.mp3' if engine in ('elevenlabs', 'gtts') else '.wav'
|
|
541
|
+
tmp_path = os.path.join(tempfile.gettempdir(), f"npc_tts_{uuid.uuid4()}{suffix}")
|
|
542
|
+
with open(tmp_path, 'wb') as f:
|
|
543
|
+
f.write(audio_bytes)
|
|
763
544
|
|
|
764
|
-
|
|
545
|
+
play_path = tmp_path
|
|
546
|
+
if suffix == '.mp3':
|
|
547
|
+
wav_path = tmp_path.replace('.mp3', '.wav')
|
|
548
|
+
convert_mp3_to_wav(tmp_path, wav_path)
|
|
549
|
+
play_path = wav_path
|
|
765
550
|
|
|
766
|
-
|
|
767
|
-
|
|
551
|
+
play_audio(play_path, state)
|
|
552
|
+
|
|
553
|
+
for p in set([tmp_path, play_path]):
|
|
554
|
+
try:
|
|
555
|
+
if os.path.exists(p):
|
|
556
|
+
os.remove(p)
|
|
557
|
+
except Exception:
|
|
558
|
+
pass
|
|
768
559
|
except Exception as e:
|
|
769
|
-
|
|
560
|
+
logger.error(f"TTS error: {e}")
|
|
770
561
|
finally:
|
|
771
|
-
|
|
772
562
|
state["tts_is_speaking"] = False
|
|
773
563
|
state["tts_just_finished"] = True
|
|
774
564
|
|
|
775
|
-
for file in [mp3_file, wav_file]:
|
|
776
|
-
try:
|
|
777
|
-
if os.path.exists(file):
|
|
778
|
-
os.remove(file)
|
|
779
|
-
except Exception as e:
|
|
780
|
-
print(f"Error removing temporary file {file}: {e}")
|
|
781
|
-
|
|
782
565
|
|
|
783
566
|
def play_audio(filename, state):
|
|
784
|
-
"""Play
|
|
785
|
-
|
|
567
|
+
"""Play a WAV file via pyaudio with state awareness."""
|
|
568
|
+
import pyaudio
|
|
569
|
+
import wave
|
|
570
|
+
|
|
571
|
+
PLAY_CHUNK = 4096
|
|
786
572
|
|
|
787
573
|
wf = wave.open(filename, "rb")
|
|
788
574
|
p = pyaudio.PyAudio()
|
|
@@ -794,33 +580,19 @@ def play_audio(filename, state):
|
|
|
794
580
|
output=True,
|
|
795
581
|
)
|
|
796
582
|
|
|
797
|
-
data = wf.readframes(
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
while data and state["running"]:
|
|
583
|
+
data = wf.readframes(PLAY_CHUNK)
|
|
584
|
+
while data and state.get("running", True):
|
|
801
585
|
stream.write(data)
|
|
802
|
-
data = wf.readframes(
|
|
586
|
+
data = wf.readframes(PLAY_CHUNK)
|
|
803
587
|
|
|
804
588
|
stream.stop_stream()
|
|
805
589
|
stream.close()
|
|
806
590
|
p.terminate()
|
|
807
591
|
|
|
808
|
-
try:
|
|
809
|
-
os.unlink(filename)
|
|
810
|
-
except:
|
|
811
|
-
pass
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
def process_response_chunk(text_chunk):
|
|
816
|
-
if not text_chunk.strip():
|
|
817
|
-
return
|
|
818
|
-
processed_text = process_text_for_tts(text_chunk)
|
|
819
|
-
create_and_queue_audio(processed_text)
|
|
820
|
-
|
|
821
592
|
|
|
822
593
|
def process_text_for_tts(text):
|
|
823
|
-
text
|
|
594
|
+
"""Clean text for TTS consumption."""
|
|
595
|
+
text = re.sub(r"[*<>{}()\[\]&%#@^~`]", "", text)
|
|
824
596
|
text = text.strip()
|
|
825
597
|
text = re.sub(r"(\w)\.(\w)\.", r"\1 \2 ", text)
|
|
826
598
|
text = re.sub(r"([.!?])(\w)", r"\1 \2", text)
|
npcpy/data/image.py
CHANGED
|
@@ -85,21 +85,21 @@ def capture_screenshot( full=False) -> Dict[str, str]:
|
|
|
85
85
|
subprocess.run(["screencapture", file_path], capture_output=True)
|
|
86
86
|
|
|
87
87
|
elif system == "Linux":
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
88
|
+
_took = False
|
|
89
|
+
# Try non-interactive tools first
|
|
90
|
+
for _cmd, _args in [
|
|
91
|
+
("grim", [file_path]), # Wayland
|
|
92
|
+
("scrot", [file_path]), # X11, non-interactive full
|
|
93
|
+
("import", ["-window", "root", file_path]), # ImageMagick X11
|
|
94
|
+
("gnome-screenshot", ["-f", file_path]), # GNOME (may show dialog on newer versions)
|
|
95
|
+
]:
|
|
96
|
+
if subprocess.run(["which", _cmd], capture_output=True).returncode == 0:
|
|
97
|
+
subprocess.run([_cmd] + _args, capture_output=True, timeout=10)
|
|
98
|
+
if os.path.exists(file_path):
|
|
99
|
+
_took = True
|
|
100
|
+
break
|
|
101
|
+
if not _took:
|
|
102
|
+
print("No supported screenshot tool found. Install scrot, grim, or imagemagick.")
|
|
103
103
|
|
|
104
104
|
elif system == "Windows":
|
|
105
105
|
|
npcpy/data/web.py
CHANGED
|
@@ -146,8 +146,8 @@ def search_perplexity(
|
|
|
146
146
|
):
|
|
147
147
|
if api_key is None:
|
|
148
148
|
api_key = os.environ.get("PERPLEXITY_API_KEY")
|
|
149
|
-
if api_key is None:
|
|
150
|
-
raise
|
|
149
|
+
if api_key is None:
|
|
150
|
+
raise ValueError("PERPLEXITY_API_KEY not set. Set it in your environment or ~/.npcshrc.")
|
|
151
151
|
|
|
152
152
|
|
|
153
153
|
url = "https://api.perplexity.ai/chat/completions"
|
npcpy/gen/audio_gen.py
CHANGED
|
@@ -4,6 +4,7 @@ Supports multiple TTS engines including real-time voice APIs.
|
|
|
4
4
|
|
|
5
5
|
TTS Engines:
|
|
6
6
|
- Kokoro: Local neural TTS (default)
|
|
7
|
+
- Qwen3-TTS: Local high-quality multilingual TTS (0.6B/1.7B)
|
|
7
8
|
- ElevenLabs: Cloud TTS with streaming
|
|
8
9
|
- OpenAI: Realtime voice API
|
|
9
10
|
- Gemini: Live API for real-time voice
|
|
@@ -13,6 +14,7 @@ Usage:
|
|
|
13
14
|
from npcpy.gen.audio_gen import text_to_speech
|
|
14
15
|
|
|
15
16
|
audio = text_to_speech("Hello world", engine="kokoro", voice="af_heart")
|
|
17
|
+
audio = text_to_speech("Hello world", engine="qwen3", voice="ryan")
|
|
16
18
|
|
|
17
19
|
For STT, see npcpy.data.audio
|
|
18
20
|
"""
|
|
@@ -477,6 +479,155 @@ def get_gemini_voices() -> list:
|
|
|
477
479
|
]
|
|
478
480
|
|
|
479
481
|
|
|
482
|
+
# =============================================================================
|
|
483
|
+
# Qwen3-TTS (Local High-Quality Multilingual)
|
|
484
|
+
# =============================================================================
|
|
485
|
+
|
|
486
|
+
_qwen3_model_cache = {}
|
|
487
|
+
|
|
488
|
+
def _get_qwen3_model(
|
|
489
|
+
model_size: str = "1.7B",
|
|
490
|
+
model_type: str = "custom_voice",
|
|
491
|
+
device: str = "auto",
|
|
492
|
+
):
|
|
493
|
+
"""Load and cache a Qwen3-TTS model."""
|
|
494
|
+
cache_key = (model_size, model_type, device)
|
|
495
|
+
if cache_key in _qwen3_model_cache:
|
|
496
|
+
return _qwen3_model_cache[cache_key]
|
|
497
|
+
|
|
498
|
+
import torch
|
|
499
|
+
from huggingface_hub import snapshot_download
|
|
500
|
+
|
|
501
|
+
if device == "auto":
|
|
502
|
+
if torch.cuda.is_available():
|
|
503
|
+
device = "cuda"
|
|
504
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
505
|
+
device = "mps"
|
|
506
|
+
else:
|
|
507
|
+
device = "cpu"
|
|
508
|
+
|
|
509
|
+
dtype = torch.bfloat16 if device != "cpu" else torch.float32
|
|
510
|
+
|
|
511
|
+
size_tag = "0.6B" if "0.6" in model_size else "1.7B"
|
|
512
|
+
type_map = {
|
|
513
|
+
"custom_voice": f"Qwen/Qwen3-TTS-12Hz-{size_tag}-CustomVoice",
|
|
514
|
+
"base": f"Qwen/Qwen3-TTS-12Hz-{size_tag}-Base",
|
|
515
|
+
"voice_design": f"Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
repo_id = type_map.get(model_type, type_map["custom_voice"])
|
|
519
|
+
|
|
520
|
+
# Try local cache first, then download
|
|
521
|
+
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "qwen-tts")
|
|
522
|
+
model_dir = os.path.join(cache_dir, repo_id.split("/")[-1])
|
|
523
|
+
|
|
524
|
+
if not os.path.exists(os.path.join(model_dir, "config.json")):
|
|
525
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
526
|
+
snapshot_download(repo_id=repo_id, local_dir=model_dir)
|
|
527
|
+
|
|
528
|
+
# Import the model class
|
|
529
|
+
try:
|
|
530
|
+
from qwen_tts import Qwen3TTSModel
|
|
531
|
+
except ImportError:
|
|
532
|
+
raise ImportError(
|
|
533
|
+
"qwen_tts package not found. Install from: "
|
|
534
|
+
"https://github.com/QwenLM/Qwen3-TTS or pip install qwen-tts"
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
model = Qwen3TTSModel.from_pretrained(
|
|
538
|
+
model_dir, device_map=device, dtype=dtype
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Clear old entries if switching configs
|
|
542
|
+
_qwen3_model_cache.clear()
|
|
543
|
+
_qwen3_model_cache[cache_key] = model
|
|
544
|
+
return model
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def tts_qwen3(
|
|
548
|
+
text: str,
|
|
549
|
+
voice: str = "ryan",
|
|
550
|
+
language: str = "auto",
|
|
551
|
+
model_size: str = "1.7B",
|
|
552
|
+
device: str = "auto",
|
|
553
|
+
speed: float = 1.0,
|
|
554
|
+
ref_audio: str = None,
|
|
555
|
+
ref_text: str = None,
|
|
556
|
+
instruct: str = None,
|
|
557
|
+
) -> bytes:
|
|
558
|
+
"""
|
|
559
|
+
Generate speech using Qwen3-TTS local model.
|
|
560
|
+
|
|
561
|
+
Supports three modes based on arguments:
|
|
562
|
+
- Custom voice (default): Use a preset speaker name
|
|
563
|
+
- Voice clone: Provide ref_audio (path) to clone a voice
|
|
564
|
+
- Voice design: Provide instruct (text description) to design a voice
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
text: Text to synthesize
|
|
568
|
+
voice: Speaker name for custom voice mode
|
|
569
|
+
(aiden, dylan, eric, ono_anna, ryan, serena, sohee, uncle_fu, vivian)
|
|
570
|
+
language: Language (auto, chinese, english, japanese, korean, french, etc.)
|
|
571
|
+
model_size: '0.6B' or '1.7B'
|
|
572
|
+
device: 'auto', 'cuda', 'mps', 'cpu'
|
|
573
|
+
speed: Speech speed (not directly supported, reserved)
|
|
574
|
+
ref_audio: Path to reference audio for voice cloning
|
|
575
|
+
ref_text: Transcript of reference audio (recommended for cloning)
|
|
576
|
+
instruct: Natural language voice description for voice design mode
|
|
577
|
+
|
|
578
|
+
Returns:
|
|
579
|
+
WAV audio bytes
|
|
580
|
+
"""
|
|
581
|
+
import numpy as np
|
|
582
|
+
import soundfile as sf
|
|
583
|
+
|
|
584
|
+
if ref_audio:
|
|
585
|
+
model = _get_qwen3_model(model_size, "base", device)
|
|
586
|
+
wavs, sr = model.generate_voice_clone(
|
|
587
|
+
text=text,
|
|
588
|
+
language=language,
|
|
589
|
+
ref_audio=ref_audio,
|
|
590
|
+
ref_text=ref_text,
|
|
591
|
+
)
|
|
592
|
+
elif instruct:
|
|
593
|
+
model = _get_qwen3_model(model_size, "voice_design", device)
|
|
594
|
+
wavs, sr = model.generate_voice_design(
|
|
595
|
+
text=text,
|
|
596
|
+
language=language,
|
|
597
|
+
instruct=instruct,
|
|
598
|
+
)
|
|
599
|
+
else:
|
|
600
|
+
model = _get_qwen3_model(model_size, "custom_voice", device)
|
|
601
|
+
wavs, sr = model.generate_custom_voice(
|
|
602
|
+
text=text,
|
|
603
|
+
language=language,
|
|
604
|
+
speaker=voice.lower().replace(" ", "_"),
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
if not wavs:
|
|
608
|
+
raise ValueError("Qwen3-TTS generated no audio")
|
|
609
|
+
|
|
610
|
+
wav_buffer = io.BytesIO()
|
|
611
|
+
sf.write(wav_buffer, wavs[0], sr, format='WAV')
|
|
612
|
+
wav_buffer.seek(0)
|
|
613
|
+
return wav_buffer.read()
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def get_qwen3_voices() -> list:
|
|
617
|
+
"""Get available Qwen3-TTS preset voices."""
|
|
618
|
+
return [
|
|
619
|
+
{"id": "aiden", "name": "Aiden", "gender": "male"},
|
|
620
|
+
{"id": "dylan", "name": "Dylan", "gender": "male"},
|
|
621
|
+
{"id": "eric", "name": "Eric", "gender": "male"},
|
|
622
|
+
{"id": "ryan", "name": "Ryan", "gender": "male"},
|
|
623
|
+
{"id": "serena", "name": "Serena", "gender": "female"},
|
|
624
|
+
{"id": "vivian", "name": "Vivian", "gender": "female"},
|
|
625
|
+
{"id": "sohee", "name": "Sohee", "gender": "female"},
|
|
626
|
+
{"id": "ono_anna", "name": "Ono Anna", "gender": "female"},
|
|
627
|
+
{"id": "uncle_fu", "name": "Uncle Fu", "gender": "male"},
|
|
628
|
+
]
|
|
629
|
+
|
|
630
|
+
|
|
480
631
|
# =============================================================================
|
|
481
632
|
# gTTS (Google Text-to-Speech) - Fallback
|
|
482
633
|
# =============================================================================
|
|
@@ -527,7 +678,7 @@ def text_to_speech(
|
|
|
527
678
|
|
|
528
679
|
Args:
|
|
529
680
|
text: Text to synthesize
|
|
530
|
-
engine: TTS engine (kokoro, elevenlabs, openai, gemini, gtts)
|
|
681
|
+
engine: TTS engine (kokoro, qwen3, elevenlabs, openai, gemini, gtts)
|
|
531
682
|
voice: Voice ID (engine-specific)
|
|
532
683
|
**kwargs: Engine-specific options
|
|
533
684
|
|
|
@@ -542,6 +693,10 @@ def text_to_speech(
|
|
|
542
693
|
lang_code = voices.get(voice, {}).get("lang", "a")
|
|
543
694
|
return tts_kokoro(text, voice=voice, lang_code=lang_code, **kwargs)
|
|
544
695
|
|
|
696
|
+
elif engine in ("qwen3", "qwen3-tts", "qwen"):
|
|
697
|
+
voice = voice or "ryan"
|
|
698
|
+
return tts_qwen3(text, voice=voice, **kwargs)
|
|
699
|
+
|
|
545
700
|
elif engine == "elevenlabs":
|
|
546
701
|
voice = voice or "JBFqnCBsd6RMkjVDRZzb"
|
|
547
702
|
return tts_elevenlabs(text, voice_id=voice, **kwargs)
|
|
@@ -568,6 +723,8 @@ def get_available_voices(engine: str = "kokoro") -> list:
|
|
|
568
723
|
|
|
569
724
|
if engine == "kokoro":
|
|
570
725
|
return get_kokoro_voices()
|
|
726
|
+
elif engine in ("qwen3", "qwen3-tts", "qwen"):
|
|
727
|
+
return get_qwen3_voices()
|
|
571
728
|
elif engine == "elevenlabs":
|
|
572
729
|
return get_elevenlabs_voices()
|
|
573
730
|
elif engine == "openai":
|
|
@@ -590,6 +747,13 @@ def get_available_engines() -> dict:
|
|
|
590
747
|
"description": "Local neural TTS (82M params)",
|
|
591
748
|
"install": "pip install kokoro soundfile"
|
|
592
749
|
},
|
|
750
|
+
"qwen3": {
|
|
751
|
+
"name": "Qwen3-TTS",
|
|
752
|
+
"type": "local",
|
|
753
|
+
"available": False,
|
|
754
|
+
"description": "Local high-quality multilingual TTS (0.6B/1.7B)",
|
|
755
|
+
"install": "pip install qwen-tts torch torchaudio transformers"
|
|
756
|
+
},
|
|
593
757
|
"elevenlabs": {
|
|
594
758
|
"name": "ElevenLabs",
|
|
595
759
|
"type": "cloud",
|
|
@@ -615,7 +779,7 @@ def get_available_engines() -> dict:
|
|
|
615
779
|
"name": "Google TTS",
|
|
616
780
|
"type": "cloud",
|
|
617
781
|
"available": False,
|
|
618
|
-
"description": "Free Google TTS"
|
|
782
|
+
"description": "Free Google TTS (fallback)"
|
|
619
783
|
}
|
|
620
784
|
}
|
|
621
785
|
|
|
@@ -625,6 +789,12 @@ def get_available_engines() -> dict:
|
|
|
625
789
|
except ImportError:
|
|
626
790
|
pass
|
|
627
791
|
|
|
792
|
+
try:
|
|
793
|
+
from qwen_tts import Qwen3TTSModel
|
|
794
|
+
engines["qwen3"]["available"] = True
|
|
795
|
+
except ImportError:
|
|
796
|
+
pass
|
|
797
|
+
|
|
628
798
|
if os.environ.get('ELEVENLABS_API_KEY'):
|
|
629
799
|
engines["elevenlabs"]["available"] = True
|
|
630
800
|
|