amd-gaia 0.14.3__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {amd_gaia-0.14.3.dist-info → amd_gaia-0.15.1.dist-info}/METADATA +223 -223
- amd_gaia-0.15.1.dist-info/RECORD +178 -0
- {amd_gaia-0.14.3.dist-info → amd_gaia-0.15.1.dist-info}/entry_points.txt +1 -0
- {amd_gaia-0.14.3.dist-info → amd_gaia-0.15.1.dist-info}/licenses/LICENSE.md +20 -20
- gaia/__init__.py +29 -29
- gaia/agents/__init__.py +19 -19
- gaia/agents/base/__init__.py +9 -9
- gaia/agents/base/agent.py +2177 -2177
- gaia/agents/base/api_agent.py +120 -120
- gaia/agents/base/console.py +1841 -1841
- gaia/agents/base/errors.py +237 -237
- gaia/agents/base/mcp_agent.py +86 -86
- gaia/agents/base/tools.py +83 -83
- gaia/agents/blender/agent.py +556 -556
- gaia/agents/blender/agent_simple.py +133 -135
- gaia/agents/blender/app.py +211 -211
- gaia/agents/blender/app_simple.py +41 -41
- gaia/agents/blender/core/__init__.py +16 -16
- gaia/agents/blender/core/materials.py +506 -506
- gaia/agents/blender/core/objects.py +316 -316
- gaia/agents/blender/core/rendering.py +225 -225
- gaia/agents/blender/core/scene.py +220 -220
- gaia/agents/blender/core/view.py +146 -146
- gaia/agents/chat/__init__.py +9 -9
- gaia/agents/chat/agent.py +835 -835
- gaia/agents/chat/app.py +1058 -1058
- gaia/agents/chat/session.py +508 -508
- gaia/agents/chat/tools/__init__.py +15 -15
- gaia/agents/chat/tools/file_tools.py +96 -96
- gaia/agents/chat/tools/rag_tools.py +1729 -1729
- gaia/agents/chat/tools/shell_tools.py +436 -436
- gaia/agents/code/__init__.py +7 -7
- gaia/agents/code/agent.py +549 -549
- gaia/agents/code/cli.py +377 -0
- gaia/agents/code/models.py +135 -135
- gaia/agents/code/orchestration/__init__.py +24 -24
- gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
- gaia/agents/code/orchestration/checklist_generator.py +713 -713
- gaia/agents/code/orchestration/factories/__init__.py +9 -9
- gaia/agents/code/orchestration/factories/base.py +63 -63
- gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
- gaia/agents/code/orchestration/factories/python_factory.py +106 -106
- gaia/agents/code/orchestration/orchestrator.py +841 -841
- gaia/agents/code/orchestration/project_analyzer.py +391 -391
- gaia/agents/code/orchestration/steps/__init__.py +67 -67
- gaia/agents/code/orchestration/steps/base.py +188 -188
- gaia/agents/code/orchestration/steps/error_handler.py +314 -314
- gaia/agents/code/orchestration/steps/nextjs.py +828 -828
- gaia/agents/code/orchestration/steps/python.py +307 -307
- gaia/agents/code/orchestration/template_catalog.py +469 -469
- gaia/agents/code/orchestration/workflows/__init__.py +14 -14
- gaia/agents/code/orchestration/workflows/base.py +80 -80
- gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
- gaia/agents/code/orchestration/workflows/python.py +94 -94
- gaia/agents/code/prompts/__init__.py +11 -11
- gaia/agents/code/prompts/base_prompt.py +77 -77
- gaia/agents/code/prompts/code_patterns.py +2036 -2036
- gaia/agents/code/prompts/nextjs_prompt.py +40 -40
- gaia/agents/code/prompts/python_prompt.py +109 -109
- gaia/agents/code/schema_inference.py +365 -365
- gaia/agents/code/system_prompt.py +41 -41
- gaia/agents/code/tools/__init__.py +42 -42
- gaia/agents/code/tools/cli_tools.py +1138 -1138
- gaia/agents/code/tools/code_formatting.py +319 -319
- gaia/agents/code/tools/code_tools.py +769 -769
- gaia/agents/code/tools/error_fixing.py +1347 -1347
- gaia/agents/code/tools/external_tools.py +180 -180
- gaia/agents/code/tools/file_io.py +845 -845
- gaia/agents/code/tools/prisma_tools.py +190 -190
- gaia/agents/code/tools/project_management.py +1016 -1016
- gaia/agents/code/tools/testing.py +321 -321
- gaia/agents/code/tools/typescript_tools.py +122 -122
- gaia/agents/code/tools/validation_parsing.py +461 -461
- gaia/agents/code/tools/validation_tools.py +806 -806
- gaia/agents/code/tools/web_dev_tools.py +1758 -1758
- gaia/agents/code/validators/__init__.py +16 -16
- gaia/agents/code/validators/antipattern_checker.py +241 -241
- gaia/agents/code/validators/ast_analyzer.py +197 -197
- gaia/agents/code/validators/requirements_validator.py +145 -145
- gaia/agents/code/validators/syntax_validator.py +171 -171
- gaia/agents/docker/__init__.py +7 -7
- gaia/agents/docker/agent.py +642 -642
- gaia/agents/emr/__init__.py +8 -8
- gaia/agents/emr/agent.py +1506 -1506
- gaia/agents/emr/cli.py +1322 -1322
- gaia/agents/emr/constants.py +475 -475
- gaia/agents/emr/dashboard/__init__.py +4 -4
- gaia/agents/emr/dashboard/server.py +1974 -1974
- gaia/agents/jira/__init__.py +11 -11
- gaia/agents/jira/agent.py +894 -894
- gaia/agents/jira/jql_templates.py +299 -299
- gaia/agents/routing/__init__.py +7 -7
- gaia/agents/routing/agent.py +567 -570
- gaia/agents/routing/system_prompt.py +75 -75
- gaia/agents/summarize/__init__.py +11 -0
- gaia/agents/summarize/agent.py +885 -0
- gaia/agents/summarize/prompts.py +129 -0
- gaia/api/__init__.py +23 -23
- gaia/api/agent_registry.py +238 -238
- gaia/api/app.py +305 -305
- gaia/api/openai_server.py +575 -575
- gaia/api/schemas.py +186 -186
- gaia/api/sse_handler.py +373 -373
- gaia/apps/__init__.py +4 -4
- gaia/apps/llm/__init__.py +6 -6
- gaia/apps/llm/app.py +173 -169
- gaia/apps/summarize/app.py +116 -633
- gaia/apps/summarize/html_viewer.py +133 -133
- gaia/apps/summarize/pdf_formatter.py +284 -284
- gaia/audio/__init__.py +2 -2
- gaia/audio/audio_client.py +439 -439
- gaia/audio/audio_recorder.py +269 -269
- gaia/audio/kokoro_tts.py +599 -599
- gaia/audio/whisper_asr.py +432 -432
- gaia/chat/__init__.py +16 -16
- gaia/chat/app.py +430 -430
- gaia/chat/prompts.py +522 -522
- gaia/chat/sdk.py +1228 -1225
- gaia/cli.py +5481 -5621
- gaia/database/__init__.py +10 -10
- gaia/database/agent.py +176 -176
- gaia/database/mixin.py +290 -290
- gaia/database/testing.py +64 -64
- gaia/eval/batch_experiment.py +2332 -2332
- gaia/eval/claude.py +542 -542
- gaia/eval/config.py +37 -37
- gaia/eval/email_generator.py +512 -512
- gaia/eval/eval.py +3179 -3179
- gaia/eval/groundtruth.py +1130 -1130
- gaia/eval/transcript_generator.py +582 -582
- gaia/eval/webapp/README.md +167 -167
- gaia/eval/webapp/package-lock.json +875 -875
- gaia/eval/webapp/package.json +20 -20
- gaia/eval/webapp/public/app.js +3402 -3402
- gaia/eval/webapp/public/index.html +87 -87
- gaia/eval/webapp/public/styles.css +3661 -3661
- gaia/eval/webapp/server.js +415 -415
- gaia/eval/webapp/test-setup.js +72 -72
- gaia/llm/__init__.py +9 -2
- gaia/llm/base_client.py +60 -0
- gaia/llm/exceptions.py +12 -0
- gaia/llm/factory.py +70 -0
- gaia/llm/lemonade_client.py +3236 -3221
- gaia/llm/lemonade_manager.py +294 -294
- gaia/llm/providers/__init__.py +9 -0
- gaia/llm/providers/claude.py +108 -0
- gaia/llm/providers/lemonade.py +120 -0
- gaia/llm/providers/openai_provider.py +79 -0
- gaia/llm/vlm_client.py +382 -382
- gaia/logger.py +189 -189
- gaia/mcp/agent_mcp_server.py +245 -245
- gaia/mcp/blender_mcp_client.py +138 -138
- gaia/mcp/blender_mcp_server.py +648 -648
- gaia/mcp/context7_cache.py +332 -332
- gaia/mcp/external_services.py +518 -518
- gaia/mcp/mcp_bridge.py +811 -550
- gaia/mcp/servers/__init__.py +6 -6
- gaia/mcp/servers/docker_mcp.py +83 -83
- gaia/perf_analysis.py +361 -0
- gaia/rag/__init__.py +10 -10
- gaia/rag/app.py +293 -293
- gaia/rag/demo.py +304 -304
- gaia/rag/pdf_utils.py +235 -235
- gaia/rag/sdk.py +2194 -2194
- gaia/security.py +163 -163
- gaia/talk/app.py +289 -289
- gaia/talk/sdk.py +538 -538
- gaia/testing/__init__.py +87 -87
- gaia/testing/assertions.py +330 -330
- gaia/testing/fixtures.py +333 -333
- gaia/testing/mocks.py +493 -493
- gaia/util.py +46 -46
- gaia/utils/__init__.py +33 -33
- gaia/utils/file_watcher.py +675 -675
- gaia/utils/parsing.py +223 -223
- gaia/version.py +100 -100
- amd_gaia-0.14.3.dist-info/RECORD +0 -168
- gaia/agents/code/app.py +0 -266
- gaia/llm/llm_client.py +0 -729
- {amd_gaia-0.14.3.dist-info → amd_gaia-0.15.1.dist-info}/WHEEL +0 -0
- {amd_gaia-0.14.3.dist-info → amd_gaia-0.15.1.dist-info}/top_level.txt +0 -0
gaia/audio/kokoro_tts.py
CHANGED
|
@@ -1,599 +1,599 @@
|
|
|
1
|
-
# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: MIT
|
|
3
|
-
|
|
4
|
-
import queue
|
|
5
|
-
import threading
|
|
6
|
-
import time
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import psutil
|
|
10
|
-
|
|
11
|
-
try:
|
|
12
|
-
import sounddevice as sd
|
|
13
|
-
except ImportError:
|
|
14
|
-
sd = None
|
|
15
|
-
|
|
16
|
-
try:
|
|
17
|
-
import soundfile as sf
|
|
18
|
-
except ImportError:
|
|
19
|
-
sf = None
|
|
20
|
-
|
|
21
|
-
try:
|
|
22
|
-
from kokoro import KPipeline
|
|
23
|
-
except ImportError:
|
|
24
|
-
KPipeline = None
|
|
25
|
-
|
|
26
|
-
from gaia.logger import get_logger
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class KokoroTTS:
|
|
30
|
-
log = get_logger(__name__)
|
|
31
|
-
|
|
32
|
-
def __init__(self):
|
|
33
|
-
# Check for required dependencies
|
|
34
|
-
missing = []
|
|
35
|
-
if sd is None:
|
|
36
|
-
missing.append("sounddevice")
|
|
37
|
-
if sf is None:
|
|
38
|
-
missing.append("soundfile")
|
|
39
|
-
if KPipeline is None:
|
|
40
|
-
missing.append("kokoro>=0.3.1")
|
|
41
|
-
|
|
42
|
-
if missing:
|
|
43
|
-
error_msg = (
|
|
44
|
-
f"\n❌ Error: Missing required talk dependencies: {', '.join(missing)}\n\n"
|
|
45
|
-
f"Please install the talk dependencies:\n"
|
|
46
|
-
f' uv pip install -e ".[talk]"\n\n'
|
|
47
|
-
f"Or install packages directly:\n"
|
|
48
|
-
f" uv pip install {' '.join(missing)}\n"
|
|
49
|
-
)
|
|
50
|
-
raise ImportError(error_msg)
|
|
51
|
-
|
|
52
|
-
self.log = self.__class__.log
|
|
53
|
-
|
|
54
|
-
# Initialize Kokoro pipeline with American English
|
|
55
|
-
self.pipeline = KPipeline(lang_code="a") # 'a' for American English
|
|
56
|
-
|
|
57
|
-
# Available voice configurations with metadata
|
|
58
|
-
self.available_voices = {
|
|
59
|
-
# American English Voices 🇸
|
|
60
|
-
"af_alloy": {
|
|
61
|
-
"name": "American Female - Alloy",
|
|
62
|
-
"quality": "C",
|
|
63
|
-
"duration": "MM",
|
|
64
|
-
},
|
|
65
|
-
"af_aoede": {
|
|
66
|
-
"name": "American Female - Aoede",
|
|
67
|
-
"quality": "C+",
|
|
68
|
-
"duration": "H",
|
|
69
|
-
},
|
|
70
|
-
"af_bella": {
|
|
71
|
-
"name": "American Female - Bella",
|
|
72
|
-
"quality": "A-",
|
|
73
|
-
"duration": "HH",
|
|
74
|
-
},
|
|
75
|
-
"af_jessica": {
|
|
76
|
-
"name": "American Female - Jessica",
|
|
77
|
-
"quality": "D",
|
|
78
|
-
"duration": "MM",
|
|
79
|
-
},
|
|
80
|
-
"af_kore": {
|
|
81
|
-
"name": "American Female - Kore",
|
|
82
|
-
"quality": "C+",
|
|
83
|
-
"duration": "H",
|
|
84
|
-
},
|
|
85
|
-
"af_nicole": {
|
|
86
|
-
"name": "American Female - Nicole",
|
|
87
|
-
"quality": "B-",
|
|
88
|
-
"duration": "HH",
|
|
89
|
-
},
|
|
90
|
-
"af_nova": {
|
|
91
|
-
"name": "American Female - Nova",
|
|
92
|
-
"quality": "C",
|
|
93
|
-
"duration": "MM",
|
|
94
|
-
},
|
|
95
|
-
"af_river": {
|
|
96
|
-
"name": "American Female - River",
|
|
97
|
-
"quality": "D",
|
|
98
|
-
"duration": "MM",
|
|
99
|
-
},
|
|
100
|
-
"af_sarah": {
|
|
101
|
-
"name": "American Female - Sarah",
|
|
102
|
-
"quality": "C+",
|
|
103
|
-
"duration": "H",
|
|
104
|
-
},
|
|
105
|
-
"af_sky": {
|
|
106
|
-
"name": "American Female - Sky",
|
|
107
|
-
"quality": "C-",
|
|
108
|
-
"duration": "M",
|
|
109
|
-
},
|
|
110
|
-
"am_adam": {
|
|
111
|
-
"name": "American Male - Adam",
|
|
112
|
-
"quality": "F+",
|
|
113
|
-
"duration": "H",
|
|
114
|
-
},
|
|
115
|
-
"am_echo": {
|
|
116
|
-
"name": "American Male - Echo",
|
|
117
|
-
"quality": "D",
|
|
118
|
-
"duration": "MM",
|
|
119
|
-
},
|
|
120
|
-
"am_eric": {
|
|
121
|
-
"name": "American Male - Eric",
|
|
122
|
-
"quality": "D",
|
|
123
|
-
"duration": "MM",
|
|
124
|
-
},
|
|
125
|
-
"am_fenrir": {
|
|
126
|
-
"name": "American Male - Fenrir",
|
|
127
|
-
"quality": "C+",
|
|
128
|
-
"duration": "H",
|
|
129
|
-
},
|
|
130
|
-
"am_liam": {
|
|
131
|
-
"name": "American Male - Liam",
|
|
132
|
-
"quality": "D",
|
|
133
|
-
"duration": "MM",
|
|
134
|
-
},
|
|
135
|
-
"am_michael": {
|
|
136
|
-
"name": "American Male - Michael",
|
|
137
|
-
"quality": "C+",
|
|
138
|
-
"duration": "H",
|
|
139
|
-
},
|
|
140
|
-
"am_onyx": {
|
|
141
|
-
"name": "American Male - Onyx",
|
|
142
|
-
"quality": "D",
|
|
143
|
-
"duration": "MM",
|
|
144
|
-
},
|
|
145
|
-
"am_puck": {
|
|
146
|
-
"name": "American Male - Puck",
|
|
147
|
-
"quality": "C+",
|
|
148
|
-
"duration": "H",
|
|
149
|
-
},
|
|
150
|
-
# British English Voices 🇧
|
|
151
|
-
"bf_alice": {
|
|
152
|
-
"name": "British Female - Alice",
|
|
153
|
-
"quality": "D",
|
|
154
|
-
"duration": "MM",
|
|
155
|
-
},
|
|
156
|
-
"bf_emma": {
|
|
157
|
-
"name": "British Female - Emma",
|
|
158
|
-
"quality": "B-",
|
|
159
|
-
"duration": "HH",
|
|
160
|
-
},
|
|
161
|
-
"bf_isabella": {
|
|
162
|
-
"name": "British Female - Isabella",
|
|
163
|
-
"quality": "C",
|
|
164
|
-
"duration": "MM",
|
|
165
|
-
},
|
|
166
|
-
"bf_lily": {
|
|
167
|
-
"name": "British Female - Lily",
|
|
168
|
-
"quality": "D",
|
|
169
|
-
"duration": "MM",
|
|
170
|
-
},
|
|
171
|
-
"bm_daniel": {
|
|
172
|
-
"name": "British Male - Daniel",
|
|
173
|
-
"quality": "D",
|
|
174
|
-
"duration": "MM",
|
|
175
|
-
},
|
|
176
|
-
"bm_fable": {
|
|
177
|
-
"name": "British Male - Fable",
|
|
178
|
-
"quality": "C",
|
|
179
|
-
"duration": "MM",
|
|
180
|
-
},
|
|
181
|
-
"bm_george": {
|
|
182
|
-
"name": "British Male - George",
|
|
183
|
-
"quality": "C",
|
|
184
|
-
"duration": "MM",
|
|
185
|
-
},
|
|
186
|
-
"bm_lewis": {
|
|
187
|
-
"name": "British Male - Lewis",
|
|
188
|
-
"quality": "D+",
|
|
189
|
-
"duration": "H",
|
|
190
|
-
},
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
# Default to highest quality voice (Bella)
|
|
194
|
-
self.voice_name = "af_bella"
|
|
195
|
-
self.chunk_size = 150 # Optimal token chunk size for best quality
|
|
196
|
-
self.log.debug(
|
|
197
|
-
f"Loaded voice: {self.voice_name} - {self.available_voices[self.voice_name]['name']} (Quality: {self.available_voices[self.voice_name]['quality']})"
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
def preprocess_text(self, text: str) -> str:
|
|
201
|
-
"""
|
|
202
|
-
Preprocess text to add appropriate pauses and improve speech flow.
|
|
203
|
-
Removes asterisks and adds pause markers.
|
|
204
|
-
"""
|
|
205
|
-
# First remove all asterisks from the text
|
|
206
|
-
text = text.replace("*", "")
|
|
207
|
-
|
|
208
|
-
# Add pauses after bullet points and numbered lists
|
|
209
|
-
lines = text.split("\n")
|
|
210
|
-
processed_lines = []
|
|
211
|
-
|
|
212
|
-
for line in lines:
|
|
213
|
-
line = line.strip()
|
|
214
|
-
if not line: # Skip empty lines
|
|
215
|
-
continue
|
|
216
|
-
|
|
217
|
-
# Check for various list formats and add pauses
|
|
218
|
-
if (
|
|
219
|
-
line.startswith(("•", "-", "*")) # Bullet points
|
|
220
|
-
or (
|
|
221
|
-
len(line) > 2 and line[0].isdigit() and line[1] == "."
|
|
222
|
-
) # Numbered lists
|
|
223
|
-
or (len(line) > 2 and line[0].isalpha() and line[1] in [")", "."])
|
|
224
|
-
): # Lettered lists
|
|
225
|
-
# For list items, ensure we add pause regardless of existing punctuation
|
|
226
|
-
if line[-1] in ".!?:":
|
|
227
|
-
line = line[:-1] # Remove existing punctuation
|
|
228
|
-
line = line.replace(")", "...") # Add pause after list items
|
|
229
|
-
processed_lines.append(f"{line}...")
|
|
230
|
-
else:
|
|
231
|
-
# Add a period at the end of non-empty lines if they don't already have ending punctuation
|
|
232
|
-
if not line[-1] in ".!?:":
|
|
233
|
-
processed_lines.append(line + ".")
|
|
234
|
-
else:
|
|
235
|
-
processed_lines.append(line)
|
|
236
|
-
|
|
237
|
-
return " ".join(processed_lines) # Join with spaces instead of newlines
|
|
238
|
-
|
|
239
|
-
def generate_speech(
|
|
240
|
-
self, text: str, stream_callback=None
|
|
241
|
-
) -> tuple[list[float], str, dict]:
|
|
242
|
-
"""Generate speech from text using Kokoro TTS with quality optimizations."""
|
|
243
|
-
self.log.debug(f"Generating speech for text of length {len(text)}")
|
|
244
|
-
|
|
245
|
-
process = psutil.Process()
|
|
246
|
-
start_memory = process.memory_info().rss / 1024 / 1024
|
|
247
|
-
start_time = time.time()
|
|
248
|
-
|
|
249
|
-
# Generate audio using the pipeline with chunking for optimal quality
|
|
250
|
-
audio_chunks = []
|
|
251
|
-
phonemes = []
|
|
252
|
-
total_duration = 0
|
|
253
|
-
|
|
254
|
-
# Split text into chunks of optimal size (100-200 tokens)
|
|
255
|
-
sentences = text.split(".")
|
|
256
|
-
current_chunk = []
|
|
257
|
-
current_length = 0
|
|
258
|
-
|
|
259
|
-
for sentence in sentences:
|
|
260
|
-
sentence = sentence.strip()
|
|
261
|
-
if not sentence:
|
|
262
|
-
continue
|
|
263
|
-
|
|
264
|
-
sentence_length = len(sentence.split())
|
|
265
|
-
|
|
266
|
-
if current_length + sentence_length > self.chunk_size:
|
|
267
|
-
# Process current chunk
|
|
268
|
-
chunk_text = ". ".join(current_chunk) + "."
|
|
269
|
-
generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
|
|
270
|
-
for _, phoneme_seq, audio in generator:
|
|
271
|
-
audio_chunks.append(audio)
|
|
272
|
-
phonemes.append(phoneme_seq)
|
|
273
|
-
chunk_duration = len(audio) / 24000
|
|
274
|
-
total_duration += chunk_duration
|
|
275
|
-
|
|
276
|
-
if stream_callback and callable(stream_callback):
|
|
277
|
-
stream_callback(audio)
|
|
278
|
-
|
|
279
|
-
current_chunk = [sentence]
|
|
280
|
-
current_length = sentence_length
|
|
281
|
-
else:
|
|
282
|
-
current_chunk.append(sentence)
|
|
283
|
-
current_length += sentence_length
|
|
284
|
-
|
|
285
|
-
# Process remaining chunk if any
|
|
286
|
-
if current_chunk:
|
|
287
|
-
chunk_text = ". ".join(current_chunk) + "."
|
|
288
|
-
generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
|
|
289
|
-
for _, phoneme_seq, audio in generator:
|
|
290
|
-
audio_chunks.append(audio)
|
|
291
|
-
phonemes.append(phoneme_seq)
|
|
292
|
-
chunk_duration = len(audio) / 24000
|
|
293
|
-
total_duration += chunk_duration
|
|
294
|
-
|
|
295
|
-
if stream_callback and callable(stream_callback):
|
|
296
|
-
stream_callback(audio)
|
|
297
|
-
|
|
298
|
-
# Combine all audio chunks
|
|
299
|
-
audio = np.concatenate(audio_chunks)
|
|
300
|
-
combined_phonemes = " ".join(phonemes)
|
|
301
|
-
|
|
302
|
-
end_time = time.time()
|
|
303
|
-
end_memory = process.memory_info().rss / 1024 / 1024
|
|
304
|
-
processing_time = end_time - start_time
|
|
305
|
-
peak_memory = end_memory - start_memory
|
|
306
|
-
|
|
307
|
-
stats = {
|
|
308
|
-
"processing_time": round(processing_time, 3),
|
|
309
|
-
"audio_duration": round(total_duration, 3),
|
|
310
|
-
"realtime_ratio": round(processing_time / total_duration, 2),
|
|
311
|
-
"peak_memory": round(peak_memory, 2),
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
return audio, combined_phonemes, stats
|
|
315
|
-
|
|
316
|
-
def generate_speech_streaming(
|
|
317
|
-
self, text_queue: queue.Queue, status_callback=None, interrupt_event=None
|
|
318
|
-
) -> None:
|
|
319
|
-
"""Optimized streaming TTS with separate processing and playback threads."""
|
|
320
|
-
self.log.debug("Starting speech streaming")
|
|
321
|
-
buffer = ""
|
|
322
|
-
audio_buffer = queue.Queue(maxsize=100) # Buffer for processed audio chunks
|
|
323
|
-
|
|
324
|
-
# Initialize audio stream
|
|
325
|
-
stream = sd.OutputStream(
|
|
326
|
-
samplerate=24000,
|
|
327
|
-
channels=1,
|
|
328
|
-
dtype=np.float32,
|
|
329
|
-
blocksize=2400, # 100ms buffer
|
|
330
|
-
latency="low",
|
|
331
|
-
)
|
|
332
|
-
stream.start()
|
|
333
|
-
self.log.debug("Audio stream initialized")
|
|
334
|
-
|
|
335
|
-
# Playback thread function
|
|
336
|
-
def audio_playback_thread():
|
|
337
|
-
try:
|
|
338
|
-
while True:
|
|
339
|
-
try:
|
|
340
|
-
audio_chunk = audio_buffer.get(timeout=0.1)
|
|
341
|
-
if audio_chunk is None: # Exit signal
|
|
342
|
-
if status_callback:
|
|
343
|
-
status_callback(False)
|
|
344
|
-
break
|
|
345
|
-
if interrupt_event and interrupt_event.is_set():
|
|
346
|
-
break
|
|
347
|
-
if status_callback:
|
|
348
|
-
status_callback(True)
|
|
349
|
-
stream.write(np.array(audio_chunk, dtype=np.float32))
|
|
350
|
-
except queue.Empty:
|
|
351
|
-
continue
|
|
352
|
-
except Exception as e:
|
|
353
|
-
self.log.error(f"Error in playback thread: {e}")
|
|
354
|
-
if status_callback:
|
|
355
|
-
status_callback(False)
|
|
356
|
-
finally:
|
|
357
|
-
stream.stop()
|
|
358
|
-
stream.close()
|
|
359
|
-
if status_callback:
|
|
360
|
-
status_callback(False)
|
|
361
|
-
|
|
362
|
-
# Start playback thread
|
|
363
|
-
playback_thread = threading.Thread(target=audio_playback_thread)
|
|
364
|
-
playback_thread.daemon = True
|
|
365
|
-
playback_thread.start()
|
|
366
|
-
|
|
367
|
-
try:
|
|
368
|
-
while True:
|
|
369
|
-
try:
|
|
370
|
-
chunk = text_queue.get(timeout=0.1)
|
|
371
|
-
|
|
372
|
-
if chunk == "__END__" or (
|
|
373
|
-
interrupt_event and interrupt_event.is_set()
|
|
374
|
-
):
|
|
375
|
-
if buffer.strip():
|
|
376
|
-
# Process final buffer
|
|
377
|
-
processed_text = self.preprocess_text(buffer.strip())
|
|
378
|
-
if processed_text: # Only process if there's actual text
|
|
379
|
-
self.generate_speech(
|
|
380
|
-
processed_text, stream_callback=audio_buffer.put
|
|
381
|
-
)
|
|
382
|
-
audio_buffer.put(None) # Signal playback thread to exit
|
|
383
|
-
break
|
|
384
|
-
|
|
385
|
-
buffer += chunk
|
|
386
|
-
|
|
387
|
-
# Find complete sentences for immediate processing
|
|
388
|
-
sentences = buffer.split(".")
|
|
389
|
-
if len(sentences) > 1:
|
|
390
|
-
# Process complete sentences immediately
|
|
391
|
-
text_to_process = ".".join(sentences[:-1]) + "."
|
|
392
|
-
if (
|
|
393
|
-
text_to_process.strip()
|
|
394
|
-
): # Only process if there's actual text
|
|
395
|
-
processed_text = self.preprocess_text(text_to_process)
|
|
396
|
-
if processed_text: # Double check after preprocessing
|
|
397
|
-
self.generate_speech(
|
|
398
|
-
processed_text, stream_callback=audio_buffer.put
|
|
399
|
-
)
|
|
400
|
-
buffer = sentences[-1]
|
|
401
|
-
|
|
402
|
-
except queue.Empty:
|
|
403
|
-
continue
|
|
404
|
-
|
|
405
|
-
except Exception as e:
|
|
406
|
-
self.log.error(f"Error in streaming: {e}")
|
|
407
|
-
audio_buffer.put(None) # Ensure playback thread exits
|
|
408
|
-
finally:
|
|
409
|
-
audio_buffer.put(None) # Ensure playback thread exits
|
|
410
|
-
playback_thread.join(timeout=2.0)
|
|
411
|
-
|
|
412
|
-
def set_voice(self, voice_name: str) -> None:
|
|
413
|
-
"""Change the current voice."""
|
|
414
|
-
self.log.info(f"Changing voice to: {voice_name}")
|
|
415
|
-
if voice_name not in self.available_voices:
|
|
416
|
-
self.log.error(f"Unknown voice '{voice_name}'")
|
|
417
|
-
raise ValueError(
|
|
418
|
-
f"Unknown voice '{voice_name}'. Available voices: {list(self.available_voices.keys())}"
|
|
419
|
-
)
|
|
420
|
-
|
|
421
|
-
self.voice_name = voice_name
|
|
422
|
-
self.log.info(
|
|
423
|
-
f"Changed voice to: {voice_name} - {self.available_voices[voice_name]['name']} (Quality: {self.available_voices[voice_name]['quality']})"
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
def list_available_voices(self) -> dict[str, dict]:
|
|
427
|
-
"""Get all available voice names and their descriptions."""
|
|
428
|
-
return self.available_voices
|
|
429
|
-
|
|
430
|
-
# Test methods remain largely unchanged, just updated to use new generate_speech method
|
|
431
|
-
def test_preprocessing(self, test_text: str) -> str:
|
|
432
|
-
"""Test the text preprocessing functionality."""
|
|
433
|
-
try:
|
|
434
|
-
processed_text = self.preprocess_text(test_text)
|
|
435
|
-
print("\nOriginal text:")
|
|
436
|
-
print(test_text)
|
|
437
|
-
print("\nProcessed text:")
|
|
438
|
-
print(processed_text)
|
|
439
|
-
return processed_text
|
|
440
|
-
except Exception as e:
|
|
441
|
-
self.log.error(f"Error during preprocessing test: {e}")
|
|
442
|
-
return None
|
|
443
|
-
|
|
444
|
-
def test_generate_audio_file(
|
|
445
|
-
self, test_text: str, output_file: str = "output.wav"
|
|
446
|
-
) -> None:
|
|
447
|
-
"""Test basic audio generation and file saving."""
|
|
448
|
-
try:
|
|
449
|
-
print("\nGenerating audio...")
|
|
450
|
-
audio, _, stats = self.generate_speech(test_text)
|
|
451
|
-
|
|
452
|
-
# Save audio to file
|
|
453
|
-
sf.write(output_file, np.array(audio), 24000)
|
|
454
|
-
print(f"Saved audio to: {output_file}")
|
|
455
|
-
|
|
456
|
-
print("\nPerformance stats:")
|
|
457
|
-
print(f"- Processing time: {stats['processing_time']:.3f}s")
|
|
458
|
-
print(f"- Audio duration: {stats['audio_duration']:.3f}s")
|
|
459
|
-
print(f"- Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
|
|
460
|
-
print(f"- Peak memory usage: {stats['peak_memory']:.2f} MB")
|
|
461
|
-
except Exception as e:
|
|
462
|
-
self.log.error(f"Error during audio generation test: {e}")
|
|
463
|
-
|
|
464
|
-
def test_streaming_playback(self, test_text: str) -> None:
|
|
465
|
-
"""Test streaming audio generation with progress display."""
|
|
466
|
-
try:
|
|
467
|
-
# Setup audio stream
|
|
468
|
-
stream = sd.OutputStream(samplerate=24000, channels=1, dtype=np.float32)
|
|
469
|
-
stream.start()
|
|
470
|
-
|
|
471
|
-
# Create audio queue and initialize tracking variables
|
|
472
|
-
audio_queue = queue.Queue(maxsize=100)
|
|
473
|
-
words = test_text.split()
|
|
474
|
-
total_words = len(words)
|
|
475
|
-
total_chunks = 0
|
|
476
|
-
current_processing_chunk = 0
|
|
477
|
-
current_playback_chunk = 0
|
|
478
|
-
spinner_chars = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
479
|
-
spinner_idx = 0
|
|
480
|
-
|
|
481
|
-
# Count total chunks
|
|
482
|
-
def count_chunks(_):
|
|
483
|
-
nonlocal total_chunks
|
|
484
|
-
total_chunks += 1
|
|
485
|
-
|
|
486
|
-
print("\nAnalyzing text length...")
|
|
487
|
-
self.generate_speech(test_text, stream_callback=count_chunks)
|
|
488
|
-
|
|
489
|
-
# Define and start streaming thread
|
|
490
|
-
def stream_audio():
|
|
491
|
-
nonlocal current_playback_chunk, spinner_idx
|
|
492
|
-
while True:
|
|
493
|
-
try:
|
|
494
|
-
chunk = audio_queue.get()
|
|
495
|
-
if chunk is None:
|
|
496
|
-
break
|
|
497
|
-
|
|
498
|
-
chunk_array = np.array(chunk, dtype=np.float32)
|
|
499
|
-
stream.write(chunk_array)
|
|
500
|
-
current_playback_chunk += 1
|
|
501
|
-
|
|
502
|
-
# Update progress display
|
|
503
|
-
word_position = int(
|
|
504
|
-
(current_playback_chunk / total_chunks) * total_words
|
|
505
|
-
)
|
|
506
|
-
current_text = " ".join(
|
|
507
|
-
words[
|
|
508
|
-
max(0, word_position - 5) : min(
|
|
509
|
-
total_words, word_position + 5
|
|
510
|
-
)
|
|
511
|
-
]
|
|
512
|
-
)
|
|
513
|
-
current_text = current_text[:60].ljust(60)
|
|
514
|
-
|
|
515
|
-
process_progress = int(
|
|
516
|
-
(current_processing_chunk / total_chunks) * 50
|
|
517
|
-
)
|
|
518
|
-
playback_progress = int(
|
|
519
|
-
(current_playback_chunk / total_chunks) * 50
|
|
520
|
-
)
|
|
521
|
-
spinner_idx = (spinner_idx + 1) % len(spinner_chars)
|
|
522
|
-
|
|
523
|
-
print("\033[K", end="")
|
|
524
|
-
print(
|
|
525
|
-
f"\r{spinner_chars[spinner_idx]} Processing: [{'=' * process_progress}{' ' * (50-process_progress)}] {(current_processing_chunk/total_chunks)*100:.1f}%"
|
|
526
|
-
)
|
|
527
|
-
print(
|
|
528
|
-
f"{spinner_chars[spinner_idx]} Playback: [{'=' * playback_progress}{' ' * (50-playback_progress)}] {(current_playback_chunk/total_chunks)*100:.1f}%"
|
|
529
|
-
)
|
|
530
|
-
print(
|
|
531
|
-
f"{spinner_chars[spinner_idx]} Current: {current_text}",
|
|
532
|
-
end="\033[2A\r",
|
|
533
|
-
)
|
|
534
|
-
|
|
535
|
-
audio_queue.task_done()
|
|
536
|
-
except queue.Empty:
|
|
537
|
-
continue
|
|
538
|
-
|
|
539
|
-
print("\nGenerating and streaming audio...")
|
|
540
|
-
print("\n\n")
|
|
541
|
-
stream_thread = threading.Thread(target=stream_audio)
|
|
542
|
-
stream_thread.start()
|
|
543
|
-
|
|
544
|
-
def process_chunk(chunk):
|
|
545
|
-
nonlocal current_processing_chunk
|
|
546
|
-
current_processing_chunk += 1
|
|
547
|
-
audio_queue.put(chunk)
|
|
548
|
-
|
|
549
|
-
processed_text = self.preprocess_text(test_text)
|
|
550
|
-
_, _, stats = self.generate_speech(
|
|
551
|
-
processed_text, stream_callback=process_chunk
|
|
552
|
-
)
|
|
553
|
-
|
|
554
|
-
audio_queue.put(None)
|
|
555
|
-
stream_thread.join()
|
|
556
|
-
|
|
557
|
-
print("\n\n\n")
|
|
558
|
-
stream.stop()
|
|
559
|
-
stream.close()
|
|
560
|
-
|
|
561
|
-
print("\nStreaming test completed")
|
|
562
|
-
print(f"Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
|
|
563
|
-
|
|
564
|
-
except Exception as e:
|
|
565
|
-
self.log.error(f"Error during streaming test: {e}")
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
def main():
|
|
569
|
-
"""Run all TTS tests."""
|
|
570
|
-
test_text = """
|
|
571
|
-
Let's play a game of trivia. I'll ask you a series of questions on a particular topic, and you try to answer them to the best of your ability. We can keep track of your score and see how well you do.
|
|
572
|
-
|
|
573
|
-
Here's your first question:
|
|
574
|
-
|
|
575
|
-
**Question 1:** Which American author wrote the classic novel "To Kill a Mockingbird"?
|
|
576
|
-
|
|
577
|
-
A) F. Scott Fitzgerald
|
|
578
|
-
B) Harper Lee
|
|
579
|
-
C) Jane Austen
|
|
580
|
-
D) J. K. Rowling
|
|
581
|
-
E) Edgar Allan Poe
|
|
582
|
-
|
|
583
|
-
Let me know your answer!
|
|
584
|
-
"""
|
|
585
|
-
|
|
586
|
-
tts = KokoroTTS()
|
|
587
|
-
|
|
588
|
-
print("Running preprocessing test...")
|
|
589
|
-
processed_text = tts.test_preprocessing(test_text)
|
|
590
|
-
|
|
591
|
-
print("\nRunning streaming test...")
|
|
592
|
-
tts.test_streaming_playback(processed_text)
|
|
593
|
-
|
|
594
|
-
print("\nRunning audio generation test...")
|
|
595
|
-
tts.test_generate_audio_file(processed_text)
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
if __name__ == "__main__":
|
|
599
|
-
main()
|
|
1
|
+
# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
import queue
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import psutil
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import sounddevice as sd
|
|
13
|
+
except ImportError:
|
|
14
|
+
sd = None
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import soundfile as sf
|
|
18
|
+
except ImportError:
|
|
19
|
+
sf = None
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from kokoro import KPipeline
|
|
23
|
+
except ImportError:
|
|
24
|
+
KPipeline = None
|
|
25
|
+
|
|
26
|
+
from gaia.logger import get_logger
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class KokoroTTS:
|
|
30
|
+
log = get_logger(__name__)
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
# Check for required dependencies
|
|
34
|
+
missing = []
|
|
35
|
+
if sd is None:
|
|
36
|
+
missing.append("sounddevice")
|
|
37
|
+
if sf is None:
|
|
38
|
+
missing.append("soundfile")
|
|
39
|
+
if KPipeline is None:
|
|
40
|
+
missing.append("kokoro>=0.3.1")
|
|
41
|
+
|
|
42
|
+
if missing:
|
|
43
|
+
error_msg = (
|
|
44
|
+
f"\n❌ Error: Missing required talk dependencies: {', '.join(missing)}\n\n"
|
|
45
|
+
f"Please install the talk dependencies:\n"
|
|
46
|
+
f' uv pip install -e ".[talk]"\n\n'
|
|
47
|
+
f"Or install packages directly:\n"
|
|
48
|
+
f" uv pip install {' '.join(missing)}\n"
|
|
49
|
+
)
|
|
50
|
+
raise ImportError(error_msg)
|
|
51
|
+
|
|
52
|
+
self.log = self.__class__.log
|
|
53
|
+
|
|
54
|
+
# Initialize Kokoro pipeline with American English
|
|
55
|
+
self.pipeline = KPipeline(lang_code="a") # 'a' for American English
|
|
56
|
+
|
|
57
|
+
# Available voice configurations with metadata
|
|
58
|
+
self.available_voices = {
|
|
59
|
+
# American English Voices 🇸
|
|
60
|
+
"af_alloy": {
|
|
61
|
+
"name": "American Female - Alloy",
|
|
62
|
+
"quality": "C",
|
|
63
|
+
"duration": "MM",
|
|
64
|
+
},
|
|
65
|
+
"af_aoede": {
|
|
66
|
+
"name": "American Female - Aoede",
|
|
67
|
+
"quality": "C+",
|
|
68
|
+
"duration": "H",
|
|
69
|
+
},
|
|
70
|
+
"af_bella": {
|
|
71
|
+
"name": "American Female - Bella",
|
|
72
|
+
"quality": "A-",
|
|
73
|
+
"duration": "HH",
|
|
74
|
+
},
|
|
75
|
+
"af_jessica": {
|
|
76
|
+
"name": "American Female - Jessica",
|
|
77
|
+
"quality": "D",
|
|
78
|
+
"duration": "MM",
|
|
79
|
+
},
|
|
80
|
+
"af_kore": {
|
|
81
|
+
"name": "American Female - Kore",
|
|
82
|
+
"quality": "C+",
|
|
83
|
+
"duration": "H",
|
|
84
|
+
},
|
|
85
|
+
"af_nicole": {
|
|
86
|
+
"name": "American Female - Nicole",
|
|
87
|
+
"quality": "B-",
|
|
88
|
+
"duration": "HH",
|
|
89
|
+
},
|
|
90
|
+
"af_nova": {
|
|
91
|
+
"name": "American Female - Nova",
|
|
92
|
+
"quality": "C",
|
|
93
|
+
"duration": "MM",
|
|
94
|
+
},
|
|
95
|
+
"af_river": {
|
|
96
|
+
"name": "American Female - River",
|
|
97
|
+
"quality": "D",
|
|
98
|
+
"duration": "MM",
|
|
99
|
+
},
|
|
100
|
+
"af_sarah": {
|
|
101
|
+
"name": "American Female - Sarah",
|
|
102
|
+
"quality": "C+",
|
|
103
|
+
"duration": "H",
|
|
104
|
+
},
|
|
105
|
+
"af_sky": {
|
|
106
|
+
"name": "American Female - Sky",
|
|
107
|
+
"quality": "C-",
|
|
108
|
+
"duration": "M",
|
|
109
|
+
},
|
|
110
|
+
"am_adam": {
|
|
111
|
+
"name": "American Male - Adam",
|
|
112
|
+
"quality": "F+",
|
|
113
|
+
"duration": "H",
|
|
114
|
+
},
|
|
115
|
+
"am_echo": {
|
|
116
|
+
"name": "American Male - Echo",
|
|
117
|
+
"quality": "D",
|
|
118
|
+
"duration": "MM",
|
|
119
|
+
},
|
|
120
|
+
"am_eric": {
|
|
121
|
+
"name": "American Male - Eric",
|
|
122
|
+
"quality": "D",
|
|
123
|
+
"duration": "MM",
|
|
124
|
+
},
|
|
125
|
+
"am_fenrir": {
|
|
126
|
+
"name": "American Male - Fenrir",
|
|
127
|
+
"quality": "C+",
|
|
128
|
+
"duration": "H",
|
|
129
|
+
},
|
|
130
|
+
"am_liam": {
|
|
131
|
+
"name": "American Male - Liam",
|
|
132
|
+
"quality": "D",
|
|
133
|
+
"duration": "MM",
|
|
134
|
+
},
|
|
135
|
+
"am_michael": {
|
|
136
|
+
"name": "American Male - Michael",
|
|
137
|
+
"quality": "C+",
|
|
138
|
+
"duration": "H",
|
|
139
|
+
},
|
|
140
|
+
"am_onyx": {
|
|
141
|
+
"name": "American Male - Onyx",
|
|
142
|
+
"quality": "D",
|
|
143
|
+
"duration": "MM",
|
|
144
|
+
},
|
|
145
|
+
"am_puck": {
|
|
146
|
+
"name": "American Male - Puck",
|
|
147
|
+
"quality": "C+",
|
|
148
|
+
"duration": "H",
|
|
149
|
+
},
|
|
150
|
+
# British English Voices 🇧
|
|
151
|
+
"bf_alice": {
|
|
152
|
+
"name": "British Female - Alice",
|
|
153
|
+
"quality": "D",
|
|
154
|
+
"duration": "MM",
|
|
155
|
+
},
|
|
156
|
+
"bf_emma": {
|
|
157
|
+
"name": "British Female - Emma",
|
|
158
|
+
"quality": "B-",
|
|
159
|
+
"duration": "HH",
|
|
160
|
+
},
|
|
161
|
+
"bf_isabella": {
|
|
162
|
+
"name": "British Female - Isabella",
|
|
163
|
+
"quality": "C",
|
|
164
|
+
"duration": "MM",
|
|
165
|
+
},
|
|
166
|
+
"bf_lily": {
|
|
167
|
+
"name": "British Female - Lily",
|
|
168
|
+
"quality": "D",
|
|
169
|
+
"duration": "MM",
|
|
170
|
+
},
|
|
171
|
+
"bm_daniel": {
|
|
172
|
+
"name": "British Male - Daniel",
|
|
173
|
+
"quality": "D",
|
|
174
|
+
"duration": "MM",
|
|
175
|
+
},
|
|
176
|
+
"bm_fable": {
|
|
177
|
+
"name": "British Male - Fable",
|
|
178
|
+
"quality": "C",
|
|
179
|
+
"duration": "MM",
|
|
180
|
+
},
|
|
181
|
+
"bm_george": {
|
|
182
|
+
"name": "British Male - George",
|
|
183
|
+
"quality": "C",
|
|
184
|
+
"duration": "MM",
|
|
185
|
+
},
|
|
186
|
+
"bm_lewis": {
|
|
187
|
+
"name": "British Male - Lewis",
|
|
188
|
+
"quality": "D+",
|
|
189
|
+
"duration": "H",
|
|
190
|
+
},
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
# Default to highest quality voice (Bella)
|
|
194
|
+
self.voice_name = "af_bella"
|
|
195
|
+
self.chunk_size = 150 # Optimal token chunk size for best quality
|
|
196
|
+
self.log.debug(
|
|
197
|
+
f"Loaded voice: {self.voice_name} - {self.available_voices[self.voice_name]['name']} (Quality: {self.available_voices[self.voice_name]['quality']})"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def preprocess_text(self, text: str) -> str:
|
|
201
|
+
"""
|
|
202
|
+
Preprocess text to add appropriate pauses and improve speech flow.
|
|
203
|
+
Removes asterisks and adds pause markers.
|
|
204
|
+
"""
|
|
205
|
+
# First remove all asterisks from the text
|
|
206
|
+
text = text.replace("*", "")
|
|
207
|
+
|
|
208
|
+
# Add pauses after bullet points and numbered lists
|
|
209
|
+
lines = text.split("\n")
|
|
210
|
+
processed_lines = []
|
|
211
|
+
|
|
212
|
+
for line in lines:
|
|
213
|
+
line = line.strip()
|
|
214
|
+
if not line: # Skip empty lines
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# Check for various list formats and add pauses
|
|
218
|
+
if (
|
|
219
|
+
line.startswith(("•", "-", "*")) # Bullet points
|
|
220
|
+
or (
|
|
221
|
+
len(line) > 2 and line[0].isdigit() and line[1] == "."
|
|
222
|
+
) # Numbered lists
|
|
223
|
+
or (len(line) > 2 and line[0].isalpha() and line[1] in [")", "."])
|
|
224
|
+
): # Lettered lists
|
|
225
|
+
# For list items, ensure we add pause regardless of existing punctuation
|
|
226
|
+
if line[-1] in ".!?:":
|
|
227
|
+
line = line[:-1] # Remove existing punctuation
|
|
228
|
+
line = line.replace(")", "...") # Add pause after list items
|
|
229
|
+
processed_lines.append(f"{line}...")
|
|
230
|
+
else:
|
|
231
|
+
# Add a period at the end of non-empty lines if they don't already have ending punctuation
|
|
232
|
+
if not line[-1] in ".!?:":
|
|
233
|
+
processed_lines.append(line + ".")
|
|
234
|
+
else:
|
|
235
|
+
processed_lines.append(line)
|
|
236
|
+
|
|
237
|
+
return " ".join(processed_lines) # Join with spaces instead of newlines
|
|
238
|
+
|
|
239
|
+
def generate_speech(
|
|
240
|
+
self, text: str, stream_callback=None
|
|
241
|
+
) -> tuple[list[float], str, dict]:
|
|
242
|
+
"""Generate speech from text using Kokoro TTS with quality optimizations."""
|
|
243
|
+
self.log.debug(f"Generating speech for text of length {len(text)}")
|
|
244
|
+
|
|
245
|
+
process = psutil.Process()
|
|
246
|
+
start_memory = process.memory_info().rss / 1024 / 1024
|
|
247
|
+
start_time = time.time()
|
|
248
|
+
|
|
249
|
+
# Generate audio using the pipeline with chunking for optimal quality
|
|
250
|
+
audio_chunks = []
|
|
251
|
+
phonemes = []
|
|
252
|
+
total_duration = 0
|
|
253
|
+
|
|
254
|
+
# Split text into chunks of optimal size (100-200 tokens)
|
|
255
|
+
sentences = text.split(".")
|
|
256
|
+
current_chunk = []
|
|
257
|
+
current_length = 0
|
|
258
|
+
|
|
259
|
+
for sentence in sentences:
|
|
260
|
+
sentence = sentence.strip()
|
|
261
|
+
if not sentence:
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
sentence_length = len(sentence.split())
|
|
265
|
+
|
|
266
|
+
if current_length + sentence_length > self.chunk_size:
|
|
267
|
+
# Process current chunk
|
|
268
|
+
chunk_text = ". ".join(current_chunk) + "."
|
|
269
|
+
generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
|
|
270
|
+
for _, phoneme_seq, audio in generator:
|
|
271
|
+
audio_chunks.append(audio)
|
|
272
|
+
phonemes.append(phoneme_seq)
|
|
273
|
+
chunk_duration = len(audio) / 24000
|
|
274
|
+
total_duration += chunk_duration
|
|
275
|
+
|
|
276
|
+
if stream_callback and callable(stream_callback):
|
|
277
|
+
stream_callback(audio)
|
|
278
|
+
|
|
279
|
+
current_chunk = [sentence]
|
|
280
|
+
current_length = sentence_length
|
|
281
|
+
else:
|
|
282
|
+
current_chunk.append(sentence)
|
|
283
|
+
current_length += sentence_length
|
|
284
|
+
|
|
285
|
+
# Process remaining chunk if any
|
|
286
|
+
if current_chunk:
|
|
287
|
+
chunk_text = ". ".join(current_chunk) + "."
|
|
288
|
+
generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
|
|
289
|
+
for _, phoneme_seq, audio in generator:
|
|
290
|
+
audio_chunks.append(audio)
|
|
291
|
+
phonemes.append(phoneme_seq)
|
|
292
|
+
chunk_duration = len(audio) / 24000
|
|
293
|
+
total_duration += chunk_duration
|
|
294
|
+
|
|
295
|
+
if stream_callback and callable(stream_callback):
|
|
296
|
+
stream_callback(audio)
|
|
297
|
+
|
|
298
|
+
# Combine all audio chunks
|
|
299
|
+
audio = np.concatenate(audio_chunks)
|
|
300
|
+
combined_phonemes = " ".join(phonemes)
|
|
301
|
+
|
|
302
|
+
end_time = time.time()
|
|
303
|
+
end_memory = process.memory_info().rss / 1024 / 1024
|
|
304
|
+
processing_time = end_time - start_time
|
|
305
|
+
peak_memory = end_memory - start_memory
|
|
306
|
+
|
|
307
|
+
stats = {
|
|
308
|
+
"processing_time": round(processing_time, 3),
|
|
309
|
+
"audio_duration": round(total_duration, 3),
|
|
310
|
+
"realtime_ratio": round(processing_time / total_duration, 2),
|
|
311
|
+
"peak_memory": round(peak_memory, 2),
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return audio, combined_phonemes, stats
|
|
315
|
+
|
|
316
|
+
def generate_speech_streaming(
|
|
317
|
+
self, text_queue: queue.Queue, status_callback=None, interrupt_event=None
|
|
318
|
+
) -> None:
|
|
319
|
+
"""Optimized streaming TTS with separate processing and playback threads."""
|
|
320
|
+
self.log.debug("Starting speech streaming")
|
|
321
|
+
buffer = ""
|
|
322
|
+
audio_buffer = queue.Queue(maxsize=100) # Buffer for processed audio chunks
|
|
323
|
+
|
|
324
|
+
# Initialize audio stream
|
|
325
|
+
stream = sd.OutputStream(
|
|
326
|
+
samplerate=24000,
|
|
327
|
+
channels=1,
|
|
328
|
+
dtype=np.float32,
|
|
329
|
+
blocksize=2400, # 100ms buffer
|
|
330
|
+
latency="low",
|
|
331
|
+
)
|
|
332
|
+
stream.start()
|
|
333
|
+
self.log.debug("Audio stream initialized")
|
|
334
|
+
|
|
335
|
+
# Playback thread function
|
|
336
|
+
def audio_playback_thread():
|
|
337
|
+
try:
|
|
338
|
+
while True:
|
|
339
|
+
try:
|
|
340
|
+
audio_chunk = audio_buffer.get(timeout=0.1)
|
|
341
|
+
if audio_chunk is None: # Exit signal
|
|
342
|
+
if status_callback:
|
|
343
|
+
status_callback(False)
|
|
344
|
+
break
|
|
345
|
+
if interrupt_event and interrupt_event.is_set():
|
|
346
|
+
break
|
|
347
|
+
if status_callback:
|
|
348
|
+
status_callback(True)
|
|
349
|
+
stream.write(np.array(audio_chunk, dtype=np.float32))
|
|
350
|
+
except queue.Empty:
|
|
351
|
+
continue
|
|
352
|
+
except Exception as e:
|
|
353
|
+
self.log.error(f"Error in playback thread: {e}")
|
|
354
|
+
if status_callback:
|
|
355
|
+
status_callback(False)
|
|
356
|
+
finally:
|
|
357
|
+
stream.stop()
|
|
358
|
+
stream.close()
|
|
359
|
+
if status_callback:
|
|
360
|
+
status_callback(False)
|
|
361
|
+
|
|
362
|
+
# Start playback thread
|
|
363
|
+
playback_thread = threading.Thread(target=audio_playback_thread)
|
|
364
|
+
playback_thread.daemon = True
|
|
365
|
+
playback_thread.start()
|
|
366
|
+
|
|
367
|
+
try:
|
|
368
|
+
while True:
|
|
369
|
+
try:
|
|
370
|
+
chunk = text_queue.get(timeout=0.1)
|
|
371
|
+
|
|
372
|
+
if chunk == "__END__" or (
|
|
373
|
+
interrupt_event and interrupt_event.is_set()
|
|
374
|
+
):
|
|
375
|
+
if buffer.strip():
|
|
376
|
+
# Process final buffer
|
|
377
|
+
processed_text = self.preprocess_text(buffer.strip())
|
|
378
|
+
if processed_text: # Only process if there's actual text
|
|
379
|
+
self.generate_speech(
|
|
380
|
+
processed_text, stream_callback=audio_buffer.put
|
|
381
|
+
)
|
|
382
|
+
audio_buffer.put(None) # Signal playback thread to exit
|
|
383
|
+
break
|
|
384
|
+
|
|
385
|
+
buffer += chunk
|
|
386
|
+
|
|
387
|
+
# Find complete sentences for immediate processing
|
|
388
|
+
sentences = buffer.split(".")
|
|
389
|
+
if len(sentences) > 1:
|
|
390
|
+
# Process complete sentences immediately
|
|
391
|
+
text_to_process = ".".join(sentences[:-1]) + "."
|
|
392
|
+
if (
|
|
393
|
+
text_to_process.strip()
|
|
394
|
+
): # Only process if there's actual text
|
|
395
|
+
processed_text = self.preprocess_text(text_to_process)
|
|
396
|
+
if processed_text: # Double check after preprocessing
|
|
397
|
+
self.generate_speech(
|
|
398
|
+
processed_text, stream_callback=audio_buffer.put
|
|
399
|
+
)
|
|
400
|
+
buffer = sentences[-1]
|
|
401
|
+
|
|
402
|
+
except queue.Empty:
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
except Exception as e:
|
|
406
|
+
self.log.error(f"Error in streaming: {e}")
|
|
407
|
+
audio_buffer.put(None) # Ensure playback thread exits
|
|
408
|
+
finally:
|
|
409
|
+
audio_buffer.put(None) # Ensure playback thread exits
|
|
410
|
+
playback_thread.join(timeout=2.0)
|
|
411
|
+
|
|
412
|
+
def set_voice(self, voice_name: str) -> None:
|
|
413
|
+
"""Change the current voice."""
|
|
414
|
+
self.log.info(f"Changing voice to: {voice_name}")
|
|
415
|
+
if voice_name not in self.available_voices:
|
|
416
|
+
self.log.error(f"Unknown voice '{voice_name}'")
|
|
417
|
+
raise ValueError(
|
|
418
|
+
f"Unknown voice '{voice_name}'. Available voices: {list(self.available_voices.keys())}"
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
self.voice_name = voice_name
|
|
422
|
+
self.log.info(
|
|
423
|
+
f"Changed voice to: {voice_name} - {self.available_voices[voice_name]['name']} (Quality: {self.available_voices[voice_name]['quality']})"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
def list_available_voices(self) -> dict[str, dict]:
|
|
427
|
+
"""Get all available voice names and their descriptions."""
|
|
428
|
+
return self.available_voices
|
|
429
|
+
|
|
430
|
+
# Test methods remain largely unchanged, just updated to use new generate_speech method
|
|
431
|
+
def test_preprocessing(self, test_text: str) -> str:
|
|
432
|
+
"""Test the text preprocessing functionality."""
|
|
433
|
+
try:
|
|
434
|
+
processed_text = self.preprocess_text(test_text)
|
|
435
|
+
print("\nOriginal text:")
|
|
436
|
+
print(test_text)
|
|
437
|
+
print("\nProcessed text:")
|
|
438
|
+
print(processed_text)
|
|
439
|
+
return processed_text
|
|
440
|
+
except Exception as e:
|
|
441
|
+
self.log.error(f"Error during preprocessing test: {e}")
|
|
442
|
+
return None
|
|
443
|
+
|
|
444
|
+
def test_generate_audio_file(
|
|
445
|
+
self, test_text: str, output_file: str = "output.wav"
|
|
446
|
+
) -> None:
|
|
447
|
+
"""Test basic audio generation and file saving."""
|
|
448
|
+
try:
|
|
449
|
+
print("\nGenerating audio...")
|
|
450
|
+
audio, _, stats = self.generate_speech(test_text)
|
|
451
|
+
|
|
452
|
+
# Save audio to file
|
|
453
|
+
sf.write(output_file, np.array(audio), 24000)
|
|
454
|
+
print(f"Saved audio to: {output_file}")
|
|
455
|
+
|
|
456
|
+
print("\nPerformance stats:")
|
|
457
|
+
print(f"- Processing time: {stats['processing_time']:.3f}s")
|
|
458
|
+
print(f"- Audio duration: {stats['audio_duration']:.3f}s")
|
|
459
|
+
print(f"- Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
|
|
460
|
+
print(f"- Peak memory usage: {stats['peak_memory']:.2f} MB")
|
|
461
|
+
except Exception as e:
|
|
462
|
+
self.log.error(f"Error during audio generation test: {e}")
|
|
463
|
+
|
|
464
|
+
def test_streaming_playback(self, test_text: str) -> None:
|
|
465
|
+
"""Test streaming audio generation with progress display."""
|
|
466
|
+
try:
|
|
467
|
+
# Setup audio stream
|
|
468
|
+
stream = sd.OutputStream(samplerate=24000, channels=1, dtype=np.float32)
|
|
469
|
+
stream.start()
|
|
470
|
+
|
|
471
|
+
# Create audio queue and initialize tracking variables
|
|
472
|
+
audio_queue = queue.Queue(maxsize=100)
|
|
473
|
+
words = test_text.split()
|
|
474
|
+
total_words = len(words)
|
|
475
|
+
total_chunks = 0
|
|
476
|
+
current_processing_chunk = 0
|
|
477
|
+
current_playback_chunk = 0
|
|
478
|
+
spinner_chars = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
479
|
+
spinner_idx = 0
|
|
480
|
+
|
|
481
|
+
# Count total chunks
|
|
482
|
+
def count_chunks(_):
|
|
483
|
+
nonlocal total_chunks
|
|
484
|
+
total_chunks += 1
|
|
485
|
+
|
|
486
|
+
print("\nAnalyzing text length...")
|
|
487
|
+
self.generate_speech(test_text, stream_callback=count_chunks)
|
|
488
|
+
|
|
489
|
+
# Define and start streaming thread
|
|
490
|
+
def stream_audio():
|
|
491
|
+
nonlocal current_playback_chunk, spinner_idx
|
|
492
|
+
while True:
|
|
493
|
+
try:
|
|
494
|
+
chunk = audio_queue.get()
|
|
495
|
+
if chunk is None:
|
|
496
|
+
break
|
|
497
|
+
|
|
498
|
+
chunk_array = np.array(chunk, dtype=np.float32)
|
|
499
|
+
stream.write(chunk_array)
|
|
500
|
+
current_playback_chunk += 1
|
|
501
|
+
|
|
502
|
+
# Update progress display
|
|
503
|
+
word_position = int(
|
|
504
|
+
(current_playback_chunk / total_chunks) * total_words
|
|
505
|
+
)
|
|
506
|
+
current_text = " ".join(
|
|
507
|
+
words[
|
|
508
|
+
max(0, word_position - 5) : min(
|
|
509
|
+
total_words, word_position + 5
|
|
510
|
+
)
|
|
511
|
+
]
|
|
512
|
+
)
|
|
513
|
+
current_text = current_text[:60].ljust(60)
|
|
514
|
+
|
|
515
|
+
process_progress = int(
|
|
516
|
+
(current_processing_chunk / total_chunks) * 50
|
|
517
|
+
)
|
|
518
|
+
playback_progress = int(
|
|
519
|
+
(current_playback_chunk / total_chunks) * 50
|
|
520
|
+
)
|
|
521
|
+
spinner_idx = (spinner_idx + 1) % len(spinner_chars)
|
|
522
|
+
|
|
523
|
+
print("\033[K", end="")
|
|
524
|
+
print(
|
|
525
|
+
f"\r{spinner_chars[spinner_idx]} Processing: [{'=' * process_progress}{' ' * (50-process_progress)}] {(current_processing_chunk/total_chunks)*100:.1f}%"
|
|
526
|
+
)
|
|
527
|
+
print(
|
|
528
|
+
f"{spinner_chars[spinner_idx]} Playback: [{'=' * playback_progress}{' ' * (50-playback_progress)}] {(current_playback_chunk/total_chunks)*100:.1f}%"
|
|
529
|
+
)
|
|
530
|
+
print(
|
|
531
|
+
f"{spinner_chars[spinner_idx]} Current: {current_text}",
|
|
532
|
+
end="\033[2A\r",
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
audio_queue.task_done()
|
|
536
|
+
except queue.Empty:
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
print("\nGenerating and streaming audio...")
|
|
540
|
+
print("\n\n")
|
|
541
|
+
stream_thread = threading.Thread(target=stream_audio)
|
|
542
|
+
stream_thread.start()
|
|
543
|
+
|
|
544
|
+
def process_chunk(chunk):
|
|
545
|
+
nonlocal current_processing_chunk
|
|
546
|
+
current_processing_chunk += 1
|
|
547
|
+
audio_queue.put(chunk)
|
|
548
|
+
|
|
549
|
+
processed_text = self.preprocess_text(test_text)
|
|
550
|
+
_, _, stats = self.generate_speech(
|
|
551
|
+
processed_text, stream_callback=process_chunk
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
audio_queue.put(None)
|
|
555
|
+
stream_thread.join()
|
|
556
|
+
|
|
557
|
+
print("\n\n\n")
|
|
558
|
+
stream.stop()
|
|
559
|
+
stream.close()
|
|
560
|
+
|
|
561
|
+
print("\nStreaming test completed")
|
|
562
|
+
print(f"Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
|
|
563
|
+
|
|
564
|
+
except Exception as e:
|
|
565
|
+
self.log.error(f"Error during streaming test: {e}")
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def main():
|
|
569
|
+
"""Run all TTS tests."""
|
|
570
|
+
test_text = """
|
|
571
|
+
Let's play a game of trivia. I'll ask you a series of questions on a particular topic, and you try to answer them to the best of your ability. We can keep track of your score and see how well you do.
|
|
572
|
+
|
|
573
|
+
Here's your first question:
|
|
574
|
+
|
|
575
|
+
**Question 1:** Which American author wrote the classic novel "To Kill a Mockingbird"?
|
|
576
|
+
|
|
577
|
+
A) F. Scott Fitzgerald
|
|
578
|
+
B) Harper Lee
|
|
579
|
+
C) Jane Austen
|
|
580
|
+
D) J. K. Rowling
|
|
581
|
+
E) Edgar Allan Poe
|
|
582
|
+
|
|
583
|
+
Let me know your answer!
|
|
584
|
+
"""
|
|
585
|
+
|
|
586
|
+
tts = KokoroTTS()
|
|
587
|
+
|
|
588
|
+
print("Running preprocessing test...")
|
|
589
|
+
processed_text = tts.test_preprocessing(test_text)
|
|
590
|
+
|
|
591
|
+
print("\nRunning streaming test...")
|
|
592
|
+
tts.test_streaming_playback(processed_text)
|
|
593
|
+
|
|
594
|
+
print("\nRunning audio generation test...")
|
|
595
|
+
tts.test_generate_audio_file(processed_text)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
if __name__ == "__main__":
|
|
599
|
+
main()
|