amd-gaia 0.15.0__py3-none-any.whl → 0.15.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/METADATA +222 -223
  2. amd_gaia-0.15.2.dist-info/RECORD +182 -0
  3. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/WHEEL +1 -1
  4. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/entry_points.txt +1 -0
  5. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/licenses/LICENSE.md +20 -20
  6. gaia/__init__.py +29 -29
  7. gaia/agents/__init__.py +19 -19
  8. gaia/agents/base/__init__.py +9 -9
  9. gaia/agents/base/agent.py +2132 -2177
  10. gaia/agents/base/api_agent.py +119 -120
  11. gaia/agents/base/console.py +1967 -1841
  12. gaia/agents/base/errors.py +237 -237
  13. gaia/agents/base/mcp_agent.py +86 -86
  14. gaia/agents/base/tools.py +88 -83
  15. gaia/agents/blender/__init__.py +7 -0
  16. gaia/agents/blender/agent.py +553 -556
  17. gaia/agents/blender/agent_simple.py +133 -135
  18. gaia/agents/blender/app.py +211 -211
  19. gaia/agents/blender/app_simple.py +41 -41
  20. gaia/agents/blender/core/__init__.py +16 -16
  21. gaia/agents/blender/core/materials.py +506 -506
  22. gaia/agents/blender/core/objects.py +316 -316
  23. gaia/agents/blender/core/rendering.py +225 -225
  24. gaia/agents/blender/core/scene.py +220 -220
  25. gaia/agents/blender/core/view.py +146 -146
  26. gaia/agents/chat/__init__.py +9 -9
  27. gaia/agents/chat/agent.py +809 -835
  28. gaia/agents/chat/app.py +1065 -1058
  29. gaia/agents/chat/session.py +508 -508
  30. gaia/agents/chat/tools/__init__.py +15 -15
  31. gaia/agents/chat/tools/file_tools.py +96 -96
  32. gaia/agents/chat/tools/rag_tools.py +1744 -1729
  33. gaia/agents/chat/tools/shell_tools.py +437 -436
  34. gaia/agents/code/__init__.py +7 -7
  35. gaia/agents/code/agent.py +549 -549
  36. gaia/agents/code/cli.py +377 -0
  37. gaia/agents/code/models.py +135 -135
  38. gaia/agents/code/orchestration/__init__.py +24 -24
  39. gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
  40. gaia/agents/code/orchestration/checklist_generator.py +713 -713
  41. gaia/agents/code/orchestration/factories/__init__.py +9 -9
  42. gaia/agents/code/orchestration/factories/base.py +63 -63
  43. gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
  44. gaia/agents/code/orchestration/factories/python_factory.py +106 -106
  45. gaia/agents/code/orchestration/orchestrator.py +841 -841
  46. gaia/agents/code/orchestration/project_analyzer.py +391 -391
  47. gaia/agents/code/orchestration/steps/__init__.py +67 -67
  48. gaia/agents/code/orchestration/steps/base.py +188 -188
  49. gaia/agents/code/orchestration/steps/error_handler.py +314 -314
  50. gaia/agents/code/orchestration/steps/nextjs.py +828 -828
  51. gaia/agents/code/orchestration/steps/python.py +307 -307
  52. gaia/agents/code/orchestration/template_catalog.py +469 -469
  53. gaia/agents/code/orchestration/workflows/__init__.py +14 -14
  54. gaia/agents/code/orchestration/workflows/base.py +80 -80
  55. gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
  56. gaia/agents/code/orchestration/workflows/python.py +94 -94
  57. gaia/agents/code/prompts/__init__.py +11 -11
  58. gaia/agents/code/prompts/base_prompt.py +77 -77
  59. gaia/agents/code/prompts/code_patterns.py +2034 -2036
  60. gaia/agents/code/prompts/nextjs_prompt.py +40 -40
  61. gaia/agents/code/prompts/python_prompt.py +109 -109
  62. gaia/agents/code/schema_inference.py +365 -365
  63. gaia/agents/code/system_prompt.py +41 -41
  64. gaia/agents/code/tools/__init__.py +42 -42
  65. gaia/agents/code/tools/cli_tools.py +1138 -1138
  66. gaia/agents/code/tools/code_formatting.py +319 -319
  67. gaia/agents/code/tools/code_tools.py +769 -769
  68. gaia/agents/code/tools/error_fixing.py +1347 -1347
  69. gaia/agents/code/tools/external_tools.py +180 -180
  70. gaia/agents/code/tools/file_io.py +845 -845
  71. gaia/agents/code/tools/prisma_tools.py +190 -190
  72. gaia/agents/code/tools/project_management.py +1016 -1016
  73. gaia/agents/code/tools/testing.py +321 -321
  74. gaia/agents/code/tools/typescript_tools.py +122 -122
  75. gaia/agents/code/tools/validation_parsing.py +461 -461
  76. gaia/agents/code/tools/validation_tools.py +806 -806
  77. gaia/agents/code/tools/web_dev_tools.py +1758 -1758
  78. gaia/agents/code/validators/__init__.py +16 -16
  79. gaia/agents/code/validators/antipattern_checker.py +241 -241
  80. gaia/agents/code/validators/ast_analyzer.py +197 -197
  81. gaia/agents/code/validators/requirements_validator.py +145 -145
  82. gaia/agents/code/validators/syntax_validator.py +171 -171
  83. gaia/agents/docker/__init__.py +7 -7
  84. gaia/agents/docker/agent.py +643 -642
  85. gaia/agents/emr/__init__.py +8 -8
  86. gaia/agents/emr/agent.py +1504 -1506
  87. gaia/agents/emr/cli.py +1322 -1322
  88. gaia/agents/emr/constants.py +475 -475
  89. gaia/agents/emr/dashboard/__init__.py +4 -4
  90. gaia/agents/emr/dashboard/server.py +1972 -1974
  91. gaia/agents/jira/__init__.py +11 -11
  92. gaia/agents/jira/agent.py +894 -894
  93. gaia/agents/jira/jql_templates.py +299 -299
  94. gaia/agents/routing/__init__.py +7 -7
  95. gaia/agents/routing/agent.py +567 -570
  96. gaia/agents/routing/system_prompt.py +75 -75
  97. gaia/agents/summarize/__init__.py +11 -0
  98. gaia/agents/summarize/agent.py +885 -0
  99. gaia/agents/summarize/prompts.py +129 -0
  100. gaia/api/__init__.py +23 -23
  101. gaia/api/agent_registry.py +238 -238
  102. gaia/api/app.py +305 -305
  103. gaia/api/openai_server.py +575 -575
  104. gaia/api/schemas.py +186 -186
  105. gaia/api/sse_handler.py +373 -373
  106. gaia/apps/__init__.py +4 -4
  107. gaia/apps/llm/__init__.py +6 -6
  108. gaia/apps/llm/app.py +184 -169
  109. gaia/apps/summarize/app.py +116 -633
  110. gaia/apps/summarize/html_viewer.py +133 -133
  111. gaia/apps/summarize/pdf_formatter.py +284 -284
  112. gaia/audio/__init__.py +2 -2
  113. gaia/audio/audio_client.py +439 -439
  114. gaia/audio/audio_recorder.py +269 -269
  115. gaia/audio/kokoro_tts.py +599 -599
  116. gaia/audio/whisper_asr.py +432 -432
  117. gaia/chat/__init__.py +16 -16
  118. gaia/chat/app.py +428 -430
  119. gaia/chat/prompts.py +522 -522
  120. gaia/chat/sdk.py +1228 -1225
  121. gaia/cli.py +5659 -5632
  122. gaia/database/__init__.py +10 -10
  123. gaia/database/agent.py +176 -176
  124. gaia/database/mixin.py +290 -290
  125. gaia/database/testing.py +64 -64
  126. gaia/eval/batch_experiment.py +2332 -2332
  127. gaia/eval/claude.py +542 -542
  128. gaia/eval/config.py +37 -37
  129. gaia/eval/email_generator.py +512 -512
  130. gaia/eval/eval.py +3179 -3179
  131. gaia/eval/groundtruth.py +1130 -1130
  132. gaia/eval/transcript_generator.py +582 -582
  133. gaia/eval/webapp/README.md +167 -167
  134. gaia/eval/webapp/package-lock.json +875 -875
  135. gaia/eval/webapp/package.json +20 -20
  136. gaia/eval/webapp/public/app.js +3402 -3402
  137. gaia/eval/webapp/public/index.html +87 -87
  138. gaia/eval/webapp/public/styles.css +3661 -3661
  139. gaia/eval/webapp/server.js +415 -415
  140. gaia/eval/webapp/test-setup.js +72 -72
  141. gaia/installer/__init__.py +23 -0
  142. gaia/installer/init_command.py +1275 -0
  143. gaia/installer/lemonade_installer.py +619 -0
  144. gaia/llm/__init__.py +10 -2
  145. gaia/llm/base_client.py +60 -0
  146. gaia/llm/exceptions.py +12 -0
  147. gaia/llm/factory.py +70 -0
  148. gaia/llm/lemonade_client.py +3421 -3221
  149. gaia/llm/lemonade_manager.py +294 -294
  150. gaia/llm/providers/__init__.py +9 -0
  151. gaia/llm/providers/claude.py +108 -0
  152. gaia/llm/providers/lemonade.py +118 -0
  153. gaia/llm/providers/openai_provider.py +79 -0
  154. gaia/llm/vlm_client.py +382 -382
  155. gaia/logger.py +189 -189
  156. gaia/mcp/agent_mcp_server.py +245 -245
  157. gaia/mcp/blender_mcp_client.py +138 -138
  158. gaia/mcp/blender_mcp_server.py +648 -648
  159. gaia/mcp/context7_cache.py +332 -332
  160. gaia/mcp/external_services.py +518 -518
  161. gaia/mcp/mcp_bridge.py +811 -550
  162. gaia/mcp/servers/__init__.py +6 -6
  163. gaia/mcp/servers/docker_mcp.py +83 -83
  164. gaia/perf_analysis.py +361 -0
  165. gaia/rag/__init__.py +10 -10
  166. gaia/rag/app.py +293 -293
  167. gaia/rag/demo.py +304 -304
  168. gaia/rag/pdf_utils.py +235 -235
  169. gaia/rag/sdk.py +2194 -2194
  170. gaia/security.py +183 -163
  171. gaia/talk/app.py +287 -289
  172. gaia/talk/sdk.py +538 -538
  173. gaia/testing/__init__.py +87 -87
  174. gaia/testing/assertions.py +330 -330
  175. gaia/testing/fixtures.py +333 -333
  176. gaia/testing/mocks.py +493 -493
  177. gaia/util.py +46 -46
  178. gaia/utils/__init__.py +33 -33
  179. gaia/utils/file_watcher.py +675 -675
  180. gaia/utils/parsing.py +223 -223
  181. gaia/version.py +100 -100
  182. amd_gaia-0.15.0.dist-info/RECORD +0 -168
  183. gaia/agents/code/app.py +0 -266
  184. gaia/llm/llm_client.py +0 -723
  185. {amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/top_level.txt +0 -0
gaia/audio/kokoro_tts.py CHANGED
@@ -1,599 +1,599 @@
1
- # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
2
- # SPDX-License-Identifier: MIT
3
-
4
- import queue
5
- import threading
6
- import time
7
-
8
- import numpy as np
9
- import psutil
10
-
11
- try:
12
- import sounddevice as sd
13
- except ImportError:
14
- sd = None
15
-
16
- try:
17
- import soundfile as sf
18
- except ImportError:
19
- sf = None
20
-
21
- try:
22
- from kokoro import KPipeline
23
- except ImportError:
24
- KPipeline = None
25
-
26
- from gaia.logger import get_logger
27
-
28
-
29
- class KokoroTTS:
30
- log = get_logger(__name__)
31
-
32
- def __init__(self):
33
- # Check for required dependencies
34
- missing = []
35
- if sd is None:
36
- missing.append("sounddevice")
37
- if sf is None:
38
- missing.append("soundfile")
39
- if KPipeline is None:
40
- missing.append("kokoro>=0.3.1")
41
-
42
- if missing:
43
- error_msg = (
44
- f"\n❌ Error: Missing required talk dependencies: {', '.join(missing)}\n\n"
45
- f"Please install the talk dependencies:\n"
46
- f' uv pip install -e ".[talk]"\n\n'
47
- f"Or install packages directly:\n"
48
- f" uv pip install {' '.join(missing)}\n"
49
- )
50
- raise ImportError(error_msg)
51
-
52
- self.log = self.__class__.log
53
-
54
- # Initialize Kokoro pipeline with American English
55
- self.pipeline = KPipeline(lang_code="a") # 'a' for American English
56
-
57
- # Available voice configurations with metadata
58
- self.available_voices = {
59
- # American English Voices 🇸
60
- "af_alloy": {
61
- "name": "American Female - Alloy",
62
- "quality": "C",
63
- "duration": "MM",
64
- },
65
- "af_aoede": {
66
- "name": "American Female - Aoede",
67
- "quality": "C+",
68
- "duration": "H",
69
- },
70
- "af_bella": {
71
- "name": "American Female - Bella",
72
- "quality": "A-",
73
- "duration": "HH",
74
- },
75
- "af_jessica": {
76
- "name": "American Female - Jessica",
77
- "quality": "D",
78
- "duration": "MM",
79
- },
80
- "af_kore": {
81
- "name": "American Female - Kore",
82
- "quality": "C+",
83
- "duration": "H",
84
- },
85
- "af_nicole": {
86
- "name": "American Female - Nicole",
87
- "quality": "B-",
88
- "duration": "HH",
89
- },
90
- "af_nova": {
91
- "name": "American Female - Nova",
92
- "quality": "C",
93
- "duration": "MM",
94
- },
95
- "af_river": {
96
- "name": "American Female - River",
97
- "quality": "D",
98
- "duration": "MM",
99
- },
100
- "af_sarah": {
101
- "name": "American Female - Sarah",
102
- "quality": "C+",
103
- "duration": "H",
104
- },
105
- "af_sky": {
106
- "name": "American Female - Sky",
107
- "quality": "C-",
108
- "duration": "M",
109
- },
110
- "am_adam": {
111
- "name": "American Male - Adam",
112
- "quality": "F+",
113
- "duration": "H",
114
- },
115
- "am_echo": {
116
- "name": "American Male - Echo",
117
- "quality": "D",
118
- "duration": "MM",
119
- },
120
- "am_eric": {
121
- "name": "American Male - Eric",
122
- "quality": "D",
123
- "duration": "MM",
124
- },
125
- "am_fenrir": {
126
- "name": "American Male - Fenrir",
127
- "quality": "C+",
128
- "duration": "H",
129
- },
130
- "am_liam": {
131
- "name": "American Male - Liam",
132
- "quality": "D",
133
- "duration": "MM",
134
- },
135
- "am_michael": {
136
- "name": "American Male - Michael",
137
- "quality": "C+",
138
- "duration": "H",
139
- },
140
- "am_onyx": {
141
- "name": "American Male - Onyx",
142
- "quality": "D",
143
- "duration": "MM",
144
- },
145
- "am_puck": {
146
- "name": "American Male - Puck",
147
- "quality": "C+",
148
- "duration": "H",
149
- },
150
- # British English Voices 🇧
151
- "bf_alice": {
152
- "name": "British Female - Alice",
153
- "quality": "D",
154
- "duration": "MM",
155
- },
156
- "bf_emma": {
157
- "name": "British Female - Emma",
158
- "quality": "B-",
159
- "duration": "HH",
160
- },
161
- "bf_isabella": {
162
- "name": "British Female - Isabella",
163
- "quality": "C",
164
- "duration": "MM",
165
- },
166
- "bf_lily": {
167
- "name": "British Female - Lily",
168
- "quality": "D",
169
- "duration": "MM",
170
- },
171
- "bm_daniel": {
172
- "name": "British Male - Daniel",
173
- "quality": "D",
174
- "duration": "MM",
175
- },
176
- "bm_fable": {
177
- "name": "British Male - Fable",
178
- "quality": "C",
179
- "duration": "MM",
180
- },
181
- "bm_george": {
182
- "name": "British Male - George",
183
- "quality": "C",
184
- "duration": "MM",
185
- },
186
- "bm_lewis": {
187
- "name": "British Male - Lewis",
188
- "quality": "D+",
189
- "duration": "H",
190
- },
191
- }
192
-
193
- # Default to highest quality voice (Bella)
194
- self.voice_name = "af_bella"
195
- self.chunk_size = 150 # Optimal token chunk size for best quality
196
- self.log.debug(
197
- f"Loaded voice: {self.voice_name} - {self.available_voices[self.voice_name]['name']} (Quality: {self.available_voices[self.voice_name]['quality']})"
198
- )
199
-
200
- def preprocess_text(self, text: str) -> str:
201
- """
202
- Preprocess text to add appropriate pauses and improve speech flow.
203
- Removes asterisks and adds pause markers.
204
- """
205
- # First remove all asterisks from the text
206
- text = text.replace("*", "")
207
-
208
- # Add pauses after bullet points and numbered lists
209
- lines = text.split("\n")
210
- processed_lines = []
211
-
212
- for line in lines:
213
- line = line.strip()
214
- if not line: # Skip empty lines
215
- continue
216
-
217
- # Check for various list formats and add pauses
218
- if (
219
- line.startswith(("•", "-", "*")) # Bullet points
220
- or (
221
- len(line) > 2 and line[0].isdigit() and line[1] == "."
222
- ) # Numbered lists
223
- or (len(line) > 2 and line[0].isalpha() and line[1] in [")", "."])
224
- ): # Lettered lists
225
- # For list items, ensure we add pause regardless of existing punctuation
226
- if line[-1] in ".!?:":
227
- line = line[:-1] # Remove existing punctuation
228
- line = line.replace(")", "...") # Add pause after list items
229
- processed_lines.append(f"{line}...")
230
- else:
231
- # Add a period at the end of non-empty lines if they don't already have ending punctuation
232
- if not line[-1] in ".!?:":
233
- processed_lines.append(line + ".")
234
- else:
235
- processed_lines.append(line)
236
-
237
- return " ".join(processed_lines) # Join with spaces instead of newlines
238
-
239
- def generate_speech(
240
- self, text: str, stream_callback=None
241
- ) -> tuple[list[float], str, dict]:
242
- """Generate speech from text using Kokoro TTS with quality optimizations."""
243
- self.log.debug(f"Generating speech for text of length {len(text)}")
244
-
245
- process = psutil.Process()
246
- start_memory = process.memory_info().rss / 1024 / 1024
247
- start_time = time.time()
248
-
249
- # Generate audio using the pipeline with chunking for optimal quality
250
- audio_chunks = []
251
- phonemes = []
252
- total_duration = 0
253
-
254
- # Split text into chunks of optimal size (100-200 tokens)
255
- sentences = text.split(".")
256
- current_chunk = []
257
- current_length = 0
258
-
259
- for sentence in sentences:
260
- sentence = sentence.strip()
261
- if not sentence:
262
- continue
263
-
264
- sentence_length = len(sentence.split())
265
-
266
- if current_length + sentence_length > self.chunk_size:
267
- # Process current chunk
268
- chunk_text = ". ".join(current_chunk) + "."
269
- generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
270
- for _, phoneme_seq, audio in generator:
271
- audio_chunks.append(audio)
272
- phonemes.append(phoneme_seq)
273
- chunk_duration = len(audio) / 24000
274
- total_duration += chunk_duration
275
-
276
- if stream_callback and callable(stream_callback):
277
- stream_callback(audio)
278
-
279
- current_chunk = [sentence]
280
- current_length = sentence_length
281
- else:
282
- current_chunk.append(sentence)
283
- current_length += sentence_length
284
-
285
- # Process remaining chunk if any
286
- if current_chunk:
287
- chunk_text = ". ".join(current_chunk) + "."
288
- generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
289
- for _, phoneme_seq, audio in generator:
290
- audio_chunks.append(audio)
291
- phonemes.append(phoneme_seq)
292
- chunk_duration = len(audio) / 24000
293
- total_duration += chunk_duration
294
-
295
- if stream_callback and callable(stream_callback):
296
- stream_callback(audio)
297
-
298
- # Combine all audio chunks
299
- audio = np.concatenate(audio_chunks)
300
- combined_phonemes = " ".join(phonemes)
301
-
302
- end_time = time.time()
303
- end_memory = process.memory_info().rss / 1024 / 1024
304
- processing_time = end_time - start_time
305
- peak_memory = end_memory - start_memory
306
-
307
- stats = {
308
- "processing_time": round(processing_time, 3),
309
- "audio_duration": round(total_duration, 3),
310
- "realtime_ratio": round(processing_time / total_duration, 2),
311
- "peak_memory": round(peak_memory, 2),
312
- }
313
-
314
- return audio, combined_phonemes, stats
315
-
316
- def generate_speech_streaming(
317
- self, text_queue: queue.Queue, status_callback=None, interrupt_event=None
318
- ) -> None:
319
- """Optimized streaming TTS with separate processing and playback threads."""
320
- self.log.debug("Starting speech streaming")
321
- buffer = ""
322
- audio_buffer = queue.Queue(maxsize=100) # Buffer for processed audio chunks
323
-
324
- # Initialize audio stream
325
- stream = sd.OutputStream(
326
- samplerate=24000,
327
- channels=1,
328
- dtype=np.float32,
329
- blocksize=2400, # 100ms buffer
330
- latency="low",
331
- )
332
- stream.start()
333
- self.log.debug("Audio stream initialized")
334
-
335
- # Playback thread function
336
- def audio_playback_thread():
337
- try:
338
- while True:
339
- try:
340
- audio_chunk = audio_buffer.get(timeout=0.1)
341
- if audio_chunk is None: # Exit signal
342
- if status_callback:
343
- status_callback(False)
344
- break
345
- if interrupt_event and interrupt_event.is_set():
346
- break
347
- if status_callback:
348
- status_callback(True)
349
- stream.write(np.array(audio_chunk, dtype=np.float32))
350
- except queue.Empty:
351
- continue
352
- except Exception as e:
353
- self.log.error(f"Error in playback thread: {e}")
354
- if status_callback:
355
- status_callback(False)
356
- finally:
357
- stream.stop()
358
- stream.close()
359
- if status_callback:
360
- status_callback(False)
361
-
362
- # Start playback thread
363
- playback_thread = threading.Thread(target=audio_playback_thread)
364
- playback_thread.daemon = True
365
- playback_thread.start()
366
-
367
- try:
368
- while True:
369
- try:
370
- chunk = text_queue.get(timeout=0.1)
371
-
372
- if chunk == "__END__" or (
373
- interrupt_event and interrupt_event.is_set()
374
- ):
375
- if buffer.strip():
376
- # Process final buffer
377
- processed_text = self.preprocess_text(buffer.strip())
378
- if processed_text: # Only process if there's actual text
379
- self.generate_speech(
380
- processed_text, stream_callback=audio_buffer.put
381
- )
382
- audio_buffer.put(None) # Signal playback thread to exit
383
- break
384
-
385
- buffer += chunk
386
-
387
- # Find complete sentences for immediate processing
388
- sentences = buffer.split(".")
389
- if len(sentences) > 1:
390
- # Process complete sentences immediately
391
- text_to_process = ".".join(sentences[:-1]) + "."
392
- if (
393
- text_to_process.strip()
394
- ): # Only process if there's actual text
395
- processed_text = self.preprocess_text(text_to_process)
396
- if processed_text: # Double check after preprocessing
397
- self.generate_speech(
398
- processed_text, stream_callback=audio_buffer.put
399
- )
400
- buffer = sentences[-1]
401
-
402
- except queue.Empty:
403
- continue
404
-
405
- except Exception as e:
406
- self.log.error(f"Error in streaming: {e}")
407
- audio_buffer.put(None) # Ensure playback thread exits
408
- finally:
409
- audio_buffer.put(None) # Ensure playback thread exits
410
- playback_thread.join(timeout=2.0)
411
-
412
- def set_voice(self, voice_name: str) -> None:
413
- """Change the current voice."""
414
- self.log.info(f"Changing voice to: {voice_name}")
415
- if voice_name not in self.available_voices:
416
- self.log.error(f"Unknown voice '{voice_name}'")
417
- raise ValueError(
418
- f"Unknown voice '{voice_name}'. Available voices: {list(self.available_voices.keys())}"
419
- )
420
-
421
- self.voice_name = voice_name
422
- self.log.info(
423
- f"Changed voice to: {voice_name} - {self.available_voices[voice_name]['name']} (Quality: {self.available_voices[voice_name]['quality']})"
424
- )
425
-
426
- def list_available_voices(self) -> dict[str, dict]:
427
- """Get all available voice names and their descriptions."""
428
- return self.available_voices
429
-
430
- # Test methods remain largely unchanged, just updated to use new generate_speech method
431
- def test_preprocessing(self, test_text: str) -> str:
432
- """Test the text preprocessing functionality."""
433
- try:
434
- processed_text = self.preprocess_text(test_text)
435
- print("\nOriginal text:")
436
- print(test_text)
437
- print("\nProcessed text:")
438
- print(processed_text)
439
- return processed_text
440
- except Exception as e:
441
- self.log.error(f"Error during preprocessing test: {e}")
442
- return None
443
-
444
- def test_generate_audio_file(
445
- self, test_text: str, output_file: str = "output.wav"
446
- ) -> None:
447
- """Test basic audio generation and file saving."""
448
- try:
449
- print("\nGenerating audio...")
450
- audio, _, stats = self.generate_speech(test_text)
451
-
452
- # Save audio to file
453
- sf.write(output_file, np.array(audio), 24000)
454
- print(f"Saved audio to: {output_file}")
455
-
456
- print("\nPerformance stats:")
457
- print(f"- Processing time: {stats['processing_time']:.3f}s")
458
- print(f"- Audio duration: {stats['audio_duration']:.3f}s")
459
- print(f"- Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
460
- print(f"- Peak memory usage: {stats['peak_memory']:.2f} MB")
461
- except Exception as e:
462
- self.log.error(f"Error during audio generation test: {e}")
463
-
464
- def test_streaming_playback(self, test_text: str) -> None:
465
- """Test streaming audio generation with progress display."""
466
- try:
467
- # Setup audio stream
468
- stream = sd.OutputStream(samplerate=24000, channels=1, dtype=np.float32)
469
- stream.start()
470
-
471
- # Create audio queue and initialize tracking variables
472
- audio_queue = queue.Queue(maxsize=100)
473
- words = test_text.split()
474
- total_words = len(words)
475
- total_chunks = 0
476
- current_processing_chunk = 0
477
- current_playback_chunk = 0
478
- spinner_chars = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
479
- spinner_idx = 0
480
-
481
- # Count total chunks
482
- def count_chunks(_):
483
- nonlocal total_chunks
484
- total_chunks += 1
485
-
486
- print("\nAnalyzing text length...")
487
- self.generate_speech(test_text, stream_callback=count_chunks)
488
-
489
- # Define and start streaming thread
490
- def stream_audio():
491
- nonlocal current_playback_chunk, spinner_idx
492
- while True:
493
- try:
494
- chunk = audio_queue.get()
495
- if chunk is None:
496
- break
497
-
498
- chunk_array = np.array(chunk, dtype=np.float32)
499
- stream.write(chunk_array)
500
- current_playback_chunk += 1
501
-
502
- # Update progress display
503
- word_position = int(
504
- (current_playback_chunk / total_chunks) * total_words
505
- )
506
- current_text = " ".join(
507
- words[
508
- max(0, word_position - 5) : min(
509
- total_words, word_position + 5
510
- )
511
- ]
512
- )
513
- current_text = current_text[:60].ljust(60)
514
-
515
- process_progress = int(
516
- (current_processing_chunk / total_chunks) * 50
517
- )
518
- playback_progress = int(
519
- (current_playback_chunk / total_chunks) * 50
520
- )
521
- spinner_idx = (spinner_idx + 1) % len(spinner_chars)
522
-
523
- print("\033[K", end="")
524
- print(
525
- f"\r{spinner_chars[spinner_idx]} Processing: [{'=' * process_progress}{' ' * (50-process_progress)}] {(current_processing_chunk/total_chunks)*100:.1f}%"
526
- )
527
- print(
528
- f"{spinner_chars[spinner_idx]} Playback: [{'=' * playback_progress}{' ' * (50-playback_progress)}] {(current_playback_chunk/total_chunks)*100:.1f}%"
529
- )
530
- print(
531
- f"{spinner_chars[spinner_idx]} Current: {current_text}",
532
- end="\033[2A\r",
533
- )
534
-
535
- audio_queue.task_done()
536
- except queue.Empty:
537
- continue
538
-
539
- print("\nGenerating and streaming audio...")
540
- print("\n\n")
541
- stream_thread = threading.Thread(target=stream_audio)
542
- stream_thread.start()
543
-
544
- def process_chunk(chunk):
545
- nonlocal current_processing_chunk
546
- current_processing_chunk += 1
547
- audio_queue.put(chunk)
548
-
549
- processed_text = self.preprocess_text(test_text)
550
- _, _, stats = self.generate_speech(
551
- processed_text, stream_callback=process_chunk
552
- )
553
-
554
- audio_queue.put(None)
555
- stream_thread.join()
556
-
557
- print("\n\n\n")
558
- stream.stop()
559
- stream.close()
560
-
561
- print("\nStreaming test completed")
562
- print(f"Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
563
-
564
- except Exception as e:
565
- self.log.error(f"Error during streaming test: {e}")
566
-
567
-
568
- def main():
569
- """Run all TTS tests."""
570
- test_text = """
571
- Let's play a game of trivia. I'll ask you a series of questions on a particular topic, and you try to answer them to the best of your ability. We can keep track of your score and see how well you do.
572
-
573
- Here's your first question:
574
-
575
- **Question 1:** Which American author wrote the classic novel "To Kill a Mockingbird"?
576
-
577
- A) F. Scott Fitzgerald
578
- B) Harper Lee
579
- C) Jane Austen
580
- D) J. K. Rowling
581
- E) Edgar Allan Poe
582
-
583
- Let me know your answer!
584
- """
585
-
586
- tts = KokoroTTS()
587
-
588
- print("Running preprocessing test...")
589
- processed_text = tts.test_preprocessing(test_text)
590
-
591
- print("\nRunning streaming test...")
592
- tts.test_streaming_playback(processed_text)
593
-
594
- print("\nRunning audio generation test...")
595
- tts.test_generate_audio_file(processed_text)
596
-
597
-
598
- if __name__ == "__main__":
599
- main()
1
+ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ import queue
5
+ import threading
6
+ import time
7
+
8
+ import numpy as np
9
+ import psutil
10
+
11
+ try:
12
+ import sounddevice as sd
13
+ except ImportError:
14
+ sd = None
15
+
16
+ try:
17
+ import soundfile as sf
18
+ except ImportError:
19
+ sf = None
20
+
21
+ try:
22
+ from kokoro import KPipeline
23
+ except ImportError:
24
+ KPipeline = None
25
+
26
+ from gaia.logger import get_logger
27
+
28
+
29
+ class KokoroTTS:
30
+ log = get_logger(__name__)
31
+
32
+ def __init__(self):
33
+ # Check for required dependencies
34
+ missing = []
35
+ if sd is None:
36
+ missing.append("sounddevice")
37
+ if sf is None:
38
+ missing.append("soundfile")
39
+ if KPipeline is None:
40
+ missing.append("kokoro>=0.3.1")
41
+
42
+ if missing:
43
+ error_msg = (
44
+ f"\n❌ Error: Missing required talk dependencies: {', '.join(missing)}\n\n"
45
+ f"Please install the talk dependencies:\n"
46
+ f' uv pip install -e ".[talk]"\n\n'
47
+ f"Or install packages directly:\n"
48
+ f" uv pip install {' '.join(missing)}\n"
49
+ )
50
+ raise ImportError(error_msg)
51
+
52
+ self.log = self.__class__.log
53
+
54
+ # Initialize Kokoro pipeline with American English
55
+ self.pipeline = KPipeline(lang_code="a") # 'a' for American English
56
+
57
+ # Available voice configurations with metadata
58
+ self.available_voices = {
59
+ # American English Voices 🇸
60
+ "af_alloy": {
61
+ "name": "American Female - Alloy",
62
+ "quality": "C",
63
+ "duration": "MM",
64
+ },
65
+ "af_aoede": {
66
+ "name": "American Female - Aoede",
67
+ "quality": "C+",
68
+ "duration": "H",
69
+ },
70
+ "af_bella": {
71
+ "name": "American Female - Bella",
72
+ "quality": "A-",
73
+ "duration": "HH",
74
+ },
75
+ "af_jessica": {
76
+ "name": "American Female - Jessica",
77
+ "quality": "D",
78
+ "duration": "MM",
79
+ },
80
+ "af_kore": {
81
+ "name": "American Female - Kore",
82
+ "quality": "C+",
83
+ "duration": "H",
84
+ },
85
+ "af_nicole": {
86
+ "name": "American Female - Nicole",
87
+ "quality": "B-",
88
+ "duration": "HH",
89
+ },
90
+ "af_nova": {
91
+ "name": "American Female - Nova",
92
+ "quality": "C",
93
+ "duration": "MM",
94
+ },
95
+ "af_river": {
96
+ "name": "American Female - River",
97
+ "quality": "D",
98
+ "duration": "MM",
99
+ },
100
+ "af_sarah": {
101
+ "name": "American Female - Sarah",
102
+ "quality": "C+",
103
+ "duration": "H",
104
+ },
105
+ "af_sky": {
106
+ "name": "American Female - Sky",
107
+ "quality": "C-",
108
+ "duration": "M",
109
+ },
110
+ "am_adam": {
111
+ "name": "American Male - Adam",
112
+ "quality": "F+",
113
+ "duration": "H",
114
+ },
115
+ "am_echo": {
116
+ "name": "American Male - Echo",
117
+ "quality": "D",
118
+ "duration": "MM",
119
+ },
120
+ "am_eric": {
121
+ "name": "American Male - Eric",
122
+ "quality": "D",
123
+ "duration": "MM",
124
+ },
125
+ "am_fenrir": {
126
+ "name": "American Male - Fenrir",
127
+ "quality": "C+",
128
+ "duration": "H",
129
+ },
130
+ "am_liam": {
131
+ "name": "American Male - Liam",
132
+ "quality": "D",
133
+ "duration": "MM",
134
+ },
135
+ "am_michael": {
136
+ "name": "American Male - Michael",
137
+ "quality": "C+",
138
+ "duration": "H",
139
+ },
140
+ "am_onyx": {
141
+ "name": "American Male - Onyx",
142
+ "quality": "D",
143
+ "duration": "MM",
144
+ },
145
+ "am_puck": {
146
+ "name": "American Male - Puck",
147
+ "quality": "C+",
148
+ "duration": "H",
149
+ },
150
+ # British English Voices 🇧
151
+ "bf_alice": {
152
+ "name": "British Female - Alice",
153
+ "quality": "D",
154
+ "duration": "MM",
155
+ },
156
+ "bf_emma": {
157
+ "name": "British Female - Emma",
158
+ "quality": "B-",
159
+ "duration": "HH",
160
+ },
161
+ "bf_isabella": {
162
+ "name": "British Female - Isabella",
163
+ "quality": "C",
164
+ "duration": "MM",
165
+ },
166
+ "bf_lily": {
167
+ "name": "British Female - Lily",
168
+ "quality": "D",
169
+ "duration": "MM",
170
+ },
171
+ "bm_daniel": {
172
+ "name": "British Male - Daniel",
173
+ "quality": "D",
174
+ "duration": "MM",
175
+ },
176
+ "bm_fable": {
177
+ "name": "British Male - Fable",
178
+ "quality": "C",
179
+ "duration": "MM",
180
+ },
181
+ "bm_george": {
182
+ "name": "British Male - George",
183
+ "quality": "C",
184
+ "duration": "MM",
185
+ },
186
+ "bm_lewis": {
187
+ "name": "British Male - Lewis",
188
+ "quality": "D+",
189
+ "duration": "H",
190
+ },
191
+ }
192
+
193
+ # Default to highest quality voice (Bella)
194
+ self.voice_name = "af_bella"
195
+ self.chunk_size = 150 # Optimal token chunk size for best quality
196
+ self.log.debug(
197
+ f"Loaded voice: {self.voice_name} - {self.available_voices[self.voice_name]['name']} (Quality: {self.available_voices[self.voice_name]['quality']})"
198
+ )
199
+
200
+ def preprocess_text(self, text: str) -> str:
201
+ """
202
+ Preprocess text to add appropriate pauses and improve speech flow.
203
+ Removes asterisks and adds pause markers.
204
+ """
205
+ # First remove all asterisks from the text
206
+ text = text.replace("*", "")
207
+
208
+ # Add pauses after bullet points and numbered lists
209
+ lines = text.split("\n")
210
+ processed_lines = []
211
+
212
+ for line in lines:
213
+ line = line.strip()
214
+ if not line: # Skip empty lines
215
+ continue
216
+
217
+ # Check for various list formats and add pauses
218
+ if (
219
+ line.startswith(("•", "-", "*")) # Bullet points
220
+ or (
221
+ len(line) > 2 and line[0].isdigit() and line[1] == "."
222
+ ) # Numbered lists
223
+ or (len(line) > 2 and line[0].isalpha() and line[1] in [")", "."])
224
+ ): # Lettered lists
225
+ # For list items, ensure we add pause regardless of existing punctuation
226
+ if line[-1] in ".!?:":
227
+ line = line[:-1] # Remove existing punctuation
228
+ line = line.replace(")", "...") # Add pause after list items
229
+ processed_lines.append(f"{line}...")
230
+ else:
231
+ # Add a period at the end of non-empty lines if they don't already have ending punctuation
232
+ if not line[-1] in ".!?:":
233
+ processed_lines.append(line + ".")
234
+ else:
235
+ processed_lines.append(line)
236
+
237
+ return " ".join(processed_lines) # Join with spaces instead of newlines
238
+
239
+ def generate_speech(
240
+ self, text: str, stream_callback=None
241
+ ) -> tuple[list[float], str, dict]:
242
+ """Generate speech from text using Kokoro TTS with quality optimizations."""
243
+ self.log.debug(f"Generating speech for text of length {len(text)}")
244
+
245
+ process = psutil.Process()
246
+ start_memory = process.memory_info().rss / 1024 / 1024
247
+ start_time = time.time()
248
+
249
+ # Generate audio using the pipeline with chunking for optimal quality
250
+ audio_chunks = []
251
+ phonemes = []
252
+ total_duration = 0
253
+
254
+ # Split text into chunks of optimal size (100-200 tokens)
255
+ sentences = text.split(".")
256
+ current_chunk = []
257
+ current_length = 0
258
+
259
+ for sentence in sentences:
260
+ sentence = sentence.strip()
261
+ if not sentence:
262
+ continue
263
+
264
+ sentence_length = len(sentence.split())
265
+
266
+ if current_length + sentence_length > self.chunk_size:
267
+ # Process current chunk
268
+ chunk_text = ". ".join(current_chunk) + "."
269
+ generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
270
+ for _, phoneme_seq, audio in generator:
271
+ audio_chunks.append(audio)
272
+ phonemes.append(phoneme_seq)
273
+ chunk_duration = len(audio) / 24000
274
+ total_duration += chunk_duration
275
+
276
+ if stream_callback and callable(stream_callback):
277
+ stream_callback(audio)
278
+
279
+ current_chunk = [sentence]
280
+ current_length = sentence_length
281
+ else:
282
+ current_chunk.append(sentence)
283
+ current_length += sentence_length
284
+
285
+ # Process remaining chunk if any
286
+ if current_chunk:
287
+ chunk_text = ". ".join(current_chunk) + "."
288
+ generator = self.pipeline(chunk_text, voice=self.voice_name, speed=1)
289
+ for _, phoneme_seq, audio in generator:
290
+ audio_chunks.append(audio)
291
+ phonemes.append(phoneme_seq)
292
+ chunk_duration = len(audio) / 24000
293
+ total_duration += chunk_duration
294
+
295
+ if stream_callback and callable(stream_callback):
296
+ stream_callback(audio)
297
+
298
+ # Combine all audio chunks
299
+ audio = np.concatenate(audio_chunks)
300
+ combined_phonemes = " ".join(phonemes)
301
+
302
+ end_time = time.time()
303
+ end_memory = process.memory_info().rss / 1024 / 1024
304
+ processing_time = end_time - start_time
305
+ peak_memory = end_memory - start_memory
306
+
307
+ stats = {
308
+ "processing_time": round(processing_time, 3),
309
+ "audio_duration": round(total_duration, 3),
310
+ "realtime_ratio": round(processing_time / total_duration, 2),
311
+ "peak_memory": round(peak_memory, 2),
312
+ }
313
+
314
+ return audio, combined_phonemes, stats
315
+
316
+ def generate_speech_streaming(
317
+ self, text_queue: queue.Queue, status_callback=None, interrupt_event=None
318
+ ) -> None:
319
+ """Optimized streaming TTS with separate processing and playback threads."""
320
+ self.log.debug("Starting speech streaming")
321
+ buffer = ""
322
+ audio_buffer = queue.Queue(maxsize=100) # Buffer for processed audio chunks
323
+
324
+ # Initialize audio stream
325
+ stream = sd.OutputStream(
326
+ samplerate=24000,
327
+ channels=1,
328
+ dtype=np.float32,
329
+ blocksize=2400, # 100ms buffer
330
+ latency="low",
331
+ )
332
+ stream.start()
333
+ self.log.debug("Audio stream initialized")
334
+
335
+ # Playback thread function
336
+ def audio_playback_thread():
337
+ try:
338
+ while True:
339
+ try:
340
+ audio_chunk = audio_buffer.get(timeout=0.1)
341
+ if audio_chunk is None: # Exit signal
342
+ if status_callback:
343
+ status_callback(False)
344
+ break
345
+ if interrupt_event and interrupt_event.is_set():
346
+ break
347
+ if status_callback:
348
+ status_callback(True)
349
+ stream.write(np.array(audio_chunk, dtype=np.float32))
350
+ except queue.Empty:
351
+ continue
352
+ except Exception as e:
353
+ self.log.error(f"Error in playback thread: {e}")
354
+ if status_callback:
355
+ status_callback(False)
356
+ finally:
357
+ stream.stop()
358
+ stream.close()
359
+ if status_callback:
360
+ status_callback(False)
361
+
362
+ # Start playback thread
363
+ playback_thread = threading.Thread(target=audio_playback_thread)
364
+ playback_thread.daemon = True
365
+ playback_thread.start()
366
+
367
+ try:
368
+ while True:
369
+ try:
370
+ chunk = text_queue.get(timeout=0.1)
371
+
372
+ if chunk == "__END__" or (
373
+ interrupt_event and interrupt_event.is_set()
374
+ ):
375
+ if buffer.strip():
376
+ # Process final buffer
377
+ processed_text = self.preprocess_text(buffer.strip())
378
+ if processed_text: # Only process if there's actual text
379
+ self.generate_speech(
380
+ processed_text, stream_callback=audio_buffer.put
381
+ )
382
+ audio_buffer.put(None) # Signal playback thread to exit
383
+ break
384
+
385
+ buffer += chunk
386
+
387
+ # Find complete sentences for immediate processing
388
+ sentences = buffer.split(".")
389
+ if len(sentences) > 1:
390
+ # Process complete sentences immediately
391
+ text_to_process = ".".join(sentences[:-1]) + "."
392
+ if (
393
+ text_to_process.strip()
394
+ ): # Only process if there's actual text
395
+ processed_text = self.preprocess_text(text_to_process)
396
+ if processed_text: # Double check after preprocessing
397
+ self.generate_speech(
398
+ processed_text, stream_callback=audio_buffer.put
399
+ )
400
+ buffer = sentences[-1]
401
+
402
+ except queue.Empty:
403
+ continue
404
+
405
+ except Exception as e:
406
+ self.log.error(f"Error in streaming: {e}")
407
+ audio_buffer.put(None) # Ensure playback thread exits
408
+ finally:
409
+ audio_buffer.put(None) # Ensure playback thread exits
410
+ playback_thread.join(timeout=2.0)
411
+
412
+ def set_voice(self, voice_name: str) -> None:
413
+ """Change the current voice."""
414
+ self.log.info(f"Changing voice to: {voice_name}")
415
+ if voice_name not in self.available_voices:
416
+ self.log.error(f"Unknown voice '{voice_name}'")
417
+ raise ValueError(
418
+ f"Unknown voice '{voice_name}'. Available voices: {list(self.available_voices.keys())}"
419
+ )
420
+
421
+ self.voice_name = voice_name
422
+ self.log.info(
423
+ f"Changed voice to: {voice_name} - {self.available_voices[voice_name]['name']} (Quality: {self.available_voices[voice_name]['quality']})"
424
+ )
425
+
426
+ def list_available_voices(self) -> dict[str, dict]:
427
+ """Get all available voice names and their descriptions."""
428
+ return self.available_voices
429
+
430
+ # Test methods remain largely unchanged, just updated to use new generate_speech method
431
+ def test_preprocessing(self, test_text: str) -> str:
432
+ """Test the text preprocessing functionality."""
433
+ try:
434
+ processed_text = self.preprocess_text(test_text)
435
+ print("\nOriginal text:")
436
+ print(test_text)
437
+ print("\nProcessed text:")
438
+ print(processed_text)
439
+ return processed_text
440
+ except Exception as e:
441
+ self.log.error(f"Error during preprocessing test: {e}")
442
+ return None
443
+
444
+ def test_generate_audio_file(
445
+ self, test_text: str, output_file: str = "output.wav"
446
+ ) -> None:
447
+ """Test basic audio generation and file saving."""
448
+ try:
449
+ print("\nGenerating audio...")
450
+ audio, _, stats = self.generate_speech(test_text)
451
+
452
+ # Save audio to file
453
+ sf.write(output_file, np.array(audio), 24000)
454
+ print(f"Saved audio to: {output_file}")
455
+
456
+ print("\nPerformance stats:")
457
+ print(f"- Processing time: {stats['processing_time']:.3f}s")
458
+ print(f"- Audio duration: {stats['audio_duration']:.3f}s")
459
+ print(f"- Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
460
+ print(f"- Peak memory usage: {stats['peak_memory']:.2f} MB")
461
+ except Exception as e:
462
+ self.log.error(f"Error during audio generation test: {e}")
463
+
464
+ def test_streaming_playback(self, test_text: str) -> None:
465
+ """Test streaming audio generation with progress display."""
466
+ try:
467
+ # Setup audio stream
468
+ stream = sd.OutputStream(samplerate=24000, channels=1, dtype=np.float32)
469
+ stream.start()
470
+
471
+ # Create audio queue and initialize tracking variables
472
+ audio_queue = queue.Queue(maxsize=100)
473
+ words = test_text.split()
474
+ total_words = len(words)
475
+ total_chunks = 0
476
+ current_processing_chunk = 0
477
+ current_playback_chunk = 0
478
+ spinner_chars = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
479
+ spinner_idx = 0
480
+
481
+ # Count total chunks
482
+ def count_chunks(_):
483
+ nonlocal total_chunks
484
+ total_chunks += 1
485
+
486
+ print("\nAnalyzing text length...")
487
+ self.generate_speech(test_text, stream_callback=count_chunks)
488
+
489
+ # Define and start streaming thread
490
+ def stream_audio():
491
+ nonlocal current_playback_chunk, spinner_idx
492
+ while True:
493
+ try:
494
+ chunk = audio_queue.get()
495
+ if chunk is None:
496
+ break
497
+
498
+ chunk_array = np.array(chunk, dtype=np.float32)
499
+ stream.write(chunk_array)
500
+ current_playback_chunk += 1
501
+
502
+ # Update progress display
503
+ word_position = int(
504
+ (current_playback_chunk / total_chunks) * total_words
505
+ )
506
+ current_text = " ".join(
507
+ words[
508
+ max(0, word_position - 5) : min(
509
+ total_words, word_position + 5
510
+ )
511
+ ]
512
+ )
513
+ current_text = current_text[:60].ljust(60)
514
+
515
+ process_progress = int(
516
+ (current_processing_chunk / total_chunks) * 50
517
+ )
518
+ playback_progress = int(
519
+ (current_playback_chunk / total_chunks) * 50
520
+ )
521
+ spinner_idx = (spinner_idx + 1) % len(spinner_chars)
522
+
523
+ print("\033[K", end="")
524
+ print(
525
+ f"\r{spinner_chars[spinner_idx]} Processing: [{'=' * process_progress}{' ' * (50-process_progress)}] {(current_processing_chunk/total_chunks)*100:.1f}%"
526
+ )
527
+ print(
528
+ f"{spinner_chars[spinner_idx]} Playback: [{'=' * playback_progress}{' ' * (50-playback_progress)}] {(current_playback_chunk/total_chunks)*100:.1f}%"
529
+ )
530
+ print(
531
+ f"{spinner_chars[spinner_idx]} Current: {current_text}",
532
+ end="\033[2A\r",
533
+ )
534
+
535
+ audio_queue.task_done()
536
+ except queue.Empty:
537
+ continue
538
+
539
+ print("\nGenerating and streaming audio...")
540
+ print("\n\n")
541
+ stream_thread = threading.Thread(target=stream_audio)
542
+ stream_thread.start()
543
+
544
+ def process_chunk(chunk):
545
+ nonlocal current_processing_chunk
546
+ current_processing_chunk += 1
547
+ audio_queue.put(chunk)
548
+
549
+ processed_text = self.preprocess_text(test_text)
550
+ _, _, stats = self.generate_speech(
551
+ processed_text, stream_callback=process_chunk
552
+ )
553
+
554
+ audio_queue.put(None)
555
+ stream_thread.join()
556
+
557
+ print("\n\n\n")
558
+ stream.stop()
559
+ stream.close()
560
+
561
+ print("\nStreaming test completed")
562
+ print(f"Realtime ratio: {stats['realtime_ratio']:.2f}x (lower is better)")
563
+
564
+ except Exception as e:
565
+ self.log.error(f"Error during streaming test: {e}")
566
+
567
+
568
+ def main():
569
+ """Run all TTS tests."""
570
+ test_text = """
571
+ Let's play a game of trivia. I'll ask you a series of questions on a particular topic, and you try to answer them to the best of your ability. We can keep track of your score and see how well you do.
572
+
573
+ Here's your first question:
574
+
575
+ **Question 1:** Which American author wrote the classic novel "To Kill a Mockingbird"?
576
+
577
+ A) F. Scott Fitzgerald
578
+ B) Harper Lee
579
+ C) Jane Austen
580
+ D) J. K. Rowling
581
+ E) Edgar Allan Poe
582
+
583
+ Let me know your answer!
584
+ """
585
+
586
+ tts = KokoroTTS()
587
+
588
+ print("Running preprocessing test...")
589
+ processed_text = tts.test_preprocessing(test_text)
590
+
591
+ print("\nRunning streaming test...")
592
+ tts.test_streaming_playback(processed_text)
593
+
594
+ print("\nRunning audio generation test...")
595
+ tts.test_generate_audio_file(processed_text)
596
+
597
+
598
+ if __name__ == "__main__":
599
+ main()