npcpy 1.3.9__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcpy/data/audio.py +360 -0
- npcpy/gen/audio_gen.py +693 -13
- npcpy/llm_funcs.py +1 -10
- npcpy/memory/command_history.py +26 -6
- npcpy/serve.py +728 -80
- {npcpy-1.3.9.dist-info → npcpy-1.3.11.dist-info}/METADATA +1 -1
- {npcpy-1.3.9.dist-info → npcpy-1.3.11.dist-info}/RECORD +10 -10
- {npcpy-1.3.9.dist-info → npcpy-1.3.11.dist-info}/WHEEL +0 -0
- {npcpy-1.3.9.dist-info → npcpy-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {npcpy-1.3.9.dist-info → npcpy-1.3.11.dist-info}/top_level.txt +0 -0
npcpy/gen/audio_gen.py
CHANGED
|
@@ -1,24 +1,704 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio Generation Module for NPC
|
|
3
|
+
Supports multiple TTS engines including real-time voice APIs.
|
|
4
|
+
|
|
5
|
+
TTS Engines:
|
|
6
|
+
- Kokoro: Local neural TTS (default)
|
|
7
|
+
- ElevenLabs: Cloud TTS with streaming
|
|
8
|
+
- OpenAI: Realtime voice API
|
|
9
|
+
- Gemini: Live API for real-time voice
|
|
10
|
+
- gTTS: Google TTS fallback
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from npcpy.gen.audio_gen import text_to_speech
|
|
14
|
+
|
|
15
|
+
audio = text_to_speech("Hello world", engine="kokoro", voice="af_heart")
|
|
16
|
+
|
|
17
|
+
For STT, see npcpy.data.audio
|
|
18
|
+
"""
|
|
19
|
+
|
|
1
20
|
import os
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
21
|
+
import io
|
|
22
|
+
import base64
|
|
23
|
+
import json
|
|
24
|
+
import asyncio
|
|
25
|
+
import tempfile
|
|
26
|
+
from typing import Optional, Callable, Any
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# =============================================================================
|
|
30
|
+
# Kokoro TTS (Local Neural)
|
|
31
|
+
# =============================================================================
|
|
32
|
+
|
|
33
|
+
def tts_kokoro(
|
|
34
|
+
text: str,
|
|
35
|
+
voice: str = "af_heart",
|
|
36
|
+
lang_code: str = "a",
|
|
37
|
+
speed: float = 1.0
|
|
38
|
+
) -> bytes:
|
|
39
|
+
"""
|
|
40
|
+
Generate speech using Kokoro local neural TTS.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
text: Text to synthesize
|
|
44
|
+
voice: Voice ID (af_heart, am_adam, bf_emma, etc.)
|
|
45
|
+
lang_code: 'a' for American, 'b' for British
|
|
46
|
+
speed: Speech speed multiplier
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
WAV audio bytes
|
|
50
|
+
"""
|
|
51
|
+
from kokoro import KPipeline
|
|
52
|
+
import soundfile as sf
|
|
53
|
+
import numpy as np
|
|
54
|
+
|
|
55
|
+
pipeline = KPipeline(lang_code=lang_code)
|
|
56
|
+
|
|
57
|
+
audio_chunks = []
|
|
58
|
+
for _, _, audio in pipeline(text, voice=voice, speed=speed):
|
|
59
|
+
audio_chunks.append(audio)
|
|
60
|
+
|
|
61
|
+
if not audio_chunks:
|
|
62
|
+
raise ValueError("No audio generated")
|
|
63
|
+
|
|
64
|
+
full_audio = np.concatenate(audio_chunks)
|
|
65
|
+
|
|
66
|
+
wav_buffer = io.BytesIO()
|
|
67
|
+
sf.write(wav_buffer, full_audio, 24000, format='WAV')
|
|
68
|
+
wav_buffer.seek(0)
|
|
69
|
+
return wav_buffer.read()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_kokoro_voices() -> list:
|
|
73
|
+
"""Get available Kokoro voices."""
|
|
74
|
+
return [
|
|
75
|
+
{"id": "af_heart", "name": "Heart", "gender": "female", "lang": "a"},
|
|
76
|
+
{"id": "af_bella", "name": "Bella", "gender": "female", "lang": "a"},
|
|
77
|
+
{"id": "af_sarah", "name": "Sarah", "gender": "female", "lang": "a"},
|
|
78
|
+
{"id": "af_nicole", "name": "Nicole", "gender": "female", "lang": "a"},
|
|
79
|
+
{"id": "af_sky", "name": "Sky", "gender": "female", "lang": "a"},
|
|
80
|
+
{"id": "am_adam", "name": "Adam", "gender": "male", "lang": "a"},
|
|
81
|
+
{"id": "am_michael", "name": "Michael", "gender": "male", "lang": "a"},
|
|
82
|
+
{"id": "bf_emma", "name": "Emma", "gender": "female", "lang": "b"},
|
|
83
|
+
{"id": "bf_isabella", "name": "Isabella", "gender": "female", "lang": "b"},
|
|
84
|
+
{"id": "bm_george", "name": "George", "gender": "male", "lang": "b"},
|
|
85
|
+
{"id": "bm_lewis", "name": "Lewis", "gender": "male", "lang": "b"},
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# =============================================================================
|
|
90
|
+
# ElevenLabs TTS
|
|
91
|
+
# =============================================================================
|
|
92
|
+
|
|
93
|
+
def tts_elevenlabs(
|
|
94
|
+
text: str,
|
|
95
|
+
api_key: Optional[str] = None,
|
|
96
|
+
voice_id: str = 'JBFqnCBsd6RMkjVDRZzb',
|
|
97
|
+
model_id: str = 'eleven_multilingual_v2',
|
|
98
|
+
output_format: str = 'mp3_44100_128'
|
|
99
|
+
) -> bytes:
|
|
100
|
+
"""
|
|
101
|
+
Generate speech using ElevenLabs API.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
MP3 audio bytes
|
|
105
|
+
"""
|
|
7
106
|
if api_key is None:
|
|
8
107
|
api_key = os.environ.get('ELEVENLABS_API_KEY')
|
|
108
|
+
|
|
109
|
+
if not api_key:
|
|
110
|
+
raise ValueError("ELEVENLABS_API_KEY not set")
|
|
111
|
+
|
|
9
112
|
from elevenlabs.client import ElevenLabs
|
|
10
|
-
from elevenlabs import play
|
|
11
113
|
|
|
12
|
-
client = ElevenLabs(
|
|
13
|
-
api_key=api_key,
|
|
14
|
-
)
|
|
114
|
+
client = ElevenLabs(api_key=api_key)
|
|
15
115
|
|
|
16
|
-
|
|
116
|
+
audio_generator = client.text_to_speech.convert(
|
|
17
117
|
text=text,
|
|
18
118
|
voice_id=voice_id,
|
|
19
119
|
model_id=model_id,
|
|
20
|
-
output_format=
|
|
120
|
+
output_format=output_format
|
|
21
121
|
)
|
|
22
122
|
|
|
23
|
-
|
|
24
|
-
|
|
123
|
+
return b''.join(chunk for chunk in audio_generator)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
async def tts_elevenlabs_stream(
|
|
127
|
+
text: str,
|
|
128
|
+
api_key: Optional[str] = None,
|
|
129
|
+
voice_id: str = 'JBFqnCBsd6RMkjVDRZzb',
|
|
130
|
+
model_id: str = 'eleven_turbo_v2_5',
|
|
131
|
+
on_chunk: Optional[Callable[[bytes], None]] = None
|
|
132
|
+
) -> bytes:
|
|
133
|
+
"""
|
|
134
|
+
Stream TTS via ElevenLabs WebSocket for lowest latency.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
text: Text to synthesize
|
|
138
|
+
api_key: ElevenLabs API key
|
|
139
|
+
voice_id: Voice to use
|
|
140
|
+
model_id: Model (eleven_turbo_v2_5 for fastest)
|
|
141
|
+
on_chunk: Callback for each audio chunk
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Complete audio bytes
|
|
145
|
+
"""
|
|
146
|
+
import websockets
|
|
147
|
+
|
|
148
|
+
if api_key is None:
|
|
149
|
+
api_key = os.environ.get('ELEVENLABS_API_KEY')
|
|
150
|
+
|
|
151
|
+
uri = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?model_id={model_id}"
|
|
152
|
+
|
|
153
|
+
all_audio = []
|
|
154
|
+
|
|
155
|
+
async with websockets.connect(uri) as ws:
|
|
156
|
+
await ws.send(json.dumps({
|
|
157
|
+
"text": " ",
|
|
158
|
+
"voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
|
|
159
|
+
"xi_api_key": api_key
|
|
160
|
+
}))
|
|
161
|
+
|
|
162
|
+
await ws.send(json.dumps({"text": text}))
|
|
163
|
+
await ws.send(json.dumps({"text": ""}))
|
|
164
|
+
|
|
165
|
+
async for message in ws:
|
|
166
|
+
data = json.loads(message)
|
|
167
|
+
if "audio" in data:
|
|
168
|
+
chunk = base64.b64decode(data["audio"])
|
|
169
|
+
all_audio.append(chunk)
|
|
170
|
+
if on_chunk:
|
|
171
|
+
on_chunk(chunk)
|
|
172
|
+
if data.get("isFinal"):
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
return b''.join(all_audio)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_elevenlabs_voices(api_key: Optional[str] = None) -> list:
|
|
179
|
+
"""Get available ElevenLabs voices."""
|
|
180
|
+
if api_key is None:
|
|
181
|
+
api_key = os.environ.get('ELEVENLABS_API_KEY')
|
|
182
|
+
|
|
183
|
+
if not api_key:
|
|
184
|
+
return []
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
from elevenlabs.client import ElevenLabs
|
|
188
|
+
client = ElevenLabs(api_key=api_key)
|
|
189
|
+
voices = client.voices.get_all()
|
|
190
|
+
return [{"id": v.voice_id, "name": v.name} for v in voices.voices]
|
|
191
|
+
except Exception:
|
|
192
|
+
return []
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# =============================================================================
|
|
196
|
+
# OpenAI Realtime Voice API
|
|
197
|
+
# =============================================================================
|
|
198
|
+
|
|
199
|
+
async def openai_realtime_connect(
|
|
200
|
+
api_key: Optional[str] = None,
|
|
201
|
+
model: str = "gpt-4o-realtime-preview-2024-12-17",
|
|
202
|
+
voice: str = "alloy",
|
|
203
|
+
instructions: str = "You are a helpful assistant."
|
|
204
|
+
):
|
|
205
|
+
"""
|
|
206
|
+
Connect to OpenAI Realtime API.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
WebSocket connection
|
|
210
|
+
"""
|
|
211
|
+
import websockets
|
|
212
|
+
|
|
213
|
+
api_key = api_key or os.environ.get('OPENAI_API_KEY')
|
|
214
|
+
|
|
215
|
+
url = f"wss://api.openai.com/v1/realtime?model={model}"
|
|
216
|
+
headers = {
|
|
217
|
+
"Authorization": f"Bearer {api_key}",
|
|
218
|
+
"OpenAI-Beta": "realtime=v1"
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
ws = await websockets.connect(url, extra_headers=headers)
|
|
222
|
+
|
|
223
|
+
await ws.send(json.dumps({
|
|
224
|
+
"type": "session.update",
|
|
225
|
+
"session": {
|
|
226
|
+
"modalities": ["text", "audio"],
|
|
227
|
+
"instructions": instructions,
|
|
228
|
+
"voice": voice,
|
|
229
|
+
"input_audio_format": "pcm16",
|
|
230
|
+
"output_audio_format": "pcm16",
|
|
231
|
+
"input_audio_transcription": {"model": "whisper-1"},
|
|
232
|
+
"turn_detection": {
|
|
233
|
+
"type": "server_vad",
|
|
234
|
+
"threshold": 0.5,
|
|
235
|
+
"prefix_padding_ms": 300,
|
|
236
|
+
"silence_duration_ms": 500
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}))
|
|
240
|
+
|
|
241
|
+
while True:
|
|
242
|
+
msg = await ws.recv()
|
|
243
|
+
event = json.loads(msg)
|
|
244
|
+
if event.get("type") == "session.created":
|
|
245
|
+
break
|
|
246
|
+
elif event.get("type") == "error":
|
|
247
|
+
await ws.close()
|
|
248
|
+
raise Exception(f"OpenAI Realtime error: {event}")
|
|
249
|
+
|
|
250
|
+
return ws
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
async def openai_realtime_send_audio(ws, audio_data: bytes):
|
|
254
|
+
"""Send audio to OpenAI Realtime (PCM16, 24kHz, mono)."""
|
|
255
|
+
await ws.send(json.dumps({
|
|
256
|
+
"type": "input_audio_buffer.append",
|
|
257
|
+
"audio": base64.b64encode(audio_data).decode()
|
|
258
|
+
}))
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
async def openai_realtime_send_text(ws, text: str):
|
|
262
|
+
"""Send text message to OpenAI Realtime."""
|
|
263
|
+
await ws.send(json.dumps({
|
|
264
|
+
"type": "conversation.item.create",
|
|
265
|
+
"item": {
|
|
266
|
+
"type": "message",
|
|
267
|
+
"role": "user",
|
|
268
|
+
"content": [{"type": "input_text", "text": text}]
|
|
269
|
+
}
|
|
270
|
+
}))
|
|
271
|
+
await ws.send(json.dumps({"type": "response.create"}))
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
async def openai_realtime_receive(ws, on_audio=None, on_text=None):
|
|
275
|
+
"""
|
|
276
|
+
Receive response from OpenAI Realtime.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
ws: WebSocket connection
|
|
280
|
+
on_audio: Callback for audio chunks (bytes)
|
|
281
|
+
on_text: Callback for text chunks (str)
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Tuple of (full_audio_bytes, full_text)
|
|
285
|
+
"""
|
|
286
|
+
audio_chunks = []
|
|
287
|
+
text_chunks = []
|
|
288
|
+
|
|
289
|
+
async for message in ws:
|
|
290
|
+
event = json.loads(message)
|
|
291
|
+
event_type = event.get("type", "")
|
|
292
|
+
|
|
293
|
+
if event_type == "response.audio.delta":
|
|
294
|
+
audio = base64.b64decode(event.get("delta", ""))
|
|
295
|
+
audio_chunks.append(audio)
|
|
296
|
+
if on_audio:
|
|
297
|
+
on_audio(audio)
|
|
298
|
+
|
|
299
|
+
elif event_type == "response.text.delta":
|
|
300
|
+
text = event.get("delta", "")
|
|
301
|
+
text_chunks.append(text)
|
|
302
|
+
if on_text:
|
|
303
|
+
on_text(text)
|
|
304
|
+
|
|
305
|
+
elif event_type == "response.done":
|
|
306
|
+
break
|
|
307
|
+
|
|
308
|
+
return b''.join(audio_chunks), ''.join(text_chunks)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
async def tts_openai_realtime(
|
|
312
|
+
text: str,
|
|
313
|
+
api_key: Optional[str] = None,
|
|
314
|
+
voice: str = "alloy",
|
|
315
|
+
on_chunk: Optional[Callable[[bytes], None]] = None
|
|
316
|
+
) -> bytes:
|
|
317
|
+
"""
|
|
318
|
+
Use OpenAI Realtime API for TTS.
|
|
319
|
+
|
|
320
|
+
Returns PCM16 audio at 24kHz.
|
|
321
|
+
"""
|
|
322
|
+
ws = await openai_realtime_connect(api_key=api_key, voice=voice)
|
|
323
|
+
try:
|
|
324
|
+
await openai_realtime_send_text(ws, f"Please repeat exactly: {text}")
|
|
325
|
+
audio, _ = await openai_realtime_receive(ws, on_audio=on_chunk)
|
|
326
|
+
return audio
|
|
327
|
+
finally:
|
|
328
|
+
await ws.close()
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def get_openai_voices() -> list:
|
|
332
|
+
"""Get available OpenAI Realtime voices."""
|
|
333
|
+
return [
|
|
334
|
+
{"id": "alloy", "name": "Alloy"},
|
|
335
|
+
{"id": "echo", "name": "Echo"},
|
|
336
|
+
{"id": "shimmer", "name": "Shimmer"},
|
|
337
|
+
{"id": "ash", "name": "Ash"},
|
|
338
|
+
{"id": "ballad", "name": "Ballad"},
|
|
339
|
+
{"id": "coral", "name": "Coral"},
|
|
340
|
+
{"id": "sage", "name": "Sage"},
|
|
341
|
+
{"id": "verse", "name": "Verse"},
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
# =============================================================================
|
|
346
|
+
# Google Gemini Live API
|
|
347
|
+
# =============================================================================
|
|
348
|
+
|
|
349
|
+
async def gemini_live_connect(
|
|
350
|
+
api_key: Optional[str] = None,
|
|
351
|
+
model: str = "gemini-2.0-flash-exp",
|
|
352
|
+
voice: str = "Puck",
|
|
353
|
+
system_instruction: str = "You are a helpful assistant."
|
|
354
|
+
):
|
|
355
|
+
"""
|
|
356
|
+
Connect to Gemini Live API.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
WebSocket connection
|
|
360
|
+
"""
|
|
361
|
+
import websockets
|
|
362
|
+
|
|
363
|
+
api_key = api_key or os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY')
|
|
364
|
+
|
|
365
|
+
url = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
|
|
366
|
+
|
|
367
|
+
ws = await websockets.connect(url)
|
|
368
|
+
|
|
369
|
+
await ws.send(json.dumps({
|
|
370
|
+
"setup": {
|
|
371
|
+
"model": f"models/{model}",
|
|
372
|
+
"generation_config": {
|
|
373
|
+
"response_modalities": ["AUDIO"],
|
|
374
|
+
"speech_config": {
|
|
375
|
+
"voice_config": {
|
|
376
|
+
"prebuilt_voice_config": {"voice_name": voice}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
},
|
|
380
|
+
"system_instruction": {"parts": [{"text": system_instruction}]}
|
|
381
|
+
}
|
|
382
|
+
}))
|
|
383
|
+
|
|
384
|
+
response = await ws.recv()
|
|
385
|
+
data = json.loads(response)
|
|
386
|
+
if "setupComplete" not in data:
|
|
387
|
+
await ws.close()
|
|
388
|
+
raise Exception(f"Gemini Live setup failed: {data}")
|
|
389
|
+
|
|
390
|
+
return ws
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
async def gemini_live_send_audio(ws, audio_data: bytes, mime_type: str = "audio/pcm"):
|
|
394
|
+
"""Send audio to Gemini Live."""
|
|
395
|
+
await ws.send(json.dumps({
|
|
396
|
+
"realtime_input": {
|
|
397
|
+
"media_chunks": [{
|
|
398
|
+
"data": base64.b64encode(audio_data).decode(),
|
|
399
|
+
"mime_type": mime_type
|
|
400
|
+
}]
|
|
401
|
+
}
|
|
402
|
+
}))
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
async def gemini_live_send_text(ws, text: str):
|
|
406
|
+
"""Send text message to Gemini Live."""
|
|
407
|
+
await ws.send(json.dumps({
|
|
408
|
+
"client_content": {
|
|
409
|
+
"turns": [{"role": "user", "parts": [{"text": text}]}],
|
|
410
|
+
"turn_complete": True
|
|
411
|
+
}
|
|
412
|
+
}))
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
async def gemini_live_receive(ws, on_audio=None, on_text=None):
|
|
416
|
+
"""
|
|
417
|
+
Receive response from Gemini Live.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Tuple of (full_audio_bytes, full_text)
|
|
421
|
+
"""
|
|
422
|
+
audio_chunks = []
|
|
423
|
+
text_chunks = []
|
|
424
|
+
|
|
425
|
+
async for message in ws:
|
|
426
|
+
data = json.loads(message)
|
|
427
|
+
|
|
428
|
+
if "serverContent" in data:
|
|
429
|
+
content = data["serverContent"]
|
|
430
|
+
|
|
431
|
+
if "modelTurn" in content:
|
|
432
|
+
for part in content["modelTurn"].get("parts", []):
|
|
433
|
+
if "inlineData" in part:
|
|
434
|
+
audio = base64.b64decode(part["inlineData"].get("data", ""))
|
|
435
|
+
audio_chunks.append(audio)
|
|
436
|
+
if on_audio:
|
|
437
|
+
on_audio(audio)
|
|
438
|
+
elif "text" in part:
|
|
439
|
+
text_chunks.append(part["text"])
|
|
440
|
+
if on_text:
|
|
441
|
+
on_text(part["text"])
|
|
442
|
+
|
|
443
|
+
if content.get("turnComplete"):
|
|
444
|
+
break
|
|
445
|
+
|
|
446
|
+
return b''.join(audio_chunks), ''.join(text_chunks)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
async def tts_gemini_live(
|
|
450
|
+
text: str,
|
|
451
|
+
api_key: Optional[str] = None,
|
|
452
|
+
voice: str = "Puck",
|
|
453
|
+
on_chunk: Optional[Callable[[bytes], None]] = None
|
|
454
|
+
) -> bytes:
|
|
455
|
+
"""
|
|
456
|
+
Use Gemini Live API for TTS.
|
|
457
|
+
|
|
458
|
+
Returns PCM audio.
|
|
459
|
+
"""
|
|
460
|
+
ws = await gemini_live_connect(api_key=api_key, voice=voice)
|
|
461
|
+
try:
|
|
462
|
+
await gemini_live_send_text(ws, f"Please repeat exactly: {text}")
|
|
463
|
+
audio, _ = await gemini_live_receive(ws, on_audio=on_chunk)
|
|
464
|
+
return audio
|
|
465
|
+
finally:
|
|
466
|
+
await ws.close()
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def get_gemini_voices() -> list:
|
|
470
|
+
"""Get available Gemini Live voices."""
|
|
471
|
+
return [
|
|
472
|
+
{"id": "Puck", "name": "Puck"},
|
|
473
|
+
{"id": "Charon", "name": "Charon"},
|
|
474
|
+
{"id": "Kore", "name": "Kore"},
|
|
475
|
+
{"id": "Fenrir", "name": "Fenrir"},
|
|
476
|
+
{"id": "Aoede", "name": "Aoede"},
|
|
477
|
+
]
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# =============================================================================
|
|
481
|
+
# gTTS (Google Text-to-Speech) - Fallback
|
|
482
|
+
# =============================================================================
|
|
483
|
+
|
|
484
|
+
def tts_gtts(text: str, lang: str = "en") -> bytes:
|
|
485
|
+
"""
|
|
486
|
+
Generate speech using gTTS.
|
|
487
|
+
|
|
488
|
+
Returns MP3 audio bytes.
|
|
489
|
+
"""
|
|
490
|
+
from gtts import gTTS
|
|
491
|
+
|
|
492
|
+
tts = gTTS(text=text, lang=lang)
|
|
493
|
+
|
|
494
|
+
mp3_buffer = io.BytesIO()
|
|
495
|
+
tts.write_to_fp(mp3_buffer)
|
|
496
|
+
mp3_buffer.seek(0)
|
|
497
|
+
return mp3_buffer.read()
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def get_gtts_voices() -> list:
|
|
501
|
+
"""Get available gTTS languages."""
|
|
502
|
+
return [
|
|
503
|
+
{"id": "en", "name": "English"},
|
|
504
|
+
{"id": "es", "name": "Spanish"},
|
|
505
|
+
{"id": "fr", "name": "French"},
|
|
506
|
+
{"id": "de", "name": "German"},
|
|
507
|
+
{"id": "it", "name": "Italian"},
|
|
508
|
+
{"id": "pt", "name": "Portuguese"},
|
|
509
|
+
{"id": "ja", "name": "Japanese"},
|
|
510
|
+
{"id": "ko", "name": "Korean"},
|
|
511
|
+
{"id": "zh-CN", "name": "Chinese"},
|
|
512
|
+
]
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
# =============================================================================
|
|
516
|
+
# Unified Interface
|
|
517
|
+
# =============================================================================
|
|
518
|
+
|
|
519
|
+
def text_to_speech(
|
|
520
|
+
text: str,
|
|
521
|
+
engine: str = "kokoro",
|
|
522
|
+
voice: Optional[str] = None,
|
|
523
|
+
**kwargs
|
|
524
|
+
) -> bytes:
|
|
525
|
+
"""
|
|
526
|
+
Unified TTS interface.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
text: Text to synthesize
|
|
530
|
+
engine: TTS engine (kokoro, elevenlabs, openai, gemini, gtts)
|
|
531
|
+
voice: Voice ID (engine-specific)
|
|
532
|
+
**kwargs: Engine-specific options
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
Audio bytes (format depends on engine)
|
|
536
|
+
"""
|
|
537
|
+
engine = engine.lower()
|
|
538
|
+
|
|
539
|
+
if engine == "kokoro":
|
|
540
|
+
voice = voice or "af_heart"
|
|
541
|
+
voices = {v["id"]: v for v in get_kokoro_voices()}
|
|
542
|
+
lang_code = voices.get(voice, {}).get("lang", "a")
|
|
543
|
+
return tts_kokoro(text, voice=voice, lang_code=lang_code, **kwargs)
|
|
544
|
+
|
|
545
|
+
elif engine == "elevenlabs":
|
|
546
|
+
voice = voice or "JBFqnCBsd6RMkjVDRZzb"
|
|
547
|
+
return tts_elevenlabs(text, voice_id=voice, **kwargs)
|
|
548
|
+
|
|
549
|
+
elif engine == "openai":
|
|
550
|
+
voice = voice or "alloy"
|
|
551
|
+
return asyncio.run(tts_openai_realtime(text, voice=voice, **kwargs))
|
|
552
|
+
|
|
553
|
+
elif engine == "gemini":
|
|
554
|
+
voice = voice or "Puck"
|
|
555
|
+
return asyncio.run(tts_gemini_live(text, voice=voice, **kwargs))
|
|
556
|
+
|
|
557
|
+
elif engine == "gtts":
|
|
558
|
+
lang = voice if voice and len(voice) <= 5 else "en"
|
|
559
|
+
return tts_gtts(text, lang=lang)
|
|
560
|
+
|
|
561
|
+
else:
|
|
562
|
+
raise ValueError(f"Unknown TTS engine: {engine}")
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def get_available_voices(engine: str = "kokoro") -> list:
|
|
566
|
+
"""Get available voices for an engine."""
|
|
567
|
+
engine = engine.lower()
|
|
568
|
+
|
|
569
|
+
if engine == "kokoro":
|
|
570
|
+
return get_kokoro_voices()
|
|
571
|
+
elif engine == "elevenlabs":
|
|
572
|
+
return get_elevenlabs_voices()
|
|
573
|
+
elif engine == "openai":
|
|
574
|
+
return get_openai_voices()
|
|
575
|
+
elif engine == "gemini":
|
|
576
|
+
return get_gemini_voices()
|
|
577
|
+
elif engine == "gtts":
|
|
578
|
+
return get_gtts_voices()
|
|
579
|
+
else:
|
|
580
|
+
return []
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def get_available_engines() -> dict:
|
|
584
|
+
"""Get info about available TTS engines."""
|
|
585
|
+
engines = {
|
|
586
|
+
"kokoro": {
|
|
587
|
+
"name": "Kokoro",
|
|
588
|
+
"type": "local",
|
|
589
|
+
"available": False,
|
|
590
|
+
"description": "Local neural TTS (82M params)",
|
|
591
|
+
"install": "pip install kokoro soundfile"
|
|
592
|
+
},
|
|
593
|
+
"elevenlabs": {
|
|
594
|
+
"name": "ElevenLabs",
|
|
595
|
+
"type": "cloud",
|
|
596
|
+
"available": False,
|
|
597
|
+
"description": "High-quality cloud TTS",
|
|
598
|
+
"requires": "ELEVENLABS_API_KEY"
|
|
599
|
+
},
|
|
600
|
+
"openai": {
|
|
601
|
+
"name": "OpenAI Realtime",
|
|
602
|
+
"type": "cloud",
|
|
603
|
+
"available": False,
|
|
604
|
+
"description": "OpenAI real-time voice API",
|
|
605
|
+
"requires": "OPENAI_API_KEY"
|
|
606
|
+
},
|
|
607
|
+
"gemini": {
|
|
608
|
+
"name": "Gemini Live",
|
|
609
|
+
"type": "cloud",
|
|
610
|
+
"available": False,
|
|
611
|
+
"description": "Google Gemini real-time voice",
|
|
612
|
+
"requires": "GOOGLE_API_KEY or GEMINI_API_KEY"
|
|
613
|
+
},
|
|
614
|
+
"gtts": {
|
|
615
|
+
"name": "Google TTS",
|
|
616
|
+
"type": "cloud",
|
|
617
|
+
"available": False,
|
|
618
|
+
"description": "Free Google TTS"
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
try:
|
|
623
|
+
from kokoro import KPipeline
|
|
624
|
+
engines["kokoro"]["available"] = True
|
|
625
|
+
except ImportError:
|
|
626
|
+
pass
|
|
627
|
+
|
|
628
|
+
if os.environ.get('ELEVENLABS_API_KEY'):
|
|
629
|
+
engines["elevenlabs"]["available"] = True
|
|
630
|
+
|
|
631
|
+
if os.environ.get('OPENAI_API_KEY'):
|
|
632
|
+
engines["openai"]["available"] = True
|
|
633
|
+
|
|
634
|
+
if os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY'):
|
|
635
|
+
engines["gemini"]["available"] = True
|
|
636
|
+
|
|
637
|
+
try:
|
|
638
|
+
from gtts import gTTS
|
|
639
|
+
engines["gtts"]["available"] = True
|
|
640
|
+
except ImportError:
|
|
641
|
+
pass
|
|
642
|
+
|
|
643
|
+
return engines
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
# =============================================================================
|
|
647
|
+
# Audio Utilities
|
|
648
|
+
# =============================================================================
|
|
649
|
+
|
|
650
|
+
def pcm16_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1) -> bytes:
|
|
651
|
+
"""Convert raw PCM16 audio to WAV format."""
|
|
652
|
+
import struct
|
|
653
|
+
|
|
654
|
+
wav_buffer = io.BytesIO()
|
|
655
|
+
wav_buffer.write(b'RIFF')
|
|
656
|
+
wav_buffer.write(struct.pack('<I', 36 + len(pcm_data)))
|
|
657
|
+
wav_buffer.write(b'WAVE')
|
|
658
|
+
wav_buffer.write(b'fmt ')
|
|
659
|
+
wav_buffer.write(struct.pack('<I', 16))
|
|
660
|
+
wav_buffer.write(struct.pack('<H', 1))
|
|
661
|
+
wav_buffer.write(struct.pack('<H', channels))
|
|
662
|
+
wav_buffer.write(struct.pack('<I', sample_rate))
|
|
663
|
+
wav_buffer.write(struct.pack('<I', sample_rate * channels * 2))
|
|
664
|
+
wav_buffer.write(struct.pack('<H', channels * 2))
|
|
665
|
+
wav_buffer.write(struct.pack('<H', 16))
|
|
666
|
+
wav_buffer.write(b'data')
|
|
667
|
+
wav_buffer.write(struct.pack('<I', len(pcm_data)))
|
|
668
|
+
wav_buffer.write(pcm_data)
|
|
669
|
+
|
|
670
|
+
wav_buffer.seek(0)
|
|
671
|
+
return wav_buffer.read()
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
def wav_to_pcm16(wav_data: bytes) -> tuple:
|
|
675
|
+
"""Extract PCM16 data from WAV. Returns (pcm_data, sample_rate)."""
|
|
676
|
+
import struct
|
|
677
|
+
|
|
678
|
+
if wav_data[:4] != b'RIFF' or wav_data[8:12] != b'WAVE':
|
|
679
|
+
raise ValueError("Invalid WAV data")
|
|
680
|
+
|
|
681
|
+
pos = 12
|
|
682
|
+
sample_rate = 24000
|
|
683
|
+
while pos < len(wav_data) - 8:
|
|
684
|
+
chunk_id = wav_data[pos:pos+4]
|
|
685
|
+
chunk_size = struct.unpack('<I', wav_data[pos+4:pos+8])[0]
|
|
686
|
+
|
|
687
|
+
if chunk_id == b'fmt ':
|
|
688
|
+
sample_rate = struct.unpack('<I', wav_data[pos+12:pos+16])[0]
|
|
689
|
+
elif chunk_id == b'data':
|
|
690
|
+
return wav_data[pos+8:pos+8+chunk_size], sample_rate
|
|
691
|
+
|
|
692
|
+
pos += 8 + chunk_size
|
|
693
|
+
|
|
694
|
+
raise ValueError("No data chunk found in WAV")
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def audio_to_base64(audio_data: bytes) -> str:
|
|
698
|
+
"""Encode audio to base64 string."""
|
|
699
|
+
return base64.b64encode(audio_data).decode('utf-8')
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def base64_to_audio(b64_string: str) -> bytes:
|
|
703
|
+
"""Decode base64 string to audio bytes."""
|
|
704
|
+
return base64.b64decode(b64_string)
|