npcpy 1.3.10__py3-none-any.whl → 1.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
npcpy/gen/audio_gen.py CHANGED
@@ -1,24 +1,704 @@
1
+ """
2
+ Audio Generation Module for NPC
3
+ Supports multiple TTS engines including real-time voice APIs.
4
+
5
+ TTS Engines:
6
+ - Kokoro: Local neural TTS (default)
7
+ - ElevenLabs: Cloud TTS with streaming
8
+ - OpenAI: Realtime voice API
9
+ - Gemini: Live API for real-time voice
10
+ - gTTS: Google TTS fallback
11
+
12
+ Usage:
13
+ from npcpy.gen.audio_gen import text_to_speech
14
+
15
+ audio = text_to_speech("Hello world", engine="kokoro", voice="af_heart")
16
+
17
+ For STT, see npcpy.data.audio
18
+ """
19
+
1
20
  import os
2
- def tts_elevenlabs(text,
3
- api_key=None,
4
- voice_id='JBFqnCBsd6RMkjVDRZzb',
5
- model_id='eleven_multilingual_v2',
6
- output_format= 'mp3_44100_128'):
21
+ import io
22
+ import base64
23
+ import json
24
+ import asyncio
25
+ import tempfile
26
+ from typing import Optional, Callable, Any
27
+
28
+
29
+ # =============================================================================
30
+ # Kokoro TTS (Local Neural)
31
+ # =============================================================================
32
+
33
+ def tts_kokoro(
34
+ text: str,
35
+ voice: str = "af_heart",
36
+ lang_code: str = "a",
37
+ speed: float = 1.0
38
+ ) -> bytes:
39
+ """
40
+ Generate speech using Kokoro local neural TTS.
41
+
42
+ Args:
43
+ text: Text to synthesize
44
+ voice: Voice ID (af_heart, am_adam, bf_emma, etc.)
45
+ lang_code: 'a' for American, 'b' for British
46
+ speed: Speech speed multiplier
47
+
48
+ Returns:
49
+ WAV audio bytes
50
+ """
51
+ from kokoro import KPipeline
52
+ import soundfile as sf
53
+ import numpy as np
54
+
55
+ pipeline = KPipeline(lang_code=lang_code)
56
+
57
+ audio_chunks = []
58
+ for _, _, audio in pipeline(text, voice=voice, speed=speed):
59
+ audio_chunks.append(audio)
60
+
61
+ if not audio_chunks:
62
+ raise ValueError("No audio generated")
63
+
64
+ full_audio = np.concatenate(audio_chunks)
65
+
66
+ wav_buffer = io.BytesIO()
67
+ sf.write(wav_buffer, full_audio, 24000, format='WAV')
68
+ wav_buffer.seek(0)
69
+ return wav_buffer.read()
70
+
71
+
72
+ def get_kokoro_voices() -> list:
73
+ """Get available Kokoro voices."""
74
+ return [
75
+ {"id": "af_heart", "name": "Heart", "gender": "female", "lang": "a"},
76
+ {"id": "af_bella", "name": "Bella", "gender": "female", "lang": "a"},
77
+ {"id": "af_sarah", "name": "Sarah", "gender": "female", "lang": "a"},
78
+ {"id": "af_nicole", "name": "Nicole", "gender": "female", "lang": "a"},
79
+ {"id": "af_sky", "name": "Sky", "gender": "female", "lang": "a"},
80
+ {"id": "am_adam", "name": "Adam", "gender": "male", "lang": "a"},
81
+ {"id": "am_michael", "name": "Michael", "gender": "male", "lang": "a"},
82
+ {"id": "bf_emma", "name": "Emma", "gender": "female", "lang": "b"},
83
+ {"id": "bf_isabella", "name": "Isabella", "gender": "female", "lang": "b"},
84
+ {"id": "bm_george", "name": "George", "gender": "male", "lang": "b"},
85
+ {"id": "bm_lewis", "name": "Lewis", "gender": "male", "lang": "b"},
86
+ ]
87
+
88
+
89
+ # =============================================================================
90
+ # ElevenLabs TTS
91
+ # =============================================================================
92
+
93
+ def tts_elevenlabs(
94
+ text: str,
95
+ api_key: Optional[str] = None,
96
+ voice_id: str = 'JBFqnCBsd6RMkjVDRZzb',
97
+ model_id: str = 'eleven_multilingual_v2',
98
+ output_format: str = 'mp3_44100_128'
99
+ ) -> bytes:
100
+ """
101
+ Generate speech using ElevenLabs API.
102
+
103
+ Returns:
104
+ MP3 audio bytes
105
+ """
7
106
  if api_key is None:
8
107
  api_key = os.environ.get('ELEVENLABS_API_KEY')
108
+
109
+ if not api_key:
110
+ raise ValueError("ELEVENLABS_API_KEY not set")
111
+
9
112
  from elevenlabs.client import ElevenLabs
10
- from elevenlabs import play
11
113
 
12
- client = ElevenLabs(
13
- api_key=api_key,
14
- )
114
+ client = ElevenLabs(api_key=api_key)
15
115
 
16
- audio = client.text_to_speech.convert(
116
+ audio_generator = client.text_to_speech.convert(
17
117
  text=text,
18
118
  voice_id=voice_id,
19
119
  model_id=model_id,
20
- output_format= output_format
120
+ output_format=output_format
21
121
  )
22
122
 
23
- play(audio)
24
- return audio
123
+ return b''.join(chunk for chunk in audio_generator)
124
+
125
+
126
+ async def tts_elevenlabs_stream(
127
+ text: str,
128
+ api_key: Optional[str] = None,
129
+ voice_id: str = 'JBFqnCBsd6RMkjVDRZzb',
130
+ model_id: str = 'eleven_turbo_v2_5',
131
+ on_chunk: Optional[Callable[[bytes], None]] = None
132
+ ) -> bytes:
133
+ """
134
+ Stream TTS via ElevenLabs WebSocket for lowest latency.
135
+
136
+ Args:
137
+ text: Text to synthesize
138
+ api_key: ElevenLabs API key
139
+ voice_id: Voice to use
140
+ model_id: Model (eleven_turbo_v2_5 for fastest)
141
+ on_chunk: Callback for each audio chunk
142
+
143
+ Returns:
144
+ Complete audio bytes
145
+ """
146
+ import websockets
147
+
148
+ if api_key is None:
149
+ api_key = os.environ.get('ELEVENLABS_API_KEY')
150
+
151
+ uri = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?model_id={model_id}"
152
+
153
+ all_audio = []
154
+
155
+ async with websockets.connect(uri) as ws:
156
+ await ws.send(json.dumps({
157
+ "text": " ",
158
+ "voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
159
+ "xi_api_key": api_key
160
+ }))
161
+
162
+ await ws.send(json.dumps({"text": text}))
163
+ await ws.send(json.dumps({"text": ""}))
164
+
165
+ async for message in ws:
166
+ data = json.loads(message)
167
+ if "audio" in data:
168
+ chunk = base64.b64decode(data["audio"])
169
+ all_audio.append(chunk)
170
+ if on_chunk:
171
+ on_chunk(chunk)
172
+ if data.get("isFinal"):
173
+ break
174
+
175
+ return b''.join(all_audio)
176
+
177
+
178
+ def get_elevenlabs_voices(api_key: Optional[str] = None) -> list:
179
+ """Get available ElevenLabs voices."""
180
+ if api_key is None:
181
+ api_key = os.environ.get('ELEVENLABS_API_KEY')
182
+
183
+ if not api_key:
184
+ return []
185
+
186
+ try:
187
+ from elevenlabs.client import ElevenLabs
188
+ client = ElevenLabs(api_key=api_key)
189
+ voices = client.voices.get_all()
190
+ return [{"id": v.voice_id, "name": v.name} for v in voices.voices]
191
+ except Exception:
192
+ return []
193
+
194
+
195
+ # =============================================================================
196
+ # OpenAI Realtime Voice API
197
+ # =============================================================================
198
+
199
+ async def openai_realtime_connect(
200
+ api_key: Optional[str] = None,
201
+ model: str = "gpt-4o-realtime-preview-2024-12-17",
202
+ voice: str = "alloy",
203
+ instructions: str = "You are a helpful assistant."
204
+ ):
205
+ """
206
+ Connect to OpenAI Realtime API.
207
+
208
+ Returns:
209
+ WebSocket connection
210
+ """
211
+ import websockets
212
+
213
+ api_key = api_key or os.environ.get('OPENAI_API_KEY')
214
+
215
+ url = f"wss://api.openai.com/v1/realtime?model={model}"
216
+ headers = {
217
+ "Authorization": f"Bearer {api_key}",
218
+ "OpenAI-Beta": "realtime=v1"
219
+ }
220
+
221
+ ws = await websockets.connect(url, extra_headers=headers)
222
+
223
+ await ws.send(json.dumps({
224
+ "type": "session.update",
225
+ "session": {
226
+ "modalities": ["text", "audio"],
227
+ "instructions": instructions,
228
+ "voice": voice,
229
+ "input_audio_format": "pcm16",
230
+ "output_audio_format": "pcm16",
231
+ "input_audio_transcription": {"model": "whisper-1"},
232
+ "turn_detection": {
233
+ "type": "server_vad",
234
+ "threshold": 0.5,
235
+ "prefix_padding_ms": 300,
236
+ "silence_duration_ms": 500
237
+ }
238
+ }
239
+ }))
240
+
241
+ while True:
242
+ msg = await ws.recv()
243
+ event = json.loads(msg)
244
+ if event.get("type") == "session.created":
245
+ break
246
+ elif event.get("type") == "error":
247
+ await ws.close()
248
+ raise Exception(f"OpenAI Realtime error: {event}")
249
+
250
+ return ws
251
+
252
+
253
+ async def openai_realtime_send_audio(ws, audio_data: bytes):
254
+ """Send audio to OpenAI Realtime (PCM16, 24kHz, mono)."""
255
+ await ws.send(json.dumps({
256
+ "type": "input_audio_buffer.append",
257
+ "audio": base64.b64encode(audio_data).decode()
258
+ }))
259
+
260
+
261
+ async def openai_realtime_send_text(ws, text: str):
262
+ """Send text message to OpenAI Realtime."""
263
+ await ws.send(json.dumps({
264
+ "type": "conversation.item.create",
265
+ "item": {
266
+ "type": "message",
267
+ "role": "user",
268
+ "content": [{"type": "input_text", "text": text}]
269
+ }
270
+ }))
271
+ await ws.send(json.dumps({"type": "response.create"}))
272
+
273
+
274
+ async def openai_realtime_receive(ws, on_audio=None, on_text=None):
275
+ """
276
+ Receive response from OpenAI Realtime.
277
+
278
+ Args:
279
+ ws: WebSocket connection
280
+ on_audio: Callback for audio chunks (bytes)
281
+ on_text: Callback for text chunks (str)
282
+
283
+ Returns:
284
+ Tuple of (full_audio_bytes, full_text)
285
+ """
286
+ audio_chunks = []
287
+ text_chunks = []
288
+
289
+ async for message in ws:
290
+ event = json.loads(message)
291
+ event_type = event.get("type", "")
292
+
293
+ if event_type == "response.audio.delta":
294
+ audio = base64.b64decode(event.get("delta", ""))
295
+ audio_chunks.append(audio)
296
+ if on_audio:
297
+ on_audio(audio)
298
+
299
+ elif event_type == "response.text.delta":
300
+ text = event.get("delta", "")
301
+ text_chunks.append(text)
302
+ if on_text:
303
+ on_text(text)
304
+
305
+ elif event_type == "response.done":
306
+ break
307
+
308
+ return b''.join(audio_chunks), ''.join(text_chunks)
309
+
310
+
311
+ async def tts_openai_realtime(
312
+ text: str,
313
+ api_key: Optional[str] = None,
314
+ voice: str = "alloy",
315
+ on_chunk: Optional[Callable[[bytes], None]] = None
316
+ ) -> bytes:
317
+ """
318
+ Use OpenAI Realtime API for TTS.
319
+
320
+ Returns PCM16 audio at 24kHz.
321
+ """
322
+ ws = await openai_realtime_connect(api_key=api_key, voice=voice)
323
+ try:
324
+ await openai_realtime_send_text(ws, f"Please repeat exactly: {text}")
325
+ audio, _ = await openai_realtime_receive(ws, on_audio=on_chunk)
326
+ return audio
327
+ finally:
328
+ await ws.close()
329
+
330
+
331
+ def get_openai_voices() -> list:
332
+ """Get available OpenAI Realtime voices."""
333
+ return [
334
+ {"id": "alloy", "name": "Alloy"},
335
+ {"id": "echo", "name": "Echo"},
336
+ {"id": "shimmer", "name": "Shimmer"},
337
+ {"id": "ash", "name": "Ash"},
338
+ {"id": "ballad", "name": "Ballad"},
339
+ {"id": "coral", "name": "Coral"},
340
+ {"id": "sage", "name": "Sage"},
341
+ {"id": "verse", "name": "Verse"},
342
+ ]
343
+
344
+
345
+ # =============================================================================
346
+ # Google Gemini Live API
347
+ # =============================================================================
348
+
349
+ async def gemini_live_connect(
350
+ api_key: Optional[str] = None,
351
+ model: str = "gemini-2.0-flash-exp",
352
+ voice: str = "Puck",
353
+ system_instruction: str = "You are a helpful assistant."
354
+ ):
355
+ """
356
+ Connect to Gemini Live API.
357
+
358
+ Returns:
359
+ WebSocket connection
360
+ """
361
+ import websockets
362
+
363
+ api_key = api_key or os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY')
364
+
365
+ url = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={api_key}"
366
+
367
+ ws = await websockets.connect(url)
368
+
369
+ await ws.send(json.dumps({
370
+ "setup": {
371
+ "model": f"models/{model}",
372
+ "generation_config": {
373
+ "response_modalities": ["AUDIO"],
374
+ "speech_config": {
375
+ "voice_config": {
376
+ "prebuilt_voice_config": {"voice_name": voice}
377
+ }
378
+ }
379
+ },
380
+ "system_instruction": {"parts": [{"text": system_instruction}]}
381
+ }
382
+ }))
383
+
384
+ response = await ws.recv()
385
+ data = json.loads(response)
386
+ if "setupComplete" not in data:
387
+ await ws.close()
388
+ raise Exception(f"Gemini Live setup failed: {data}")
389
+
390
+ return ws
391
+
392
+
393
+ async def gemini_live_send_audio(ws, audio_data: bytes, mime_type: str = "audio/pcm"):
394
+ """Send audio to Gemini Live."""
395
+ await ws.send(json.dumps({
396
+ "realtime_input": {
397
+ "media_chunks": [{
398
+ "data": base64.b64encode(audio_data).decode(),
399
+ "mime_type": mime_type
400
+ }]
401
+ }
402
+ }))
403
+
404
+
405
+ async def gemini_live_send_text(ws, text: str):
406
+ """Send text message to Gemini Live."""
407
+ await ws.send(json.dumps({
408
+ "client_content": {
409
+ "turns": [{"role": "user", "parts": [{"text": text}]}],
410
+ "turn_complete": True
411
+ }
412
+ }))
413
+
414
+
415
+ async def gemini_live_receive(ws, on_audio=None, on_text=None):
416
+ """
417
+ Receive response from Gemini Live.
418
+
419
+ Returns:
420
+ Tuple of (full_audio_bytes, full_text)
421
+ """
422
+ audio_chunks = []
423
+ text_chunks = []
424
+
425
+ async for message in ws:
426
+ data = json.loads(message)
427
+
428
+ if "serverContent" in data:
429
+ content = data["serverContent"]
430
+
431
+ if "modelTurn" in content:
432
+ for part in content["modelTurn"].get("parts", []):
433
+ if "inlineData" in part:
434
+ audio = base64.b64decode(part["inlineData"].get("data", ""))
435
+ audio_chunks.append(audio)
436
+ if on_audio:
437
+ on_audio(audio)
438
+ elif "text" in part:
439
+ text_chunks.append(part["text"])
440
+ if on_text:
441
+ on_text(part["text"])
442
+
443
+ if content.get("turnComplete"):
444
+ break
445
+
446
+ return b''.join(audio_chunks), ''.join(text_chunks)
447
+
448
+
449
+ async def tts_gemini_live(
450
+ text: str,
451
+ api_key: Optional[str] = None,
452
+ voice: str = "Puck",
453
+ on_chunk: Optional[Callable[[bytes], None]] = None
454
+ ) -> bytes:
455
+ """
456
+ Use Gemini Live API for TTS.
457
+
458
+ Returns PCM audio.
459
+ """
460
+ ws = await gemini_live_connect(api_key=api_key, voice=voice)
461
+ try:
462
+ await gemini_live_send_text(ws, f"Please repeat exactly: {text}")
463
+ audio, _ = await gemini_live_receive(ws, on_audio=on_chunk)
464
+ return audio
465
+ finally:
466
+ await ws.close()
467
+
468
+
469
+ def get_gemini_voices() -> list:
470
+ """Get available Gemini Live voices."""
471
+ return [
472
+ {"id": "Puck", "name": "Puck"},
473
+ {"id": "Charon", "name": "Charon"},
474
+ {"id": "Kore", "name": "Kore"},
475
+ {"id": "Fenrir", "name": "Fenrir"},
476
+ {"id": "Aoede", "name": "Aoede"},
477
+ ]
478
+
479
+
480
+ # =============================================================================
481
+ # gTTS (Google Text-to-Speech) - Fallback
482
+ # =============================================================================
483
+
484
+ def tts_gtts(text: str, lang: str = "en") -> bytes:
485
+ """
486
+ Generate speech using gTTS.
487
+
488
+ Returns MP3 audio bytes.
489
+ """
490
+ from gtts import gTTS
491
+
492
+ tts = gTTS(text=text, lang=lang)
493
+
494
+ mp3_buffer = io.BytesIO()
495
+ tts.write_to_fp(mp3_buffer)
496
+ mp3_buffer.seek(0)
497
+ return mp3_buffer.read()
498
+
499
+
500
+ def get_gtts_voices() -> list:
501
+ """Get available gTTS languages."""
502
+ return [
503
+ {"id": "en", "name": "English"},
504
+ {"id": "es", "name": "Spanish"},
505
+ {"id": "fr", "name": "French"},
506
+ {"id": "de", "name": "German"},
507
+ {"id": "it", "name": "Italian"},
508
+ {"id": "pt", "name": "Portuguese"},
509
+ {"id": "ja", "name": "Japanese"},
510
+ {"id": "ko", "name": "Korean"},
511
+ {"id": "zh-CN", "name": "Chinese"},
512
+ ]
513
+
514
+
515
+ # =============================================================================
516
+ # Unified Interface
517
+ # =============================================================================
518
+
519
+ def text_to_speech(
520
+ text: str,
521
+ engine: str = "kokoro",
522
+ voice: Optional[str] = None,
523
+ **kwargs
524
+ ) -> bytes:
525
+ """
526
+ Unified TTS interface.
527
+
528
+ Args:
529
+ text: Text to synthesize
530
+ engine: TTS engine (kokoro, elevenlabs, openai, gemini, gtts)
531
+ voice: Voice ID (engine-specific)
532
+ **kwargs: Engine-specific options
533
+
534
+ Returns:
535
+ Audio bytes (format depends on engine)
536
+ """
537
+ engine = engine.lower()
538
+
539
+ if engine == "kokoro":
540
+ voice = voice or "af_heart"
541
+ voices = {v["id"]: v for v in get_kokoro_voices()}
542
+ lang_code = voices.get(voice, {}).get("lang", "a")
543
+ return tts_kokoro(text, voice=voice, lang_code=lang_code, **kwargs)
544
+
545
+ elif engine == "elevenlabs":
546
+ voice = voice or "JBFqnCBsd6RMkjVDRZzb"
547
+ return tts_elevenlabs(text, voice_id=voice, **kwargs)
548
+
549
+ elif engine == "openai":
550
+ voice = voice or "alloy"
551
+ return asyncio.run(tts_openai_realtime(text, voice=voice, **kwargs))
552
+
553
+ elif engine == "gemini":
554
+ voice = voice or "Puck"
555
+ return asyncio.run(tts_gemini_live(text, voice=voice, **kwargs))
556
+
557
+ elif engine == "gtts":
558
+ lang = voice if voice and len(voice) <= 5 else "en"
559
+ return tts_gtts(text, lang=lang)
560
+
561
+ else:
562
+ raise ValueError(f"Unknown TTS engine: {engine}")
563
+
564
+
565
+ def get_available_voices(engine: str = "kokoro") -> list:
566
+ """Get available voices for an engine."""
567
+ engine = engine.lower()
568
+
569
+ if engine == "kokoro":
570
+ return get_kokoro_voices()
571
+ elif engine == "elevenlabs":
572
+ return get_elevenlabs_voices()
573
+ elif engine == "openai":
574
+ return get_openai_voices()
575
+ elif engine == "gemini":
576
+ return get_gemini_voices()
577
+ elif engine == "gtts":
578
+ return get_gtts_voices()
579
+ else:
580
+ return []
581
+
582
+
583
+ def get_available_engines() -> dict:
584
+ """Get info about available TTS engines."""
585
+ engines = {
586
+ "kokoro": {
587
+ "name": "Kokoro",
588
+ "type": "local",
589
+ "available": False,
590
+ "description": "Local neural TTS (82M params)",
591
+ "install": "pip install kokoro soundfile"
592
+ },
593
+ "elevenlabs": {
594
+ "name": "ElevenLabs",
595
+ "type": "cloud",
596
+ "available": False,
597
+ "description": "High-quality cloud TTS",
598
+ "requires": "ELEVENLABS_API_KEY"
599
+ },
600
+ "openai": {
601
+ "name": "OpenAI Realtime",
602
+ "type": "cloud",
603
+ "available": False,
604
+ "description": "OpenAI real-time voice API",
605
+ "requires": "OPENAI_API_KEY"
606
+ },
607
+ "gemini": {
608
+ "name": "Gemini Live",
609
+ "type": "cloud",
610
+ "available": False,
611
+ "description": "Google Gemini real-time voice",
612
+ "requires": "GOOGLE_API_KEY or GEMINI_API_KEY"
613
+ },
614
+ "gtts": {
615
+ "name": "Google TTS",
616
+ "type": "cloud",
617
+ "available": False,
618
+ "description": "Free Google TTS"
619
+ }
620
+ }
621
+
622
+ try:
623
+ from kokoro import KPipeline
624
+ engines["kokoro"]["available"] = True
625
+ except ImportError:
626
+ pass
627
+
628
+ if os.environ.get('ELEVENLABS_API_KEY'):
629
+ engines["elevenlabs"]["available"] = True
630
+
631
+ if os.environ.get('OPENAI_API_KEY'):
632
+ engines["openai"]["available"] = True
633
+
634
+ if os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY'):
635
+ engines["gemini"]["available"] = True
636
+
637
+ try:
638
+ from gtts import gTTS
639
+ engines["gtts"]["available"] = True
640
+ except ImportError:
641
+ pass
642
+
643
+ return engines
644
+
645
+
646
+ # =============================================================================
647
+ # Audio Utilities
648
+ # =============================================================================
649
+
650
+ def pcm16_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1) -> bytes:
651
+ """Convert raw PCM16 audio to WAV format."""
652
+ import struct
653
+
654
+ wav_buffer = io.BytesIO()
655
+ wav_buffer.write(b'RIFF')
656
+ wav_buffer.write(struct.pack('<I', 36 + len(pcm_data)))
657
+ wav_buffer.write(b'WAVE')
658
+ wav_buffer.write(b'fmt ')
659
+ wav_buffer.write(struct.pack('<I', 16))
660
+ wav_buffer.write(struct.pack('<H', 1))
661
+ wav_buffer.write(struct.pack('<H', channels))
662
+ wav_buffer.write(struct.pack('<I', sample_rate))
663
+ wav_buffer.write(struct.pack('<I', sample_rate * channels * 2))
664
+ wav_buffer.write(struct.pack('<H', channels * 2))
665
+ wav_buffer.write(struct.pack('<H', 16))
666
+ wav_buffer.write(b'data')
667
+ wav_buffer.write(struct.pack('<I', len(pcm_data)))
668
+ wav_buffer.write(pcm_data)
669
+
670
+ wav_buffer.seek(0)
671
+ return wav_buffer.read()
672
+
673
+
674
+ def wav_to_pcm16(wav_data: bytes) -> tuple:
675
+ """Extract PCM16 data from WAV. Returns (pcm_data, sample_rate)."""
676
+ import struct
677
+
678
+ if wav_data[:4] != b'RIFF' or wav_data[8:12] != b'WAVE':
679
+ raise ValueError("Invalid WAV data")
680
+
681
+ pos = 12
682
+ sample_rate = 24000
683
+ while pos < len(wav_data) - 8:
684
+ chunk_id = wav_data[pos:pos+4]
685
+ chunk_size = struct.unpack('<I', wav_data[pos+4:pos+8])[0]
686
+
687
+ if chunk_id == b'fmt ':
688
+ sample_rate = struct.unpack('<I', wav_data[pos+12:pos+16])[0]
689
+ elif chunk_id == b'data':
690
+ return wav_data[pos+8:pos+8+chunk_size], sample_rate
691
+
692
+ pos += 8 + chunk_size
693
+
694
+ raise ValueError("No data chunk found in WAV")
695
+
696
+
697
+ def audio_to_base64(audio_data: bytes) -> str:
698
+ """Encode audio to base64 string."""
699
+ return base64.b64encode(audio_data).decode('utf-8')
700
+
701
+
702
+ def base64_to_audio(b64_string: str) -> bytes:
703
+ """Decode base64 string to audio bytes."""
704
+ return base64.b64decode(b64_string)