npcpy 1.3.10__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcpy/data/audio.py +360 -0
- npcpy/gen/audio_gen.py +693 -13
- npcpy/llm_funcs.py +1 -10
- npcpy/memory/command_history.py +26 -6
- npcpy/serve.py +712 -63
- {npcpy-1.3.10.dist-info → npcpy-1.3.11.dist-info}/METADATA +1 -1
- {npcpy-1.3.10.dist-info → npcpy-1.3.11.dist-info}/RECORD +10 -10
- {npcpy-1.3.10.dist-info → npcpy-1.3.11.dist-info}/WHEEL +0 -0
- {npcpy-1.3.10.dist-info → npcpy-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {npcpy-1.3.10.dist-info → npcpy-1.3.11.dist-info}/top_level.txt +0 -0
npcpy/data/audio.py
CHANGED
|
@@ -210,6 +210,366 @@ def transcribe_audio_file(file_path: str, language=None) -> str:
|
|
|
210
210
|
return ""
|
|
211
211
|
|
|
212
212
|
|
|
213
|
+
# =============================================================================
|
|
214
|
+
# Speech-to-Text: Multi-Engine Support
|
|
215
|
+
# =============================================================================
|
|
216
|
+
|
|
217
|
+
def stt_whisper(
|
|
218
|
+
audio_data: bytes,
|
|
219
|
+
model_size: str = "base",
|
|
220
|
+
language: str = None,
|
|
221
|
+
device: str = "auto"
|
|
222
|
+
) -> dict:
|
|
223
|
+
"""
|
|
224
|
+
Transcribe audio using local Whisper (faster-whisper).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
audio_data: Audio bytes (WAV, MP3, etc.)
|
|
228
|
+
model_size: Model size (tiny, base, small, medium, large-v3)
|
|
229
|
+
language: Language code or None for auto-detect
|
|
230
|
+
device: 'cpu', 'cuda', or 'auto'
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Dict with 'text', 'language', 'segments'
|
|
234
|
+
"""
|
|
235
|
+
from faster_whisper import WhisperModel
|
|
236
|
+
|
|
237
|
+
if device == "auto":
|
|
238
|
+
try:
|
|
239
|
+
import torch
|
|
240
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
241
|
+
except ImportError:
|
|
242
|
+
device = "cpu"
|
|
243
|
+
|
|
244
|
+
compute_type = "float16" if device == "cuda" else "int8"
|
|
245
|
+
model = WhisperModel(model_size, device=device, compute_type=compute_type)
|
|
246
|
+
|
|
247
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
248
|
+
f.write(audio_data)
|
|
249
|
+
temp_path = f.name
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
segments, info = model.transcribe(
|
|
253
|
+
temp_path,
|
|
254
|
+
language=language,
|
|
255
|
+
beam_size=5,
|
|
256
|
+
vad_filter=True
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
segment_list = []
|
|
260
|
+
text_parts = []
|
|
261
|
+
for segment in segments:
|
|
262
|
+
segment_list.append({
|
|
263
|
+
"start": segment.start,
|
|
264
|
+
"end": segment.end,
|
|
265
|
+
"text": segment.text
|
|
266
|
+
})
|
|
267
|
+
text_parts.append(segment.text)
|
|
268
|
+
|
|
269
|
+
return {
|
|
270
|
+
"text": " ".join(text_parts).strip(),
|
|
271
|
+
"language": info.language,
|
|
272
|
+
"language_probability": info.language_probability,
|
|
273
|
+
"segments": segment_list
|
|
274
|
+
}
|
|
275
|
+
finally:
|
|
276
|
+
os.unlink(temp_path)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def stt_openai(
|
|
280
|
+
audio_data: bytes,
|
|
281
|
+
api_key: str = None,
|
|
282
|
+
model: str = "whisper-1",
|
|
283
|
+
language: str = None,
|
|
284
|
+
response_format: str = "verbose_json",
|
|
285
|
+
filename: str = "audio.wav"
|
|
286
|
+
) -> dict:
|
|
287
|
+
"""
|
|
288
|
+
Transcribe audio using OpenAI Whisper API.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
audio_data: Audio bytes
|
|
292
|
+
api_key: OpenAI API key
|
|
293
|
+
model: Model name (whisper-1)
|
|
294
|
+
language: Optional language hint
|
|
295
|
+
response_format: json, text, srt, verbose_json, vtt
|
|
296
|
+
filename: Filename hint for format detection
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Dict with 'text', 'language', 'segments' (if verbose_json)
|
|
300
|
+
"""
|
|
301
|
+
import requests
|
|
302
|
+
|
|
303
|
+
api_key = api_key or os.environ.get('OPENAI_API_KEY')
|
|
304
|
+
if not api_key:
|
|
305
|
+
raise ValueError("OPENAI_API_KEY not set")
|
|
306
|
+
|
|
307
|
+
url = "https://api.openai.com/v1/audio/transcriptions"
|
|
308
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
309
|
+
|
|
310
|
+
files = {"file": (filename, audio_data)}
|
|
311
|
+
data = {"model": model, "response_format": response_format}
|
|
312
|
+
if language:
|
|
313
|
+
data["language"] = language
|
|
314
|
+
|
|
315
|
+
response = requests.post(url, headers=headers, files=files, data=data)
|
|
316
|
+
response.raise_for_status()
|
|
317
|
+
|
|
318
|
+
if response_format == "verbose_json":
|
|
319
|
+
result = response.json()
|
|
320
|
+
return {
|
|
321
|
+
"text": result.get("text", "").strip(),
|
|
322
|
+
"language": result.get("language", "en"),
|
|
323
|
+
"duration": result.get("duration"),
|
|
324
|
+
"segments": result.get("segments", [])
|
|
325
|
+
}
|
|
326
|
+
elif response_format == "json":
|
|
327
|
+
return {"text": response.json().get("text", "").strip()}
|
|
328
|
+
else:
|
|
329
|
+
return {"text": response.text.strip()}
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def stt_gemini(
|
|
333
|
+
audio_data: bytes,
|
|
334
|
+
api_key: str = None,
|
|
335
|
+
model: str = "gemini-1.5-flash",
|
|
336
|
+
language: str = None,
|
|
337
|
+
mime_type: str = "audio/wav"
|
|
338
|
+
) -> dict:
|
|
339
|
+
"""
|
|
340
|
+
Transcribe audio using Gemini API.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
audio_data: Audio bytes
|
|
344
|
+
api_key: Google/Gemini API key
|
|
345
|
+
model: Gemini model
|
|
346
|
+
language: Language hint
|
|
347
|
+
mime_type: Audio MIME type
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Dict with 'text'
|
|
351
|
+
"""
|
|
352
|
+
import google.generativeai as genai
|
|
353
|
+
|
|
354
|
+
api_key = api_key or os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY')
|
|
355
|
+
if not api_key:
|
|
356
|
+
raise ValueError("GOOGLE_API_KEY or GEMINI_API_KEY not set")
|
|
357
|
+
|
|
358
|
+
genai.configure(api_key=api_key)
|
|
359
|
+
model_obj = genai.GenerativeModel(model)
|
|
360
|
+
|
|
361
|
+
prompt = "Transcribe this audio exactly. Output only the transcription, nothing else."
|
|
362
|
+
if language:
|
|
363
|
+
prompt = f"Transcribe this audio in {language}. Output only the transcription, nothing else."
|
|
364
|
+
|
|
365
|
+
response = model_obj.generate_content([
|
|
366
|
+
prompt,
|
|
367
|
+
{"mime_type": mime_type, "data": audio_data}
|
|
368
|
+
])
|
|
369
|
+
|
|
370
|
+
return {"text": response.text.strip()}
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def stt_elevenlabs(
|
|
374
|
+
audio_data: bytes,
|
|
375
|
+
api_key: str = None,
|
|
376
|
+
model_id: str = "scribe_v1",
|
|
377
|
+
language: str = None
|
|
378
|
+
) -> dict:
|
|
379
|
+
"""
|
|
380
|
+
Transcribe audio using ElevenLabs Scribe API.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
audio_data: Audio bytes
|
|
384
|
+
api_key: ElevenLabs API key
|
|
385
|
+
model_id: Model (scribe_v1)
|
|
386
|
+
language: Language code (ISO 639-1)
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Dict with 'text', 'language', 'words'
|
|
390
|
+
"""
|
|
391
|
+
import requests
|
|
392
|
+
|
|
393
|
+
api_key = api_key or os.environ.get('ELEVENLABS_API_KEY')
|
|
394
|
+
if not api_key:
|
|
395
|
+
raise ValueError("ELEVENLABS_API_KEY not set")
|
|
396
|
+
|
|
397
|
+
url = "https://api.elevenlabs.io/v1/speech-to-text"
|
|
398
|
+
headers = {"xi-api-key": api_key}
|
|
399
|
+
|
|
400
|
+
files = {"file": ("audio.wav", audio_data, "audio/wav")}
|
|
401
|
+
data = {"model_id": model_id}
|
|
402
|
+
if language:
|
|
403
|
+
data["language_code"] = language
|
|
404
|
+
|
|
405
|
+
response = requests.post(url, headers=headers, files=files, data=data)
|
|
406
|
+
response.raise_for_status()
|
|
407
|
+
|
|
408
|
+
result = response.json()
|
|
409
|
+
return {
|
|
410
|
+
"text": result.get("text", "").strip(),
|
|
411
|
+
"language": result.get("language_code"),
|
|
412
|
+
"words": result.get("words", [])
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def stt_groq(
|
|
417
|
+
audio_data: bytes,
|
|
418
|
+
api_key: str = None,
|
|
419
|
+
model: str = "whisper-large-v3",
|
|
420
|
+
language: str = None
|
|
421
|
+
) -> dict:
|
|
422
|
+
"""
|
|
423
|
+
Transcribe audio using Groq's Whisper API (very fast).
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
audio_data: Audio bytes
|
|
427
|
+
api_key: Groq API key
|
|
428
|
+
model: whisper-large-v3 or whisper-large-v3-turbo
|
|
429
|
+
language: Language code
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
Dict with 'text'
|
|
433
|
+
"""
|
|
434
|
+
import requests
|
|
435
|
+
|
|
436
|
+
api_key = api_key or os.environ.get('GROQ_API_KEY')
|
|
437
|
+
if not api_key:
|
|
438
|
+
raise ValueError("GROQ_API_KEY not set")
|
|
439
|
+
|
|
440
|
+
url = "https://api.groq.com/openai/v1/audio/transcriptions"
|
|
441
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
442
|
+
|
|
443
|
+
files = {"file": ("audio.wav", audio_data, "audio/wav")}
|
|
444
|
+
data = {"model": model}
|
|
445
|
+
if language:
|
|
446
|
+
data["language"] = language
|
|
447
|
+
|
|
448
|
+
response = requests.post(url, headers=headers, files=files, data=data)
|
|
449
|
+
response.raise_for_status()
|
|
450
|
+
|
|
451
|
+
return {"text": response.json().get("text", "").strip()}
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def speech_to_text(
|
|
455
|
+
audio_data: bytes,
|
|
456
|
+
engine: str = "whisper",
|
|
457
|
+
language: str = None,
|
|
458
|
+
**kwargs
|
|
459
|
+
) -> dict:
|
|
460
|
+
"""
|
|
461
|
+
Unified STT interface.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
audio_data: Audio bytes (WAV, MP3, etc.)
|
|
465
|
+
engine: STT engine (whisper, openai, gemini, elevenlabs, groq)
|
|
466
|
+
language: Language hint
|
|
467
|
+
**kwargs: Engine-specific options
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
Dict with at least 'text' key
|
|
471
|
+
"""
|
|
472
|
+
engine = engine.lower()
|
|
473
|
+
|
|
474
|
+
if engine == "whisper" or engine == "faster-whisper":
|
|
475
|
+
try:
|
|
476
|
+
return stt_whisper(audio_data, language=language, **kwargs)
|
|
477
|
+
except ImportError:
|
|
478
|
+
# Fallback to openai whisper
|
|
479
|
+
import whisper
|
|
480
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
481
|
+
f.write(audio_data)
|
|
482
|
+
temp_path = f.name
|
|
483
|
+
try:
|
|
484
|
+
model = whisper.load_model(kwargs.get("model_size", "base"))
|
|
485
|
+
result = model.transcribe(temp_path, language=language)
|
|
486
|
+
return {"text": result["text"].strip(), "language": result.get("language", "en")}
|
|
487
|
+
finally:
|
|
488
|
+
os.unlink(temp_path)
|
|
489
|
+
|
|
490
|
+
elif engine == "openai":
|
|
491
|
+
return stt_openai(audio_data, language=language, **kwargs)
|
|
492
|
+
|
|
493
|
+
elif engine == "gemini":
|
|
494
|
+
return stt_gemini(audio_data, language=language, **kwargs)
|
|
495
|
+
|
|
496
|
+
elif engine == "elevenlabs":
|
|
497
|
+
return stt_elevenlabs(audio_data, language=language, **kwargs)
|
|
498
|
+
|
|
499
|
+
elif engine == "groq":
|
|
500
|
+
return stt_groq(audio_data, language=language, **kwargs)
|
|
501
|
+
|
|
502
|
+
else:
|
|
503
|
+
raise ValueError(f"Unknown STT engine: {engine}")
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def get_available_stt_engines() -> dict:
|
|
507
|
+
"""Get info about available STT engines."""
|
|
508
|
+
engines = {
|
|
509
|
+
"whisper": {
|
|
510
|
+
"name": "Whisper (Local)",
|
|
511
|
+
"type": "local",
|
|
512
|
+
"available": False,
|
|
513
|
+
"description": "OpenAI Whisper running locally",
|
|
514
|
+
"install": "pip install faster-whisper"
|
|
515
|
+
},
|
|
516
|
+
"openai": {
|
|
517
|
+
"name": "OpenAI Whisper API",
|
|
518
|
+
"type": "cloud",
|
|
519
|
+
"available": False,
|
|
520
|
+
"description": "OpenAI's cloud Whisper API",
|
|
521
|
+
"requires": "OPENAI_API_KEY"
|
|
522
|
+
},
|
|
523
|
+
"gemini": {
|
|
524
|
+
"name": "Gemini",
|
|
525
|
+
"type": "cloud",
|
|
526
|
+
"available": False,
|
|
527
|
+
"description": "Google Gemini transcription",
|
|
528
|
+
"requires": "GOOGLE_API_KEY or GEMINI_API_KEY"
|
|
529
|
+
},
|
|
530
|
+
"elevenlabs": {
|
|
531
|
+
"name": "ElevenLabs Scribe",
|
|
532
|
+
"type": "cloud",
|
|
533
|
+
"available": False,
|
|
534
|
+
"description": "ElevenLabs speech-to-text",
|
|
535
|
+
"requires": "ELEVENLABS_API_KEY"
|
|
536
|
+
},
|
|
537
|
+
"groq": {
|
|
538
|
+
"name": "Groq Whisper",
|
|
539
|
+
"type": "cloud",
|
|
540
|
+
"available": False,
|
|
541
|
+
"description": "Ultra-fast Whisper via Groq",
|
|
542
|
+
"requires": "GROQ_API_KEY"
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
# Check local whisper
|
|
547
|
+
try:
|
|
548
|
+
from faster_whisper import WhisperModel
|
|
549
|
+
engines["whisper"]["available"] = True
|
|
550
|
+
except ImportError:
|
|
551
|
+
try:
|
|
552
|
+
import whisper
|
|
553
|
+
engines["whisper"]["available"] = True
|
|
554
|
+
except ImportError:
|
|
555
|
+
pass
|
|
556
|
+
|
|
557
|
+
# Check API keys
|
|
558
|
+
if os.environ.get('OPENAI_API_KEY'):
|
|
559
|
+
engines["openai"]["available"] = True
|
|
560
|
+
|
|
561
|
+
if os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY'):
|
|
562
|
+
engines["gemini"]["available"] = True
|
|
563
|
+
|
|
564
|
+
if os.environ.get('ELEVENLABS_API_KEY'):
|
|
565
|
+
engines["elevenlabs"]["available"] = True
|
|
566
|
+
|
|
567
|
+
if os.environ.get('GROQ_API_KEY'):
|
|
568
|
+
engines["groq"]["available"] = True
|
|
569
|
+
|
|
570
|
+
return engines
|
|
571
|
+
|
|
572
|
+
|
|
213
573
|
|
|
214
574
|
def load_history():
|
|
215
575
|
global history
|