npcpy 1.3.9__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
npcpy/data/audio.py CHANGED
@@ -210,6 +210,366 @@ def transcribe_audio_file(file_path: str, language=None) -> str:
210
210
  return ""
211
211
 
212
212
 
213
+ # =============================================================================
214
+ # Speech-to-Text: Multi-Engine Support
215
+ # =============================================================================
216
+
217
+ def stt_whisper(
218
+ audio_data: bytes,
219
+ model_size: str = "base",
220
+ language: str = None,
221
+ device: str = "auto"
222
+ ) -> dict:
223
+ """
224
+ Transcribe audio using local Whisper (faster-whisper).
225
+
226
+ Args:
227
+ audio_data: Audio bytes (WAV, MP3, etc.)
228
+ model_size: Model size (tiny, base, small, medium, large-v3)
229
+ language: Language code or None for auto-detect
230
+ device: 'cpu', 'cuda', or 'auto'
231
+
232
+ Returns:
233
+ Dict with 'text', 'language', 'segments'
234
+ """
235
+ from faster_whisper import WhisperModel
236
+
237
+ if device == "auto":
238
+ try:
239
+ import torch
240
+ device = "cuda" if torch.cuda.is_available() else "cpu"
241
+ except ImportError:
242
+ device = "cpu"
243
+
244
+ compute_type = "float16" if device == "cuda" else "int8"
245
+ model = WhisperModel(model_size, device=device, compute_type=compute_type)
246
+
247
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
248
+ f.write(audio_data)
249
+ temp_path = f.name
250
+
251
+ try:
252
+ segments, info = model.transcribe(
253
+ temp_path,
254
+ language=language,
255
+ beam_size=5,
256
+ vad_filter=True
257
+ )
258
+
259
+ segment_list = []
260
+ text_parts = []
261
+ for segment in segments:
262
+ segment_list.append({
263
+ "start": segment.start,
264
+ "end": segment.end,
265
+ "text": segment.text
266
+ })
267
+ text_parts.append(segment.text)
268
+
269
+ return {
270
+ "text": " ".join(text_parts).strip(),
271
+ "language": info.language,
272
+ "language_probability": info.language_probability,
273
+ "segments": segment_list
274
+ }
275
+ finally:
276
+ os.unlink(temp_path)
277
+
278
+
279
+ def stt_openai(
280
+ audio_data: bytes,
281
+ api_key: str = None,
282
+ model: str = "whisper-1",
283
+ language: str = None,
284
+ response_format: str = "verbose_json",
285
+ filename: str = "audio.wav"
286
+ ) -> dict:
287
+ """
288
+ Transcribe audio using OpenAI Whisper API.
289
+
290
+ Args:
291
+ audio_data: Audio bytes
292
+ api_key: OpenAI API key
293
+ model: Model name (whisper-1)
294
+ language: Optional language hint
295
+ response_format: json, text, srt, verbose_json, vtt
296
+ filename: Filename hint for format detection
297
+
298
+ Returns:
299
+ Dict with 'text', 'language', 'segments' (if verbose_json)
300
+ """
301
+ import requests
302
+
303
+ api_key = api_key or os.environ.get('OPENAI_API_KEY')
304
+ if not api_key:
305
+ raise ValueError("OPENAI_API_KEY not set")
306
+
307
+ url = "https://api.openai.com/v1/audio/transcriptions"
308
+ headers = {"Authorization": f"Bearer {api_key}"}
309
+
310
+ files = {"file": (filename, audio_data)}
311
+ data = {"model": model, "response_format": response_format}
312
+ if language:
313
+ data["language"] = language
314
+
315
+ response = requests.post(url, headers=headers, files=files, data=data)
316
+ response.raise_for_status()
317
+
318
+ if response_format == "verbose_json":
319
+ result = response.json()
320
+ return {
321
+ "text": result.get("text", "").strip(),
322
+ "language": result.get("language", "en"),
323
+ "duration": result.get("duration"),
324
+ "segments": result.get("segments", [])
325
+ }
326
+ elif response_format == "json":
327
+ return {"text": response.json().get("text", "").strip()}
328
+ else:
329
+ return {"text": response.text.strip()}
330
+
331
+
332
+ def stt_gemini(
333
+ audio_data: bytes,
334
+ api_key: str = None,
335
+ model: str = "gemini-1.5-flash",
336
+ language: str = None,
337
+ mime_type: str = "audio/wav"
338
+ ) -> dict:
339
+ """
340
+ Transcribe audio using Gemini API.
341
+
342
+ Args:
343
+ audio_data: Audio bytes
344
+ api_key: Google/Gemini API key
345
+ model: Gemini model
346
+ language: Language hint
347
+ mime_type: Audio MIME type
348
+
349
+ Returns:
350
+ Dict with 'text'
351
+ """
352
+ import google.generativeai as genai
353
+
354
+ api_key = api_key or os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY')
355
+ if not api_key:
356
+ raise ValueError("GOOGLE_API_KEY or GEMINI_API_KEY not set")
357
+
358
+ genai.configure(api_key=api_key)
359
+ model_obj = genai.GenerativeModel(model)
360
+
361
+ prompt = "Transcribe this audio exactly. Output only the transcription, nothing else."
362
+ if language:
363
+ prompt = f"Transcribe this audio in {language}. Output only the transcription, nothing else."
364
+
365
+ response = model_obj.generate_content([
366
+ prompt,
367
+ {"mime_type": mime_type, "data": audio_data}
368
+ ])
369
+
370
+ return {"text": response.text.strip()}
371
+
372
+
373
+ def stt_elevenlabs(
374
+ audio_data: bytes,
375
+ api_key: str = None,
376
+ model_id: str = "scribe_v1",
377
+ language: str = None
378
+ ) -> dict:
379
+ """
380
+ Transcribe audio using ElevenLabs Scribe API.
381
+
382
+ Args:
383
+ audio_data: Audio bytes
384
+ api_key: ElevenLabs API key
385
+ model_id: Model (scribe_v1)
386
+ language: Language code (ISO 639-1)
387
+
388
+ Returns:
389
+ Dict with 'text', 'language', 'words'
390
+ """
391
+ import requests
392
+
393
+ api_key = api_key or os.environ.get('ELEVENLABS_API_KEY')
394
+ if not api_key:
395
+ raise ValueError("ELEVENLABS_API_KEY not set")
396
+
397
+ url = "https://api.elevenlabs.io/v1/speech-to-text"
398
+ headers = {"xi-api-key": api_key}
399
+
400
+ files = {"file": ("audio.wav", audio_data, "audio/wav")}
401
+ data = {"model_id": model_id}
402
+ if language:
403
+ data["language_code"] = language
404
+
405
+ response = requests.post(url, headers=headers, files=files, data=data)
406
+ response.raise_for_status()
407
+
408
+ result = response.json()
409
+ return {
410
+ "text": result.get("text", "").strip(),
411
+ "language": result.get("language_code"),
412
+ "words": result.get("words", [])
413
+ }
414
+
415
+
416
+ def stt_groq(
417
+ audio_data: bytes,
418
+ api_key: str = None,
419
+ model: str = "whisper-large-v3",
420
+ language: str = None
421
+ ) -> dict:
422
+ """
423
+ Transcribe audio using Groq's Whisper API (very fast).
424
+
425
+ Args:
426
+ audio_data: Audio bytes
427
+ api_key: Groq API key
428
+ model: whisper-large-v3 or whisper-large-v3-turbo
429
+ language: Language code
430
+
431
+ Returns:
432
+ Dict with 'text'
433
+ """
434
+ import requests
435
+
436
+ api_key = api_key or os.environ.get('GROQ_API_KEY')
437
+ if not api_key:
438
+ raise ValueError("GROQ_API_KEY not set")
439
+
440
+ url = "https://api.groq.com/openai/v1/audio/transcriptions"
441
+ headers = {"Authorization": f"Bearer {api_key}"}
442
+
443
+ files = {"file": ("audio.wav", audio_data, "audio/wav")}
444
+ data = {"model": model}
445
+ if language:
446
+ data["language"] = language
447
+
448
+ response = requests.post(url, headers=headers, files=files, data=data)
449
+ response.raise_for_status()
450
+
451
+ return {"text": response.json().get("text", "").strip()}
452
+
453
+
454
+ def speech_to_text(
455
+ audio_data: bytes,
456
+ engine: str = "whisper",
457
+ language: str = None,
458
+ **kwargs
459
+ ) -> dict:
460
+ """
461
+ Unified STT interface.
462
+
463
+ Args:
464
+ audio_data: Audio bytes (WAV, MP3, etc.)
465
+ engine: STT engine (whisper, openai, gemini, elevenlabs, groq)
466
+ language: Language hint
467
+ **kwargs: Engine-specific options
468
+
469
+ Returns:
470
+ Dict with at least 'text' key
471
+ """
472
+ engine = engine.lower()
473
+
474
+ if engine == "whisper" or engine == "faster-whisper":
475
+ try:
476
+ return stt_whisper(audio_data, language=language, **kwargs)
477
+ except ImportError:
478
+ # Fallback to openai whisper
479
+ import whisper
480
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
481
+ f.write(audio_data)
482
+ temp_path = f.name
483
+ try:
484
+ model = whisper.load_model(kwargs.get("model_size", "base"))
485
+ result = model.transcribe(temp_path, language=language)
486
+ return {"text": result["text"].strip(), "language": result.get("language", "en")}
487
+ finally:
488
+ os.unlink(temp_path)
489
+
490
+ elif engine == "openai":
491
+ return stt_openai(audio_data, language=language, **kwargs)
492
+
493
+ elif engine == "gemini":
494
+ return stt_gemini(audio_data, language=language, **kwargs)
495
+
496
+ elif engine == "elevenlabs":
497
+ return stt_elevenlabs(audio_data, language=language, **kwargs)
498
+
499
+ elif engine == "groq":
500
+ return stt_groq(audio_data, language=language, **kwargs)
501
+
502
+ else:
503
+ raise ValueError(f"Unknown STT engine: {engine}")
504
+
505
+
506
+ def get_available_stt_engines() -> dict:
507
+ """Get info about available STT engines."""
508
+ engines = {
509
+ "whisper": {
510
+ "name": "Whisper (Local)",
511
+ "type": "local",
512
+ "available": False,
513
+ "description": "OpenAI Whisper running locally",
514
+ "install": "pip install faster-whisper"
515
+ },
516
+ "openai": {
517
+ "name": "OpenAI Whisper API",
518
+ "type": "cloud",
519
+ "available": False,
520
+ "description": "OpenAI's cloud Whisper API",
521
+ "requires": "OPENAI_API_KEY"
522
+ },
523
+ "gemini": {
524
+ "name": "Gemini",
525
+ "type": "cloud",
526
+ "available": False,
527
+ "description": "Google Gemini transcription",
528
+ "requires": "GOOGLE_API_KEY or GEMINI_API_KEY"
529
+ },
530
+ "elevenlabs": {
531
+ "name": "ElevenLabs Scribe",
532
+ "type": "cloud",
533
+ "available": False,
534
+ "description": "ElevenLabs speech-to-text",
535
+ "requires": "ELEVENLABS_API_KEY"
536
+ },
537
+ "groq": {
538
+ "name": "Groq Whisper",
539
+ "type": "cloud",
540
+ "available": False,
541
+ "description": "Ultra-fast Whisper via Groq",
542
+ "requires": "GROQ_API_KEY"
543
+ }
544
+ }
545
+
546
+ # Check local whisper
547
+ try:
548
+ from faster_whisper import WhisperModel
549
+ engines["whisper"]["available"] = True
550
+ except ImportError:
551
+ try:
552
+ import whisper
553
+ engines["whisper"]["available"] = True
554
+ except ImportError:
555
+ pass
556
+
557
+ # Check API keys
558
+ if os.environ.get('OPENAI_API_KEY'):
559
+ engines["openai"]["available"] = True
560
+
561
+ if os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY'):
562
+ engines["gemini"]["available"] = True
563
+
564
+ if os.environ.get('ELEVENLABS_API_KEY'):
565
+ engines["elevenlabs"]["available"] = True
566
+
567
+ if os.environ.get('GROQ_API_KEY'):
568
+ engines["groq"]["available"] = True
569
+
570
+ return engines
571
+
572
+
213
573
 
214
574
  def load_history():
215
575
  global history