@codexstar/pi-listen 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/transcribe.py ADDED
@@ -0,0 +1,497 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Local & cloud STT transcriber for pi-voice.
4
+
5
+ Backends (in priority order):
6
+ faster-whisper — CTranslate2 Whisper (pip install faster-whisper)
7
+ moonshine — Moonshine v2 ONNX (pip install useful-moonshine[onnx])
8
+ whisper-cpp — whisper.cpp CLI (brew install whisper-cpp)
9
+ deepgram — Deepgram Nova 3 API (DEEPGRAM_API_KEY env var)
10
+ parakeet — NVIDIA Parakeet v3 via NeMo (pip install nemo_toolkit[asr])
11
+
12
+ Usage:
13
+ transcribe.py <audio_file>
14
+ transcribe.py --backend faster-whisper --model small <audio_file>
15
+ transcribe.py --backend deepgram <audio_file>
16
+ transcribe.py --list-backends
17
+ transcribe.py --list-models --backend faster-whisper
18
+
19
+ Output: JSON {"text": "...", "duration": 1.23, "backend": "...", "model": "..."}
20
+ """
21
+ import sys
22
+ import json
23
+ import time
24
+ import argparse
25
+ import os
26
+ import shutil
27
+ from pathlib import Path
28
+
29
+
30
+ def _existing_dirs(paths: list[str]) -> list[Path]:
31
+ result: list[Path] = []
32
+ for p in paths:
33
+ path = Path(os.path.expanduser(p))
34
+ if path.exists() and path.is_dir():
35
+ result.append(path)
36
+ return result
37
+
38
+
39
+ def huggingface_cache_dirs() -> list[Path]:
40
+ candidates = []
41
+ if os.environ.get("HUGGINGFACE_HUB_CACHE"):
42
+ candidates.append(os.environ["HUGGINGFACE_HUB_CACHE"])
43
+ if os.environ.get("HF_HOME"):
44
+ candidates.append(str(Path(os.environ["HF_HOME"]) / "hub"))
45
+ if os.environ.get("TRANSFORMERS_CACHE"):
46
+ candidates.append(os.environ["TRANSFORMERS_CACHE"])
47
+ candidates.append("~/.cache/huggingface/hub")
48
+ return _existing_dirs(candidates)
49
+
50
+
51
+ def repo_id_to_cache_dir(repo_id: str) -> str:
52
+ return f"models--{repo_id.replace('/', '--')}"
53
+
54
+
55
+ def huggingface_repo_exists(repo_ids: list[str]) -> bool:
56
+ for cache_dir in huggingface_cache_dirs():
57
+ for repo_id in repo_ids:
58
+ repo_dir = cache_dir / repo_id_to_cache_dir(repo_id)
59
+ snapshots_dir = repo_dir / "snapshots"
60
+ if snapshots_dir.exists() and any(snapshots_dir.iterdir()):
61
+ return True
62
+ if repo_dir.exists() and any(repo_dir.iterdir()):
63
+ return True
64
+ return False
65
+
66
+
67
+ def detect_installed_models(model_names: list[str], detector) -> list[str]:
68
+ installed: list[str] = []
69
+ for model_name in model_names:
70
+ try:
71
+ if detector(model_name):
72
+ installed.append(model_name)
73
+ except Exception:
74
+ continue
75
+ return installed
76
+
77
+
78
+ # ─── Backend: faster-whisper ──────────────────────────────────────────────────
79
+
80
+ FASTER_WHISPER_MODELS = [
81
+ "tiny", "tiny.en",
82
+ "base", "base.en",
83
+ "small", "small.en",
84
+ "medium", "medium.en",
85
+ "large-v3", "large-v3-turbo",
86
+ "distil-small.en", "distil-medium.en", "distil-large-v3",
87
+ ]
88
+
89
+ def is_faster_whisper_available() -> bool:
90
+ try:
91
+ import faster_whisper
92
+ return True
93
+ except ImportError:
94
+ return False
95
+
96
+
97
+ FASTER_WHISPER_MODEL_REPO_MAP = {
98
+ "tiny": "Systran/faster-whisper-tiny",
99
+ "tiny.en": "Systran/faster-whisper-tiny.en",
100
+ "base": "Systran/faster-whisper-base",
101
+ "base.en": "Systran/faster-whisper-base.en",
102
+ "small": "Systran/faster-whisper-small",
103
+ "small.en": "Systran/faster-whisper-small.en",
104
+ "medium": "Systran/faster-whisper-medium",
105
+ "medium.en": "Systran/faster-whisper-medium.en",
106
+ "large-v3": "Systran/faster-whisper-large-v3",
107
+ "large-v3-turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
108
+ "distil-small.en": "Systran/faster-distil-whisper-small.en",
109
+ "distil-medium.en": "Systran/faster-distil-whisper-medium.en",
110
+ "distil-large-v3": "Systran/faster-distil-whisper-large-v3",
111
+ }
112
+
113
+
114
+ def faster_whisper_repo_ids(model_name: str) -> list[str]:
115
+ try:
116
+ from faster_whisper.utils import _MODELS # type: ignore
117
+ repo_id = _MODELS.get(model_name)
118
+ if repo_id:
119
+ return [repo_id]
120
+ except Exception:
121
+ pass
122
+
123
+ repo_id = FASTER_WHISPER_MODEL_REPO_MAP.get(model_name)
124
+ if repo_id:
125
+ return [repo_id]
126
+
127
+ if model_name.startswith("distil-"):
128
+ return [f"Systran/faster-{model_name.replace('distil-', 'distil-whisper-')}"]
129
+ return [f"Systran/faster-whisper-{model_name}"]
130
+
131
+
132
+ def is_faster_whisper_model_installed(model_name: str) -> bool:
133
+ return huggingface_repo_exists(faster_whisper_repo_ids(model_name))
134
+
135
+
136
+ def transcribe_faster_whisper(audio_path: str, model_name: str, language: str) -> dict:
137
+ from faster_whisper import WhisperModel
138
+ model = WhisperModel(model_name, device="cpu", compute_type="int8")
139
+ start = time.time()
140
+ segments, info = model.transcribe(
141
+ audio_path,
142
+ language=language if language != "auto" else None,
143
+ beam_size=1,
144
+ vad_filter=True,
145
+ )
146
+ text = " ".join(seg.text.strip() for seg in segments)
147
+ return {
148
+ "text": text,
149
+ "duration": round(time.time() - start, 2),
150
+ "backend": "faster-whisper",
151
+ "model": model_name,
152
+ "language": info.language,
153
+ }
154
+
155
+
156
+ # ─── Backend: moonshine ──────────────────────────────────────────────────────
157
+
158
+ MOONSHINE_MODELS = ["moonshine/tiny", "moonshine/base"]
159
+
160
+ def is_moonshine_available() -> bool:
161
+ try:
162
+ import moonshine_onnx
163
+ return True
164
+ except ImportError:
165
+ return shutil.which("moonshine") is not None
166
+
167
+
168
+ def moonshine_repo_ids(model_name: str) -> list[str]:
169
+ slug = model_name.split("/", 1)[-1]
170
+ return [
171
+ f"UsefulSensors/moonshine-{slug}",
172
+ f"UsefulSensors/moonshine-{slug}-onnx",
173
+ ]
174
+
175
+
176
+ def is_moonshine_model_installed(model_name: str) -> bool:
177
+ cache_hits = huggingface_repo_exists(moonshine_repo_ids(model_name))
178
+ if cache_hits:
179
+ return True
180
+ local_candidates = [
181
+ f"~/.cache/moonshine/{model_name.replace('/', '-')}",
182
+ f"~/.cache/moonshine/{model_name.split('/', 1)[-1]}",
183
+ ]
184
+ return any(path.exists() for path in _existing_dirs(local_candidates))
185
+
186
+
187
+ def transcribe_moonshine(audio_path: str, model_name: str, language: str) -> dict:
188
+ start = time.time()
189
+
190
+ try:
191
+ # Try Python API first
192
+ from moonshine_onnx import transcribe as ms_transcribe
193
+ import soundfile as sf
194
+ audio, sr = sf.read(audio_path)
195
+ if sr != 16000:
196
+ import librosa
197
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
198
+ tokens = ms_transcribe(audio, model=model_name)
199
+ text = " ".join(tokens) if isinstance(tokens, list) else str(tokens)
200
+ except ImportError:
201
+ # Fall back to CLI
202
+ import subprocess
203
+ result = subprocess.run(
204
+ ["moonshine", "transcribe", "--model", model_name, audio_path],
205
+ capture_output=True, text=True, timeout=30
206
+ )
207
+ text = result.stdout.strip()
208
+
209
+ return {
210
+ "text": text,
211
+ "duration": round(time.time() - start, 2),
212
+ "backend": "moonshine",
213
+ "model": model_name,
214
+ }
215
+
216
+
217
+ # ─── Backend: whisper-cpp ─────────────────────────────────────────────────────
218
+
219
+ WHISPER_CPP_MODELS = ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]
220
+
221
+ def whisper_cpp_model_candidates(model_name: str) -> list[str]:
222
+ return [
223
+ os.path.expanduser(f"~/.cache/whisper-cpp/ggml-{model_name}.bin"),
224
+ f"/opt/homebrew/share/whisper-cpp/models/ggml-{model_name}.bin",
225
+ f"/usr/local/share/whisper-cpp/models/ggml-{model_name}.bin",
226
+ ]
227
+
228
+
229
+ def is_whisper_cpp_available() -> bool:
230
+ return shutil.which("whisper-cpp") is not None or shutil.which("main") is not None
231
+
232
+
233
+ def is_whisper_cpp_model_installed(model_name: str) -> bool:
234
+ return any(Path(candidate).exists() for candidate in whisper_cpp_model_candidates(model_name))
235
+
236
+
237
+ def transcribe_whisper_cpp(audio_path: str, model_name: str, language: str) -> dict:
238
+ import subprocess
239
+ cmd = "whisper-cpp" if shutil.which("whisper-cpp") else "main"
240
+ start = time.time()
241
+
242
+ # whisper.cpp expects model path, resolve from Homebrew or standard location
243
+ model_path = model_name
244
+ if not os.path.exists(model_path):
245
+ candidates = whisper_cpp_model_candidates(model_name)
246
+ for c in candidates:
247
+ if os.path.exists(c):
248
+ model_path = c
249
+ break
250
+
251
+ result = subprocess.run(
252
+ [cmd, "-m", model_path, "-l", language, "--no-timestamps", "-f", audio_path],
253
+ capture_output=True, text=True, timeout=60
254
+ )
255
+ text = result.stdout.strip()
256
+ # whisper.cpp outputs with [timestamp] prefix, clean it
257
+ lines = [l.strip() for l in text.split("\n") if l.strip() and not l.strip().startswith("[")]
258
+ if not lines:
259
+ lines = [l.split("]", 1)[-1].strip() for l in text.split("\n") if "]" in l]
260
+
261
+ return {
262
+ "text": " ".join(lines),
263
+ "duration": round(time.time() - start, 2),
264
+ "backend": "whisper-cpp",
265
+ "model": model_name,
266
+ }
267
+
268
+
269
+ # ─── Backend: deepgram (cloud) ───────────────────────────────────────────────
270
+
271
+ DEEPGRAM_MODELS = ["nova-3", "nova-2", "whisper-large", "whisper-medium", "whisper-small"]
272
+
273
+ def is_deepgram_available() -> bool:
274
+ return bool(os.environ.get("DEEPGRAM_API_KEY"))
275
+
276
+ def transcribe_deepgram(audio_path: str, model_name: str, language: str) -> dict:
277
+ import urllib.request
278
+
279
+ api_key = os.environ["DEEPGRAM_API_KEY"]
280
+ start = time.time()
281
+
282
+ with open(audio_path, "rb") as f:
283
+ audio_data = f.read()
284
+
285
+ params = f"model={model_name}&language={language}&smart_format=true"
286
+ url = f"https://api.deepgram.com/v1/listen?{params}"
287
+
288
+ req = urllib.request.Request(
289
+ url,
290
+ data=audio_data,
291
+ headers={
292
+ "Authorization": f"Token {api_key}",
293
+ "Content-Type": "audio/wav",
294
+ },
295
+ )
296
+
297
+ with urllib.request.urlopen(req, timeout=30) as resp:
298
+ result = json.loads(resp.read())
299
+
300
+ text = result["results"]["channels"][0]["alternatives"][0]["transcript"]
301
+ return {
302
+ "text": text,
303
+ "duration": round(time.time() - start, 2),
304
+ "backend": "deepgram",
305
+ "model": model_name,
306
+ }
307
+
308
+
309
+ # ─── Backend: parakeet (NVIDIA NeMo) ─────────────────────────────────────────
310
+
311
+ PARAKEET_MODELS = [
312
+ "nvidia/parakeet-tdt-0.6b-v2",
313
+ "nvidia/parakeet-ctc-0.6b",
314
+ "nvidia/parakeet-tdt-1.1b",
315
+ ]
316
+
317
+ def is_parakeet_available() -> bool:
318
+ try:
319
+ import nemo.collections.asr
320
+ return True
321
+ except ImportError:
322
+ return False
323
+
324
+
325
+ def parakeet_repo_ids(model_name: str) -> list[str]:
326
+ return [model_name]
327
+
328
+
329
+ def is_parakeet_model_installed(model_name: str) -> bool:
330
+ if huggingface_repo_exists(parakeet_repo_ids(model_name)):
331
+ return True
332
+ local_candidates = [
333
+ f"~/.cache/torch/NeMo/{model_name.replace('/', '--')}",
334
+ f"~/.cache/torch/NeMo/{model_name.split('/', 1)[-1]}",
335
+ ]
336
+ return any(path.exists() for path in _existing_dirs(local_candidates))
337
+
338
+
339
+ def transcribe_parakeet(audio_path: str, model_name: str, language: str) -> dict:
340
+ import nemo.collections.asr as nemo_asr
341
+ start = time.time()
342
+ model = nemo_asr.models.ASRModel.from_pretrained(model_name)
343
+ text = model.transcribe([audio_path])[0]
344
+ if isinstance(text, list):
345
+ text = text[0]
346
+ return {
347
+ "text": str(text),
348
+ "duration": round(time.time() - start, 2),
349
+ "backend": "parakeet",
350
+ "model": model_name,
351
+ }
352
+
353
+
354
+ # ─── Registry ────────────────────────────────────────────────────────────────
355
+
356
+ BACKENDS = {
357
+ "faster-whisper": {
358
+ "fn": transcribe_faster_whisper,
359
+ "available": is_faster_whisper_available,
360
+ "models": FASTER_WHISPER_MODELS,
361
+ "installed_models": lambda: detect_installed_models(FASTER_WHISPER_MODELS, is_faster_whisper_model_installed),
362
+ "default_model": "small",
363
+ "install": "pip install faster-whisper",
364
+ "install_detection": "huggingface-cache",
365
+ "type": "local",
366
+ },
367
+ "moonshine": {
368
+ "fn": transcribe_moonshine,
369
+ "available": is_moonshine_available,
370
+ "models": MOONSHINE_MODELS,
371
+ "installed_models": lambda: detect_installed_models(MOONSHINE_MODELS, is_moonshine_model_installed),
372
+ "default_model": "moonshine/base",
373
+ "install": "pip install useful-moonshine[onnx]",
374
+ "install_detection": "moonshine-cache-heuristic",
375
+ "type": "local",
376
+ },
377
+ "whisper-cpp": {
378
+ "fn": transcribe_whisper_cpp,
379
+ "available": is_whisper_cpp_available,
380
+ "models": WHISPER_CPP_MODELS,
381
+ "installed_models": lambda: detect_installed_models(WHISPER_CPP_MODELS, is_whisper_cpp_model_installed),
382
+ "default_model": "small",
383
+ "install": "brew install whisper-cpp",
384
+ "install_detection": "whisper-cpp-model-paths",
385
+ "type": "local",
386
+ },
387
+ "deepgram": {
388
+ "fn": transcribe_deepgram,
389
+ "available": is_deepgram_available,
390
+ "models": DEEPGRAM_MODELS,
391
+ "installed_models": lambda: [],
392
+ "default_model": "nova-3",
393
+ "install": "Set DEEPGRAM_API_KEY env var (free: deepgram.com)",
394
+ "install_detection": "api-key",
395
+ "type": "cloud",
396
+ },
397
+ "parakeet": {
398
+ "fn": transcribe_parakeet,
399
+ "available": is_parakeet_available,
400
+ "models": PARAKEET_MODELS,
401
+ "installed_models": lambda: detect_installed_models(PARAKEET_MODELS, is_parakeet_model_installed),
402
+ "default_model": "nvidia/parakeet-tdt-0.6b-v2",
403
+ "install": "pip install nemo_toolkit[asr]",
404
+ "install_detection": "huggingface-or-nemo-cache",
405
+ "type": "local",
406
+ },
407
+ }
408
+
409
+
410
+ def detect_backend() -> str:
411
+ """Auto-detect best available backend (prefer local, fastest first)."""
412
+ priority = ["faster-whisper", "moonshine", "whisper-cpp", "deepgram", "parakeet"]
413
+ for name in priority:
414
+ if BACKENDS[name]["available"]():
415
+ return name
416
+ return "none"
417
+
418
+
419
+ def resolve_backend_and_model(requested_backend: str | None, requested_model: str | None) -> tuple[str, str | None]:
420
+ backend = detect_backend() if requested_backend in (None, "auto") else requested_backend
421
+ if backend == "none" or backend not in BACKENDS:
422
+ return backend, requested_model
423
+
424
+ info = BACKENDS[backend]
425
+ if requested_model in (None, ""):
426
+ return backend, info["default_model"]
427
+
428
+ if requested_backend in (None, "auto") and requested_model not in info["models"]:
429
+ return backend, info["default_model"]
430
+
431
+ return backend, requested_model
432
+
433
+
434
+ def main():
435
+ parser = argparse.ArgumentParser(description="Local & cloud STT transcriber for pi-voice")
436
+ parser.add_argument("audio_file", nargs="?", help="Path to audio file (WAV)")
437
+ parser.add_argument("--backend", choices=list(BACKENDS.keys()), default=None)
438
+ parser.add_argument("--model", default=None, help="Model name/size")
439
+ parser.add_argument("--language", default="en", help="Language code (or 'auto')")
440
+ parser.add_argument("--list-backends", action="store_true", help="List available backends")
441
+ parser.add_argument("--list-models", action="store_true", help="List models for a backend")
442
+ args = parser.parse_args()
443
+
444
+ if args.list_backends:
445
+ result = []
446
+ for name, info in BACKENDS.items():
447
+ available = info["available"]()
448
+ installed_models = info["installed_models"]() if "installed_models" in info else []
449
+ result.append({
450
+ "name": name,
451
+ "available": available,
452
+ "type": info["type"],
453
+ "default_model": info["default_model"],
454
+ "install": info["install"] if not available else None,
455
+ "install_detection": info.get("install_detection", "unknown"),
456
+ "models": info["models"],
457
+ "installed_models": installed_models,
458
+ })
459
+ print(json.dumps(result, indent=2))
460
+ return
461
+
462
+ if args.list_models:
463
+ backend = args.backend or detect_backend()
464
+ if backend == "none" or backend not in BACKENDS:
465
+ print(json.dumps({"error": f"Unknown backend: {backend}"}))
466
+ sys.exit(1)
467
+ print(json.dumps({
468
+ "backend": backend,
469
+ "models": BACKENDS[backend]["models"],
470
+ "default": BACKENDS[backend]["default_model"],
471
+ }, indent=2))
472
+ return
473
+
474
+ if not args.audio_file:
475
+ parser.print_help()
476
+ sys.exit(1)
477
+
478
+ backend, model = resolve_backend_and_model(args.backend, args.model)
479
+ if backend == "none":
480
+ print(json.dumps({
481
+ "error": "No STT backend found",
482
+ "install_options": {name: info["install"] for name, info in BACKENDS.items()},
483
+ }))
484
+ sys.exit(1)
485
+
486
+ info = BACKENDS[backend]
487
+
488
+ try:
489
+ result = info["fn"](args.audio_file, model, args.language)
490
+ print(json.dumps(result))
491
+ except Exception as e:
492
+ print(json.dumps({"error": str(e), "backend": backend, "model": model}))
493
+ sys.exit(1)
494
+
495
+
496
+ if __name__ == "__main__":
497
+ main()