@codexstar/pi-listen 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +283 -0
- package/daemon.py +517 -0
- package/docs/API.md +273 -0
- package/docs/ARCHITECTURE.md +114 -0
- package/docs/backends.md +196 -0
- package/docs/plans/2026-03-12-pi-voice-master-plan.md +613 -0
- package/docs/plans/2026-03-12-pi-voice-model-aware-execution-plan.md +256 -0
- package/docs/plans/2026-03-12-pi-voice-onboarding-remediation-plan.md +391 -0
- package/docs/plans/pi-voice-model-aware-review.md +196 -0
- package/docs/plans/pi-voice-model-detection-qa-plan.md +226 -0
- package/docs/plans/pi-voice-model-detection-research.md +483 -0
- package/docs/plans/pi-voice-onboarding-ux-plan.md +388 -0
- package/docs/plans/pi-voice-release-validation-plan.md +386 -0
- package/docs/plans/pi-voice-remaining-implementation-plan.md +524 -0
- package/docs/plans/pi-voice-review-findings.md +227 -0
- package/docs/plans/pi-voice-technical-remediation-plan.md +613 -0
- package/docs/qa-matrix.md +69 -0
- package/docs/qa-results.md +357 -0
- package/docs/troubleshooting.md +265 -0
- package/extensions/voice/config.ts +206 -0
- package/extensions/voice/diagnostics.ts +212 -0
- package/extensions/voice/install.ts +62 -0
- package/extensions/voice/onboarding.ts +315 -0
- package/extensions/voice.ts +1149 -0
- package/package.json +48 -0
- package/scripts/setup-macos.sh +374 -0
- package/scripts/setup-windows.ps1 +271 -0
- package/transcribe.py +497 -0
package/transcribe.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Local & cloud STT transcriber for pi-voice.
|
|
4
|
+
|
|
5
|
+
Backends (in priority order):
|
|
6
|
+
faster-whisper — CTranslate2 Whisper (pip install faster-whisper)
|
|
7
|
+
moonshine — Moonshine v2 ONNX (pip install useful-moonshine[onnx])
|
|
8
|
+
whisper-cpp — whisper.cpp CLI (brew install whisper-cpp)
|
|
9
|
+
deepgram — Deepgram Nova 3 API (DEEPGRAM_API_KEY env var)
|
|
10
|
+
parakeet — NVIDIA Parakeet v3 via NeMo (pip install nemo_toolkit[asr])
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
transcribe.py <audio_file>
|
|
14
|
+
transcribe.py --backend faster-whisper --model small <audio_file>
|
|
15
|
+
transcribe.py --backend deepgram <audio_file>
|
|
16
|
+
transcribe.py --list-backends
|
|
17
|
+
transcribe.py --list-models --backend faster-whisper
|
|
18
|
+
|
|
19
|
+
Output: JSON {"text": "...", "duration": 1.23, "backend": "...", "model": "..."}
|
|
20
|
+
"""
|
|
21
|
+
import sys
|
|
22
|
+
import json
|
|
23
|
+
import time
|
|
24
|
+
import argparse
|
|
25
|
+
import os
|
|
26
|
+
import shutil
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _existing_dirs(paths: list[str]) -> list[Path]:
|
|
31
|
+
result: list[Path] = []
|
|
32
|
+
for p in paths:
|
|
33
|
+
path = Path(os.path.expanduser(p))
|
|
34
|
+
if path.exists() and path.is_dir():
|
|
35
|
+
result.append(path)
|
|
36
|
+
return result
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def huggingface_cache_dirs() -> list[Path]:
|
|
40
|
+
candidates = []
|
|
41
|
+
if os.environ.get("HUGGINGFACE_HUB_CACHE"):
|
|
42
|
+
candidates.append(os.environ["HUGGINGFACE_HUB_CACHE"])
|
|
43
|
+
if os.environ.get("HF_HOME"):
|
|
44
|
+
candidates.append(str(Path(os.environ["HF_HOME"]) / "hub"))
|
|
45
|
+
if os.environ.get("TRANSFORMERS_CACHE"):
|
|
46
|
+
candidates.append(os.environ["TRANSFORMERS_CACHE"])
|
|
47
|
+
candidates.append("~/.cache/huggingface/hub")
|
|
48
|
+
return _existing_dirs(candidates)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def repo_id_to_cache_dir(repo_id: str) -> str:
|
|
52
|
+
return f"models--{repo_id.replace('/', '--')}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def huggingface_repo_exists(repo_ids: list[str]) -> bool:
|
|
56
|
+
for cache_dir in huggingface_cache_dirs():
|
|
57
|
+
for repo_id in repo_ids:
|
|
58
|
+
repo_dir = cache_dir / repo_id_to_cache_dir(repo_id)
|
|
59
|
+
snapshots_dir = repo_dir / "snapshots"
|
|
60
|
+
if snapshots_dir.exists() and any(snapshots_dir.iterdir()):
|
|
61
|
+
return True
|
|
62
|
+
if repo_dir.exists() and any(repo_dir.iterdir()):
|
|
63
|
+
return True
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def detect_installed_models(model_names: list[str], detector) -> list[str]:
|
|
68
|
+
installed: list[str] = []
|
|
69
|
+
for model_name in model_names:
|
|
70
|
+
try:
|
|
71
|
+
if detector(model_name):
|
|
72
|
+
installed.append(model_name)
|
|
73
|
+
except Exception:
|
|
74
|
+
continue
|
|
75
|
+
return installed
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ─── Backend: faster-whisper ──────────────────────────────────────────────────
|
|
79
|
+
|
|
80
|
+
FASTER_WHISPER_MODELS = [
|
|
81
|
+
"tiny", "tiny.en",
|
|
82
|
+
"base", "base.en",
|
|
83
|
+
"small", "small.en",
|
|
84
|
+
"medium", "medium.en",
|
|
85
|
+
"large-v3", "large-v3-turbo",
|
|
86
|
+
"distil-small.en", "distil-medium.en", "distil-large-v3",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
def is_faster_whisper_available() -> bool:
|
|
90
|
+
try:
|
|
91
|
+
import faster_whisper
|
|
92
|
+
return True
|
|
93
|
+
except ImportError:
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
FASTER_WHISPER_MODEL_REPO_MAP = {
|
|
98
|
+
"tiny": "Systran/faster-whisper-tiny",
|
|
99
|
+
"tiny.en": "Systran/faster-whisper-tiny.en",
|
|
100
|
+
"base": "Systran/faster-whisper-base",
|
|
101
|
+
"base.en": "Systran/faster-whisper-base.en",
|
|
102
|
+
"small": "Systran/faster-whisper-small",
|
|
103
|
+
"small.en": "Systran/faster-whisper-small.en",
|
|
104
|
+
"medium": "Systran/faster-whisper-medium",
|
|
105
|
+
"medium.en": "Systran/faster-whisper-medium.en",
|
|
106
|
+
"large-v3": "Systran/faster-whisper-large-v3",
|
|
107
|
+
"large-v3-turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
|
|
108
|
+
"distil-small.en": "Systran/faster-distil-whisper-small.en",
|
|
109
|
+
"distil-medium.en": "Systran/faster-distil-whisper-medium.en",
|
|
110
|
+
"distil-large-v3": "Systran/faster-distil-whisper-large-v3",
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def faster_whisper_repo_ids(model_name: str) -> list[str]:
|
|
115
|
+
try:
|
|
116
|
+
from faster_whisper.utils import _MODELS # type: ignore
|
|
117
|
+
repo_id = _MODELS.get(model_name)
|
|
118
|
+
if repo_id:
|
|
119
|
+
return [repo_id]
|
|
120
|
+
except Exception:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
repo_id = FASTER_WHISPER_MODEL_REPO_MAP.get(model_name)
|
|
124
|
+
if repo_id:
|
|
125
|
+
return [repo_id]
|
|
126
|
+
|
|
127
|
+
if model_name.startswith("distil-"):
|
|
128
|
+
return [f"Systran/faster-{model_name.replace('distil-', 'distil-whisper-')}"]
|
|
129
|
+
return [f"Systran/faster-whisper-{model_name}"]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def is_faster_whisper_model_installed(model_name: str) -> bool:
|
|
133
|
+
return huggingface_repo_exists(faster_whisper_repo_ids(model_name))
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def transcribe_faster_whisper(audio_path: str, model_name: str, language: str) -> dict:
|
|
137
|
+
from faster_whisper import WhisperModel
|
|
138
|
+
model = WhisperModel(model_name, device="cpu", compute_type="int8")
|
|
139
|
+
start = time.time()
|
|
140
|
+
segments, info = model.transcribe(
|
|
141
|
+
audio_path,
|
|
142
|
+
language=language if language != "auto" else None,
|
|
143
|
+
beam_size=1,
|
|
144
|
+
vad_filter=True,
|
|
145
|
+
)
|
|
146
|
+
text = " ".join(seg.text.strip() for seg in segments)
|
|
147
|
+
return {
|
|
148
|
+
"text": text,
|
|
149
|
+
"duration": round(time.time() - start, 2),
|
|
150
|
+
"backend": "faster-whisper",
|
|
151
|
+
"model": model_name,
|
|
152
|
+
"language": info.language,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ─── Backend: moonshine ──────────────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
MOONSHINE_MODELS = ["moonshine/tiny", "moonshine/base"]
|
|
159
|
+
|
|
160
|
+
def is_moonshine_available() -> bool:
|
|
161
|
+
try:
|
|
162
|
+
import moonshine_onnx
|
|
163
|
+
return True
|
|
164
|
+
except ImportError:
|
|
165
|
+
return shutil.which("moonshine") is not None
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def moonshine_repo_ids(model_name: str) -> list[str]:
|
|
169
|
+
slug = model_name.split("/", 1)[-1]
|
|
170
|
+
return [
|
|
171
|
+
f"UsefulSensors/moonshine-{slug}",
|
|
172
|
+
f"UsefulSensors/moonshine-{slug}-onnx",
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def is_moonshine_model_installed(model_name: str) -> bool:
|
|
177
|
+
cache_hits = huggingface_repo_exists(moonshine_repo_ids(model_name))
|
|
178
|
+
if cache_hits:
|
|
179
|
+
return True
|
|
180
|
+
local_candidates = [
|
|
181
|
+
f"~/.cache/moonshine/{model_name.replace('/', '-')}",
|
|
182
|
+
f"~/.cache/moonshine/{model_name.split('/', 1)[-1]}",
|
|
183
|
+
]
|
|
184
|
+
return any(path.exists() for path in _existing_dirs(local_candidates))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def transcribe_moonshine(audio_path: str, model_name: str, language: str) -> dict:
|
|
188
|
+
start = time.time()
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
# Try Python API first
|
|
192
|
+
from moonshine_onnx import transcribe as ms_transcribe
|
|
193
|
+
import soundfile as sf
|
|
194
|
+
audio, sr = sf.read(audio_path)
|
|
195
|
+
if sr != 16000:
|
|
196
|
+
import librosa
|
|
197
|
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
|
198
|
+
tokens = ms_transcribe(audio, model=model_name)
|
|
199
|
+
text = " ".join(tokens) if isinstance(tokens, list) else str(tokens)
|
|
200
|
+
except ImportError:
|
|
201
|
+
# Fall back to CLI
|
|
202
|
+
import subprocess
|
|
203
|
+
result = subprocess.run(
|
|
204
|
+
["moonshine", "transcribe", "--model", model_name, audio_path],
|
|
205
|
+
capture_output=True, text=True, timeout=30
|
|
206
|
+
)
|
|
207
|
+
text = result.stdout.strip()
|
|
208
|
+
|
|
209
|
+
return {
|
|
210
|
+
"text": text,
|
|
211
|
+
"duration": round(time.time() - start, 2),
|
|
212
|
+
"backend": "moonshine",
|
|
213
|
+
"model": model_name,
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ─── Backend: whisper-cpp ─────────────────────────────────────────────────────
|
|
218
|
+
|
|
219
|
+
WHISPER_CPP_MODELS = ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]
|
|
220
|
+
|
|
221
|
+
def whisper_cpp_model_candidates(model_name: str) -> list[str]:
|
|
222
|
+
return [
|
|
223
|
+
os.path.expanduser(f"~/.cache/whisper-cpp/ggml-{model_name}.bin"),
|
|
224
|
+
f"/opt/homebrew/share/whisper-cpp/models/ggml-{model_name}.bin",
|
|
225
|
+
f"/usr/local/share/whisper-cpp/models/ggml-{model_name}.bin",
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def is_whisper_cpp_available() -> bool:
|
|
230
|
+
return shutil.which("whisper-cpp") is not None or shutil.which("main") is not None
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def is_whisper_cpp_model_installed(model_name: str) -> bool:
|
|
234
|
+
return any(Path(candidate).exists() for candidate in whisper_cpp_model_candidates(model_name))
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def transcribe_whisper_cpp(audio_path: str, model_name: str, language: str) -> dict:
|
|
238
|
+
import subprocess
|
|
239
|
+
cmd = "whisper-cpp" if shutil.which("whisper-cpp") else "main"
|
|
240
|
+
start = time.time()
|
|
241
|
+
|
|
242
|
+
# whisper.cpp expects model path, resolve from Homebrew or standard location
|
|
243
|
+
model_path = model_name
|
|
244
|
+
if not os.path.exists(model_path):
|
|
245
|
+
candidates = whisper_cpp_model_candidates(model_name)
|
|
246
|
+
for c in candidates:
|
|
247
|
+
if os.path.exists(c):
|
|
248
|
+
model_path = c
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
result = subprocess.run(
|
|
252
|
+
[cmd, "-m", model_path, "-l", language, "--no-timestamps", "-f", audio_path],
|
|
253
|
+
capture_output=True, text=True, timeout=60
|
|
254
|
+
)
|
|
255
|
+
text = result.stdout.strip()
|
|
256
|
+
# whisper.cpp outputs with [timestamp] prefix, clean it
|
|
257
|
+
lines = [l.strip() for l in text.split("\n") if l.strip() and not l.strip().startswith("[")]
|
|
258
|
+
if not lines:
|
|
259
|
+
lines = [l.split("]", 1)[-1].strip() for l in text.split("\n") if "]" in l]
|
|
260
|
+
|
|
261
|
+
return {
|
|
262
|
+
"text": " ".join(lines),
|
|
263
|
+
"duration": round(time.time() - start, 2),
|
|
264
|
+
"backend": "whisper-cpp",
|
|
265
|
+
"model": model_name,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
# ─── Backend: deepgram (cloud) ───────────────────────────────────────────────
|
|
270
|
+
|
|
271
|
+
DEEPGRAM_MODELS = ["nova-3", "nova-2", "whisper-large", "whisper-medium", "whisper-small"]
|
|
272
|
+
|
|
273
|
+
def is_deepgram_available() -> bool:
|
|
274
|
+
return bool(os.environ.get("DEEPGRAM_API_KEY"))
|
|
275
|
+
|
|
276
|
+
def transcribe_deepgram(audio_path: str, model_name: str, language: str) -> dict:
|
|
277
|
+
import urllib.request
|
|
278
|
+
|
|
279
|
+
api_key = os.environ["DEEPGRAM_API_KEY"]
|
|
280
|
+
start = time.time()
|
|
281
|
+
|
|
282
|
+
with open(audio_path, "rb") as f:
|
|
283
|
+
audio_data = f.read()
|
|
284
|
+
|
|
285
|
+
params = f"model={model_name}&language={language}&smart_format=true"
|
|
286
|
+
url = f"https://api.deepgram.com/v1/listen?{params}"
|
|
287
|
+
|
|
288
|
+
req = urllib.request.Request(
|
|
289
|
+
url,
|
|
290
|
+
data=audio_data,
|
|
291
|
+
headers={
|
|
292
|
+
"Authorization": f"Token {api_key}",
|
|
293
|
+
"Content-Type": "audio/wav",
|
|
294
|
+
},
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
298
|
+
result = json.loads(resp.read())
|
|
299
|
+
|
|
300
|
+
text = result["results"]["channels"][0]["alternatives"][0]["transcript"]
|
|
301
|
+
return {
|
|
302
|
+
"text": text,
|
|
303
|
+
"duration": round(time.time() - start, 2),
|
|
304
|
+
"backend": "deepgram",
|
|
305
|
+
"model": model_name,
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# ─── Backend: parakeet (NVIDIA NeMo) ─────────────────────────────────────────
|
|
310
|
+
|
|
311
|
+
PARAKEET_MODELS = [
|
|
312
|
+
"nvidia/parakeet-tdt-0.6b-v2",
|
|
313
|
+
"nvidia/parakeet-ctc-0.6b",
|
|
314
|
+
"nvidia/parakeet-tdt-1.1b",
|
|
315
|
+
]
|
|
316
|
+
|
|
317
|
+
def is_parakeet_available() -> bool:
|
|
318
|
+
try:
|
|
319
|
+
import nemo.collections.asr
|
|
320
|
+
return True
|
|
321
|
+
except ImportError:
|
|
322
|
+
return False
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def parakeet_repo_ids(model_name: str) -> list[str]:
|
|
326
|
+
return [model_name]
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def is_parakeet_model_installed(model_name: str) -> bool:
|
|
330
|
+
if huggingface_repo_exists(parakeet_repo_ids(model_name)):
|
|
331
|
+
return True
|
|
332
|
+
local_candidates = [
|
|
333
|
+
f"~/.cache/torch/NeMo/{model_name.replace('/', '--')}",
|
|
334
|
+
f"~/.cache/torch/NeMo/{model_name.split('/', 1)[-1]}",
|
|
335
|
+
]
|
|
336
|
+
return any(path.exists() for path in _existing_dirs(local_candidates))
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def transcribe_parakeet(audio_path: str, model_name: str, language: str) -> dict:
|
|
340
|
+
import nemo.collections.asr as nemo_asr
|
|
341
|
+
start = time.time()
|
|
342
|
+
model = nemo_asr.models.ASRModel.from_pretrained(model_name)
|
|
343
|
+
text = model.transcribe([audio_path])[0]
|
|
344
|
+
if isinstance(text, list):
|
|
345
|
+
text = text[0]
|
|
346
|
+
return {
|
|
347
|
+
"text": str(text),
|
|
348
|
+
"duration": round(time.time() - start, 2),
|
|
349
|
+
"backend": "parakeet",
|
|
350
|
+
"model": model_name,
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# ─── Registry ────────────────────────────────────────────────────────────────
|
|
355
|
+
|
|
356
|
+
BACKENDS = {
|
|
357
|
+
"faster-whisper": {
|
|
358
|
+
"fn": transcribe_faster_whisper,
|
|
359
|
+
"available": is_faster_whisper_available,
|
|
360
|
+
"models": FASTER_WHISPER_MODELS,
|
|
361
|
+
"installed_models": lambda: detect_installed_models(FASTER_WHISPER_MODELS, is_faster_whisper_model_installed),
|
|
362
|
+
"default_model": "small",
|
|
363
|
+
"install": "pip install faster-whisper",
|
|
364
|
+
"install_detection": "huggingface-cache",
|
|
365
|
+
"type": "local",
|
|
366
|
+
},
|
|
367
|
+
"moonshine": {
|
|
368
|
+
"fn": transcribe_moonshine,
|
|
369
|
+
"available": is_moonshine_available,
|
|
370
|
+
"models": MOONSHINE_MODELS,
|
|
371
|
+
"installed_models": lambda: detect_installed_models(MOONSHINE_MODELS, is_moonshine_model_installed),
|
|
372
|
+
"default_model": "moonshine/base",
|
|
373
|
+
"install": "pip install useful-moonshine[onnx]",
|
|
374
|
+
"install_detection": "moonshine-cache-heuristic",
|
|
375
|
+
"type": "local",
|
|
376
|
+
},
|
|
377
|
+
"whisper-cpp": {
|
|
378
|
+
"fn": transcribe_whisper_cpp,
|
|
379
|
+
"available": is_whisper_cpp_available,
|
|
380
|
+
"models": WHISPER_CPP_MODELS,
|
|
381
|
+
"installed_models": lambda: detect_installed_models(WHISPER_CPP_MODELS, is_whisper_cpp_model_installed),
|
|
382
|
+
"default_model": "small",
|
|
383
|
+
"install": "brew install whisper-cpp",
|
|
384
|
+
"install_detection": "whisper-cpp-model-paths",
|
|
385
|
+
"type": "local",
|
|
386
|
+
},
|
|
387
|
+
"deepgram": {
|
|
388
|
+
"fn": transcribe_deepgram,
|
|
389
|
+
"available": is_deepgram_available,
|
|
390
|
+
"models": DEEPGRAM_MODELS,
|
|
391
|
+
"installed_models": lambda: [],
|
|
392
|
+
"default_model": "nova-3",
|
|
393
|
+
"install": "Set DEEPGRAM_API_KEY env var (free: deepgram.com)",
|
|
394
|
+
"install_detection": "api-key",
|
|
395
|
+
"type": "cloud",
|
|
396
|
+
},
|
|
397
|
+
"parakeet": {
|
|
398
|
+
"fn": transcribe_parakeet,
|
|
399
|
+
"available": is_parakeet_available,
|
|
400
|
+
"models": PARAKEET_MODELS,
|
|
401
|
+
"installed_models": lambda: detect_installed_models(PARAKEET_MODELS, is_parakeet_model_installed),
|
|
402
|
+
"default_model": "nvidia/parakeet-tdt-0.6b-v2",
|
|
403
|
+
"install": "pip install nemo_toolkit[asr]",
|
|
404
|
+
"install_detection": "huggingface-or-nemo-cache",
|
|
405
|
+
"type": "local",
|
|
406
|
+
},
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def detect_backend() -> str:
|
|
411
|
+
"""Auto-detect best available backend (prefer local, fastest first)."""
|
|
412
|
+
priority = ["faster-whisper", "moonshine", "whisper-cpp", "deepgram", "parakeet"]
|
|
413
|
+
for name in priority:
|
|
414
|
+
if BACKENDS[name]["available"]():
|
|
415
|
+
return name
|
|
416
|
+
return "none"
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def resolve_backend_and_model(requested_backend: str | None, requested_model: str | None) -> tuple[str, str | None]:
|
|
420
|
+
backend = detect_backend() if requested_backend in (None, "auto") else requested_backend
|
|
421
|
+
if backend == "none" or backend not in BACKENDS:
|
|
422
|
+
return backend, requested_model
|
|
423
|
+
|
|
424
|
+
info = BACKENDS[backend]
|
|
425
|
+
if requested_model in (None, ""):
|
|
426
|
+
return backend, info["default_model"]
|
|
427
|
+
|
|
428
|
+
if requested_backend in (None, "auto") and requested_model not in info["models"]:
|
|
429
|
+
return backend, info["default_model"]
|
|
430
|
+
|
|
431
|
+
return backend, requested_model
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def main():
|
|
435
|
+
parser = argparse.ArgumentParser(description="Local & cloud STT transcriber for pi-voice")
|
|
436
|
+
parser.add_argument("audio_file", nargs="?", help="Path to audio file (WAV)")
|
|
437
|
+
parser.add_argument("--backend", choices=list(BACKENDS.keys()), default=None)
|
|
438
|
+
parser.add_argument("--model", default=None, help="Model name/size")
|
|
439
|
+
parser.add_argument("--language", default="en", help="Language code (or 'auto')")
|
|
440
|
+
parser.add_argument("--list-backends", action="store_true", help="List available backends")
|
|
441
|
+
parser.add_argument("--list-models", action="store_true", help="List models for a backend")
|
|
442
|
+
args = parser.parse_args()
|
|
443
|
+
|
|
444
|
+
if args.list_backends:
|
|
445
|
+
result = []
|
|
446
|
+
for name, info in BACKENDS.items():
|
|
447
|
+
available = info["available"]()
|
|
448
|
+
installed_models = info["installed_models"]() if "installed_models" in info else []
|
|
449
|
+
result.append({
|
|
450
|
+
"name": name,
|
|
451
|
+
"available": available,
|
|
452
|
+
"type": info["type"],
|
|
453
|
+
"default_model": info["default_model"],
|
|
454
|
+
"install": info["install"] if not available else None,
|
|
455
|
+
"install_detection": info.get("install_detection", "unknown"),
|
|
456
|
+
"models": info["models"],
|
|
457
|
+
"installed_models": installed_models,
|
|
458
|
+
})
|
|
459
|
+
print(json.dumps(result, indent=2))
|
|
460
|
+
return
|
|
461
|
+
|
|
462
|
+
if args.list_models:
|
|
463
|
+
backend = args.backend or detect_backend()
|
|
464
|
+
if backend == "none" or backend not in BACKENDS:
|
|
465
|
+
print(json.dumps({"error": f"Unknown backend: {backend}"}))
|
|
466
|
+
sys.exit(1)
|
|
467
|
+
print(json.dumps({
|
|
468
|
+
"backend": backend,
|
|
469
|
+
"models": BACKENDS[backend]["models"],
|
|
470
|
+
"default": BACKENDS[backend]["default_model"],
|
|
471
|
+
}, indent=2))
|
|
472
|
+
return
|
|
473
|
+
|
|
474
|
+
if not args.audio_file:
|
|
475
|
+
parser.print_help()
|
|
476
|
+
sys.exit(1)
|
|
477
|
+
|
|
478
|
+
backend, model = resolve_backend_and_model(args.backend, args.model)
|
|
479
|
+
if backend == "none":
|
|
480
|
+
print(json.dumps({
|
|
481
|
+
"error": "No STT backend found",
|
|
482
|
+
"install_options": {name: info["install"] for name, info in BACKENDS.items()},
|
|
483
|
+
}))
|
|
484
|
+
sys.exit(1)
|
|
485
|
+
|
|
486
|
+
info = BACKENDS[backend]
|
|
487
|
+
|
|
488
|
+
try:
|
|
489
|
+
result = info["fn"](args.audio_file, model, args.language)
|
|
490
|
+
print(json.dumps(result))
|
|
491
|
+
except Exception as e:
|
|
492
|
+
print(json.dumps({"error": str(e), "backend": backend, "model": model}))
|
|
493
|
+
sys.exit(1)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
if __name__ == "__main__":
|
|
497
|
+
main()
|