dulus 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent.py +363 -0
- backend/__init__.py +63 -0
- backend/compressor.py +261 -0
- backend/context.py +329 -0
- backend/githook.py +166 -0
- backend/marketplace.py +141 -0
- backend/mempalace_bridge.py +182 -0
- backend/personas.py +297 -0
- backend/plugins.py +222 -0
- backend/server.py +411 -0
- backend/tasks.py +213 -0
- batch_api.py +307 -0
- checkpoint/__init__.py +27 -0
- checkpoint/hooks.py +90 -0
- checkpoint/store.py +314 -0
- checkpoint/types.py +80 -0
- claude_code_watcher.py +214 -0
- clipboard_utils.py +246 -0
- cloudsave.py +159 -0
- common.py +177 -0
- compaction.py +378 -0
- config.py +180 -0
- context.py +241 -0
- dulus-0.2.0.dist-info/METADATA +600 -0
- dulus-0.2.0.dist-info/RECORD +101 -0
- dulus-0.2.0.dist-info/WHEEL +5 -0
- dulus-0.2.0.dist-info/entry_points.txt +2 -0
- dulus-0.2.0.dist-info/licenses/LICENSE +674 -0
- dulus-0.2.0.dist-info/licenses/license_manager.py +187 -0
- dulus-0.2.0.dist-info/top_level.txt +36 -0
- dulus.py +8455 -0
- dulus_gui.py +331 -0
- dulus_mcp/__init__.py +43 -0
- dulus_mcp/client.py +546 -0
- dulus_mcp/config.py +133 -0
- dulus_mcp/tools.py +131 -0
- dulus_mcp/types.py +124 -0
- gui/__init__.py +18 -0
- gui/agent_bridge.py +283 -0
- gui/chat_widget.py +448 -0
- gui/main_window.py +485 -0
- gui/personas.py +230 -0
- gui/session_utils.py +189 -0
- gui/settings_dialog.py +146 -0
- gui/sidebar.py +515 -0
- gui/tasks_view.py +499 -0
- gui/themes.py +256 -0
- gui/tool_panel.py +94 -0
- input.py +1030 -0
- license_manager.py +187 -0
- memory/__init__.py +93 -0
- memory/audit.py +51 -0
- memory/consolidator.py +312 -0
- memory/context.py +270 -0
- memory/offload.py +148 -0
- memory/palace.py +127 -0
- memory/scan.py +146 -0
- memory/sessions.py +100 -0
- memory/store.py +395 -0
- memory/tools.py +408 -0
- memory/types.py +114 -0
- memory/vector_search.py +92 -0
- multi_agent/__init__.py +23 -0
- multi_agent/subagent.py +501 -0
- multi_agent/tools.py +393 -0
- offload_helper.py +183 -0
- plugin/__init__.py +22 -0
- plugin/autoadapter.py +1641 -0
- plugin/loader.py +156 -0
- plugin/recommend.py +211 -0
- plugin/store.py +387 -0
- plugin/types.py +147 -0
- providers.py +3750 -0
- skill/__init__.py +14 -0
- skill/builtin.py +100 -0
- skill/clawhub.py +270 -0
- skill/executor.py +66 -0
- skill/loader.py +199 -0
- skill/tools.py +110 -0
- skills.py +14 -0
- spinner.py +42 -0
- string_utils.py +42 -0
- subagent.py +11 -0
- task/__init__.py +12 -0
- task/store.py +199 -0
- task/tools.py +265 -0
- task/types.py +92 -0
- tmux_offloader.py +177 -0
- tmux_tools.py +410 -0
- tool_registry.py +214 -0
- tools.py +2694 -0
- ui/__init__.py +1 -0
- ui/input.py +464 -0
- ui/render.py +272 -0
- voice/__init__.py +56 -0
- voice/keyterms.py +179 -0
- voice/recorder.py +263 -0
- voice/stt.py +408 -0
- voice/tts.py +570 -0
- webchat.py +432 -0
- webchat_server.py +1761 -0
voice/recorder.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""Audio capture for voice input.
|
|
2
|
+
|
|
3
|
+
Backend priority (tried in order):
|
|
4
|
+
1. sounddevice — cross-platform, pure-Python wrapper around PortAudio.
|
|
5
|
+
Best option: works on macOS, Linux, Windows.
|
|
6
|
+
pip install sounddevice
|
|
7
|
+
2. arecord — Linux ALSA utility. No pip install needed.
|
|
8
|
+
3. sox rec — SoX command-line recorder. Supports silence detection.
|
|
9
|
+
sudo apt install sox / brew install sox
|
|
10
|
+
|
|
11
|
+
All backends capture raw PCM: 16 kHz, 16-bit signed little-endian, mono.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import io
|
|
17
|
+
import shutil
|
|
18
|
+
import subprocess
|
|
19
|
+
import threading
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
SAMPLE_RATE = 16000
|
|
23
|
+
CHANNELS = 1
|
|
24
|
+
DTYPE = "int16"
|
|
25
|
+
BYTES_PER_SAMPLE = 2 # int16
|
|
26
|
+
|
|
27
|
+
# Silence detection parameters
|
|
28
|
+
SILENCE_THRESHOLD_RMS = 0.012 # fraction of int16 max (0..1)
|
|
29
|
+
SILENCE_DURATION_SECS = 1.8 # stop after this many seconds of silence
|
|
30
|
+
CHUNK_SECS = 0.08 # 80 ms chunks for RMS poll
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _has_cmd(cmd: str) -> bool:
|
|
34
|
+
return shutil.which(cmd) is not None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ── Availability ──────────────────────────────────────────────────────────
|
|
38
|
+
|
|
39
|
+
def check_recording_availability() -> tuple[bool, str | None]:
|
|
40
|
+
"""Return (available, reason_if_not)."""
|
|
41
|
+
# sounddevice (ImportError = not installed; OSError = PortAudio library missing)
|
|
42
|
+
try:
|
|
43
|
+
import sounddevice # noqa: F401
|
|
44
|
+
return True, None
|
|
45
|
+
except (ImportError, OSError):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
# arecord
|
|
49
|
+
if _has_cmd("arecord"):
|
|
50
|
+
return True, None
|
|
51
|
+
|
|
52
|
+
# sox rec
|
|
53
|
+
if _has_cmd("rec"):
|
|
54
|
+
return True, None
|
|
55
|
+
|
|
56
|
+
return False, (
|
|
57
|
+
"No audio recording backend found.\n"
|
|
58
|
+
"Install one of:\n"
|
|
59
|
+
" pip install sounddevice (recommended, cross-platform)\n"
|
|
60
|
+
" sudo apt install alsa-utils (Linux — provides arecord)\n"
|
|
61
|
+
" sudo apt install sox / brew install sox (SoX rec)"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ── sounddevice backend ───────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
def list_input_devices() -> list[dict]:
|
|
68
|
+
"""Return a list of available input devices with index and name."""
|
|
69
|
+
import sounddevice as sd
|
|
70
|
+
devices = sd.query_devices()
|
|
71
|
+
result = []
|
|
72
|
+
for i, d in enumerate(devices):
|
|
73
|
+
if d["max_input_channels"] > 0:
|
|
74
|
+
result.append({"index": i, "name": d["name"]})
|
|
75
|
+
return result
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _record_sounddevice(
|
|
79
|
+
max_seconds: int = 30,
|
|
80
|
+
on_energy: "callable | None" = None,
|
|
81
|
+
device_index: "int | None" = None,
|
|
82
|
+
) -> bytes:
|
|
83
|
+
import sounddevice as sd
|
|
84
|
+
import numpy as np
|
|
85
|
+
|
|
86
|
+
chunk_samples = int(SAMPLE_RATE * CHUNK_SECS)
|
|
87
|
+
silence_chunks_needed = int(SILENCE_DURATION_SECS / CHUNK_SECS)
|
|
88
|
+
max_chunks = int(max_seconds / CHUNK_SECS)
|
|
89
|
+
|
|
90
|
+
chunks: list[bytes] = []
|
|
91
|
+
silence_count = 0
|
|
92
|
+
done_evt = threading.Event()
|
|
93
|
+
|
|
94
|
+
def callback(indata: "np.ndarray", frames: int, time_info, status) -> None:
|
|
95
|
+
nonlocal silence_count
|
|
96
|
+
mono = indata[:, 0].copy()
|
|
97
|
+
chunks.append(mono.tobytes())
|
|
98
|
+
|
|
99
|
+
# RMS energy (normalised 0..1)
|
|
100
|
+
rms = float(np.sqrt(np.mean(mono.astype(np.float32) ** 2))) / 32768.0
|
|
101
|
+
if on_energy:
|
|
102
|
+
on_energy(rms)
|
|
103
|
+
|
|
104
|
+
if rms < SILENCE_THRESHOLD_RMS:
|
|
105
|
+
silence_count += 1
|
|
106
|
+
else:
|
|
107
|
+
silence_count = 0
|
|
108
|
+
|
|
109
|
+
# Only auto-stop on silence *after* we have some speech (≥3 chunks with signal)
|
|
110
|
+
has_speech = len(chunks) >= 3
|
|
111
|
+
if has_speech and silence_count >= silence_chunks_needed:
|
|
112
|
+
done_evt.set()
|
|
113
|
+
raise sd.CallbackStop()
|
|
114
|
+
if len(chunks) >= max_chunks:
|
|
115
|
+
done_evt.set()
|
|
116
|
+
raise sd.CallbackStop()
|
|
117
|
+
|
|
118
|
+
stream_kwargs = dict(
|
|
119
|
+
samplerate=SAMPLE_RATE,
|
|
120
|
+
channels=CHANNELS,
|
|
121
|
+
dtype=DTYPE,
|
|
122
|
+
blocksize=chunk_samples,
|
|
123
|
+
callback=callback,
|
|
124
|
+
)
|
|
125
|
+
if device_index is not None:
|
|
126
|
+
stream_kwargs["device"] = device_index
|
|
127
|
+
with sd.InputStream(**stream_kwargs):
|
|
128
|
+
done_evt.wait(timeout=max_seconds + 2)
|
|
129
|
+
|
|
130
|
+
return b"".join(chunks)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ── arecord backend (Linux ALSA) ──────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
def _record_arecord(
|
|
136
|
+
max_seconds: int = 30,
|
|
137
|
+
on_energy: "callable | None" = None,
|
|
138
|
+
) -> bytes:
|
|
139
|
+
"""Record via arecord. Silence detection done in Python on the piped PCM."""
|
|
140
|
+
import numpy as np
|
|
141
|
+
|
|
142
|
+
cmd = [
|
|
143
|
+
"arecord",
|
|
144
|
+
"-f", "S16_LE",
|
|
145
|
+
"-r", str(SAMPLE_RATE),
|
|
146
|
+
"-c", str(CHANNELS),
|
|
147
|
+
"-t", "raw",
|
|
148
|
+
"-q",
|
|
149
|
+
"-d", str(max_seconds),
|
|
150
|
+
"-",
|
|
151
|
+
]
|
|
152
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
|
153
|
+
|
|
154
|
+
chunk_bytes = int(SAMPLE_RATE * CHUNK_SECS) * BYTES_PER_SAMPLE
|
|
155
|
+
silence_chunks_needed = int(SILENCE_DURATION_SECS / CHUNK_SECS)
|
|
156
|
+
|
|
157
|
+
chunks: list[bytes] = []
|
|
158
|
+
silence_count = 0
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
while True:
|
|
162
|
+
raw = proc.stdout.read(chunk_bytes)
|
|
163
|
+
if not raw:
|
|
164
|
+
break
|
|
165
|
+
chunks.append(raw)
|
|
166
|
+
|
|
167
|
+
arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
|
|
168
|
+
rms = float(np.sqrt(np.mean(arr ** 2))) / 32768.0
|
|
169
|
+
if on_energy:
|
|
170
|
+
on_energy(rms)
|
|
171
|
+
|
|
172
|
+
if rms < SILENCE_THRESHOLD_RMS:
|
|
173
|
+
silence_count += 1
|
|
174
|
+
else:
|
|
175
|
+
silence_count = 0
|
|
176
|
+
|
|
177
|
+
has_speech = len(chunks) >= 3
|
|
178
|
+
if has_speech and silence_count >= silence_chunks_needed:
|
|
179
|
+
break
|
|
180
|
+
finally:
|
|
181
|
+
proc.terminate()
|
|
182
|
+
try:
|
|
183
|
+
proc.wait(timeout=2)
|
|
184
|
+
except subprocess.TimeoutExpired:
|
|
185
|
+
proc.kill()
|
|
186
|
+
|
|
187
|
+
return b"".join(chunks)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ── SoX rec backend ───────────────────────────────────────────────────────
|
|
191
|
+
|
|
192
|
+
def _record_sox(
|
|
193
|
+
max_seconds: int = 30,
|
|
194
|
+
on_energy: "callable | None" = None,
|
|
195
|
+
) -> bytes:
|
|
196
|
+
"""Record via SoX `rec` with built-in silence detection."""
|
|
197
|
+
silence_threshold = "3%"
|
|
198
|
+
silence_pre_duration = "0.1"
|
|
199
|
+
silence_post_duration = str(SILENCE_DURATION_SECS)
|
|
200
|
+
|
|
201
|
+
cmd = [
|
|
202
|
+
"rec",
|
|
203
|
+
"-q",
|
|
204
|
+
"--buffer", "1024",
|
|
205
|
+
"-t", "raw",
|
|
206
|
+
"-r", str(SAMPLE_RATE),
|
|
207
|
+
"-e", "signed",
|
|
208
|
+
"-b", "16",
|
|
209
|
+
"-c", str(CHANNELS),
|
|
210
|
+
"-",
|
|
211
|
+
"silence",
|
|
212
|
+
"1", silence_pre_duration, silence_threshold,
|
|
213
|
+
"1", silence_post_duration, silence_threshold,
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
# Honour max_seconds via a timeout
|
|
217
|
+
try:
|
|
218
|
+
result = subprocess.run(
|
|
219
|
+
cmd,
|
|
220
|
+
capture_output=True,
|
|
221
|
+
timeout=max_seconds,
|
|
222
|
+
)
|
|
223
|
+
return result.stdout
|
|
224
|
+
except subprocess.TimeoutExpired as e:
|
|
225
|
+
return e.stdout or b""
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# ── Public entry point ────────────────────────────────────────────────────
|
|
229
|
+
|
|
230
|
+
def record_until_silence(
|
|
231
|
+
max_seconds: int = 30,
|
|
232
|
+
on_energy: "callable | None" = None,
|
|
233
|
+
device_index: "int | None" = None,
|
|
234
|
+
) -> bytes:
|
|
235
|
+
"""Record from microphone until silence or max_seconds.
|
|
236
|
+
|
|
237
|
+
Returns raw PCM bytes: int16, 16 kHz, mono.
|
|
238
|
+
Tries backends in order: sounddevice → arecord → sox rec.
|
|
239
|
+
Raises RuntimeError if no backend is available.
|
|
240
|
+
"""
|
|
241
|
+
try:
|
|
242
|
+
import sounddevice # noqa: F401
|
|
243
|
+
return _record_sounddevice(max_seconds=max_seconds, on_energy=on_energy, device_index=device_index)
|
|
244
|
+
except (ImportError, OSError):
|
|
245
|
+
pass
|
|
246
|
+
|
|
247
|
+
if _has_cmd("arecord"):
|
|
248
|
+
try:
|
|
249
|
+
import numpy # noqa: F401
|
|
250
|
+
return _record_arecord(max_seconds=max_seconds, on_energy=on_energy)
|
|
251
|
+
except ImportError:
|
|
252
|
+
# numpy missing — fall through to sox (no RMS feedback)
|
|
253
|
+
return _record_arecord(max_seconds=max_seconds, on_energy=None)
|
|
254
|
+
|
|
255
|
+
if _has_cmd("rec"):
|
|
256
|
+
return _record_sox(max_seconds=max_seconds, on_energy=on_energy)
|
|
257
|
+
|
|
258
|
+
raise RuntimeError(
|
|
259
|
+
"No audio recording backend found.\n"
|
|
260
|
+
"Install sounddevice: pip install sounddevice\n"
|
|
261
|
+
"Or install arecord: sudo apt install alsa-utils\n"
|
|
262
|
+
"Or install SoX: sudo apt install sox"
|
|
263
|
+
)
|
voice/stt.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""Speech-to-text (STT) backends.
|
|
2
|
+
|
|
3
|
+
Backend priority (tried in order):
|
|
4
|
+
1. NVIDIA Riva — cloud, whisper-large-v3 via gRPC, needs NVIDIA_API_KEY.
|
|
5
|
+
pip install nvidia-riva-client
|
|
6
|
+
2. faster-whisper — local, offline, fast, best for coding vocab.
|
|
7
|
+
pip install faster-whisper
|
|
8
|
+
3. openai-whisper — local, offline, original OpenAI Whisper library.
|
|
9
|
+
pip install openai-whisper
|
|
10
|
+
4. OpenAI Whisper API — cloud, needs OPENAI_API_KEY.
|
|
11
|
+
pip install openai (already in requirements)
|
|
12
|
+
|
|
13
|
+
All backends receive raw PCM (int16, 16 kHz, mono) and return a text string.
|
|
14
|
+
Keyterms are passed as initial_prompt to local Whisper backends so that
|
|
15
|
+
coding-domain vocabulary (grep, MCP, TypeScript, …) is recognised correctly.
|
|
16
|
+
Riva does not accept initial_prompt; keyterms are ignored on that path.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import io
|
|
22
|
+
import os
|
|
23
|
+
import struct
|
|
24
|
+
import tempfile
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import List, Optional
|
|
27
|
+
|
|
28
|
+
from .recorder import SAMPLE_RATE, CHANNELS, BYTES_PER_SAMPLE
|
|
29
|
+
|
|
30
|
+
# ── Cached model handles ──────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
_faster_whisper_model = None
|
|
33
|
+
_openai_whisper_model = None
|
|
34
|
+
|
|
35
|
+
# Model size: "tiny", "base", "small", "medium", "large-v2", "large-v3"
|
|
36
|
+
# "base" is a good balance of speed and accuracy for coding dictation.
|
|
37
|
+
# Override with env var DULUS_WHISPER_MODEL.
|
|
38
|
+
DEFAULT_MODEL_SIZE = os.environ.get("DULUS_WHISPER_MODEL", "medium")
|
|
39
|
+
|
|
40
|
+
# ── NVIDIA Riva (whisper-large-v3 via NVCF gRPC) ─────────────────────────
|
|
41
|
+
RIVA_SERVER = os.environ.get("DULUS_RIVA_SERVER", "grpc.nvcf.nvidia.com:443")
|
|
42
|
+
RIVA_FUNCTION_ID = os.environ.get("DULUS_RIVA_FUNCTION_ID",
|
|
43
|
+
"b702f636-f60c-4a3d-a6f4-f3568c13bd7d")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _riva_available() -> bool:
|
|
47
|
+
"""Riva backend is usable iff the client lib is installed AND we have a key."""
|
|
48
|
+
if not os.environ.get("NVIDIA_API_KEY"):
|
|
49
|
+
return False
|
|
50
|
+
try:
|
|
51
|
+
import riva.client # noqa: F401
|
|
52
|
+
return True
|
|
53
|
+
except ImportError:
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _transcribe_nvidia_riva(
|
|
58
|
+
pcm_bytes: bytes,
|
|
59
|
+
language: Optional[str],
|
|
60
|
+
translate: bool = False,
|
|
61
|
+
) -> str:
|
|
62
|
+
"""Transcribe via NVIDIA NVCF Riva (whisper-large-v3, gRPC).
|
|
63
|
+
|
|
64
|
+
Riva expects a real audio container — we wrap raw PCM in WAV.
|
|
65
|
+
`language=None` or "auto" → "multi" (Riva auto-detect).
|
|
66
|
+
`translate=True` adds custom_configuration "task:translate" so foreign
|
|
67
|
+
speech comes back as English.
|
|
68
|
+
"""
|
|
69
|
+
import riva.client
|
|
70
|
+
api_key = os.environ["NVIDIA_API_KEY"]
|
|
71
|
+
auth = riva.client.Auth(
|
|
72
|
+
None, # ssl_cert
|
|
73
|
+
True, # use_ssl
|
|
74
|
+
RIVA_SERVER,
|
|
75
|
+
[("function-id", RIVA_FUNCTION_ID),
|
|
76
|
+
("authorization", f"Bearer {api_key}")],
|
|
77
|
+
)
|
|
78
|
+
asr = riva.client.ASRService(auth)
|
|
79
|
+
lang_code = "multi" if (not language or language == "auto") else language
|
|
80
|
+
config = riva.client.RecognitionConfig(
|
|
81
|
+
encoding=riva.client.AudioEncoding.LINEAR_PCM,
|
|
82
|
+
sample_rate_hertz=SAMPLE_RATE,
|
|
83
|
+
audio_channel_count=CHANNELS,
|
|
84
|
+
language_code=lang_code,
|
|
85
|
+
max_alternatives=1,
|
|
86
|
+
enable_automatic_punctuation=True,
|
|
87
|
+
)
|
|
88
|
+
if translate:
|
|
89
|
+
riva.client.add_custom_configuration_to_config(config, "task:translate")
|
|
90
|
+
wav = _pcm_to_wav(pcm_bytes)
|
|
91
|
+
resp = asr.offline_recognize(wav, config)
|
|
92
|
+
parts = []
|
|
93
|
+
for r in resp.results:
|
|
94
|
+
if r.alternatives:
|
|
95
|
+
parts.append(r.alternatives[0].transcript)
|
|
96
|
+
return " ".join(parts).strip()
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ── OGG/audio file → PCM conversion ──────────────────────────────────────
|
|
100
|
+
|
|
101
|
+
def _audio_file_to_pcm(audio_bytes: bytes, suffix: str = ".ogg") -> bytes:
|
|
102
|
+
"""Convert an audio file (OGG, MP3, etc.) to raw int16 PCM (16kHz mono) via ffmpeg."""
|
|
103
|
+
import subprocess
|
|
104
|
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
|
|
105
|
+
f.write(audio_bytes)
|
|
106
|
+
f.flush()
|
|
107
|
+
tmp_in = f.name
|
|
108
|
+
try:
|
|
109
|
+
r = subprocess.run(
|
|
110
|
+
["ffmpeg", "-y", "-i", tmp_in, "-f", "s16le", "-ar", str(SAMPLE_RATE),
|
|
111
|
+
"-ac", str(CHANNELS), "-acodec", "pcm_s16le", "-"],
|
|
112
|
+
capture_output=True, timeout=30,
|
|
113
|
+
)
|
|
114
|
+
if r.returncode != 0:
|
|
115
|
+
raise RuntimeError(f"ffmpeg failed: {r.stderr[:200]}")
|
|
116
|
+
return r.stdout
|
|
117
|
+
finally:
|
|
118
|
+
Path(tmp_in).unlink(missing_ok=True)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# ── WAV helper ────────────────────────────────────────────────────────────
|
|
122
|
+
|
|
123
|
+
def _pcm_to_wav(pcm_bytes: bytes) -> bytes:
|
|
124
|
+
"""Wrap raw int16 PCM in a minimal WAV container."""
|
|
125
|
+
num_samples = len(pcm_bytes) // BYTES_PER_SAMPLE
|
|
126
|
+
byte_rate = SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE
|
|
127
|
+
block_align = CHANNELS * BYTES_PER_SAMPLE
|
|
128
|
+
data_size = len(pcm_bytes)
|
|
129
|
+
header = struct.pack(
|
|
130
|
+
"<4sI4s4sIHHIIHH4sI",
|
|
131
|
+
b"RIFF",
|
|
132
|
+
36 + data_size,
|
|
133
|
+
b"WAVE",
|
|
134
|
+
b"fmt ",
|
|
135
|
+
16, # chunk size
|
|
136
|
+
1, # PCM format
|
|
137
|
+
CHANNELS,
|
|
138
|
+
SAMPLE_RATE,
|
|
139
|
+
byte_rate,
|
|
140
|
+
block_align,
|
|
141
|
+
16, # bits per sample
|
|
142
|
+
b"data",
|
|
143
|
+
data_size,
|
|
144
|
+
)
|
|
145
|
+
return header + pcm_bytes
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# ── Availability ──────────────────────────────────────────────────────────
|
|
149
|
+
|
|
150
|
+
def check_stt_availability() -> tuple[bool, str | None]:
|
|
151
|
+
"""Return (available, reason_if_not)."""
|
|
152
|
+
if _riva_available():
|
|
153
|
+
return True, None
|
|
154
|
+
try:
|
|
155
|
+
import faster_whisper # noqa: F401
|
|
156
|
+
return True, None
|
|
157
|
+
except ImportError:
|
|
158
|
+
pass
|
|
159
|
+
try:
|
|
160
|
+
import whisper # noqa: F401
|
|
161
|
+
return True, None
|
|
162
|
+
except ImportError:
|
|
163
|
+
pass
|
|
164
|
+
if os.environ.get("OPENAI_API_KEY"):
|
|
165
|
+
return True, None
|
|
166
|
+
|
|
167
|
+
return False, (
|
|
168
|
+
"No STT backend available.\n"
|
|
169
|
+
"Install one of:\n"
|
|
170
|
+
" pip install nvidia-riva-client (cloud, whisper-large-v3 — set NVIDIA_API_KEY)\n"
|
|
171
|
+
" pip install faster-whisper (local, recommended)\n"
|
|
172
|
+
" pip install openai-whisper (local, original)\n"
|
|
173
|
+
" Set OPENAI_API_KEY to use the OpenAI Whisper cloud API"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def get_stt_backend_name() -> str:
|
|
178
|
+
"""Return a human-readable name of the backend that will be used."""
|
|
179
|
+
if _riva_available():
|
|
180
|
+
return "NVIDIA Riva (whisper-large-v3, cloud)"
|
|
181
|
+
try:
|
|
182
|
+
import faster_whisper # noqa: F401
|
|
183
|
+
return f"faster-whisper ({DEFAULT_MODEL_SIZE})"
|
|
184
|
+
except ImportError:
|
|
185
|
+
pass
|
|
186
|
+
try:
|
|
187
|
+
import whisper # noqa: F401
|
|
188
|
+
return f"openai-whisper ({DEFAULT_MODEL_SIZE})"
|
|
189
|
+
except ImportError:
|
|
190
|
+
pass
|
|
191
|
+
if os.environ.get("OPENAI_API_KEY"):
|
|
192
|
+
return "OpenAI Whisper API"
|
|
193
|
+
return "(none)"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ── faster-whisper ────────────────────────────────────────────────────────
|
|
197
|
+
|
|
198
|
+
def _get_faster_whisper_model():
|
|
199
|
+
global _faster_whisper_model
|
|
200
|
+
if _faster_whisper_model is None:
|
|
201
|
+
from faster_whisper import WhisperModel
|
|
202
|
+
# Use CPU by default; set device="cuda" if GPU available.
|
|
203
|
+
device = "cuda" if _has_cuda() else "cpu"
|
|
204
|
+
compute = "float16" if device == "cuda" else "int8"
|
|
205
|
+
_faster_whisper_model = WhisperModel(
|
|
206
|
+
DEFAULT_MODEL_SIZE,
|
|
207
|
+
device=device,
|
|
208
|
+
compute_type=compute,
|
|
209
|
+
)
|
|
210
|
+
return _faster_whisper_model
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _has_cuda() -> bool:
|
|
214
|
+
try:
|
|
215
|
+
import torch
|
|
216
|
+
return torch.cuda.is_available()
|
|
217
|
+
except ImportError:
|
|
218
|
+
pass
|
|
219
|
+
try:
|
|
220
|
+
import ctranslate2
|
|
221
|
+
return "cuda" in ctranslate2.get_supported_compute_types("cuda")
|
|
222
|
+
except Exception:
|
|
223
|
+
return False
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _transcribe_faster_whisper(
|
|
227
|
+
pcm_bytes: bytes,
|
|
228
|
+
keyterms: List[str],
|
|
229
|
+
language: Optional[str],
|
|
230
|
+
) -> str:
|
|
231
|
+
import numpy as np
|
|
232
|
+
|
|
233
|
+
model = _get_faster_whisper_model()
|
|
234
|
+
|
|
235
|
+
# Convert int16 PCM to float32 normalised array
|
|
236
|
+
audio = np.frombuffer(pcm_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
237
|
+
|
|
238
|
+
initial_prompt = _keyterms_to_prompt(keyterms)
|
|
239
|
+
lang = None if not language or language == "auto" else language
|
|
240
|
+
|
|
241
|
+
segments, _info = model.transcribe(
|
|
242
|
+
audio,
|
|
243
|
+
language=lang,
|
|
244
|
+
initial_prompt=initial_prompt,
|
|
245
|
+
vad_filter=True, # skip silent regions
|
|
246
|
+
vad_parameters=dict(
|
|
247
|
+
min_silence_duration_ms=300,
|
|
248
|
+
),
|
|
249
|
+
)
|
|
250
|
+
return " ".join(seg.text for seg in segments).strip()
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ── openai-whisper ────────────────────────────────────────────────────────
|
|
254
|
+
|
|
255
|
+
def _get_openai_whisper_model():
|
|
256
|
+
global _openai_whisper_model
|
|
257
|
+
if _openai_whisper_model is None:
|
|
258
|
+
import whisper
|
|
259
|
+
_openai_whisper_model = whisper.load_model(DEFAULT_MODEL_SIZE)
|
|
260
|
+
return _openai_whisper_model
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _transcribe_openai_whisper(
|
|
264
|
+
pcm_bytes: bytes,
|
|
265
|
+
keyterms: List[str],
|
|
266
|
+
language: Optional[str],
|
|
267
|
+
) -> str:
|
|
268
|
+
import numpy as np
|
|
269
|
+
|
|
270
|
+
model = _get_openai_whisper_model()
|
|
271
|
+
audio = np.frombuffer(pcm_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
272
|
+
|
|
273
|
+
initial_prompt = _keyterms_to_prompt(keyterms)
|
|
274
|
+
options: dict = {"initial_prompt": initial_prompt} if initial_prompt else {}
|
|
275
|
+
if language and language != "auto":
|
|
276
|
+
options["language"] = language
|
|
277
|
+
|
|
278
|
+
result = model.transcribe(audio, **options)
|
|
279
|
+
return result.get("text", "").strip()
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# ── OpenAI Whisper API ────────────────────────────────────────────────────
|
|
283
|
+
|
|
284
|
+
def _transcribe_openai_api(
|
|
285
|
+
pcm_bytes: bytes,
|
|
286
|
+
language: Optional[str],
|
|
287
|
+
) -> str:
|
|
288
|
+
from openai import OpenAI
|
|
289
|
+
|
|
290
|
+
client = OpenAI() # uses OPENAI_API_KEY from env
|
|
291
|
+
wav = _pcm_to_wav(pcm_bytes)
|
|
292
|
+
|
|
293
|
+
kwargs: dict = {"model": "whisper-1", "file": ("audio.wav", io.BytesIO(wav), "audio/wav")}
|
|
294
|
+
if language and language != "auto":
|
|
295
|
+
kwargs["language"] = language
|
|
296
|
+
|
|
297
|
+
transcript = client.audio.transcriptions.create(**kwargs)
|
|
298
|
+
return transcript.text.strip()
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ── Keyterms → prompt ─────────────────────────────────────────────────────
|
|
302
|
+
|
|
303
|
+
def _keyterms_to_prompt(keyterms: List[str]) -> str:
|
|
304
|
+
"""Convert a list of keywords into a Whisper initial_prompt string.
|
|
305
|
+
|
|
306
|
+
Whisper treats the initial_prompt as preceding context; sprinkling the
|
|
307
|
+
coding vocabulary terms nudges the model to prefer these spellings.
|
|
308
|
+
"""
|
|
309
|
+
if not keyterms:
|
|
310
|
+
return ""
|
|
311
|
+
# Keep it short — Whisper truncates at ~224 tokens.
|
|
312
|
+
return ", ".join(keyterms[:40])
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# ── Public entry point ────────────────────────────────────────────────────
|
|
316
|
+
|
|
317
|
+
def transcribe(
|
|
318
|
+
pcm_bytes: bytes,
|
|
319
|
+
keyterms: Optional[List[str]] = None,
|
|
320
|
+
language: str = "auto",
|
|
321
|
+
) -> str:
|
|
322
|
+
"""Transcribe raw PCM audio to text.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
pcm_bytes: Raw int16 PCM, 16 kHz, mono.
|
|
326
|
+
keyterms: Coding-domain vocabulary hints (improves accuracy).
|
|
327
|
+
language: BCP-47 language code, or 'auto' for detection.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Transcribed text, or empty string if audio contains no speech.
|
|
331
|
+
"""
|
|
332
|
+
if not pcm_bytes:
|
|
333
|
+
return ""
|
|
334
|
+
|
|
335
|
+
terms = keyterms or []
|
|
336
|
+
lang = None if language == "auto" else language
|
|
337
|
+
|
|
338
|
+
# NVIDIA Riva (whisper-large-v3, cloud) — preferred when configured
|
|
339
|
+
if _riva_available():
|
|
340
|
+
try:
|
|
341
|
+
return _transcribe_nvidia_riva(pcm_bytes, lang)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
# Network blip / quota / auth — fall through to local backends
|
|
344
|
+
print(f" [STT] Riva failed, falling back: {e}")
|
|
345
|
+
|
|
346
|
+
# faster-whisper (local)
|
|
347
|
+
try:
|
|
348
|
+
import faster_whisper # noqa: F401
|
|
349
|
+
return _transcribe_faster_whisper(pcm_bytes, terms, lang)
|
|
350
|
+
except ImportError:
|
|
351
|
+
pass
|
|
352
|
+
|
|
353
|
+
# openai-whisper (local, fallback)
|
|
354
|
+
try:
|
|
355
|
+
import whisper # noqa: F401
|
|
356
|
+
return _transcribe_openai_whisper(pcm_bytes, terms, lang)
|
|
357
|
+
except ImportError:
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
# OpenAI Whisper API (cloud, last resort)
|
|
361
|
+
if os.environ.get("OPENAI_API_KEY"):
|
|
362
|
+
return _transcribe_openai_api(pcm_bytes, lang)
|
|
363
|
+
|
|
364
|
+
raise RuntimeError(
|
|
365
|
+
"No STT backend available.\n"
|
|
366
|
+
"Install nvidia-riva-client (set NVIDIA_API_KEY), faster-whisper,\n"
|
|
367
|
+
"or set OPENAI_API_KEY to use the OpenAI Whisper cloud API."
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def transcribe_audio_file(
|
|
372
|
+
audio_bytes: bytes,
|
|
373
|
+
suffix: str = ".ogg",
|
|
374
|
+
language: str = "auto",
|
|
375
|
+
) -> str:
|
|
376
|
+
"""Transcribe an audio file (OGG, MP3, etc.) to text.
|
|
377
|
+
|
|
378
|
+
Converts to PCM via ffmpeg, then runs through the STT pipeline.
|
|
379
|
+
Falls back to OpenAI Whisper API (which accepts OGG natively) if
|
|
380
|
+
ffmpeg is not available.
|
|
381
|
+
"""
|
|
382
|
+
# Try ffmpeg conversion → local STT
|
|
383
|
+
try:
|
|
384
|
+
pcm = _audio_file_to_pcm(audio_bytes, suffix)
|
|
385
|
+
except (RuntimeError, FileNotFoundError):
|
|
386
|
+
pcm = None
|
|
387
|
+
|
|
388
|
+
if pcm is not None:
|
|
389
|
+
try:
|
|
390
|
+
return transcribe(pcm, language=language)
|
|
391
|
+
except RuntimeError:
|
|
392
|
+
pass # local STT backend failed, fall through to cloud API
|
|
393
|
+
|
|
394
|
+
# Fallback: OpenAI Whisper API accepts OGG directly
|
|
395
|
+
if os.environ.get("OPENAI_API_KEY"):
|
|
396
|
+
from openai import OpenAI
|
|
397
|
+
client = OpenAI()
|
|
398
|
+
kwargs: dict = {"model": "whisper-1", "file": (f"audio{suffix}", io.BytesIO(audio_bytes), "audio/ogg")}
|
|
399
|
+
lang = None if language == "auto" else language
|
|
400
|
+
if lang:
|
|
401
|
+
kwargs["language"] = lang
|
|
402
|
+
transcript = client.audio.transcriptions.create(**kwargs)
|
|
403
|
+
return transcript.text.strip()
|
|
404
|
+
|
|
405
|
+
raise RuntimeError(
|
|
406
|
+
"Cannot transcribe audio file.\n"
|
|
407
|
+
"Install ffmpeg for local conversion, or set OPENAI_API_KEY for cloud STT."
|
|
408
|
+
)
|