dulus 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. agent.py +363 -0
  2. backend/__init__.py +63 -0
  3. backend/compressor.py +261 -0
  4. backend/context.py +329 -0
  5. backend/githook.py +166 -0
  6. backend/marketplace.py +141 -0
  7. backend/mempalace_bridge.py +182 -0
  8. backend/personas.py +297 -0
  9. backend/plugins.py +222 -0
  10. backend/server.py +411 -0
  11. backend/tasks.py +213 -0
  12. batch_api.py +307 -0
  13. checkpoint/__init__.py +27 -0
  14. checkpoint/hooks.py +90 -0
  15. checkpoint/store.py +314 -0
  16. checkpoint/types.py +80 -0
  17. claude_code_watcher.py +214 -0
  18. clipboard_utils.py +246 -0
  19. cloudsave.py +159 -0
  20. common.py +177 -0
  21. compaction.py +378 -0
  22. config.py +180 -0
  23. context.py +241 -0
  24. dulus-0.2.0.dist-info/METADATA +600 -0
  25. dulus-0.2.0.dist-info/RECORD +101 -0
  26. dulus-0.2.0.dist-info/WHEEL +5 -0
  27. dulus-0.2.0.dist-info/entry_points.txt +2 -0
  28. dulus-0.2.0.dist-info/licenses/LICENSE +674 -0
  29. dulus-0.2.0.dist-info/licenses/license_manager.py +187 -0
  30. dulus-0.2.0.dist-info/top_level.txt +36 -0
  31. dulus.py +8455 -0
  32. dulus_gui.py +331 -0
  33. dulus_mcp/__init__.py +43 -0
  34. dulus_mcp/client.py +546 -0
  35. dulus_mcp/config.py +133 -0
  36. dulus_mcp/tools.py +131 -0
  37. dulus_mcp/types.py +124 -0
  38. gui/__init__.py +18 -0
  39. gui/agent_bridge.py +283 -0
  40. gui/chat_widget.py +448 -0
  41. gui/main_window.py +485 -0
  42. gui/personas.py +230 -0
  43. gui/session_utils.py +189 -0
  44. gui/settings_dialog.py +146 -0
  45. gui/sidebar.py +515 -0
  46. gui/tasks_view.py +499 -0
  47. gui/themes.py +256 -0
  48. gui/tool_panel.py +94 -0
  49. input.py +1030 -0
  50. license_manager.py +187 -0
  51. memory/__init__.py +93 -0
  52. memory/audit.py +51 -0
  53. memory/consolidator.py +312 -0
  54. memory/context.py +270 -0
  55. memory/offload.py +148 -0
  56. memory/palace.py +127 -0
  57. memory/scan.py +146 -0
  58. memory/sessions.py +100 -0
  59. memory/store.py +395 -0
  60. memory/tools.py +408 -0
  61. memory/types.py +114 -0
  62. memory/vector_search.py +92 -0
  63. multi_agent/__init__.py +23 -0
  64. multi_agent/subagent.py +501 -0
  65. multi_agent/tools.py +393 -0
  66. offload_helper.py +183 -0
  67. plugin/__init__.py +22 -0
  68. plugin/autoadapter.py +1641 -0
  69. plugin/loader.py +156 -0
  70. plugin/recommend.py +211 -0
  71. plugin/store.py +387 -0
  72. plugin/types.py +147 -0
  73. providers.py +3750 -0
  74. skill/__init__.py +14 -0
  75. skill/builtin.py +100 -0
  76. skill/clawhub.py +270 -0
  77. skill/executor.py +66 -0
  78. skill/loader.py +199 -0
  79. skill/tools.py +110 -0
  80. skills.py +14 -0
  81. spinner.py +42 -0
  82. string_utils.py +42 -0
  83. subagent.py +11 -0
  84. task/__init__.py +12 -0
  85. task/store.py +199 -0
  86. task/tools.py +265 -0
  87. task/types.py +92 -0
  88. tmux_offloader.py +177 -0
  89. tmux_tools.py +410 -0
  90. tool_registry.py +214 -0
  91. tools.py +2694 -0
  92. ui/__init__.py +1 -0
  93. ui/input.py +464 -0
  94. ui/render.py +272 -0
  95. voice/__init__.py +56 -0
  96. voice/keyterms.py +179 -0
  97. voice/recorder.py +263 -0
  98. voice/stt.py +408 -0
  99. voice/tts.py +570 -0
  100. webchat.py +432 -0
  101. webchat_server.py +1761 -0
voice/recorder.py ADDED
@@ -0,0 +1,263 @@
1
+ """Audio capture for voice input.
2
+
3
+ Backend priority (tried in order):
4
+ 1. sounddevice — cross-platform, pure-Python wrapper around PortAudio.
5
+ Best option: works on macOS, Linux, Windows.
6
+ pip install sounddevice
7
+ 2. arecord — Linux ALSA utility. No pip install needed.
8
+ 3. sox rec — SoX command-line recorder. Supports silence detection.
9
+ sudo apt install sox / brew install sox
10
+
11
+ All backends capture raw PCM: 16 kHz, 16-bit signed little-endian, mono.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import io
17
+ import shutil
18
+ import subprocess
19
+ import threading
20
+ from pathlib import Path
21
+
22
+ SAMPLE_RATE = 16000
23
+ CHANNELS = 1
24
+ DTYPE = "int16"
25
+ BYTES_PER_SAMPLE = 2 # int16
26
+
27
+ # Silence detection parameters
28
+ SILENCE_THRESHOLD_RMS = 0.012 # fraction of int16 max (0..1)
29
+ SILENCE_DURATION_SECS = 1.8 # stop after this many seconds of silence
30
+ CHUNK_SECS = 0.08 # 80 ms chunks for RMS poll
31
+
32
+
33
+ def _has_cmd(cmd: str) -> bool:
34
+ return shutil.which(cmd) is not None
35
+
36
+
37
+ # ── Availability ──────────────────────────────────────────────────────────
38
+
39
+ def check_recording_availability() -> tuple[bool, str | None]:
40
+ """Return (available, reason_if_not)."""
41
+ # sounddevice (ImportError = not installed; OSError = PortAudio library missing)
42
+ try:
43
+ import sounddevice # noqa: F401
44
+ return True, None
45
+ except (ImportError, OSError):
46
+ pass
47
+
48
+ # arecord
49
+ if _has_cmd("arecord"):
50
+ return True, None
51
+
52
+ # sox rec
53
+ if _has_cmd("rec"):
54
+ return True, None
55
+
56
+ return False, (
57
+ "No audio recording backend found.\n"
58
+ "Install one of:\n"
59
+ " pip install sounddevice (recommended, cross-platform)\n"
60
+ " sudo apt install alsa-utils (Linux — provides arecord)\n"
61
+ " sudo apt install sox / brew install sox (SoX rec)"
62
+ )
63
+
64
+
65
+ # ── sounddevice backend ───────────────────────────────────────────────────
66
+
67
+ def list_input_devices() -> list[dict]:
68
+ """Return a list of available input devices with index and name."""
69
+ import sounddevice as sd
70
+ devices = sd.query_devices()
71
+ result = []
72
+ for i, d in enumerate(devices):
73
+ if d["max_input_channels"] > 0:
74
+ result.append({"index": i, "name": d["name"]})
75
+ return result
76
+
77
+
78
+ def _record_sounddevice(
79
+ max_seconds: int = 30,
80
+ on_energy: "callable | None" = None,
81
+ device_index: "int | None" = None,
82
+ ) -> bytes:
83
+ import sounddevice as sd
84
+ import numpy as np
85
+
86
+ chunk_samples = int(SAMPLE_RATE * CHUNK_SECS)
87
+ silence_chunks_needed = int(SILENCE_DURATION_SECS / CHUNK_SECS)
88
+ max_chunks = int(max_seconds / CHUNK_SECS)
89
+
90
+ chunks: list[bytes] = []
91
+ silence_count = 0
92
+ done_evt = threading.Event()
93
+
94
+ def callback(indata: "np.ndarray", frames: int, time_info, status) -> None:
95
+ nonlocal silence_count
96
+ mono = indata[:, 0].copy()
97
+ chunks.append(mono.tobytes())
98
+
99
+ # RMS energy (normalised 0..1)
100
+ rms = float(np.sqrt(np.mean(mono.astype(np.float32) ** 2))) / 32768.0
101
+ if on_energy:
102
+ on_energy(rms)
103
+
104
+ if rms < SILENCE_THRESHOLD_RMS:
105
+ silence_count += 1
106
+ else:
107
+ silence_count = 0
108
+
109
+ # Only auto-stop on silence *after* we have some speech (≥3 chunks with signal)
110
+ has_speech = len(chunks) >= 3
111
+ if has_speech and silence_count >= silence_chunks_needed:
112
+ done_evt.set()
113
+ raise sd.CallbackStop()
114
+ if len(chunks) >= max_chunks:
115
+ done_evt.set()
116
+ raise sd.CallbackStop()
117
+
118
+ stream_kwargs = dict(
119
+ samplerate=SAMPLE_RATE,
120
+ channels=CHANNELS,
121
+ dtype=DTYPE,
122
+ blocksize=chunk_samples,
123
+ callback=callback,
124
+ )
125
+ if device_index is not None:
126
+ stream_kwargs["device"] = device_index
127
+ with sd.InputStream(**stream_kwargs):
128
+ done_evt.wait(timeout=max_seconds + 2)
129
+
130
+ return b"".join(chunks)
131
+
132
+
133
+ # ── arecord backend (Linux ALSA) ──────────────────────────────────────────
134
+
135
+ def _record_arecord(
136
+ max_seconds: int = 30,
137
+ on_energy: "callable | None" = None,
138
+ ) -> bytes:
139
+ """Record via arecord. Silence detection done in Python on the piped PCM."""
140
+ import numpy as np
141
+
142
+ cmd = [
143
+ "arecord",
144
+ "-f", "S16_LE",
145
+ "-r", str(SAMPLE_RATE),
146
+ "-c", str(CHANNELS),
147
+ "-t", "raw",
148
+ "-q",
149
+ "-d", str(max_seconds),
150
+ "-",
151
+ ]
152
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
153
+
154
+ chunk_bytes = int(SAMPLE_RATE * CHUNK_SECS) * BYTES_PER_SAMPLE
155
+ silence_chunks_needed = int(SILENCE_DURATION_SECS / CHUNK_SECS)
156
+
157
+ chunks: list[bytes] = []
158
+ silence_count = 0
159
+
160
+ try:
161
+ while True:
162
+ raw = proc.stdout.read(chunk_bytes)
163
+ if not raw:
164
+ break
165
+ chunks.append(raw)
166
+
167
+ arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
168
+ rms = float(np.sqrt(np.mean(arr ** 2))) / 32768.0
169
+ if on_energy:
170
+ on_energy(rms)
171
+
172
+ if rms < SILENCE_THRESHOLD_RMS:
173
+ silence_count += 1
174
+ else:
175
+ silence_count = 0
176
+
177
+ has_speech = len(chunks) >= 3
178
+ if has_speech and silence_count >= silence_chunks_needed:
179
+ break
180
+ finally:
181
+ proc.terminate()
182
+ try:
183
+ proc.wait(timeout=2)
184
+ except subprocess.TimeoutExpired:
185
+ proc.kill()
186
+
187
+ return b"".join(chunks)
188
+
189
+
190
+ # ── SoX rec backend ───────────────────────────────────────────────────────
191
+
192
+ def _record_sox(
193
+ max_seconds: int = 30,
194
+ on_energy: "callable | None" = None,
195
+ ) -> bytes:
196
+ """Record via SoX `rec` with built-in silence detection."""
197
+ silence_threshold = "3%"
198
+ silence_pre_duration = "0.1"
199
+ silence_post_duration = str(SILENCE_DURATION_SECS)
200
+
201
+ cmd = [
202
+ "rec",
203
+ "-q",
204
+ "--buffer", "1024",
205
+ "-t", "raw",
206
+ "-r", str(SAMPLE_RATE),
207
+ "-e", "signed",
208
+ "-b", "16",
209
+ "-c", str(CHANNELS),
210
+ "-",
211
+ "silence",
212
+ "1", silence_pre_duration, silence_threshold,
213
+ "1", silence_post_duration, silence_threshold,
214
+ ]
215
+
216
+ # Honour max_seconds via a timeout
217
+ try:
218
+ result = subprocess.run(
219
+ cmd,
220
+ capture_output=True,
221
+ timeout=max_seconds,
222
+ )
223
+ return result.stdout
224
+ except subprocess.TimeoutExpired as e:
225
+ return e.stdout or b""
226
+
227
+
228
+ # ── Public entry point ────────────────────────────────────────────────────
229
+
230
+ def record_until_silence(
231
+ max_seconds: int = 30,
232
+ on_energy: "callable | None" = None,
233
+ device_index: "int | None" = None,
234
+ ) -> bytes:
235
+ """Record from microphone until silence or max_seconds.
236
+
237
+ Returns raw PCM bytes: int16, 16 kHz, mono.
238
+ Tries backends in order: sounddevice → arecord → sox rec.
239
+ Raises RuntimeError if no backend is available.
240
+ """
241
+ try:
242
+ import sounddevice # noqa: F401
243
+ return _record_sounddevice(max_seconds=max_seconds, on_energy=on_energy, device_index=device_index)
244
+ except (ImportError, OSError):
245
+ pass
246
+
247
+ if _has_cmd("arecord"):
248
+ try:
249
+ import numpy # noqa: F401
250
+ return _record_arecord(max_seconds=max_seconds, on_energy=on_energy)
251
+ except ImportError:
252
+ # numpy missing — fall through to sox (no RMS feedback)
253
+ return _record_arecord(max_seconds=max_seconds, on_energy=None)
254
+
255
+ if _has_cmd("rec"):
256
+ return _record_sox(max_seconds=max_seconds, on_energy=on_energy)
257
+
258
+ raise RuntimeError(
259
+ "No audio recording backend found.\n"
260
+ "Install sounddevice: pip install sounddevice\n"
261
+ "Or install arecord: sudo apt install alsa-utils\n"
262
+ "Or install SoX: sudo apt install sox"
263
+ )
voice/stt.py ADDED
@@ -0,0 +1,408 @@
1
+ """Speech-to-text (STT) backends.
2
+
3
+ Backend priority (tried in order):
4
+ 1. NVIDIA Riva — cloud, whisper-large-v3 via gRPC, needs NVIDIA_API_KEY.
5
+ pip install nvidia-riva-client
6
+ 2. faster-whisper — local, offline, fast, best for coding vocab.
7
+ pip install faster-whisper
8
+ 3. openai-whisper — local, offline, original OpenAI Whisper library.
9
+ pip install openai-whisper
10
+ 4. OpenAI Whisper API — cloud, needs OPENAI_API_KEY.
11
+ pip install openai (already in requirements)
12
+
13
+ All backends receive raw PCM (int16, 16 kHz, mono) and return a text string.
14
+ Keyterms are passed as initial_prompt to local Whisper backends so that
15
+ coding-domain vocabulary (grep, MCP, TypeScript, …) is recognised correctly.
16
+ Riva does not accept initial_prompt; keyterms are ignored on that path.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import io
22
+ import os
23
+ import struct
24
+ import tempfile
25
+ from pathlib import Path
26
+ from typing import List, Optional
27
+
28
+ from .recorder import SAMPLE_RATE, CHANNELS, BYTES_PER_SAMPLE
29
+
30
+ # ── Cached model handles ──────────────────────────────────────────────────
31
+
32
+ _faster_whisper_model = None
33
+ _openai_whisper_model = None
34
+
35
+ # Model size: "tiny", "base", "small", "medium", "large-v2", "large-v3"
36
+ # "base" is a good balance of speed and accuracy for coding dictation.
37
+ # Override with env var DULUS_WHISPER_MODEL.
38
+ DEFAULT_MODEL_SIZE = os.environ.get("DULUS_WHISPER_MODEL", "medium")
39
+
40
+ # ── NVIDIA Riva (whisper-large-v3 via NVCF gRPC) ─────────────────────────
41
+ RIVA_SERVER = os.environ.get("DULUS_RIVA_SERVER", "grpc.nvcf.nvidia.com:443")
42
+ RIVA_FUNCTION_ID = os.environ.get("DULUS_RIVA_FUNCTION_ID",
43
+ "b702f636-f60c-4a3d-a6f4-f3568c13bd7d")
44
+
45
+
46
+ def _riva_available() -> bool:
47
+ """Riva backend is usable iff the client lib is installed AND we have a key."""
48
+ if not os.environ.get("NVIDIA_API_KEY"):
49
+ return False
50
+ try:
51
+ import riva.client # noqa: F401
52
+ return True
53
+ except ImportError:
54
+ return False
55
+
56
+
57
+ def _transcribe_nvidia_riva(
58
+ pcm_bytes: bytes,
59
+ language: Optional[str],
60
+ translate: bool = False,
61
+ ) -> str:
62
+ """Transcribe via NVIDIA NVCF Riva (whisper-large-v3, gRPC).
63
+
64
+ Riva expects a real audio container — we wrap raw PCM in WAV.
65
+ `language=None` or "auto" → "multi" (Riva auto-detect).
66
+ `translate=True` adds custom_configuration "task:translate" so foreign
67
+ speech comes back as English.
68
+ """
69
+ import riva.client
70
+ api_key = os.environ["NVIDIA_API_KEY"]
71
+ auth = riva.client.Auth(
72
+ None, # ssl_cert
73
+ True, # use_ssl
74
+ RIVA_SERVER,
75
+ [("function-id", RIVA_FUNCTION_ID),
76
+ ("authorization", f"Bearer {api_key}")],
77
+ )
78
+ asr = riva.client.ASRService(auth)
79
+ lang_code = "multi" if (not language or language == "auto") else language
80
+ config = riva.client.RecognitionConfig(
81
+ encoding=riva.client.AudioEncoding.LINEAR_PCM,
82
+ sample_rate_hertz=SAMPLE_RATE,
83
+ audio_channel_count=CHANNELS,
84
+ language_code=lang_code,
85
+ max_alternatives=1,
86
+ enable_automatic_punctuation=True,
87
+ )
88
+ if translate:
89
+ riva.client.add_custom_configuration_to_config(config, "task:translate")
90
+ wav = _pcm_to_wav(pcm_bytes)
91
+ resp = asr.offline_recognize(wav, config)
92
+ parts = []
93
+ for r in resp.results:
94
+ if r.alternatives:
95
+ parts.append(r.alternatives[0].transcript)
96
+ return " ".join(parts).strip()
97
+
98
+
99
+ # ── OGG/audio file → PCM conversion ──────────────────────────────────────
100
+
101
+ def _audio_file_to_pcm(audio_bytes: bytes, suffix: str = ".ogg") -> bytes:
102
+ """Convert an audio file (OGG, MP3, etc.) to raw int16 PCM (16kHz mono) via ffmpeg."""
103
+ import subprocess
104
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
105
+ f.write(audio_bytes)
106
+ f.flush()
107
+ tmp_in = f.name
108
+ try:
109
+ r = subprocess.run(
110
+ ["ffmpeg", "-y", "-i", tmp_in, "-f", "s16le", "-ar", str(SAMPLE_RATE),
111
+ "-ac", str(CHANNELS), "-acodec", "pcm_s16le", "-"],
112
+ capture_output=True, timeout=30,
113
+ )
114
+ if r.returncode != 0:
115
+ raise RuntimeError(f"ffmpeg failed: {r.stderr[:200]}")
116
+ return r.stdout
117
+ finally:
118
+ Path(tmp_in).unlink(missing_ok=True)
119
+
120
+
121
+ # ── WAV helper ────────────────────────────────────────────────────────────
122
+
123
+ def _pcm_to_wav(pcm_bytes: bytes) -> bytes:
124
+ """Wrap raw int16 PCM in a minimal WAV container."""
125
+ num_samples = len(pcm_bytes) // BYTES_PER_SAMPLE
126
+ byte_rate = SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE
127
+ block_align = CHANNELS * BYTES_PER_SAMPLE
128
+ data_size = len(pcm_bytes)
129
+ header = struct.pack(
130
+ "<4sI4s4sIHHIIHH4sI",
131
+ b"RIFF",
132
+ 36 + data_size,
133
+ b"WAVE",
134
+ b"fmt ",
135
+ 16, # chunk size
136
+ 1, # PCM format
137
+ CHANNELS,
138
+ SAMPLE_RATE,
139
+ byte_rate,
140
+ block_align,
141
+ 16, # bits per sample
142
+ b"data",
143
+ data_size,
144
+ )
145
+ return header + pcm_bytes
146
+
147
+
148
+ # ── Availability ──────────────────────────────────────────────────────────
149
+
150
+ def check_stt_availability() -> tuple[bool, str | None]:
151
+ """Return (available, reason_if_not)."""
152
+ if _riva_available():
153
+ return True, None
154
+ try:
155
+ import faster_whisper # noqa: F401
156
+ return True, None
157
+ except ImportError:
158
+ pass
159
+ try:
160
+ import whisper # noqa: F401
161
+ return True, None
162
+ except ImportError:
163
+ pass
164
+ if os.environ.get("OPENAI_API_KEY"):
165
+ return True, None
166
+
167
+ return False, (
168
+ "No STT backend available.\n"
169
+ "Install one of:\n"
170
+ " pip install nvidia-riva-client (cloud, whisper-large-v3 — set NVIDIA_API_KEY)\n"
171
+ " pip install faster-whisper (local, recommended)\n"
172
+ " pip install openai-whisper (local, original)\n"
173
+ " Set OPENAI_API_KEY to use the OpenAI Whisper cloud API"
174
+ )
175
+
176
+
177
+ def get_stt_backend_name() -> str:
178
+ """Return a human-readable name of the backend that will be used."""
179
+ if _riva_available():
180
+ return "NVIDIA Riva (whisper-large-v3, cloud)"
181
+ try:
182
+ import faster_whisper # noqa: F401
183
+ return f"faster-whisper ({DEFAULT_MODEL_SIZE})"
184
+ except ImportError:
185
+ pass
186
+ try:
187
+ import whisper # noqa: F401
188
+ return f"openai-whisper ({DEFAULT_MODEL_SIZE})"
189
+ except ImportError:
190
+ pass
191
+ if os.environ.get("OPENAI_API_KEY"):
192
+ return "OpenAI Whisper API"
193
+ return "(none)"
194
+
195
+
196
+ # ── faster-whisper ────────────────────────────────────────────────────────
197
+
198
+ def _get_faster_whisper_model():
199
+ global _faster_whisper_model
200
+ if _faster_whisper_model is None:
201
+ from faster_whisper import WhisperModel
202
+ # Use CPU by default; set device="cuda" if GPU available.
203
+ device = "cuda" if _has_cuda() else "cpu"
204
+ compute = "float16" if device == "cuda" else "int8"
205
+ _faster_whisper_model = WhisperModel(
206
+ DEFAULT_MODEL_SIZE,
207
+ device=device,
208
+ compute_type=compute,
209
+ )
210
+ return _faster_whisper_model
211
+
212
+
213
+ def _has_cuda() -> bool:
214
+ try:
215
+ import torch
216
+ return torch.cuda.is_available()
217
+ except ImportError:
218
+ pass
219
+ try:
220
+ import ctranslate2
221
+ return "cuda" in ctranslate2.get_supported_compute_types("cuda")
222
+ except Exception:
223
+ return False
224
+
225
+
226
+ def _transcribe_faster_whisper(
227
+ pcm_bytes: bytes,
228
+ keyterms: List[str],
229
+ language: Optional[str],
230
+ ) -> str:
231
+ import numpy as np
232
+
233
+ model = _get_faster_whisper_model()
234
+
235
+ # Convert int16 PCM to float32 normalised array
236
+ audio = np.frombuffer(pcm_bytes, dtype=np.int16).astype(np.float32) / 32768.0
237
+
238
+ initial_prompt = _keyterms_to_prompt(keyterms)
239
+ lang = None if not language or language == "auto" else language
240
+
241
+ segments, _info = model.transcribe(
242
+ audio,
243
+ language=lang,
244
+ initial_prompt=initial_prompt,
245
+ vad_filter=True, # skip silent regions
246
+ vad_parameters=dict(
247
+ min_silence_duration_ms=300,
248
+ ),
249
+ )
250
+ return " ".join(seg.text for seg in segments).strip()
251
+
252
+
253
+ # ── openai-whisper ────────────────────────────────────────────────────────
254
+
255
+ def _get_openai_whisper_model():
256
+ global _openai_whisper_model
257
+ if _openai_whisper_model is None:
258
+ import whisper
259
+ _openai_whisper_model = whisper.load_model(DEFAULT_MODEL_SIZE)
260
+ return _openai_whisper_model
261
+
262
+
263
+ def _transcribe_openai_whisper(
264
+ pcm_bytes: bytes,
265
+ keyterms: List[str],
266
+ language: Optional[str],
267
+ ) -> str:
268
+ import numpy as np
269
+
270
+ model = _get_openai_whisper_model()
271
+ audio = np.frombuffer(pcm_bytes, dtype=np.int16).astype(np.float32) / 32768.0
272
+
273
+ initial_prompt = _keyterms_to_prompt(keyterms)
274
+ options: dict = {"initial_prompt": initial_prompt} if initial_prompt else {}
275
+ if language and language != "auto":
276
+ options["language"] = language
277
+
278
+ result = model.transcribe(audio, **options)
279
+ return result.get("text", "").strip()
280
+
281
+
282
+ # ── OpenAI Whisper API ────────────────────────────────────────────────────
283
+
284
+ def _transcribe_openai_api(
285
+ pcm_bytes: bytes,
286
+ language: Optional[str],
287
+ ) -> str:
288
+ from openai import OpenAI
289
+
290
+ client = OpenAI() # uses OPENAI_API_KEY from env
291
+ wav = _pcm_to_wav(pcm_bytes)
292
+
293
+ kwargs: dict = {"model": "whisper-1", "file": ("audio.wav", io.BytesIO(wav), "audio/wav")}
294
+ if language and language != "auto":
295
+ kwargs["language"] = language
296
+
297
+ transcript = client.audio.transcriptions.create(**kwargs)
298
+ return transcript.text.strip()
299
+
300
+
301
+ # ── Keyterms → prompt ─────────────────────────────────────────────────────
302
+
303
+ def _keyterms_to_prompt(keyterms: List[str]) -> str:
304
+ """Convert a list of keywords into a Whisper initial_prompt string.
305
+
306
+ Whisper treats the initial_prompt as preceding context; sprinkling the
307
+ coding vocabulary terms nudges the model to prefer these spellings.
308
+ """
309
+ if not keyterms:
310
+ return ""
311
+ # Keep it short — Whisper truncates at ~224 tokens.
312
+ return ", ".join(keyterms[:40])
313
+
314
+
315
+ # ── Public entry point ────────────────────────────────────────────────────
316
+
317
+ def transcribe(
318
+ pcm_bytes: bytes,
319
+ keyterms: Optional[List[str]] = None,
320
+ language: str = "auto",
321
+ ) -> str:
322
+ """Transcribe raw PCM audio to text.
323
+
324
+ Args:
325
+ pcm_bytes: Raw int16 PCM, 16 kHz, mono.
326
+ keyterms: Coding-domain vocabulary hints (improves accuracy).
327
+ language: BCP-47 language code, or 'auto' for detection.
328
+
329
+ Returns:
330
+ Transcribed text, or empty string if audio contains no speech.
331
+ """
332
+ if not pcm_bytes:
333
+ return ""
334
+
335
+ terms = keyterms or []
336
+ lang = None if language == "auto" else language
337
+
338
+ # NVIDIA Riva (whisper-large-v3, cloud) — preferred when configured
339
+ if _riva_available():
340
+ try:
341
+ return _transcribe_nvidia_riva(pcm_bytes, lang)
342
+ except Exception as e:
343
+ # Network blip / quota / auth — fall through to local backends
344
+ print(f" [STT] Riva failed, falling back: {e}")
345
+
346
+ # faster-whisper (local)
347
+ try:
348
+ import faster_whisper # noqa: F401
349
+ return _transcribe_faster_whisper(pcm_bytes, terms, lang)
350
+ except ImportError:
351
+ pass
352
+
353
+ # openai-whisper (local, fallback)
354
+ try:
355
+ import whisper # noqa: F401
356
+ return _transcribe_openai_whisper(pcm_bytes, terms, lang)
357
+ except ImportError:
358
+ pass
359
+
360
+ # OpenAI Whisper API (cloud, last resort)
361
+ if os.environ.get("OPENAI_API_KEY"):
362
+ return _transcribe_openai_api(pcm_bytes, lang)
363
+
364
+ raise RuntimeError(
365
+ "No STT backend available.\n"
366
+ "Install nvidia-riva-client (set NVIDIA_API_KEY), faster-whisper,\n"
367
+ "or set OPENAI_API_KEY to use the OpenAI Whisper cloud API."
368
+ )
369
+
370
+
371
+ def transcribe_audio_file(
372
+ audio_bytes: bytes,
373
+ suffix: str = ".ogg",
374
+ language: str = "auto",
375
+ ) -> str:
376
+ """Transcribe an audio file (OGG, MP3, etc.) to text.
377
+
378
+ Converts to PCM via ffmpeg, then runs through the STT pipeline.
379
+ Falls back to OpenAI Whisper API (which accepts OGG natively) if
380
+ ffmpeg is not available.
381
+ """
382
+ # Try ffmpeg conversion → local STT
383
+ try:
384
+ pcm = _audio_file_to_pcm(audio_bytes, suffix)
385
+ except (RuntimeError, FileNotFoundError):
386
+ pcm = None
387
+
388
+ if pcm is not None:
389
+ try:
390
+ return transcribe(pcm, language=language)
391
+ except RuntimeError:
392
+ pass # local STT backend failed, fall through to cloud API
393
+
394
+ # Fallback: OpenAI Whisper API accepts OGG directly
395
+ if os.environ.get("OPENAI_API_KEY"):
396
+ from openai import OpenAI
397
+ client = OpenAI()
398
+ kwargs: dict = {"model": "whisper-1", "file": (f"audio{suffix}", io.BytesIO(audio_bytes), "audio/ogg")}
399
+ lang = None if language == "auto" else language
400
+ if lang:
401
+ kwargs["language"] = lang
402
+ transcript = client.audio.transcriptions.create(**kwargs)
403
+ return transcript.text.strip()
404
+
405
+ raise RuntimeError(
406
+ "Cannot transcribe audio file.\n"
407
+ "Install ffmpeg for local conversion, or set OPENAI_API_KEY for cloud STT."
408
+ )