supervoxtral 0.1.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/METADATA +3 -6
- supervoxtral-0.3.0.dist-info/RECORD +18 -0
- {supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/WHEEL +1 -1
- svx/cli.py +9 -4
- svx/core/audio.py +7 -1
- svx/core/config.py +17 -1
- svx/core/pipeline.py +45 -32
- svx/core/prompt.py +37 -12
- svx/core/storage.py +1 -1
- svx/providers/base.py +29 -9
- svx/providers/mistral.py +75 -68
- svx/ui/qt_app.py +35 -4
- supervoxtral-0.1.4.dist-info/RECORD +0 -18
- {supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/entry_points.txt +0 -0
- {supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: supervoxtral
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: CLI/GUI audio recorder
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: CLI/GUI audio recorder with 2-step pipeline: transcription (Voxtral) then text transformation (LLM).
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Keywords: audio,cli,gui,mistral,transcription,voxtral,whisper
|
|
@@ -14,10 +14,7 @@ Requires-Dist: sounddevice
|
|
|
14
14
|
Requires-Dist: soundfile
|
|
15
15
|
Requires-Dist: typer
|
|
16
16
|
Provides-Extra: dev
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist: mypy; extra == 'dev'
|
|
19
|
-
Requires-Dist: pytest; extra == 'dev'
|
|
17
|
+
Requires-Dist: basedpyright; extra == 'dev'
|
|
20
18
|
Requires-Dist: ruff; extra == 'dev'
|
|
21
|
-
Requires-Dist: types-python-dotenv; extra == 'dev'
|
|
22
19
|
Provides-Extra: gui
|
|
23
20
|
Requires-Dist: pyside6-essentials; extra == 'gui'
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
svx/__init__.py,sha256=qPEe5u3PT8yOQN4MiOLj_Bd18HqcRb6fxnPDfdMUP7w,742
|
|
2
|
+
svx/cli.py,sha256=7fzs85LT85RbZYtI8t-yOXKrRd9r-IzE1hnFJHNgxL4,9436
|
|
3
|
+
svx/core/__init__.py,sha256=mhzXuIXo3kUzjWme0Bxhe4TQZQELlyEiG_89LUAPC7M,2856
|
|
4
|
+
svx/core/audio.py,sha256=svyRWbPaUyYqbmGaLF8oUim-x5mj9zciv0XCqq2VGEU,7828
|
|
5
|
+
svx/core/clipboard.py,sha256=IFtiN2SnYKQIu0WXx0hCK8syvDXanBpm1Jr2a8X7y9s,3692
|
|
6
|
+
svx/core/config.py,sha256=Ib_lIKUFriW-B9i49zTUZE-YcOQYEkpBE3CF_WDzFlg,17060
|
|
7
|
+
svx/core/pipeline.py,sha256=GhaOJtHGiwwsSv2EkNM-ZKu0DSm25xDeI9sNTLw7YJU,11612
|
|
8
|
+
svx/core/prompt.py,sha256=OpS3XgusRwV4JP9cCzyk0DXcphcLcgHIPV89eoc2vFc,7282
|
|
9
|
+
svx/core/storage.py,sha256=_w_rTOPoqlz0eoD2XHPNvHPQXgs6QxZ7SP4_IBT8Bx4,3223
|
|
10
|
+
svx/providers/__init__.py,sha256=SzlSWpZSUIptbSrAnGfi0d0NX4hYTpT0ObWpYyskDdA,2634
|
|
11
|
+
svx/providers/base.py,sha256=D_iDjhJuAGye-JjWbO-Rtl131kD6hgYQaspO53-6spo,3238
|
|
12
|
+
svx/providers/mistral.py,sha256=ZkA02KDU-2ktdBM2tKUmTG8ZVnp8suE4g1TrPBpmqDA,6439
|
|
13
|
+
svx/ui/qt_app.py,sha256=FDdxcgqzHi5HCsbmCzQtVEFTfcVaDLfiVgQe_8YHHoY,19993
|
|
14
|
+
supervoxtral-0.3.0.dist-info/METADATA,sha256=vnKZuM96g1le-da4EVKOsECuSvIVpneE0xKq-DY-2_E,629
|
|
15
|
+
supervoxtral-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
16
|
+
supervoxtral-0.3.0.dist-info/entry_points.txt,sha256=phJhRy3VkYHC6AR_tUB5CypHzG0ePRR9sB13HWE1vEg,36
|
|
17
|
+
supervoxtral-0.3.0.dist-info/licenses/LICENSE,sha256=fCEBKmC4i-1WZAwoKjKWegfDd8qNsG8ECB7JyqoswyQ,1064
|
|
18
|
+
supervoxtral-0.3.0.dist-info/RECORD,,
|
svx/cli.py
CHANGED
|
@@ -162,10 +162,14 @@ def record(
|
|
|
162
162
|
),
|
|
163
163
|
):
|
|
164
164
|
"""
|
|
165
|
-
Record audio from the microphone and
|
|
165
|
+
Record audio from the microphone and process it via a 2-step pipeline.
|
|
166
|
+
|
|
167
|
+
Pipeline:
|
|
168
|
+
1. Transcription: audio -> text via dedicated transcription endpoint (always).
|
|
169
|
+
2. Transformation: text + prompt -> text via text-based LLM (when a prompt is provided).
|
|
166
170
|
|
|
167
171
|
This CLI accepts only a small set of runtime flags. Most defaults (provider, format,
|
|
168
|
-
model, language, sample rate, channels, device,
|
|
172
|
+
model, chat_model, language, sample rate, channels, device,
|
|
169
173
|
file retention, copy-to-clipboard)
|
|
170
174
|
must be configured in the user's `config.toml` under [defaults].
|
|
171
175
|
|
|
@@ -178,11 +182,12 @@ def record(
|
|
|
178
182
|
Flow:
|
|
179
183
|
- Records WAV until you press Enter (CLI mode).
|
|
180
184
|
- Optionally converts to MP3/Opus depending on config.
|
|
181
|
-
-
|
|
185
|
+
- Transcribes via dedicated endpoint (step 1).
|
|
186
|
+
- If a prompt is provided, transforms the transcript via LLM (step 2).
|
|
182
187
|
- Prints and saves the result.
|
|
183
188
|
|
|
184
189
|
Note: In --transcribe mode, prompts (--user-prompt or --user-prompt-file) are ignored,
|
|
185
|
-
|
|
190
|
+
and only step 1 (transcription) is performed.
|
|
186
191
|
"""
|
|
187
192
|
cfg = Config.load(log_level=log_level)
|
|
188
193
|
|
svx/core/audio.py
CHANGED
|
@@ -22,6 +22,7 @@ from pathlib import Path
|
|
|
22
22
|
from threading import Event, Thread
|
|
23
23
|
from typing import Any
|
|
24
24
|
|
|
25
|
+
import numpy as np
|
|
25
26
|
import sounddevice as sd
|
|
26
27
|
import soundfile as sf
|
|
27
28
|
|
|
@@ -149,7 +150,12 @@ def record_wav(
|
|
|
149
150
|
writer_stop = Event()
|
|
150
151
|
start_time = time.time()
|
|
151
152
|
|
|
152
|
-
def audio_callback(
|
|
153
|
+
def audio_callback(
|
|
154
|
+
indata: np.ndarray[Any, np.dtype[np.int16]],
|
|
155
|
+
frames: int,
|
|
156
|
+
time_info: sd.CallbackFlags,
|
|
157
|
+
status: sd.CallbackFlags,
|
|
158
|
+
) -> None:
|
|
153
159
|
if status:
|
|
154
160
|
logging.warning("SoundDevice status: %s", status)
|
|
155
161
|
q.put(indata.copy())
|
svx/core/config.py
CHANGED
|
@@ -220,10 +220,17 @@ def init_user_config(force: bool = False, prompt_file: Path | None = None) -> Pa
|
|
|
220
220
|
'# File format sent to the provider: "wav" | "mp3" | "opus"\n'
|
|
221
221
|
'# Recording is always WAV; conversion is applied if "mp3" or "opus"\n'
|
|
222
222
|
'format = "opus"\n\n'
|
|
223
|
-
"# Model
|
|
223
|
+
"# Model for audio transcription (dedicated endpoint)\n"
|
|
224
224
|
'model = "voxtral-mini-latest"\n\n'
|
|
225
|
+
"# Model for text transformation via LLM\n"
|
|
226
|
+
"# (applied after transcription when a prompt is used)\n"
|
|
227
|
+
'chat_model = "mistral-small-latest"\n\n'
|
|
225
228
|
"# Language hint (may help the provider)\n"
|
|
226
229
|
'language = "fr"\n\n'
|
|
230
|
+
"# Context bias: up to 100 words/phrases to help recognize specific vocabulary\n"
|
|
231
|
+
"# (proper nouns, technical terms, brand names, etc.)\n"
|
|
232
|
+
'# context_bias = ["SuperVoxtral", "Mistral AI", "Voxtral"]\n'
|
|
233
|
+
"context_bias = []\n\n"
|
|
227
234
|
"# Audio recording parameters\n"
|
|
228
235
|
"rate = 16000\n"
|
|
229
236
|
"channels = 1\n"
|
|
@@ -271,7 +278,9 @@ class DefaultsConfig:
|
|
|
271
278
|
provider: str = "mistral"
|
|
272
279
|
format: str = "opus"
|
|
273
280
|
model: str = "voxtral-mini-latest"
|
|
281
|
+
chat_model: str = "mistral-small-latest"
|
|
274
282
|
language: str | None = None
|
|
283
|
+
context_bias: list[str] = field(default_factory=list)
|
|
275
284
|
rate: int = 16000
|
|
276
285
|
channels: int = 1
|
|
277
286
|
device: str | None = None
|
|
@@ -315,7 +324,11 @@ class Config:
|
|
|
315
324
|
"provider": str(user_defaults_raw.get("provider", "mistral")),
|
|
316
325
|
"format": str(user_defaults_raw.get("format", "opus")),
|
|
317
326
|
"model": str(user_defaults_raw.get("model", "voxtral-mini-latest")),
|
|
327
|
+
"chat_model": str(user_defaults_raw.get("chat_model", "mistral-small-latest")),
|
|
318
328
|
"language": user_defaults_raw.get("language"),
|
|
329
|
+
"context_bias": list(user_defaults_raw.get("context_bias", []))
|
|
330
|
+
if isinstance(user_defaults_raw.get("context_bias"), list)
|
|
331
|
+
else [],
|
|
319
332
|
"rate": int(user_defaults_raw.get("rate", 16000)),
|
|
320
333
|
"channels": int(user_defaults_raw.get("channels", 1)),
|
|
321
334
|
"device": user_defaults_raw.get("device"),
|
|
@@ -335,6 +348,9 @@ class Config:
|
|
|
335
348
|
format_ = defaults_data["format"]
|
|
336
349
|
if format_ not in {"wav", "mp3", "opus"}:
|
|
337
350
|
raise ValueError("format must be one of wav|mp3|opus")
|
|
351
|
+
context_bias = defaults_data["context_bias"]
|
|
352
|
+
if len(context_bias) > 100:
|
|
353
|
+
raise ValueError("context_bias cannot contain more than 100 items (Mistral API limit)")
|
|
338
354
|
defaults = DefaultsConfig(**defaults_data)
|
|
339
355
|
# Conditional output directories
|
|
340
356
|
if defaults.keep_audio_files:
|
svx/core/pipeline.py
CHANGED
|
@@ -12,18 +12,23 @@ import svx.core.config as config
|
|
|
12
12
|
from svx.core.audio import convert_audio, record_wav, timestamp
|
|
13
13
|
from svx.core.clipboard import copy_to_clipboard
|
|
14
14
|
from svx.core.config import Config
|
|
15
|
-
from svx.core.storage import save_transcript
|
|
15
|
+
from svx.core.storage import save_text_file, save_transcript
|
|
16
16
|
from svx.providers import get_provider
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class RecordingPipeline:
|
|
20
20
|
"""
|
|
21
|
-
Centralized pipeline for recording audio, transcribing via provider,
|
|
22
|
-
|
|
21
|
+
Centralized pipeline for recording audio, transcribing via provider, optionally
|
|
22
|
+
transforming with a text LLM, saving outputs, and copying to clipboard.
|
|
23
23
|
|
|
24
|
+
Pipeline steps:
|
|
25
|
+
1. Transcription: audio -> text via dedicated transcription endpoint (always)
|
|
26
|
+
2. Transformation: text + prompt -> text via text-based LLM (when a prompt is provided)
|
|
27
|
+
|
|
28
|
+
Handles temporary files when not keeping audio.
|
|
24
29
|
Supports runtime overrides like save_all for keeping all files and adding log handlers.
|
|
25
30
|
Optional progress_callback for status updates (e.g., for GUI).
|
|
26
|
-
Supports transcribe_mode for pure transcription without prompt
|
|
31
|
+
Supports transcribe_mode for pure transcription without prompt (step 1 only).
|
|
27
32
|
"""
|
|
28
33
|
|
|
29
34
|
def __init__(
|
|
@@ -136,31 +141,26 @@ class RecordingPipeline:
|
|
|
136
141
|
self, wav_path: Path, duration: float, transcribe_mode: bool, user_prompt: str | None = None
|
|
137
142
|
) -> dict[str, Any]:
|
|
138
143
|
"""
|
|
139
|
-
Process recorded audio: convert if needed, transcribe, save, copy.
|
|
144
|
+
Process recorded audio: convert if needed, transcribe, optionally transform, save, copy.
|
|
145
|
+
|
|
146
|
+
Pipeline:
|
|
147
|
+
1. Transcription: audio -> text via dedicated endpoint (always)
|
|
148
|
+
2. Transformation: text + prompt -> text via LLM (when prompt is provided)
|
|
140
149
|
|
|
141
150
|
Args:
|
|
142
151
|
wav_path: Path to the recorded WAV file.
|
|
143
152
|
duration: Recording duration in seconds.
|
|
144
|
-
transcribe_mode: Whether to use pure transcription mode.
|
|
145
|
-
user_prompt: User prompt to use (None for transcribe_mode).
|
|
153
|
+
transcribe_mode: Whether to use pure transcription mode (step 1 only).
|
|
154
|
+
user_prompt: User prompt to use for transformation (None for transcribe_mode).
|
|
146
155
|
|
|
147
156
|
Returns:
|
|
148
|
-
Dict with 'text' (str), '
|
|
149
|
-
'paths' (dict of Path or None).
|
|
157
|
+
Dict with 'text' (str), 'raw_transcript' (str), 'raw' (dict),
|
|
158
|
+
'duration' (float), 'paths' (dict of Path or None).
|
|
150
159
|
"""
|
|
151
160
|
# Resolve parameters
|
|
152
161
|
provider = self.cfg.defaults.provider
|
|
153
162
|
audio_format = self.cfg.defaults.format
|
|
154
163
|
model = self.cfg.defaults.model
|
|
155
|
-
original_model = model
|
|
156
|
-
if transcribe_mode:
|
|
157
|
-
model = "voxtral-mini-latest"
|
|
158
|
-
if original_model != "voxtral-mini-latest":
|
|
159
|
-
logging.warning(
|
|
160
|
-
"Transcribe mode: model override from '%s' to 'voxtral-mini-latest'\n"
|
|
161
|
-
"(optimized for transcription).",
|
|
162
|
-
original_model,
|
|
163
|
-
)
|
|
164
164
|
language = self.cfg.defaults.language
|
|
165
165
|
if wav_path.stem.endswith(".wav"):
|
|
166
166
|
base = wav_path.stem.replace(".wav", "")
|
|
@@ -176,9 +176,11 @@ class RecordingPipeline:
|
|
|
176
176
|
final_user_prompt = self.cfg.resolve_prompt(self.user_prompt, self.user_prompt_file)
|
|
177
177
|
else:
|
|
178
178
|
final_user_prompt = user_prompt
|
|
179
|
-
self._status("
|
|
179
|
+
self._status("Prompt mode: transcription then transformation.")
|
|
180
180
|
else:
|
|
181
|
-
self._status("Transcribe mode
|
|
181
|
+
self._status("Transcribe mode: transcription only, no prompt.")
|
|
182
|
+
|
|
183
|
+
logging.debug(f"Applied prompt: {final_user_prompt or 'None (transcribe mode)'}")
|
|
182
184
|
|
|
183
185
|
paths: dict[str, Path | None] = {"wav": wav_path}
|
|
184
186
|
|
|
@@ -192,18 +194,22 @@ class RecordingPipeline:
|
|
|
192
194
|
paths["converted"] = to_send_path
|
|
193
195
|
_converted = True
|
|
194
196
|
|
|
195
|
-
#
|
|
197
|
+
# Step 1: Transcription (always)
|
|
196
198
|
self._status("Transcribing...")
|
|
197
199
|
prov = get_provider(provider, cfg=self.cfg)
|
|
198
|
-
result = prov.transcribe(
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
200
|
+
result = prov.transcribe(to_send_path, model=model, language=language)
|
|
201
|
+
raw_transcript = result["text"]
|
|
202
|
+
|
|
203
|
+
# Step 2: Transformation (if prompt)
|
|
204
|
+
if not transcribe_mode and final_user_prompt:
|
|
205
|
+
self._status("Applying prompt...")
|
|
206
|
+
chat_model = self.cfg.defaults.chat_model
|
|
207
|
+
chat_result = prov.chat(raw_transcript, final_user_prompt, model=chat_model)
|
|
208
|
+
text = chat_result["text"]
|
|
209
|
+
raw = {"transcription": result["raw"], "transformation": chat_result["raw"]}
|
|
210
|
+
else:
|
|
211
|
+
text = raw_transcript
|
|
212
|
+
raw = result["raw"]
|
|
207
213
|
|
|
208
214
|
# Save if keeping transcripts
|
|
209
215
|
if keep_transcript:
|
|
@@ -213,6 +219,12 @@ class RecordingPipeline:
|
|
|
213
219
|
)
|
|
214
220
|
paths["txt"] = txt_path
|
|
215
221
|
paths["json"] = json_path
|
|
222
|
+
|
|
223
|
+
# Save raw transcript separately when transformation was applied
|
|
224
|
+
if not transcribe_mode and final_user_prompt:
|
|
225
|
+
raw_txt_path = self.cfg.transcripts_dir / f"{base}_{provider}_raw.txt"
|
|
226
|
+
save_text_file(raw_txt_path, raw_transcript)
|
|
227
|
+
paths["raw_txt"] = raw_txt_path
|
|
216
228
|
else:
|
|
217
229
|
paths["txt"] = None
|
|
218
230
|
paths["json"] = None
|
|
@@ -228,6 +240,7 @@ class RecordingPipeline:
|
|
|
228
240
|
logging.info("Processing finished (%.2fs)", duration)
|
|
229
241
|
return {
|
|
230
242
|
"text": text,
|
|
243
|
+
"raw_transcript": raw_transcript,
|
|
231
244
|
"raw": raw,
|
|
232
245
|
"duration": duration,
|
|
233
246
|
"paths": paths,
|
|
@@ -261,8 +274,8 @@ class RecordingPipeline:
|
|
|
261
274
|
stop_event: Optional event to signal recording stop (e.g., for GUI).
|
|
262
275
|
|
|
263
276
|
Returns:
|
|
264
|
-
Dict with 'text' (str), '
|
|
265
|
-
'paths' (dict of Path or None).
|
|
277
|
+
Dict with 'text' (str), 'raw_transcript' (str), 'raw' (dict),
|
|
278
|
+
'duration' (float), 'paths' (dict of Path or None).
|
|
266
279
|
|
|
267
280
|
Raises:
|
|
268
281
|
Exception: On recording, conversion, or transcription errors.
|
svx/core/prompt.py
CHANGED
|
@@ -12,6 +12,7 @@ Intended to be small and dependency-light so it can be imported broadly.
|
|
|
12
12
|
from __future__ import annotations
|
|
13
13
|
|
|
14
14
|
import logging
|
|
15
|
+
from collections.abc import Callable
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
|
|
17
18
|
from .config import USER_PROMPT_DIR, Config, PromptEntry
|
|
@@ -121,22 +122,45 @@ def resolve_user_prompt(
|
|
|
121
122
|
return ""
|
|
122
123
|
|
|
123
124
|
key = key or "default"
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
lambda:
|
|
128
|
-
|
|
125
|
+
|
|
126
|
+
# Suppliers annotated with a name for tracing which one returned the prompt.
|
|
127
|
+
named_suppliers: list[tuple[str, Callable[[], str]]] = [
|
|
128
|
+
("inline", lambda: _strip(inline)),
|
|
129
|
+
("file", lambda: _read(file)),
|
|
130
|
+
(f"prompt_config[{key}]", lambda: _from_user_cfg(key)),
|
|
131
|
+
("user_prompt_dir/user.md", _from_user_prompt_dir),
|
|
129
132
|
]
|
|
130
133
|
|
|
131
|
-
for supplier in
|
|
134
|
+
for name, supplier in named_suppliers:
|
|
132
135
|
try:
|
|
133
136
|
val = supplier()
|
|
134
137
|
if val:
|
|
138
|
+
# Log which supplier provided the prompt and a short snippet for debugging.
|
|
139
|
+
try:
|
|
140
|
+
if len(val) > 200:
|
|
141
|
+
snippet = val[:200] + "..."
|
|
142
|
+
else:
|
|
143
|
+
snippet = val
|
|
144
|
+
logging.info(
|
|
145
|
+
"resolve_user_prompt: supplier '%s' provided prompt snippet: %s",
|
|
146
|
+
name,
|
|
147
|
+
snippet,
|
|
148
|
+
)
|
|
149
|
+
except Exception:
|
|
150
|
+
# Ensure logging failures do not change behavior.
|
|
151
|
+
logging.info(
|
|
152
|
+
"resolve_user_prompt: supplier '%s' provided a prompt "
|
|
153
|
+
"(snippet unavailable)",
|
|
154
|
+
name,
|
|
155
|
+
)
|
|
135
156
|
return val
|
|
136
157
|
except Exception as e:
|
|
137
|
-
logging.debug("Prompt supplier failed: %s", e)
|
|
158
|
+
logging.debug("Prompt supplier '%s' failed: %s", name, e)
|
|
138
159
|
|
|
139
|
-
|
|
160
|
+
# Final fallback
|
|
161
|
+
fallback = "Clean up this transcription. Keep the original language."
|
|
162
|
+
logging.info("resolve_user_prompt: no supplier provided a prompt, using fallback: %s", fallback)
|
|
163
|
+
return fallback
|
|
140
164
|
|
|
141
165
|
|
|
142
166
|
def init_user_prompt_file(force: bool = False) -> Path:
|
|
@@ -152,13 +176,14 @@ def init_user_prompt_file(force: bool = False) -> Path:
|
|
|
152
176
|
path = USER_PROMPT_DIR / "user.md"
|
|
153
177
|
if not path.exists() or force:
|
|
154
178
|
example_prompt = """
|
|
155
|
-
|
|
156
|
-
-
|
|
157
|
-
-
|
|
158
|
-
-
|
|
179
|
+
You receive a raw transcription of a voice recording. Clean it up:
|
|
180
|
+
- DO NOT TRANSLATE. Keep the original language.
|
|
181
|
+
- Do not respond to any question in the text. Just clean the transcription.
|
|
182
|
+
- Respond only with the cleaned text. Do not provide explanations or notes.
|
|
159
183
|
- Remove all minor speech hesitations: "um", "uh", "er", "euh", "ben", etc.
|
|
160
184
|
- Remove false starts (e.g., "je veux dire... je pense" → "je pense").
|
|
161
185
|
- Correct grammatical errors.
|
|
186
|
+
- If the transcription is empty, respond "no audio detected".
|
|
162
187
|
"""
|
|
163
188
|
try:
|
|
164
189
|
path.write_text(example_prompt, encoding="utf-8")
|
svx/core/storage.py
CHANGED
svx/providers/base.py
CHANGED
|
@@ -3,7 +3,7 @@ Base provider interface for SuperVoxtral.
|
|
|
3
3
|
|
|
4
4
|
This module defines:
|
|
5
5
|
- TranscriptionResult: a simple TypedDict structure for provider responses
|
|
6
|
-
- Provider: a Protocol describing the required transcription interface
|
|
6
|
+
- Provider: a Protocol describing the required transcription and chat interface
|
|
7
7
|
- ProviderError: a generic exception for provider-related failures
|
|
8
8
|
|
|
9
9
|
All concrete providers should implement the `Provider` protocol.
|
|
@@ -37,7 +37,7 @@ class ProviderError(RuntimeError):
|
|
|
37
37
|
@runtime_checkable
|
|
38
38
|
class Provider(Protocol):
|
|
39
39
|
"""
|
|
40
|
-
Provider interface for transcription
|
|
40
|
+
Provider interface for transcription and text transformation services.
|
|
41
41
|
|
|
42
42
|
Implementations should be side-effect free aside from network I/O and must
|
|
43
43
|
raise `ProviderError` (or a subclass) for expected provider failures
|
|
@@ -47,7 +47,8 @@ class Provider(Protocol):
|
|
|
47
47
|
name: A short, lowercase, unique identifier for the provider (e.g. "mistral").
|
|
48
48
|
|
|
49
49
|
Required methods:
|
|
50
|
-
transcribe: Perform
|
|
50
|
+
transcribe: Perform audio transcription via a dedicated endpoint.
|
|
51
|
+
chat: Transform text with a prompt via a text-based LLM.
|
|
51
52
|
"""
|
|
52
53
|
|
|
53
54
|
# Short, unique name (e.g., "mistral", "whisper")
|
|
@@ -56,21 +57,16 @@ class Provider(Protocol):
|
|
|
56
57
|
def transcribe(
|
|
57
58
|
self,
|
|
58
59
|
audio_path: Path,
|
|
59
|
-
user_prompt: str | None,
|
|
60
60
|
model: str | None = None,
|
|
61
61
|
language: str | None = None,
|
|
62
|
-
transcribe_mode: bool = False,
|
|
63
62
|
) -> TranscriptionResult:
|
|
64
63
|
"""
|
|
65
|
-
Transcribe
|
|
64
|
+
Transcribe `audio_path` using a dedicated transcription endpoint.
|
|
66
65
|
|
|
67
66
|
Args:
|
|
68
67
|
audio_path: Path to an audio file (wav/mp3/opus...) to send to the provider.
|
|
69
|
-
user_prompt: Optional user prompt to guide the transcription or analysis.
|
|
70
68
|
model: Optional provider-specific model identifier.
|
|
71
69
|
language: Optional language hint/constraint (e.g., "en", "fr").
|
|
72
|
-
transcribe_mode: Optional bool to enable specialized modes like pure
|
|
73
|
-
transcription (default False).
|
|
74
70
|
|
|
75
71
|
Returns:
|
|
76
72
|
TranscriptionResult including a human-readable `text` and
|
|
@@ -81,3 +77,27 @@ class Provider(Protocol):
|
|
|
81
77
|
Exception: For unexpected failures (network issues, serialization, etc.).
|
|
82
78
|
"""
|
|
83
79
|
...
|
|
80
|
+
|
|
81
|
+
def chat(
|
|
82
|
+
self,
|
|
83
|
+
text: str,
|
|
84
|
+
prompt: str,
|
|
85
|
+
model: str | None = None,
|
|
86
|
+
) -> TranscriptionResult:
|
|
87
|
+
"""
|
|
88
|
+
Transform `text` using a text-based LLM with the given `prompt`.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
text: Input text (e.g., raw transcription) to process.
|
|
92
|
+
prompt: System prompt guiding the transformation.
|
|
93
|
+
model: Optional provider-specific model identifier for the chat LLM.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
TranscriptionResult including the transformed `text` and
|
|
97
|
+
provider `raw` payload.
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
ProviderError: For known/handled provider errors (e.g., missing API key).
|
|
101
|
+
Exception: For unexpected failures (network issues, serialization, etc.).
|
|
102
|
+
"""
|
|
103
|
+
...
|
svx/providers/mistral.py
CHANGED
|
@@ -1,22 +1,18 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Mistral provider implementation for SuperVoxtral.
|
|
3
3
|
|
|
4
|
-
This module provides a concrete Provider that uses Mistral's
|
|
5
|
-
|
|
4
|
+
This module provides a concrete Provider that uses Mistral's dedicated
|
|
5
|
+
transcription endpoint (Voxtral) and text-based LLM chat for transformation.
|
|
6
6
|
|
|
7
7
|
Requirements:
|
|
8
8
|
- User config must define [providers.mistral].api_key in config.toml.
|
|
9
9
|
- Package 'mistralai' installed and importable.
|
|
10
10
|
|
|
11
|
-
The provider composes messages with:
|
|
12
|
-
- User content including the audio (base64) and optional user prompt text.
|
|
13
|
-
|
|
14
11
|
It returns a normalized TranscriptionResult: {"text": str, "raw": dict}.
|
|
15
12
|
"""
|
|
16
13
|
|
|
17
14
|
from __future__ import annotations
|
|
18
15
|
|
|
19
|
-
import base64
|
|
20
16
|
import json
|
|
21
17
|
import logging
|
|
22
18
|
from pathlib import Path
|
|
@@ -29,14 +25,6 @@ from .base import Provider, ProviderError, TranscriptionResult
|
|
|
29
25
|
__all__ = ["MistralProvider"]
|
|
30
26
|
|
|
31
27
|
|
|
32
|
-
def _read_file_as_base64(path: Path) -> str:
|
|
33
|
-
"""
|
|
34
|
-
Read a file and return its base64-encoded string.
|
|
35
|
-
"""
|
|
36
|
-
data = Path(path).read_bytes()
|
|
37
|
-
return base64.b64encode(data).decode("utf-8")
|
|
38
|
-
|
|
39
|
-
|
|
40
28
|
def _extract_text_from_response(resp: Any) -> str:
|
|
41
29
|
"""
|
|
42
30
|
Attempt to robustly extract the textual content from a Mistral response.
|
|
@@ -89,9 +77,10 @@ def _normalize_raw_response(resp: Any) -> dict[str, Any]:
|
|
|
89
77
|
|
|
90
78
|
class MistralProvider(Provider):
|
|
91
79
|
"""
|
|
92
|
-
Mistral
|
|
80
|
+
Mistral provider implementation.
|
|
93
81
|
|
|
94
|
-
Uses the
|
|
82
|
+
Uses the dedicated transcription endpoint for audio-to-text
|
|
83
|
+
and the chat endpoint for text transformation via LLM.
|
|
95
84
|
"""
|
|
96
85
|
|
|
97
86
|
name = "mistral"
|
|
@@ -103,27 +92,21 @@ class MistralProvider(Provider):
|
|
|
103
92
|
self.api_key = mistral_cfg.api_key
|
|
104
93
|
if not self.api_key:
|
|
105
94
|
raise ProviderError("Missing providers.mistral.api_key in user config (config.toml).")
|
|
95
|
+
self.context_bias = cfg.defaults.context_bias
|
|
106
96
|
|
|
107
97
|
def transcribe(
|
|
108
98
|
self,
|
|
109
99
|
audio_path: Path,
|
|
110
|
-
|
|
111
|
-
model: str | None = "voxtral-small-latest",
|
|
100
|
+
model: str | None = "voxtral-mini-latest",
|
|
112
101
|
language: str | None = None,
|
|
113
|
-
transcribe_mode: bool = False,
|
|
114
102
|
) -> TranscriptionResult:
|
|
115
103
|
"""
|
|
116
|
-
Transcribe
|
|
104
|
+
Transcribe audio using Mistral's dedicated transcription endpoint.
|
|
117
105
|
|
|
118
106
|
Args:
|
|
119
107
|
audio_path: Path to wav/mp3/opus file to send.
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
model: Voxtral model identifier (default: "voxtral-small-latest" for chat,
|
|
123
|
-
"voxtral-mini-latest" for transcribe).
|
|
124
|
-
language: Optional language hint for transcription (used only in
|
|
125
|
-
transcribe_mode).
|
|
126
|
-
transcribe_mode: If True, use dedicated transcription endpoint without prompt.
|
|
108
|
+
model: Voxtral model identifier (default: "voxtral-mini-latest").
|
|
109
|
+
language: Optional language hint for transcription.
|
|
127
110
|
|
|
128
111
|
Returns:
|
|
129
112
|
TranscriptionResult: {"text": text, "raw": raw_dict}
|
|
@@ -143,47 +126,71 @@ class MistralProvider(Provider):
|
|
|
143
126
|
|
|
144
127
|
client = Mistral(api_key=self.api_key)
|
|
145
128
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
129
|
+
model_name = model or "voxtral-mini-latest"
|
|
130
|
+
logging.info(
|
|
131
|
+
"Calling Mistral transcription endpoint model=%s with audio=%s (%s),"
|
|
132
|
+
" language=%s, context_bias=%d items",
|
|
133
|
+
model_name,
|
|
134
|
+
Path(audio_path).name,
|
|
135
|
+
Path(audio_path).suffix,
|
|
136
|
+
language or "auto",
|
|
137
|
+
len(self.context_bias),
|
|
138
|
+
)
|
|
139
|
+
with open(audio_path, "rb") as f:
|
|
140
|
+
resp = client.audio.transcriptions.complete(
|
|
141
|
+
model=model_name,
|
|
142
|
+
file={"content": f, "file_name": Path(audio_path).name},
|
|
143
|
+
language=language,
|
|
144
|
+
context_bias=self.context_bias if self.context_bias else None,
|
|
156
145
|
)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
model=model_name,
|
|
160
|
-
file={"content": f, "file_name": Path(audio_path).name},
|
|
161
|
-
language=language,
|
|
162
|
-
)
|
|
163
|
-
text = resp.text
|
|
164
|
-
raw = _normalize_raw_response(resp)
|
|
165
|
-
else:
|
|
166
|
-
audio_b64 = _read_file_as_base64(Path(audio_path))
|
|
167
|
-
|
|
168
|
-
# Compose messages (user only)
|
|
169
|
-
messages: list[dict[str, Any]] = []
|
|
170
|
-
user_content: list[dict[str, Any]] = [{"type": "input_audio", "input_audio": audio_b64}]
|
|
171
|
-
if user_prompt:
|
|
172
|
-
user_content.append({"type": "text", "text": user_prompt})
|
|
173
|
-
messages.append({"role": "user", "content": user_content})
|
|
174
|
-
|
|
175
|
-
# Execute request
|
|
176
|
-
model_name = model or "voxtral-small-latest"
|
|
177
|
-
logging.info(
|
|
178
|
-
"Calling Mistral chat-with-audio model=%s with audio=%s (%s)",
|
|
179
|
-
model_name,
|
|
180
|
-
Path(audio_path).name,
|
|
181
|
-
Path(audio_path).suffix,
|
|
182
|
-
)
|
|
183
|
-
resp = client.chat.complete(model=model_name, messages=cast(Any, messages))
|
|
184
|
-
|
|
185
|
-
# Extract normalized text and raw payload
|
|
186
|
-
text = _extract_text_from_response(resp)
|
|
187
|
-
raw = _normalize_raw_response(resp)
|
|
146
|
+
text = resp.text
|
|
147
|
+
raw = _normalize_raw_response(resp)
|
|
188
148
|
|
|
189
149
|
return TranscriptionResult(text=text, raw=raw)
|
|
150
|
+
|
|
151
|
+
def chat(
|
|
152
|
+
self,
|
|
153
|
+
text: str,
|
|
154
|
+
prompt: str,
|
|
155
|
+
model: str | None = None,
|
|
156
|
+
) -> TranscriptionResult:
|
|
157
|
+
"""
|
|
158
|
+
Transform text using Mistral's chat endpoint with a system prompt.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
text: Input text (e.g., raw transcription) to process.
|
|
162
|
+
prompt: System prompt guiding the transformation.
|
|
163
|
+
model: Model identifier (default: None, caller should provide).
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
TranscriptionResult: {"text": text, "raw": raw_dict}
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ProviderError: for expected configuration/import errors.
|
|
170
|
+
"""
|
|
171
|
+
try:
|
|
172
|
+
from mistralai import Mistral
|
|
173
|
+
except Exception as e:
|
|
174
|
+
raise ProviderError(
|
|
175
|
+
"Failed to import 'mistralai'. Ensure the 'mistralai' package is installed."
|
|
176
|
+
) from e
|
|
177
|
+
|
|
178
|
+
client = Mistral(api_key=self.api_key)
|
|
179
|
+
|
|
180
|
+
model_name = model or "mistral-small-latest"
|
|
181
|
+
logging.info(
|
|
182
|
+
"Calling Mistral chat endpoint model=%s for text transformation",
|
|
183
|
+
model_name,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
messages: list[dict[str, Any]] = [
|
|
187
|
+
{"role": "system", "content": prompt},
|
|
188
|
+
{"role": "user", "content": text},
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
resp = client.chat.complete(model=model_name, messages=cast(Any, messages))
|
|
192
|
+
|
|
193
|
+
result_text = _extract_text_from_response(resp)
|
|
194
|
+
raw = _normalize_raw_response(resp)
|
|
195
|
+
|
|
196
|
+
return TranscriptionResult(text=result_text, raw=raw)
|
svx/ui/qt_app.py
CHANGED
|
@@ -119,7 +119,7 @@ class WaveformWidget(QWidget):
|
|
|
119
119
|
waveform to indicate recording activity. It is lightweight and self-contained.
|
|
120
120
|
"""
|
|
121
121
|
|
|
122
|
-
def __init__(self, parent=None, height: int = 64) -> None:
|
|
122
|
+
def __init__(self, parent: QWidget | None = None, height: int = 64) -> None:
|
|
123
123
|
super().__init__(parent)
|
|
124
124
|
self.setMinimumHeight(height)
|
|
125
125
|
self.setMaximumHeight(height)
|
|
@@ -273,10 +273,39 @@ class RecorderWorker(QObject):
|
|
|
273
273
|
self.canceled.emit()
|
|
274
274
|
return
|
|
275
275
|
self.status.emit("Processing in progress...")
|
|
276
|
+
# Wait for user to select mode in the GUI
|
|
276
277
|
while self.mode is None:
|
|
277
278
|
time.sleep(0.05)
|
|
279
|
+
|
|
280
|
+
# Log the selected mode/key for debugging prompt application
|
|
281
|
+
try:
|
|
282
|
+
logging.info("RecorderWorker: selected mode/key: %s", self.mode)
|
|
283
|
+
except Exception:
|
|
284
|
+
# ensure failures in logging don't break the worker
|
|
285
|
+
pass
|
|
286
|
+
|
|
278
287
|
transcribe_mode = self.mode == "transcribe"
|
|
279
|
-
|
|
288
|
+
if transcribe_mode:
|
|
289
|
+
user_prompt = None
|
|
290
|
+
else:
|
|
291
|
+
# Resolve the user prompt for the selected key and log a short snippet
|
|
292
|
+
user_prompt = self._resolve_user_prompt(self.mode)
|
|
293
|
+
try:
|
|
294
|
+
if user_prompt:
|
|
295
|
+
snippet = (
|
|
296
|
+
user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
snippet = "<EMPTY>"
|
|
300
|
+
logging.info(
|
|
301
|
+
"RecorderWorker: resolved prompt snippet for key '%s': %s",
|
|
302
|
+
self.mode,
|
|
303
|
+
snippet,
|
|
304
|
+
)
|
|
305
|
+
except Exception:
|
|
306
|
+
# avoid breaking the flow on logging errors
|
|
307
|
+
pass
|
|
308
|
+
|
|
280
309
|
result = pipeline.process(wav_path, duration, transcribe_mode, user_prompt)
|
|
281
310
|
keep_audio = self.save_all or self.cfg.defaults.keep_audio_files
|
|
282
311
|
pipeline.clean(wav_path, result["paths"], keep_audio)
|
|
@@ -383,13 +412,15 @@ class RecorderWindow(QWidget):
|
|
|
383
412
|
button_layout.addStretch()
|
|
384
413
|
self._transcribe_btn = QPushButton("Transcribe")
|
|
385
414
|
self._transcribe_btn.setToolTip("Stop and transcribe without prompt")
|
|
386
|
-
self._transcribe_btn.clicked.connect(
|
|
415
|
+
self._transcribe_btn.clicked.connect(
|
|
416
|
+
lambda checked=False, m="transcribe": self._on_mode_selected(m)
|
|
417
|
+
)
|
|
387
418
|
button_layout.addWidget(self._transcribe_btn)
|
|
388
419
|
self._prompt_buttons: dict[str, QPushButton] = {}
|
|
389
420
|
for key in self.prompt_keys:
|
|
390
421
|
btn = QPushButton(key.capitalize())
|
|
391
422
|
btn.setToolTip(f"Stop and transcribe with '{key}' prompt")
|
|
392
|
-
btn.clicked.connect(lambda k=key: self._on_mode_selected(k))
|
|
423
|
+
btn.clicked.connect(lambda checked=False, k=key: self._on_mode_selected(k))
|
|
393
424
|
self._prompt_buttons[key] = btn
|
|
394
425
|
button_layout.addWidget(btn)
|
|
395
426
|
self._cancel_btn = QPushButton("Cancel")
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
svx/__init__.py,sha256=qPEe5u3PT8yOQN4MiOLj_Bd18HqcRb6fxnPDfdMUP7w,742
|
|
2
|
-
svx/cli.py,sha256=3AirsBynuq2rcz-4C8hbZ69JztkgA7LTMGmL6ym7nyY,9167
|
|
3
|
-
svx/core/__init__.py,sha256=mhzXuIXo3kUzjWme0Bxhe4TQZQELlyEiG_89LUAPC7M,2856
|
|
4
|
-
svx/core/audio.py,sha256=r0m5T1uzdsJ1j9YXgQ5clv15dvMwZBp_bk2aLpjnrkc,7684
|
|
5
|
-
svx/core/clipboard.py,sha256=IFtiN2SnYKQIu0WXx0hCK8syvDXanBpm1Jr2a8X7y9s,3692
|
|
6
|
-
svx/core/config.py,sha256=e2tTGjjPcUYFctB28Ha90G-W44mF_0eWey1zpSyZkBo,16095
|
|
7
|
-
svx/core/pipeline.py,sha256=nqvCgK5Pbyx18mfACrN_mIDt546Bh7fKA6MF4XG1hxM,10637
|
|
8
|
-
svx/core/prompt.py,sha256=yO8UbpFg7n1IT7wFjSQ7NUTbrqxuwPhdnxkTH4Iu7XU,5967
|
|
9
|
-
svx/core/storage.py,sha256=5_xKYEpvDhaixRxmSTBlyX_jt8ssjHwHzX9VodcrtJw,3213
|
|
10
|
-
svx/providers/__init__.py,sha256=SzlSWpZSUIptbSrAnGfi0d0NX4hYTpT0ObWpYyskDdA,2634
|
|
11
|
-
svx/providers/base.py,sha256=YoiI8KWVRGISh7dx9XXPr1Q1a7ZDu8vfeJFlPbcKr20,2695
|
|
12
|
-
svx/providers/mistral.py,sha256=vrBatNZg0zGNkJ5Qfnfz6ZwP6QtBgIt9sT_w59zkSO0,6636
|
|
13
|
-
svx/ui/qt_app.py,sha256=6LOMeMjkMmYylu6H_prDRmPDsL0s4PVMZqfbflByCMs,18808
|
|
14
|
-
supervoxtral-0.1.4.dist-info/METADATA,sha256=0w_i5geOKu8F9x7eviNboDNt-PTy6FS3WHe3cCx4eHg,753
|
|
15
|
-
supervoxtral-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
16
|
-
supervoxtral-0.1.4.dist-info/entry_points.txt,sha256=phJhRy3VkYHC6AR_tUB5CypHzG0ePRR9sB13HWE1vEg,36
|
|
17
|
-
supervoxtral-0.1.4.dist-info/licenses/LICENSE,sha256=fCEBKmC4i-1WZAwoKjKWegfDd8qNsG8ECB7JyqoswyQ,1064
|
|
18
|
-
supervoxtral-0.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|