supervoxtral 0.1.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: supervoxtral
3
- Version: 0.1.5
4
- Summary: CLI/GUI audio recorder and transcription client using Mistral Voxtral (chat with audio and transcription).
3
+ Version: 0.3.0
4
+ Summary: CLI/GUI audio recorder with 2-step pipeline: transcription (Voxtral) then text transformation (LLM).
5
5
  License: MIT
6
6
  License-File: LICENSE
7
7
  Keywords: audio,cli,gui,mistral,transcription,voxtral,whisper
@@ -14,10 +14,7 @@ Requires-Dist: sounddevice
14
14
  Requires-Dist: soundfile
15
15
  Requires-Dist: typer
16
16
  Provides-Extra: dev
17
- Requires-Dist: black; extra == 'dev'
18
- Requires-Dist: mypy; extra == 'dev'
19
- Requires-Dist: pytest; extra == 'dev'
17
+ Requires-Dist: basedpyright; extra == 'dev'
20
18
  Requires-Dist: ruff; extra == 'dev'
21
- Requires-Dist: types-python-dotenv; extra == 'dev'
22
19
  Provides-Extra: gui
23
20
  Requires-Dist: pyside6-essentials; extra == 'gui'
@@ -0,0 +1,18 @@
1
+ svx/__init__.py,sha256=qPEe5u3PT8yOQN4MiOLj_Bd18HqcRb6fxnPDfdMUP7w,742
2
+ svx/cli.py,sha256=7fzs85LT85RbZYtI8t-yOXKrRd9r-IzE1hnFJHNgxL4,9436
3
+ svx/core/__init__.py,sha256=mhzXuIXo3kUzjWme0Bxhe4TQZQELlyEiG_89LUAPC7M,2856
4
+ svx/core/audio.py,sha256=svyRWbPaUyYqbmGaLF8oUim-x5mj9zciv0XCqq2VGEU,7828
5
+ svx/core/clipboard.py,sha256=IFtiN2SnYKQIu0WXx0hCK8syvDXanBpm1Jr2a8X7y9s,3692
6
+ svx/core/config.py,sha256=Ib_lIKUFriW-B9i49zTUZE-YcOQYEkpBE3CF_WDzFlg,17060
7
+ svx/core/pipeline.py,sha256=GhaOJtHGiwwsSv2EkNM-ZKu0DSm25xDeI9sNTLw7YJU,11612
8
+ svx/core/prompt.py,sha256=OpS3XgusRwV4JP9cCzyk0DXcphcLcgHIPV89eoc2vFc,7282
9
+ svx/core/storage.py,sha256=_w_rTOPoqlz0eoD2XHPNvHPQXgs6QxZ7SP4_IBT8Bx4,3223
10
+ svx/providers/__init__.py,sha256=SzlSWpZSUIptbSrAnGfi0d0NX4hYTpT0ObWpYyskDdA,2634
11
+ svx/providers/base.py,sha256=D_iDjhJuAGye-JjWbO-Rtl131kD6hgYQaspO53-6spo,3238
12
+ svx/providers/mistral.py,sha256=ZkA02KDU-2ktdBM2tKUmTG8ZVnp8suE4g1TrPBpmqDA,6439
13
+ svx/ui/qt_app.py,sha256=FDdxcgqzHi5HCsbmCzQtVEFTfcVaDLfiVgQe_8YHHoY,19993
14
+ supervoxtral-0.3.0.dist-info/METADATA,sha256=vnKZuM96g1le-da4EVKOsECuSvIVpneE0xKq-DY-2_E,629
15
+ supervoxtral-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
16
+ supervoxtral-0.3.0.dist-info/entry_points.txt,sha256=phJhRy3VkYHC6AR_tUB5CypHzG0ePRR9sB13HWE1vEg,36
17
+ supervoxtral-0.3.0.dist-info/licenses/LICENSE,sha256=fCEBKmC4i-1WZAwoKjKWegfDd8qNsG8ECB7JyqoswyQ,1064
18
+ supervoxtral-0.3.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
svx/cli.py CHANGED
@@ -162,10 +162,14 @@ def record(
162
162
  ),
163
163
  ):
164
164
  """
165
- Record audio from the microphone and send it to the selected provider.
165
+ Record audio from the microphone and process it via a 2-step pipeline.
166
+
167
+ Pipeline:
168
+ 1. Transcription: audio -> text via dedicated transcription endpoint (always).
169
+ 2. Transformation: text + prompt -> text via text-based LLM (when a prompt is provided).
166
170
 
167
171
  This CLI accepts only a small set of runtime flags. Most defaults (provider, format,
168
- model, language, sample rate, channels, device,
172
+ model, chat_model, language, sample rate, channels, device,
169
173
  file retention, copy-to-clipboard)
170
174
  must be configured in the user's `config.toml` under [defaults].
171
175
 
@@ -178,11 +182,12 @@ def record(
178
182
  Flow:
179
183
  - Records WAV until you press Enter (CLI mode).
180
184
  - Optionally converts to MP3/Opus depending on config.
181
- - Sends the file per provider rules.
185
+ - Transcribes via dedicated endpoint (step 1).
186
+ - If a prompt is provided, transforms the transcript via LLM (step 2).
182
187
  - Prints and saves the result.
183
188
 
184
189
  Note: In --transcribe mode, prompts (--user-prompt or --user-prompt-file) are ignored,
185
- as it uses a dedicated transcription endpoint without prompting.
190
+ and only step 1 (transcription) is performed.
186
191
  """
187
192
  cfg = Config.load(log_level=log_level)
188
193
 
svx/core/audio.py CHANGED
@@ -22,6 +22,7 @@ from pathlib import Path
22
22
  from threading import Event, Thread
23
23
  from typing import Any
24
24
 
25
+ import numpy as np
25
26
  import sounddevice as sd
26
27
  import soundfile as sf
27
28
 
@@ -149,7 +150,12 @@ def record_wav(
149
150
  writer_stop = Event()
150
151
  start_time = time.time()
151
152
 
152
- def audio_callback(indata, frames, time_info, status):
153
+ def audio_callback(
154
+ indata: np.ndarray[Any, np.dtype[np.int16]],
155
+ frames: int,
156
+ time_info: sd.CallbackFlags,
157
+ status: sd.CallbackFlags,
158
+ ) -> None:
153
159
  if status:
154
160
  logging.warning("SoundDevice status: %s", status)
155
161
  q.put(indata.copy())
svx/core/config.py CHANGED
@@ -220,10 +220,17 @@ def init_user_config(force: bool = False, prompt_file: Path | None = None) -> Pa
220
220
  '# File format sent to the provider: "wav" | "mp3" | "opus"\n'
221
221
  '# Recording is always WAV; conversion is applied if "mp3" or "opus"\n'
222
222
  'format = "opus"\n\n'
223
- "# Model to use on the provider side (example for Mistral Voxtral)\n"
223
+ "# Model for audio transcription (dedicated endpoint)\n"
224
224
  'model = "voxtral-mini-latest"\n\n'
225
+ "# Model for text transformation via LLM\n"
226
+ "# (applied after transcription when a prompt is used)\n"
227
+ 'chat_model = "mistral-small-latest"\n\n'
225
228
  "# Language hint (may help the provider)\n"
226
229
  'language = "fr"\n\n'
230
+ "# Context bias: up to 100 words/phrases to help recognize specific vocabulary\n"
231
+ "# (proper nouns, technical terms, brand names, etc.)\n"
232
+ '# context_bias = ["SuperVoxtral", "Mistral AI", "Voxtral"]\n'
233
+ "context_bias = []\n\n"
227
234
  "# Audio recording parameters\n"
228
235
  "rate = 16000\n"
229
236
  "channels = 1\n"
@@ -271,7 +278,9 @@ class DefaultsConfig:
271
278
  provider: str = "mistral"
272
279
  format: str = "opus"
273
280
  model: str = "voxtral-mini-latest"
281
+ chat_model: str = "mistral-small-latest"
274
282
  language: str | None = None
283
+ context_bias: list[str] = field(default_factory=list)
275
284
  rate: int = 16000
276
285
  channels: int = 1
277
286
  device: str | None = None
@@ -315,7 +324,11 @@ class Config:
315
324
  "provider": str(user_defaults_raw.get("provider", "mistral")),
316
325
  "format": str(user_defaults_raw.get("format", "opus")),
317
326
  "model": str(user_defaults_raw.get("model", "voxtral-mini-latest")),
327
+ "chat_model": str(user_defaults_raw.get("chat_model", "mistral-small-latest")),
318
328
  "language": user_defaults_raw.get("language"),
329
+ "context_bias": list(user_defaults_raw.get("context_bias", []))
330
+ if isinstance(user_defaults_raw.get("context_bias"), list)
331
+ else [],
319
332
  "rate": int(user_defaults_raw.get("rate", 16000)),
320
333
  "channels": int(user_defaults_raw.get("channels", 1)),
321
334
  "device": user_defaults_raw.get("device"),
@@ -335,6 +348,9 @@ class Config:
335
348
  format_ = defaults_data["format"]
336
349
  if format_ not in {"wav", "mp3", "opus"}:
337
350
  raise ValueError("format must be one of wav|mp3|opus")
351
+ context_bias = defaults_data["context_bias"]
352
+ if len(context_bias) > 100:
353
+ raise ValueError("context_bias cannot contain more than 100 items (Mistral API limit)")
338
354
  defaults = DefaultsConfig(**defaults_data)
339
355
  # Conditional output directories
340
356
  if defaults.keep_audio_files:
svx/core/pipeline.py CHANGED
@@ -12,18 +12,23 @@ import svx.core.config as config
12
12
  from svx.core.audio import convert_audio, record_wav, timestamp
13
13
  from svx.core.clipboard import copy_to_clipboard
14
14
  from svx.core.config import Config
15
- from svx.core.storage import save_transcript
15
+ from svx.core.storage import save_text_file, save_transcript
16
16
  from svx.providers import get_provider
17
17
 
18
18
 
19
19
  class RecordingPipeline:
20
20
  """
21
- Centralized pipeline for recording audio, transcribing via provider, saving outputs,
22
- and copying to clipboard. Handles temporary files when not keeping audio.
21
+ Centralized pipeline for recording audio, transcribing via provider, optionally
22
+ transforming with a text LLM, saving outputs, and copying to clipboard.
23
23
 
24
+ Pipeline steps:
25
+ 1. Transcription: audio -> text via dedicated transcription endpoint (always)
26
+ 2. Transformation: text + prompt -> text via text-based LLM (when a prompt is provided)
27
+
28
+ Handles temporary files when not keeping audio.
24
29
  Supports runtime overrides like save_all for keeping all files and adding log handlers.
25
30
  Optional progress_callback for status updates (e.g., for GUI).
26
- Supports transcribe_mode for pure transcription without prompt using dedicated endpoint.
31
+ Supports transcribe_mode for pure transcription without prompt (step 1 only).
27
32
  """
28
33
 
29
34
  def __init__(
@@ -136,31 +141,26 @@ class RecordingPipeline:
136
141
  self, wav_path: Path, duration: float, transcribe_mode: bool, user_prompt: str | None = None
137
142
  ) -> dict[str, Any]:
138
143
  """
139
- Process recorded audio: convert if needed, transcribe, save, copy.
144
+ Process recorded audio: convert if needed, transcribe, optionally transform, save, copy.
145
+
146
+ Pipeline:
147
+ 1. Transcription: audio -> text via dedicated endpoint (always)
148
+ 2. Transformation: text + prompt -> text via LLM (when prompt is provided)
140
149
 
141
150
  Args:
142
151
  wav_path: Path to the recorded WAV file.
143
152
  duration: Recording duration in seconds.
144
- transcribe_mode: Whether to use pure transcription mode.
145
- user_prompt: User prompt to use (None for transcribe_mode).
153
+ transcribe_mode: Whether to use pure transcription mode (step 1 only).
154
+ user_prompt: User prompt to use for transformation (None for transcribe_mode).
146
155
 
147
156
  Returns:
148
- Dict with 'text' (str), 'raw' (dict), 'duration' (float),
149
- 'paths' (dict of Path or None).
157
+ Dict with 'text' (str), 'raw_transcript' (str), 'raw' (dict),
158
+ 'duration' (float), 'paths' (dict of Path or None).
150
159
  """
151
160
  # Resolve parameters
152
161
  provider = self.cfg.defaults.provider
153
162
  audio_format = self.cfg.defaults.format
154
163
  model = self.cfg.defaults.model
155
- original_model = model
156
- if transcribe_mode:
157
- model = "voxtral-mini-latest"
158
- if original_model != "voxtral-mini-latest":
159
- logging.warning(
160
- "Transcribe mode: model override from '%s' to 'voxtral-mini-latest'\n"
161
- "(optimized for transcription).",
162
- original_model,
163
- )
164
164
  language = self.cfg.defaults.language
165
165
  if wav_path.stem.endswith(".wav"):
166
166
  base = wav_path.stem.replace(".wav", "")
@@ -176,9 +176,9 @@ class RecordingPipeline:
176
176
  final_user_prompt = self.cfg.resolve_prompt(self.user_prompt, self.user_prompt_file)
177
177
  else:
178
178
  final_user_prompt = user_prompt
179
- self._status("Transcribe mode not activated: using prompt.")
179
+ self._status("Prompt mode: transcription then transformation.")
180
180
  else:
181
- self._status("Transcribe mode activated: no prompt used.")
181
+ self._status("Transcribe mode: transcription only, no prompt.")
182
182
 
183
183
  logging.debug(f"Applied prompt: {final_user_prompt or 'None (transcribe mode)'}")
184
184
 
@@ -194,18 +194,22 @@ class RecordingPipeline:
194
194
  paths["converted"] = to_send_path
195
195
  _converted = True
196
196
 
197
- # Transcribe
197
+ # Step 1: Transcription (always)
198
198
  self._status("Transcribing...")
199
199
  prov = get_provider(provider, cfg=self.cfg)
200
- result = prov.transcribe(
201
- to_send_path,
202
- user_prompt=final_user_prompt,
203
- model=model,
204
- language=language,
205
- transcribe_mode=transcribe_mode,
206
- )
207
- text = result["text"]
208
- raw = result["raw"]
200
+ result = prov.transcribe(to_send_path, model=model, language=language)
201
+ raw_transcript = result["text"]
202
+
203
+ # Step 2: Transformation (if prompt)
204
+ if not transcribe_mode and final_user_prompt:
205
+ self._status("Applying prompt...")
206
+ chat_model = self.cfg.defaults.chat_model
207
+ chat_result = prov.chat(raw_transcript, final_user_prompt, model=chat_model)
208
+ text = chat_result["text"]
209
+ raw = {"transcription": result["raw"], "transformation": chat_result["raw"]}
210
+ else:
211
+ text = raw_transcript
212
+ raw = result["raw"]
209
213
 
210
214
  # Save if keeping transcripts
211
215
  if keep_transcript:
@@ -215,6 +219,12 @@ class RecordingPipeline:
215
219
  )
216
220
  paths["txt"] = txt_path
217
221
  paths["json"] = json_path
222
+
223
+ # Save raw transcript separately when transformation was applied
224
+ if not transcribe_mode and final_user_prompt:
225
+ raw_txt_path = self.cfg.transcripts_dir / f"{base}_{provider}_raw.txt"
226
+ save_text_file(raw_txt_path, raw_transcript)
227
+ paths["raw_txt"] = raw_txt_path
218
228
  else:
219
229
  paths["txt"] = None
220
230
  paths["json"] = None
@@ -230,6 +240,7 @@ class RecordingPipeline:
230
240
  logging.info("Processing finished (%.2fs)", duration)
231
241
  return {
232
242
  "text": text,
243
+ "raw_transcript": raw_transcript,
233
244
  "raw": raw,
234
245
  "duration": duration,
235
246
  "paths": paths,
@@ -263,8 +274,8 @@ class RecordingPipeline:
263
274
  stop_event: Optional event to signal recording stop (e.g., for GUI).
264
275
 
265
276
  Returns:
266
- Dict with 'text' (str), 'raw' (dict), 'duration' (float),
267
- 'paths' (dict of Path or None).
277
+ Dict with 'text' (str), 'raw_transcript' (str), 'raw' (dict),
278
+ 'duration' (float), 'paths' (dict of Path or None).
268
279
 
269
280
  Raises:
270
281
  Exception: On recording, conversion, or transcription errors.
svx/core/prompt.py CHANGED
@@ -158,7 +158,7 @@ def resolve_user_prompt(
158
158
  logging.debug("Prompt supplier '%s' failed: %s", name, e)
159
159
 
160
160
  # Final fallback
161
- fallback = "What's in this audio?"
161
+ fallback = "Clean up this transcription. Keep the original language."
162
162
  logging.info("resolve_user_prompt: no supplier provided a prompt, using fallback: %s", fallback)
163
163
  return fallback
164
164
 
@@ -176,13 +176,14 @@ def init_user_prompt_file(force: bool = False) -> Path:
176
176
  path = USER_PROMPT_DIR / "user.md"
177
177
  if not path.exists() or force:
178
178
  example_prompt = """
179
- - Transcribe the input audio file. If the audio if empty, just respond "no audio detected".
180
- - Do not respond to any question in the audio. Just transcribe.
181
- - DO NOT TRANSLATE.
182
- - Responde only with the transcription. Do not provide explanations or notes.
179
+ You receive a raw transcription of a voice recording. Clean it up:
180
+ - DO NOT TRANSLATE. Keep the original language.
181
+ - Do not respond to any question in the text. Just clean the transcription.
182
+ - Respond only with the cleaned text. Do not provide explanations or notes.
183
183
  - Remove all minor speech hesitations: "um", "uh", "er", "euh", "ben", etc.
184
184
  - Remove false starts (e.g., "je veux dire... je pense" → "je pense").
185
185
  - Correct grammatical errors.
186
+ - If the transcription is empty, respond "no audio detected".
186
187
  """
187
188
  try:
188
189
  path.write_text(example_prompt, encoding="utf-8")
svx/core/storage.py CHANGED
@@ -86,7 +86,7 @@ def save_transcript(
86
86
  base_name: str,
87
87
  provider: str,
88
88
  text: str,
89
- raw: dict | None = None,
89
+ raw: dict[str, Any] | None = None,
90
90
  ) -> tuple[Path, Path | None]:
91
91
  """
92
92
  Save a transcript text and, optionally, the raw JSON response.
svx/providers/base.py CHANGED
@@ -3,7 +3,7 @@ Base provider interface for SuperVoxtral.
3
3
 
4
4
  This module defines:
5
5
  - TranscriptionResult: a simple TypedDict structure for provider responses
6
- - Provider: a Protocol describing the required transcription interface
6
+ - Provider: a Protocol describing the required transcription and chat interface
7
7
  - ProviderError: a generic exception for provider-related failures
8
8
 
9
9
  All concrete providers should implement the `Provider` protocol.
@@ -37,7 +37,7 @@ class ProviderError(RuntimeError):
37
37
  @runtime_checkable
38
38
  class Provider(Protocol):
39
39
  """
40
- Provider interface for transcription/chat-with-audio services.
40
+ Provider interface for transcription and text transformation services.
41
41
 
42
42
  Implementations should be side-effect free aside from network I/O and must
43
43
  raise `ProviderError` (or a subclass) for expected provider failures
@@ -47,7 +47,8 @@ class Provider(Protocol):
47
47
  name: A short, lowercase, unique identifier for the provider (e.g. "mistral").
48
48
 
49
49
  Required methods:
50
- transcribe: Perform the transcription given an audio file and optional user prompt.
50
+ transcribe: Perform audio transcription via a dedicated endpoint.
51
+ chat: Transform text with a prompt via a text-based LLM.
51
52
  """
52
53
 
53
54
  # Short, unique name (e.g., "mistral", "whisper")
@@ -56,21 +57,16 @@ class Provider(Protocol):
56
57
  def transcribe(
57
58
  self,
58
59
  audio_path: Path,
59
- user_prompt: str | None,
60
60
  model: str | None = None,
61
61
  language: str | None = None,
62
- transcribe_mode: bool = False,
63
62
  ) -> TranscriptionResult:
64
63
  """
65
- Transcribe or process `audio_path` and return a normalized result.
64
+ Transcribe `audio_path` using a dedicated transcription endpoint.
66
65
 
67
66
  Args:
68
67
  audio_path: Path to an audio file (wav/mp3/opus...) to send to the provider.
69
- user_prompt: Optional user prompt to guide the transcription or analysis.
70
68
  model: Optional provider-specific model identifier.
71
69
  language: Optional language hint/constraint (e.g., "en", "fr").
72
- transcribe_mode: Optional bool to enable specialized modes like pure
73
- transcription (default False).
74
70
 
75
71
  Returns:
76
72
  TranscriptionResult including a human-readable `text` and
@@ -81,3 +77,27 @@ class Provider(Protocol):
81
77
  Exception: For unexpected failures (network issues, serialization, etc.).
82
78
  """
83
79
  ...
80
+
81
+ def chat(
82
+ self,
83
+ text: str,
84
+ prompt: str,
85
+ model: str | None = None,
86
+ ) -> TranscriptionResult:
87
+ """
88
+ Transform `text` using a text-based LLM with the given `prompt`.
89
+
90
+ Args:
91
+ text: Input text (e.g., raw transcription) to process.
92
+ prompt: System prompt guiding the transformation.
93
+ model: Optional provider-specific model identifier for the chat LLM.
94
+
95
+ Returns:
96
+ TranscriptionResult including the transformed `text` and
97
+ provider `raw` payload.
98
+
99
+ Raises:
100
+ ProviderError: For known/handled provider errors (e.g., missing API key).
101
+ Exception: For unexpected failures (network issues, serialization, etc.).
102
+ """
103
+ ...
svx/providers/mistral.py CHANGED
@@ -1,22 +1,18 @@
1
1
  """
2
2
  Mistral provider implementation for SuperVoxtral.
3
3
 
4
- This module provides a concrete Provider that uses Mistral's
5
- "chat with audio" capability (Voxtral) to process audio and return text.
4
+ This module provides a concrete Provider that uses Mistral's dedicated
5
+ transcription endpoint (Voxtral) and text-based LLM chat for transformation.
6
6
 
7
7
  Requirements:
8
8
  - User config must define [providers.mistral].api_key in config.toml.
9
9
  - Package 'mistralai' installed and importable.
10
10
 
11
- The provider composes messages with:
12
- - User content including the audio (base64) and optional user prompt text.
13
-
14
11
  It returns a normalized TranscriptionResult: {"text": str, "raw": dict}.
15
12
  """
16
13
 
17
14
  from __future__ import annotations
18
15
 
19
- import base64
20
16
  import json
21
17
  import logging
22
18
  from pathlib import Path
@@ -29,14 +25,6 @@ from .base import Provider, ProviderError, TranscriptionResult
29
25
  __all__ = ["MistralProvider"]
30
26
 
31
27
 
32
- def _read_file_as_base64(path: Path) -> str:
33
- """
34
- Read a file and return its base64-encoded string.
35
- """
36
- data = Path(path).read_bytes()
37
- return base64.b64encode(data).decode("utf-8")
38
-
39
-
40
28
  def _extract_text_from_response(resp: Any) -> str:
41
29
  """
42
30
  Attempt to robustly extract the textual content from a Mistral response.
@@ -89,9 +77,10 @@ def _normalize_raw_response(resp: Any) -> dict[str, Any]:
89
77
 
90
78
  class MistralProvider(Provider):
91
79
  """
92
- Mistral Voxtral provider implementation.
80
+ Mistral provider implementation.
93
81
 
94
- Uses the Mistral Python SDK to call `chat.with_audio` endpoint.
82
+ Uses the dedicated transcription endpoint for audio-to-text
83
+ and the chat endpoint for text transformation via LLM.
95
84
  """
96
85
 
97
86
  name = "mistral"
@@ -103,27 +92,21 @@ class MistralProvider(Provider):
103
92
  self.api_key = mistral_cfg.api_key
104
93
  if not self.api_key:
105
94
  raise ProviderError("Missing providers.mistral.api_key in user config (config.toml).")
95
+ self.context_bias = cfg.defaults.context_bias
106
96
 
107
97
  def transcribe(
108
98
  self,
109
99
  audio_path: Path,
110
- user_prompt: str | None,
111
- model: str | None = "voxtral-small-latest",
100
+ model: str | None = "voxtral-mini-latest",
112
101
  language: str | None = None,
113
- transcribe_mode: bool = False,
114
102
  ) -> TranscriptionResult:
115
103
  """
116
- Transcribe/process audio using Mistral's chat-with-audio or transcription endpoint.
104
+ Transcribe audio using Mistral's dedicated transcription endpoint.
117
105
 
118
106
  Args:
119
107
  audio_path: Path to wav/mp3/opus file to send.
120
- user_prompt: Optional user prompt to include with the audio
121
- (ignored in transcribe_mode).
122
- model: Voxtral model identifier (default: "voxtral-small-latest" for chat,
123
- "voxtral-mini-latest" for transcribe).
124
- language: Optional language hint for transcription (used only in
125
- transcribe_mode).
126
- transcribe_mode: If True, use dedicated transcription endpoint without prompt.
108
+ model: Voxtral model identifier (default: "voxtral-mini-latest").
109
+ language: Optional language hint for transcription.
127
110
 
128
111
  Returns:
129
112
  TranscriptionResult: {"text": text, "raw": raw_dict}
@@ -143,47 +126,71 @@ class MistralProvider(Provider):
143
126
 
144
127
  client = Mistral(api_key=self.api_key)
145
128
 
146
- if transcribe_mode:
147
- if user_prompt:
148
- logging.warning("Transcribe mode: user_prompt is ignored.")
149
- model_name = model or "voxtral-mini-latest"
150
- logging.info(
151
- "Calling Mistral transcription endpoint model=%s with audio=%s (%s), language=%s",
152
- model_name,
153
- Path(audio_path).name,
154
- Path(audio_path).suffix,
155
- language or "auto",
129
+ model_name = model or "voxtral-mini-latest"
130
+ logging.info(
131
+ "Calling Mistral transcription endpoint model=%s with audio=%s (%s),"
132
+ " language=%s, context_bias=%d items",
133
+ model_name,
134
+ Path(audio_path).name,
135
+ Path(audio_path).suffix,
136
+ language or "auto",
137
+ len(self.context_bias),
138
+ )
139
+ with open(audio_path, "rb") as f:
140
+ resp = client.audio.transcriptions.complete(
141
+ model=model_name,
142
+ file={"content": f, "file_name": Path(audio_path).name},
143
+ language=language,
144
+ context_bias=self.context_bias if self.context_bias else None,
156
145
  )
157
- with open(audio_path, "rb") as f:
158
- resp = client.audio.transcriptions.complete(
159
- model=model_name,
160
- file={"content": f, "file_name": Path(audio_path).name},
161
- language=language,
162
- )
163
- text = resp.text
164
- raw = _normalize_raw_response(resp)
165
- else:
166
- audio_b64 = _read_file_as_base64(Path(audio_path))
167
-
168
- # Compose messages (user only)
169
- messages: list[dict[str, Any]] = []
170
- user_content: list[dict[str, Any]] = [{"type": "input_audio", "input_audio": audio_b64}]
171
- if user_prompt:
172
- user_content.append({"type": "text", "text": user_prompt})
173
- messages.append({"role": "user", "content": user_content})
174
-
175
- # Execute request
176
- model_name = model or "voxtral-small-latest"
177
- logging.info(
178
- "Calling Mistral chat-with-audio model=%s with audio=%s (%s)",
179
- model_name,
180
- Path(audio_path).name,
181
- Path(audio_path).suffix,
182
- )
183
- resp = client.chat.complete(model=model_name, messages=cast(Any, messages))
184
-
185
- # Extract normalized text and raw payload
186
- text = _extract_text_from_response(resp)
187
- raw = _normalize_raw_response(resp)
146
+ text = resp.text
147
+ raw = _normalize_raw_response(resp)
188
148
 
189
149
  return TranscriptionResult(text=text, raw=raw)
150
+
151
+ def chat(
152
+ self,
153
+ text: str,
154
+ prompt: str,
155
+ model: str | None = None,
156
+ ) -> TranscriptionResult:
157
+ """
158
+ Transform text using Mistral's chat endpoint with a system prompt.
159
+
160
+ Args:
161
+ text: Input text (e.g., raw transcription) to process.
162
+ prompt: System prompt guiding the transformation.
163
+ model: Model identifier (default: None, caller should provide).
164
+
165
+ Returns:
166
+ TranscriptionResult: {"text": text, "raw": raw_dict}
167
+
168
+ Raises:
169
+ ProviderError: for expected configuration/import errors.
170
+ """
171
+ try:
172
+ from mistralai import Mistral
173
+ except Exception as e:
174
+ raise ProviderError(
175
+ "Failed to import 'mistralai'. Ensure the 'mistralai' package is installed."
176
+ ) from e
177
+
178
+ client = Mistral(api_key=self.api_key)
179
+
180
+ model_name = model or "mistral-small-latest"
181
+ logging.info(
182
+ "Calling Mistral chat endpoint model=%s for text transformation",
183
+ model_name,
184
+ )
185
+
186
+ messages: list[dict[str, Any]] = [
187
+ {"role": "system", "content": prompt},
188
+ {"role": "user", "content": text},
189
+ ]
190
+
191
+ resp = client.chat.complete(model=model_name, messages=cast(Any, messages))
192
+
193
+ result_text = _extract_text_from_response(resp)
194
+ raw = _normalize_raw_response(resp)
195
+
196
+ return TranscriptionResult(text=result_text, raw=raw)
svx/ui/qt_app.py CHANGED
@@ -119,7 +119,7 @@ class WaveformWidget(QWidget):
119
119
  waveform to indicate recording activity. It is lightweight and self-contained.
120
120
  """
121
121
 
122
- def __init__(self, parent=None, height: int = 64) -> None:
122
+ def __init__(self, parent: QWidget | None = None, height: int = 64) -> None:
123
123
  super().__init__(parent)
124
124
  self.setMinimumHeight(height)
125
125
  self.setMaximumHeight(height)
@@ -1,18 +0,0 @@
1
- svx/__init__.py,sha256=qPEe5u3PT8yOQN4MiOLj_Bd18HqcRb6fxnPDfdMUP7w,742
2
- svx/cli.py,sha256=3AirsBynuq2rcz-4C8hbZ69JztkgA7LTMGmL6ym7nyY,9167
3
- svx/core/__init__.py,sha256=mhzXuIXo3kUzjWme0Bxhe4TQZQELlyEiG_89LUAPC7M,2856
4
- svx/core/audio.py,sha256=r0m5T1uzdsJ1j9YXgQ5clv15dvMwZBp_bk2aLpjnrkc,7684
5
- svx/core/clipboard.py,sha256=IFtiN2SnYKQIu0WXx0hCK8syvDXanBpm1Jr2a8X7y9s,3692
6
- svx/core/config.py,sha256=e2tTGjjPcUYFctB28Ha90G-W44mF_0eWey1zpSyZkBo,16095
7
- svx/core/pipeline.py,sha256=AjNTzx8eFyWjI4VQYgKGELdo4soWUJZCn3861yIE5i0,10728
8
- svx/core/prompt.py,sha256=VAHBSL0UESrkbalT6s7SaMc9IQ0OJFKBFO1bEmXnRrI,7172
9
- svx/core/storage.py,sha256=5_xKYEpvDhaixRxmSTBlyX_jt8ssjHwHzX9VodcrtJw,3213
10
- svx/providers/__init__.py,sha256=SzlSWpZSUIptbSrAnGfi0d0NX4hYTpT0ObWpYyskDdA,2634
11
- svx/providers/base.py,sha256=YoiI8KWVRGISh7dx9XXPr1Q1a7ZDu8vfeJFlPbcKr20,2695
12
- svx/providers/mistral.py,sha256=vrBatNZg0zGNkJ5Qfnfz6ZwP6QtBgIt9sT_w59zkSO0,6636
13
- svx/ui/qt_app.py,sha256=yww_sH0zFI2HEQTRFcR1LW0N4duFqlW-HDo6SWelkWo,19975
14
- supervoxtral-0.1.5.dist-info/METADATA,sha256=Nq09MVEhqC5mjfVQCNaL2LL4HSgaQB-aQdJTp0-7Gk4,753
15
- supervoxtral-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- supervoxtral-0.1.5.dist-info/entry_points.txt,sha256=phJhRy3VkYHC6AR_tUB5CypHzG0ePRR9sB13HWE1vEg,36
17
- supervoxtral-0.1.5.dist-info/licenses/LICENSE,sha256=fCEBKmC4i-1WZAwoKjKWegfDd8qNsG8ECB7JyqoswyQ,1064
18
- supervoxtral-0.1.5.dist-info/RECORD,,