PyPI - supervoxtral - Versions diffs - 0.1.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

supervoxtral 0.1.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/METADATA +3 -6
supervoxtral-0.3.0.dist-info/RECORD +18 -0
{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/WHEEL +1 -1
svx/cli.py +9 -4
svx/core/audio.py +7 -1
svx/core/config.py +17 -1
svx/core/pipeline.py +45 -32
svx/core/prompt.py +37 -12
svx/core/storage.py +1 -1
svx/providers/base.py +29 -9
svx/providers/mistral.py +75 -68
svx/ui/qt_app.py +35 -4
supervoxtral-0.1.4.dist-info/RECORD +0 -18
{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/entry_points.txt +0 -0
{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/licenses/LICENSE +0 -0

{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: supervoxtral
-Version: 0.1.4
-Summary: CLI/GUI audio recorder and transcription client using Mistral Voxtral (chat with audio and transcription).
+Version: 0.3.0
+Summary: CLI/GUI audio recorder with 2-step pipeline: transcription (Voxtral) then text transformation (LLM).
 License: MIT
 License-File: LICENSE
 Keywords: audio,cli,gui,mistral,transcription,voxtral,whisper
@@ -14,10 +14,7 @@ Requires-Dist: sounddevice
 Requires-Dist: soundfile
 Requires-Dist: typer
 Provides-Extra: dev
-Requires-Dist: black; extra == 'dev'
-Requires-Dist: mypy; extra == 'dev'
-Requires-Dist: pytest; extra == 'dev'
+Requires-Dist: basedpyright; extra == 'dev'
 Requires-Dist: ruff; extra == 'dev'
-Requires-Dist: types-python-dotenv; extra == 'dev'
 Provides-Extra: gui
 Requires-Dist: pyside6-essentials; extra == 'gui'

supervoxtral-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+svx/__init__.py,sha256=qPEe5u3PT8yOQN4MiOLj_Bd18HqcRb6fxnPDfdMUP7w,742
+svx/cli.py,sha256=7fzs85LT85RbZYtI8t-yOXKrRd9r-IzE1hnFJHNgxL4,9436
+svx/core/__init__.py,sha256=mhzXuIXo3kUzjWme0Bxhe4TQZQELlyEiG_89LUAPC7M,2856
+svx/core/audio.py,sha256=svyRWbPaUyYqbmGaLF8oUim-x5mj9zciv0XCqq2VGEU,7828
+svx/core/clipboard.py,sha256=IFtiN2SnYKQIu0WXx0hCK8syvDXanBpm1Jr2a8X7y9s,3692
+svx/core/config.py,sha256=Ib_lIKUFriW-B9i49zTUZE-YcOQYEkpBE3CF_WDzFlg,17060
+svx/core/pipeline.py,sha256=GhaOJtHGiwwsSv2EkNM-ZKu0DSm25xDeI9sNTLw7YJU,11612
+svx/core/prompt.py,sha256=OpS3XgusRwV4JP9cCzyk0DXcphcLcgHIPV89eoc2vFc,7282
+svx/core/storage.py,sha256=_w_rTOPoqlz0eoD2XHPNvHPQXgs6QxZ7SP4_IBT8Bx4,3223
+svx/providers/__init__.py,sha256=SzlSWpZSUIptbSrAnGfi0d0NX4hYTpT0ObWpYyskDdA,2634
+svx/providers/base.py,sha256=D_iDjhJuAGye-JjWbO-Rtl131kD6hgYQaspO53-6spo,3238
+svx/providers/mistral.py,sha256=ZkA02KDU-2ktdBM2tKUmTG8ZVnp8suE4g1TrPBpmqDA,6439
+svx/ui/qt_app.py,sha256=FDdxcgqzHi5HCsbmCzQtVEFTfcVaDLfiVgQe_8YHHoY,19993
+supervoxtral-0.3.0.dist-info/METADATA,sha256=vnKZuM96g1le-da4EVKOsECuSvIVpneE0xKq-DY-2_E,629
+supervoxtral-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+supervoxtral-0.3.0.dist-info/entry_points.txt,sha256=phJhRy3VkYHC6AR_tUB5CypHzG0ePRR9sB13HWE1vEg,36
+supervoxtral-0.3.0.dist-info/licenses/LICENSE,sha256=fCEBKmC4i-1WZAwoKjKWegfDd8qNsG8ECB7JyqoswyQ,1064
+supervoxtral-0.3.0.dist-info/RECORD,,

{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.27.0
+Generator: hatchling 1.28.0
 Root-Is-Purelib: true
 Tag: py3-none-any

svx/cli.py CHANGED Viewed

@@ -162,10 +162,14 @@ def record(
     ),
 ):
     """
-    Record audio from the microphone and send it to the selected provider.
+    Record audio from the microphone and process it via a 2-step pipeline.
+    Pipeline:
+    1. Transcription: audio -> text via dedicated transcription endpoint (always).
+    2. Transformation: text + prompt -> text via text-based LLM (when a prompt is provided).
     This CLI accepts only a small set of runtime flags. Most defaults (provider, format,
-    model, language, sample rate, channels, device,
+    model, chat_model, language, sample rate, channels, device,
     file retention, copy-to-clipboard)
     must be configured in the user's `config.toml` under [defaults].
@@ -178,11 +182,12 @@ def record(
     Flow:
     - Records WAV until you press Enter (CLI mode).
     - Optionally converts to MP3/Opus depending on config.
-    - Sends the file per provider rules.
+    - Transcribes via dedicated endpoint (step 1).
+    - If a prompt is provided, transforms the transcript via LLM (step 2).
     - Prints and saves the result.
     Note: In --transcribe mode, prompts (--user-prompt or --user-prompt-file) are ignored,
-    as it uses a dedicated transcription endpoint without prompting.
+    and only step 1 (transcription) is performed.
     """
     cfg = Config.load(log_level=log_level)

svx/core/audio.py CHANGED Viewed

@@ -22,6 +22,7 @@ from pathlib import Path
 from threading import Event, Thread
 from typing import Any
+import numpy as np
 import sounddevice as sd
 import soundfile as sf
@@ -149,7 +150,12 @@ def record_wav(
     writer_stop = Event()
     start_time = time.time()
-    def audio_callback(indata, frames, time_info, status):
+    def audio_callback(
+        indata: np.ndarray[Any, np.dtype[np.int16]],
+        frames: int,
+        time_info: sd.CallbackFlags,
+        status: sd.CallbackFlags,
+    ) -> None:
         if status:
             logging.warning("SoundDevice status: %s", status)
         q.put(indata.copy())

svx/core/config.py CHANGED Viewed

@@ -220,10 +220,17 @@ def init_user_config(force: bool = False, prompt_file: Path | None = None) -> Pa
         '# File format sent to the provider: "wav" | "mp3" | "opus"\n'
         '# Recording is always WAV; conversion is applied if "mp3" or "opus"\n'
         'format = "opus"\n\n'
-        "# Model to use on the provider side (example for Mistral Voxtral)\n"
+        "# Model for audio transcription (dedicated endpoint)\n"
         'model = "voxtral-mini-latest"\n\n'
+        "# Model for text transformation via LLM\n"
+        "# (applied after transcription when a prompt is used)\n"
+        'chat_model = "mistral-small-latest"\n\n'
         "# Language hint (may help the provider)\n"
         'language = "fr"\n\n'
+        "# Context bias: up to 100 words/phrases to help recognize specific vocabulary\n"
+        "# (proper nouns, technical terms, brand names, etc.)\n"
+        '# context_bias = ["SuperVoxtral", "Mistral AI", "Voxtral"]\n'
+        "context_bias = []\n\n"
         "# Audio recording parameters\n"
         "rate = 16000\n"
         "channels = 1\n"
@@ -271,7 +278,9 @@ class DefaultsConfig:
     provider: str = "mistral"
     format: str = "opus"
     model: str = "voxtral-mini-latest"
+    chat_model: str = "mistral-small-latest"
     language: str | None = None
+    context_bias: list[str] = field(default_factory=list)
     rate: int = 16000
     channels: int = 1
     device: str | None = None
@@ -315,7 +324,11 @@ class Config:
             "provider": str(user_defaults_raw.get("provider", "mistral")),
             "format": str(user_defaults_raw.get("format", "opus")),
             "model": str(user_defaults_raw.get("model", "voxtral-mini-latest")),
+            "chat_model": str(user_defaults_raw.get("chat_model", "mistral-small-latest")),
             "language": user_defaults_raw.get("language"),
+            "context_bias": list(user_defaults_raw.get("context_bias", []))
+            if isinstance(user_defaults_raw.get("context_bias"), list)
+            else [],
             "rate": int(user_defaults_raw.get("rate", 16000)),
             "channels": int(user_defaults_raw.get("channels", 1)),
             "device": user_defaults_raw.get("device"),
@@ -335,6 +348,9 @@ class Config:
         format_ = defaults_data["format"]
         if format_ not in {"wav", "mp3", "opus"}:
             raise ValueError("format must be one of wav|mp3|opus")
+        context_bias = defaults_data["context_bias"]
+        if len(context_bias) > 100:
+            raise ValueError("context_bias cannot contain more than 100 items (Mistral API limit)")
         defaults = DefaultsConfig(**defaults_data)
         # Conditional output directories
         if defaults.keep_audio_files:

svx/core/pipeline.py CHANGED Viewed

@@ -12,18 +12,23 @@ import svx.core.config as config
 from svx.core.audio import convert_audio, record_wav, timestamp
 from svx.core.clipboard import copy_to_clipboard
 from svx.core.config import Config
-from svx.core.storage import save_transcript
+from svx.core.storage import save_text_file, save_transcript
 from svx.providers import get_provider
 class RecordingPipeline:
     """
-    Centralized pipeline for recording audio, transcribing via provider, saving outputs,
-    and copying to clipboard. Handles temporary files when not keeping audio.
+    Centralized pipeline for recording audio, transcribing via provider, optionally
+    transforming with a text LLM, saving outputs, and copying to clipboard.
+    Pipeline steps:
+    1. Transcription: audio -> text via dedicated transcription endpoint (always)
+    2. Transformation: text + prompt -> text via text-based LLM (when a prompt is provided)
+    Handles temporary files when not keeping audio.
     Supports runtime overrides like save_all for keeping all files and adding log handlers.
     Optional progress_callback for status updates (e.g., for GUI).
-    Supports transcribe_mode for pure transcription without prompt using dedicated endpoint.
+    Supports transcribe_mode for pure transcription without prompt (step 1 only).
     """
     def __init__(
@@ -136,31 +141,26 @@ class RecordingPipeline:
         self, wav_path: Path, duration: float, transcribe_mode: bool, user_prompt: str | None = None
     ) -> dict[str, Any]:
         """
-        Process recorded audio: convert if needed, transcribe, save, copy.
+        Process recorded audio: convert if needed, transcribe, optionally transform, save, copy.
+        Pipeline:
+        1. Transcription: audio -> text via dedicated endpoint (always)
+        2. Transformation: text + prompt -> text via LLM (when prompt is provided)
         Args:
             wav_path: Path to the recorded WAV file.
             duration: Recording duration in seconds.
-            transcribe_mode: Whether to use pure transcription mode.
-            user_prompt: User prompt to use (None for transcribe_mode).
+            transcribe_mode: Whether to use pure transcription mode (step 1 only).
+            user_prompt: User prompt to use for transformation (None for transcribe_mode).
         Returns:
-            Dict with 'text' (str), 'raw' (dict), 'duration' (float),
-            'paths' (dict of Path or None).
+            Dict with 'text' (str), 'raw_transcript' (str), 'raw' (dict),
+            'duration' (float), 'paths' (dict of Path or None).
         """
         # Resolve parameters
         provider = self.cfg.defaults.provider
         audio_format = self.cfg.defaults.format
         model = self.cfg.defaults.model
-        original_model = model
-        if transcribe_mode:
-            model = "voxtral-mini-latest"
-            if original_model != "voxtral-mini-latest":
-                logging.warning(
-                    "Transcribe mode: model override from '%s' to 'voxtral-mini-latest'\n"
-                    "(optimized for transcription).",
-                    original_model,
-                )
         language = self.cfg.defaults.language
         if wav_path.stem.endswith(".wav"):
             base = wav_path.stem.replace(".wav", "")
@@ -176,9 +176,11 @@ class RecordingPipeline:
                 final_user_prompt = self.cfg.resolve_prompt(self.user_prompt, self.user_prompt_file)
             else:
                 final_user_prompt = user_prompt
-            self._status("Transcribe mode not activated: using prompt.")
+            self._status("Prompt mode: transcription then transformation.")
         else:
-            self._status("Transcribe mode activated: no prompt used.")
+            self._status("Transcribe mode: transcription only, no prompt.")
+        logging.debug(f"Applied prompt: {final_user_prompt or 'None (transcribe mode)'}")
         paths: dict[str, Path | None] = {"wav": wav_path}
@@ -192,18 +194,22 @@ class RecordingPipeline:
             paths["converted"] = to_send_path
             _converted = True
-        # Transcribe
+        # Step 1: Transcription (always)
         self._status("Transcribing...")
         prov = get_provider(provider, cfg=self.cfg)
-        result = prov.transcribe(
-            to_send_path,
-            user_prompt=final_user_prompt,
-            model=model,
-            language=language,
-            transcribe_mode=transcribe_mode,
-        )
-        text = result["text"]
-        raw = result["raw"]
+        result = prov.transcribe(to_send_path, model=model, language=language)
+        raw_transcript = result["text"]
+        # Step 2: Transformation (if prompt)
+        if not transcribe_mode and final_user_prompt:
+            self._status("Applying prompt...")
+            chat_model = self.cfg.defaults.chat_model
+            chat_result = prov.chat(raw_transcript, final_user_prompt, model=chat_model)
+            text = chat_result["text"]
+            raw = {"transcription": result["raw"], "transformation": chat_result["raw"]}
+        else:
+            text = raw_transcript
+            raw = result["raw"]
         # Save if keeping transcripts
         if keep_transcript:
@@ -213,6 +219,12 @@ class RecordingPipeline:
             )
             paths["txt"] = txt_path
             paths["json"] = json_path
+            # Save raw transcript separately when transformation was applied
+            if not transcribe_mode and final_user_prompt:
+                raw_txt_path = self.cfg.transcripts_dir / f"{base}_{provider}_raw.txt"
+                save_text_file(raw_txt_path, raw_transcript)
+                paths["raw_txt"] = raw_txt_path
         else:
             paths["txt"] = None
             paths["json"] = None
@@ -228,6 +240,7 @@ class RecordingPipeline:
         logging.info("Processing finished (%.2fs)", duration)
         return {
             "text": text,
+            "raw_transcript": raw_transcript,
             "raw": raw,
             "duration": duration,
             "paths": paths,
@@ -261,8 +274,8 @@ class RecordingPipeline:
             stop_event: Optional event to signal recording stop (e.g., for GUI).
         Returns:
-            Dict with 'text' (str), 'raw' (dict), 'duration' (float),
-            'paths' (dict of Path or None).
+            Dict with 'text' (str), 'raw_transcript' (str), 'raw' (dict),
+            'duration' (float), 'paths' (dict of Path or None).
         Raises:
             Exception: On recording, conversion, or transcription errors.

svx/core/prompt.py CHANGED Viewed

@@ -12,6 +12,7 @@ Intended to be small and dependency-light so it can be imported broadly.
 from __future__ import annotations
 import logging
+from collections.abc import Callable
 from pathlib import Path
 from .config import USER_PROMPT_DIR, Config, PromptEntry
@@ -121,22 +122,45 @@ def resolve_user_prompt(
         return ""
     key = key or "default"
-    suppliers = [
-        lambda: _strip(inline),
-        lambda: _read(file),
-        lambda: _from_user_cfg(key),
-        _from_user_prompt_dir,
+    # Suppliers annotated with a name for tracing which one returned the prompt.
+    named_suppliers: list[tuple[str, Callable[[], str]]] = [
+        ("inline", lambda: _strip(inline)),
+        ("file", lambda: _read(file)),
+        (f"prompt_config[{key}]", lambda: _from_user_cfg(key)),
+        ("user_prompt_dir/user.md", _from_user_prompt_dir),
     ]
-    for supplier in suppliers:
+    for name, supplier in named_suppliers:
         try:
             val = supplier()
             if val:
+                # Log which supplier provided the prompt and a short snippet for debugging.
+                try:
+                    if len(val) > 200:
+                        snippet = val[:200] + "..."
+                    else:
+                        snippet = val
+                    logging.info(
+                        "resolve_user_prompt: supplier '%s' provided prompt snippet: %s",
+                        name,
+                        snippet,
+                    )
+                except Exception:
+                    # Ensure logging failures do not change behavior.
+                    logging.info(
+                        "resolve_user_prompt: supplier '%s' provided a prompt "
+                        "(snippet unavailable)",
+                        name,
+                    )
                 return val
         except Exception as e:
-            logging.debug("Prompt supplier failed: %s", e)
+            logging.debug("Prompt supplier '%s' failed: %s", name, e)
-    return "What's in this audio?"
+    # Final fallback
+    fallback = "Clean up this transcription. Keep the original language."
+    logging.info("resolve_user_prompt: no supplier provided a prompt, using fallback: %s", fallback)
+    return fallback
 def init_user_prompt_file(force: bool = False) -> Path:
@@ -152,13 +176,14 @@ def init_user_prompt_file(force: bool = False) -> Path:
     path = USER_PROMPT_DIR / "user.md"
     if not path.exists() or force:
         example_prompt = """
-- Transcribe the input audio file. If the audio if empty, just respond "no audio detected".
-- Do not respond to any question in the audio. Just transcribe.
-- DO NOT TRANSLATE.
-- Responde only with the transcription. Do not provide explanations or notes.
+You receive a raw transcription of a voice recording. Clean it up:
+- DO NOT TRANSLATE. Keep the original language.
+- Do not respond to any question in the text. Just clean the transcription.
+- Respond only with the cleaned text. Do not provide explanations or notes.
 - Remove all minor speech hesitations: "um", "uh", "er", "euh", "ben", etc.
 - Remove false starts (e.g., "je veux dire... je pense" → "je pense").
 - Correct grammatical errors.
+- If the transcription is empty, respond "no audio detected".
         """
         try:
             path.write_text(example_prompt, encoding="utf-8")

svx/core/storage.py CHANGED Viewed

@@ -86,7 +86,7 @@ def save_transcript(
     base_name: str,
     provider: str,
     text: str,
-    raw: dict | None = None,
+    raw: dict[str, Any] | None = None,
 ) -> tuple[Path, Path | None]:
     """
     Save a transcript text and, optionally, the raw JSON response.

svx/providers/base.py CHANGED Viewed

@@ -3,7 +3,7 @@ Base provider interface for SuperVoxtral.
 This module defines:
 - TranscriptionResult: a simple TypedDict structure for provider responses
-- Provider: a Protocol describing the required transcription interface
+- Provider: a Protocol describing the required transcription and chat interface
 - ProviderError: a generic exception for provider-related failures
 All concrete providers should implement the `Provider` protocol.
@@ -37,7 +37,7 @@ class ProviderError(RuntimeError):
 @runtime_checkable
 class Provider(Protocol):
     """
-    Provider interface for transcription/chat-with-audio services.
+    Provider interface for transcription and text transformation services.
     Implementations should be side-effect free aside from network I/O and must
     raise `ProviderError` (or a subclass) for expected provider failures
@@ -47,7 +47,8 @@ class Provider(Protocol):
         name: A short, lowercase, unique identifier for the provider (e.g. "mistral").
     Required methods:
-        transcribe: Perform the transcription given an audio file and optional user prompt.
+        transcribe: Perform audio transcription via a dedicated endpoint.
+        chat: Transform text with a prompt via a text-based LLM.
     """
     # Short, unique name (e.g., "mistral", "whisper")
@@ -56,21 +57,16 @@ class Provider(Protocol):
     def transcribe(
         self,
         audio_path: Path,
-        user_prompt: str | None,
         model: str | None = None,
         language: str | None = None,
-        transcribe_mode: bool = False,
     ) -> TranscriptionResult:
         """
-        Transcribe or process `audio_path` and return a normalized result.
+        Transcribe `audio_path` using a dedicated transcription endpoint.
         Args:
             audio_path: Path to an audio file (wav/mp3/opus...) to send to the provider.
-            user_prompt: Optional user prompt to guide the transcription or analysis.
             model: Optional provider-specific model identifier.
             language: Optional language hint/constraint (e.g., "en", "fr").
-            transcribe_mode: Optional bool to enable specialized modes like pure
-                             transcription (default False).
         Returns:
             TranscriptionResult including a human-readable `text` and
@@ -81,3 +77,27 @@ class Provider(Protocol):
             Exception: For unexpected failures (network issues, serialization, etc.).
         """
         ...
+    def chat(
+        self,
+        text: str,
+        prompt: str,
+        model: str | None = None,
+    ) -> TranscriptionResult:
+        """
+        Transform `text` using a text-based LLM with the given `prompt`.
+        Args:
+            text: Input text (e.g., raw transcription) to process.
+            prompt: System prompt guiding the transformation.
+            model: Optional provider-specific model identifier for the chat LLM.
+        Returns:
+            TranscriptionResult including the transformed `text` and
+            provider `raw` payload.
+        Raises:
+            ProviderError: For known/handled provider errors (e.g., missing API key).
+            Exception: For unexpected failures (network issues, serialization, etc.).
+        """
+        ...

svx/providers/mistral.py CHANGED Viewed

@@ -1,22 +1,18 @@
 """
 Mistral provider implementation for SuperVoxtral.
-This module provides a concrete Provider that uses Mistral's
-"chat with audio" capability (Voxtral) to process audio and return text.
+This module provides a concrete Provider that uses Mistral's dedicated
+transcription endpoint (Voxtral) and text-based LLM chat for transformation.
 Requirements:
 - User config must define [providers.mistral].api_key in config.toml.
 - Package 'mistralai' installed and importable.
-The provider composes messages with:
-- User content including the audio (base64) and optional user prompt text.
 It returns a normalized TranscriptionResult: {"text": str, "raw": dict}.
 """
 from __future__ import annotations
-import base64
 import json
 import logging
 from pathlib import Path
@@ -29,14 +25,6 @@ from .base import Provider, ProviderError, TranscriptionResult
 __all__ = ["MistralProvider"]
-def _read_file_as_base64(path: Path) -> str:
-    """
-    Read a file and return its base64-encoded string.
-    """
-    data = Path(path).read_bytes()
-    return base64.b64encode(data).decode("utf-8")
 def _extract_text_from_response(resp: Any) -> str:
     """
     Attempt to robustly extract the textual content from a Mistral response.
@@ -89,9 +77,10 @@ def _normalize_raw_response(resp: Any) -> dict[str, Any]:
 class MistralProvider(Provider):
     """
-    Mistral Voxtral provider implementation.
+    Mistral provider implementation.
-    Uses the Mistral Python SDK to call `chat.with_audio` endpoint.
+    Uses the dedicated transcription endpoint for audio-to-text
+    and the chat endpoint for text transformation via LLM.
     """
     name = "mistral"
@@ -103,27 +92,21 @@ class MistralProvider(Provider):
         self.api_key = mistral_cfg.api_key
         if not self.api_key:
             raise ProviderError("Missing providers.mistral.api_key in user config (config.toml).")
+        self.context_bias = cfg.defaults.context_bias
     def transcribe(
         self,
         audio_path: Path,
-        user_prompt: str | None,
-        model: str | None = "voxtral-small-latest",
+        model: str | None = "voxtral-mini-latest",
         language: str | None = None,
-        transcribe_mode: bool = False,
     ) -> TranscriptionResult:
         """
-        Transcribe/process audio using Mistral's chat-with-audio or transcription endpoint.
+        Transcribe audio using Mistral's dedicated transcription endpoint.
         Args:
             audio_path: Path to wav/mp3/opus file to send.
-            user_prompt: Optional user prompt to include with the audio
-                         (ignored in transcribe_mode).
-            model: Voxtral model identifier (default: "voxtral-small-latest" for chat,
-                   "voxtral-mini-latest" for transcribe).
-            language: Optional language hint for transcription (used only in
-                      transcribe_mode).
-            transcribe_mode: If True, use dedicated transcription endpoint without prompt.
+            model: Voxtral model identifier (default: "voxtral-mini-latest").
+            language: Optional language hint for transcription.
         Returns:
             TranscriptionResult: {"text": text, "raw": raw_dict}
@@ -143,47 +126,71 @@ class MistralProvider(Provider):
         client = Mistral(api_key=self.api_key)
-        if transcribe_mode:
-            if user_prompt:
-                logging.warning("Transcribe mode: user_prompt is ignored.")
-            model_name = model or "voxtral-mini-latest"
-            logging.info(
-                "Calling Mistral transcription endpoint model=%s with audio=%s (%s), language=%s",
-                model_name,
-                Path(audio_path).name,
-                Path(audio_path).suffix,
-                language or "auto",
+        model_name = model or "voxtral-mini-latest"
+        logging.info(
+            "Calling Mistral transcription endpoint model=%s with audio=%s (%s),"
+            " language=%s, context_bias=%d items",
+            model_name,
+            Path(audio_path).name,
+            Path(audio_path).suffix,
+            language or "auto",
+            len(self.context_bias),
+        )
+        with open(audio_path, "rb") as f:
+            resp = client.audio.transcriptions.complete(
+                model=model_name,
+                file={"content": f, "file_name": Path(audio_path).name},
+                language=language,
+                context_bias=self.context_bias if self.context_bias else None,
             )
-            with open(audio_path, "rb") as f:
-                resp = client.audio.transcriptions.complete(
-                    model=model_name,
-                    file={"content": f, "file_name": Path(audio_path).name},
-                    language=language,
-                )
-            text = resp.text
-            raw = _normalize_raw_response(resp)
-        else:
-            audio_b64 = _read_file_as_base64(Path(audio_path))
-            # Compose messages (user only)
-            messages: list[dict[str, Any]] = []
-            user_content: list[dict[str, Any]] = [{"type": "input_audio", "input_audio": audio_b64}]
-            if user_prompt:
-                user_content.append({"type": "text", "text": user_prompt})
-            messages.append({"role": "user", "content": user_content})
-            # Execute request
-            model_name = model or "voxtral-small-latest"
-            logging.info(
-                "Calling Mistral chat-with-audio model=%s with audio=%s (%s)",
-                model_name,
-                Path(audio_path).name,
-                Path(audio_path).suffix,
-            )
-            resp = client.chat.complete(model=model_name, messages=cast(Any, messages))
-            # Extract normalized text and raw payload
-            text = _extract_text_from_response(resp)
-            raw = _normalize_raw_response(resp)
+        text = resp.text
+        raw = _normalize_raw_response(resp)
         return TranscriptionResult(text=text, raw=raw)
+    def chat(
+        self,
+        text: str,
+        prompt: str,
+        model: str | None = None,
+    ) -> TranscriptionResult:
+        """
+        Transform text using Mistral's chat endpoint with a system prompt.
+        Args:
+            text: Input text (e.g., raw transcription) to process.
+            prompt: System prompt guiding the transformation.
+            model: Model identifier (default: None, caller should provide).
+        Returns:
+            TranscriptionResult: {"text": text, "raw": raw_dict}
+        Raises:
+            ProviderError: for expected configuration/import errors.
+        """
+        try:
+            from mistralai import Mistral
+        except Exception as e:
+            raise ProviderError(
+                "Failed to import 'mistralai'. Ensure the 'mistralai' package is installed."
+            ) from e
+        client = Mistral(api_key=self.api_key)
+        model_name = model or "mistral-small-latest"
+        logging.info(
+            "Calling Mistral chat endpoint model=%s for text transformation",
+            model_name,
+        )
+        messages: list[dict[str, Any]] = [
+            {"role": "system", "content": prompt},
+            {"role": "user", "content": text},
+        ]
+        resp = client.chat.complete(model=model_name, messages=cast(Any, messages))
+        result_text = _extract_text_from_response(resp)
+        raw = _normalize_raw_response(resp)
+        return TranscriptionResult(text=result_text, raw=raw)

svx/ui/qt_app.py CHANGED Viewed

@@ -119,7 +119,7 @@ class WaveformWidget(QWidget):
     waveform to indicate recording activity. It is lightweight and self-contained.
     """
-    def __init__(self, parent=None, height: int = 64) -> None:
+    def __init__(self, parent: QWidget | None = None, height: int = 64) -> None:
         super().__init__(parent)
         self.setMinimumHeight(height)
         self.setMaximumHeight(height)
@@ -273,10 +273,39 @@ class RecorderWorker(QObject):
                 self.canceled.emit()
                 return
             self.status.emit("Processing in progress...")
+            # Wait for user to select mode in the GUI
             while self.mode is None:
                 time.sleep(0.05)
+            # Log the selected mode/key for debugging prompt application
+            try:
+                logging.info("RecorderWorker: selected mode/key: %s", self.mode)
+            except Exception:
+                # ensure failures in logging don't break the worker
+                pass
             transcribe_mode = self.mode == "transcribe"
-            user_prompt = None if transcribe_mode else self._resolve_user_prompt(self.mode)
+            if transcribe_mode:
+                user_prompt = None
+            else:
+                # Resolve the user prompt for the selected key and log a short snippet
+                user_prompt = self._resolve_user_prompt(self.mode)
+                try:
+                    if user_prompt:
+                        snippet = (
+                            user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt
+                        )
+                    else:
+                        snippet = "<EMPTY>"
+                    logging.info(
+                        "RecorderWorker: resolved prompt snippet for key '%s': %s",
+                        self.mode,
+                        snippet,
+                    )
+                except Exception:
+                    # avoid breaking the flow on logging errors
+                    pass
             result = pipeline.process(wav_path, duration, transcribe_mode, user_prompt)
             keep_audio = self.save_all or self.cfg.defaults.keep_audio_files
             pipeline.clean(wav_path, result["paths"], keep_audio)
@@ -383,13 +412,15 @@ class RecorderWindow(QWidget):
         button_layout.addStretch()
         self._transcribe_btn = QPushButton("Transcribe")
         self._transcribe_btn.setToolTip("Stop and transcribe without prompt")
-        self._transcribe_btn.clicked.connect(lambda: self._on_mode_selected("transcribe"))
+        self._transcribe_btn.clicked.connect(
+            lambda checked=False, m="transcribe": self._on_mode_selected(m)
+        )
         button_layout.addWidget(self._transcribe_btn)
         self._prompt_buttons: dict[str, QPushButton] = {}
         for key in self.prompt_keys:
             btn = QPushButton(key.capitalize())
             btn.setToolTip(f"Stop and transcribe with '{key}' prompt")
-            btn.clicked.connect(lambda k=key: self._on_mode_selected(k))
+            btn.clicked.connect(lambda checked=False, k=key: self._on_mode_selected(k))
             self._prompt_buttons[key] = btn
             button_layout.addWidget(btn)
         self._cancel_btn = QPushButton("Cancel")

supervoxtral-0.1.4.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-svx/__init__.py,sha256=qPEe5u3PT8yOQN4MiOLj_Bd18HqcRb6fxnPDfdMUP7w,742
-svx/cli.py,sha256=3AirsBynuq2rcz-4C8hbZ69JztkgA7LTMGmL6ym7nyY,9167
-svx/core/__init__.py,sha256=mhzXuIXo3kUzjWme0Bxhe4TQZQELlyEiG_89LUAPC7M,2856
-svx/core/audio.py,sha256=r0m5T1uzdsJ1j9YXgQ5clv15dvMwZBp_bk2aLpjnrkc,7684
-svx/core/clipboard.py,sha256=IFtiN2SnYKQIu0WXx0hCK8syvDXanBpm1Jr2a8X7y9s,3692
-svx/core/config.py,sha256=e2tTGjjPcUYFctB28Ha90G-W44mF_0eWey1zpSyZkBo,16095
-svx/core/pipeline.py,sha256=nqvCgK5Pbyx18mfACrN_mIDt546Bh7fKA6MF4XG1hxM,10637
-svx/core/prompt.py,sha256=yO8UbpFg7n1IT7wFjSQ7NUTbrqxuwPhdnxkTH4Iu7XU,5967
-svx/core/storage.py,sha256=5_xKYEpvDhaixRxmSTBlyX_jt8ssjHwHzX9VodcrtJw,3213
-svx/providers/__init__.py,sha256=SzlSWpZSUIptbSrAnGfi0d0NX4hYTpT0ObWpYyskDdA,2634
-svx/providers/base.py,sha256=YoiI8KWVRGISh7dx9XXPr1Q1a7ZDu8vfeJFlPbcKr20,2695
-svx/providers/mistral.py,sha256=vrBatNZg0zGNkJ5Qfnfz6ZwP6QtBgIt9sT_w59zkSO0,6636
-svx/ui/qt_app.py,sha256=6LOMeMjkMmYylu6H_prDRmPDsL0s4PVMZqfbflByCMs,18808
-supervoxtral-0.1.4.dist-info/METADATA,sha256=0w_i5geOKu8F9x7eviNboDNt-PTy6FS3WHe3cCx4eHg,753
-supervoxtral-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-supervoxtral-0.1.4.dist-info/entry_points.txt,sha256=phJhRy3VkYHC6AR_tUB5CypHzG0ePRR9sB13HWE1vEg,36
-supervoxtral-0.1.4.dist-info/licenses/LICENSE,sha256=fCEBKmC4i-1WZAwoKjKWegfDd8qNsG8ECB7JyqoswyQ,1064
-supervoxtral-0.1.4.dist-info/RECORD,,

{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{supervoxtral-0.1.4.dist-info → supervoxtral-0.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

supervoxtral 0.1.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

supervoxtral 0.1.4py3-none-any.whl → 0.3.0py3-none-any.whl