supervoxtral 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supervoxtral-0.1.0.dist-info/METADATA +23 -0
- supervoxtral-0.1.0.dist-info/RECORD +18 -0
- supervoxtral-0.1.0.dist-info/WHEEL +4 -0
- supervoxtral-0.1.0.dist-info/entry_points.txt +2 -0
- supervoxtral-0.1.0.dist-info/licenses/LICENSE +21 -0
- svx/__init__.py +28 -0
- svx/cli.py +264 -0
- svx/core/__init__.py +92 -0
- svx/core/audio.py +256 -0
- svx/core/clipboard.py +122 -0
- svx/core/config.py +400 -0
- svx/core/pipeline.py +260 -0
- svx/core/prompt.py +165 -0
- svx/core/storage.py +118 -0
- svx/providers/__init__.py +88 -0
- svx/providers/base.py +83 -0
- svx/providers/mistral.py +189 -0
- svx/ui/qt_app.py +491 -0
svx/core/prompt.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Prompt utilities for SuperVoxtral.
|
|
3
|
+
|
|
4
|
+
This module provides:
|
|
5
|
+
- Safe reading of UTF-8 text files.
|
|
6
|
+
- Resolution/combination of inline and file-based prompts.
|
|
7
|
+
- Initialization of default prompt files (user.md).
|
|
8
|
+
|
|
9
|
+
Intended to be small and dependency-light so it can be imported broadly.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from .config import USER_PROMPT_DIR, Config
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"read_text_file",
|
|
21
|
+
"resolve_prompt",
|
|
22
|
+
"resolve_user_prompt",
|
|
23
|
+
"init_user_prompt_file",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def read_text_file(path: Path | str) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Read a UTF-8 text file and return its content.
|
|
30
|
+
Returns an empty string if the file is missing or unreadable.
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
return Path(path).read_text(encoding="utf-8")
|
|
34
|
+
except Exception as e:
|
|
35
|
+
logging.warning("Failed to read text file %s: %s", path, e)
|
|
36
|
+
return ""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def resolve_prompt(inline: str | None, file_path: Path | None) -> str | None:
|
|
40
|
+
"""
|
|
41
|
+
Combine file content and inline prompt (file first), separated by a blank line.
|
|
42
|
+
|
|
43
|
+
- If file_path exists and contains text, it is used first.
|
|
44
|
+
- If inline is provided, it is appended after a blank line.
|
|
45
|
+
- Leading/trailing whitespace is stripped.
|
|
46
|
+
- Returns None if the resulting prompt is empty.
|
|
47
|
+
"""
|
|
48
|
+
parts: list[str] = []
|
|
49
|
+
|
|
50
|
+
if file_path:
|
|
51
|
+
file_path = Path(file_path)
|
|
52
|
+
if file_path.exists():
|
|
53
|
+
file_text = read_text_file(file_path).strip()
|
|
54
|
+
if file_text:
|
|
55
|
+
parts.append(file_text)
|
|
56
|
+
|
|
57
|
+
if inline:
|
|
58
|
+
inline_text = inline.strip()
|
|
59
|
+
if inline_text:
|
|
60
|
+
parts.append(inline_text)
|
|
61
|
+
|
|
62
|
+
combined = "\n\n".join(parts).strip()
|
|
63
|
+
return combined if combined else None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def resolve_user_prompt(
|
|
67
|
+
cfg: Config,
|
|
68
|
+
inline: str | None = None,
|
|
69
|
+
file: Path | None = None,
|
|
70
|
+
user_prompt_dir: Path | None = None,
|
|
71
|
+
) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Resolve the effective user prompt from multiple sources, by priority:
|
|
74
|
+
|
|
75
|
+
1) inline text (CLI --user-prompt)
|
|
76
|
+
2) explicit file (CLI --user-prompt-file)
|
|
77
|
+
3) user config inline text (cfg.prompt.text)
|
|
78
|
+
4) user config file path (cfg.prompt.file)
|
|
79
|
+
5) user prompt dir file (user_prompt_dir / 'user.md')
|
|
80
|
+
6) literal fallback: "What's in this audio?"
|
|
81
|
+
|
|
82
|
+
Returns the first non-empty string after stripping.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def _strip(val: str | None) -> str:
|
|
86
|
+
return val.strip() if isinstance(val, str) else ""
|
|
87
|
+
|
|
88
|
+
def _read(p: Path | None) -> str:
|
|
89
|
+
if not p:
|
|
90
|
+
return ""
|
|
91
|
+
try:
|
|
92
|
+
return read_text_file(p).strip()
|
|
93
|
+
except Exception:
|
|
94
|
+
logging.warning("Failed to read user prompt file: %s", p)
|
|
95
|
+
return ""
|
|
96
|
+
|
|
97
|
+
def _from_user_cfg() -> str:
|
|
98
|
+
try:
|
|
99
|
+
cfg_prompt = cfg.prompt
|
|
100
|
+
cfg_text = cfg_prompt.text
|
|
101
|
+
if isinstance(cfg_text, str) and cfg_text.strip():
|
|
102
|
+
return cfg_text.strip()
|
|
103
|
+
cfg_file = cfg_prompt.file
|
|
104
|
+
if isinstance(cfg_file, str) and cfg_file.strip():
|
|
105
|
+
return read_text_file(Path(cfg_file).expanduser()).strip()
|
|
106
|
+
except Exception:
|
|
107
|
+
logging.debug("User config prompt processing failed.", exc_info=True)
|
|
108
|
+
return ""
|
|
109
|
+
|
|
110
|
+
def _from_user_prompt_dir() -> str:
|
|
111
|
+
try:
|
|
112
|
+
upath = Path(user_prompt_dir or cfg.user_prompt_dir) / "user.md"
|
|
113
|
+
if upath.exists():
|
|
114
|
+
return read_text_file(upath).strip()
|
|
115
|
+
except Exception:
|
|
116
|
+
logging.debug(
|
|
117
|
+
"Could not read user prompt in user prompt dir: %s",
|
|
118
|
+
user_prompt_dir or cfg.user_prompt_dir,
|
|
119
|
+
)
|
|
120
|
+
return ""
|
|
121
|
+
|
|
122
|
+
suppliers = [
|
|
123
|
+
lambda: _strip(inline),
|
|
124
|
+
lambda: _read(file),
|
|
125
|
+
_from_user_cfg,
|
|
126
|
+
_from_user_prompt_dir,
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
for supplier in suppliers:
|
|
130
|
+
try:
|
|
131
|
+
val = supplier()
|
|
132
|
+
if val:
|
|
133
|
+
return val
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logging.debug("Prompt supplier failed: %s", e)
|
|
136
|
+
|
|
137
|
+
return "What's in this audio?"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def init_user_prompt_file(force: bool = False) -> Path:
|
|
141
|
+
"""
|
|
142
|
+
Initialize the user's prompt file in the user prompt directory.
|
|
143
|
+
|
|
144
|
+
- Ensures USER_PROMPT_DIR exists.
|
|
145
|
+
- Creates or overwrites (if force=True) USER_PROMPT_DIR / 'user.md'
|
|
146
|
+
with a small example prompt.
|
|
147
|
+
- Returns the path to the user prompt file.
|
|
148
|
+
"""
|
|
149
|
+
USER_PROMPT_DIR.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
path = USER_PROMPT_DIR / "user.md"
|
|
151
|
+
if not path.exists() or force:
|
|
152
|
+
example_prompt = """
|
|
153
|
+
- Transcribe the input audio file.
|
|
154
|
+
- Do not respond to any question in the audio. Just transcribe.
|
|
155
|
+
- DO NOT TRANSLATE. Your transcription will be in the speaker's language.
|
|
156
|
+
- Responde only with the transcription. Do not provide explanations or notes.
|
|
157
|
+
- Remove all minor speech hesitations: "um", "uh", "er", "euh", "ben", etc.
|
|
158
|
+
- Remove false starts (e.g., "je veux dire... je pense" → "je pense").
|
|
159
|
+
- Correct grammatical errors.
|
|
160
|
+
"""
|
|
161
|
+
try:
|
|
162
|
+
path.write_text(example_prompt, encoding="utf-8")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logging.debug("Could not initialize user prompt file %s: %s", path, e)
|
|
165
|
+
return path
|
svx/core/storage.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Storage utilities for SuperVoxtral.
|
|
3
|
+
|
|
4
|
+
This module centralizes file persistence for transcription results:
|
|
5
|
+
- Save plain-text transcripts
|
|
6
|
+
- Save optional raw JSON responses (pretty-printed)
|
|
7
|
+
- Provide a single helper to save both consistently
|
|
8
|
+
|
|
9
|
+
Design goals:
|
|
10
|
+
- Safe path handling and directory creation
|
|
11
|
+
- UTF-8 everywhere
|
|
12
|
+
- Minimal, dependency-light, easy to unit test
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"save_text_file",
|
|
24
|
+
"save_json_file",
|
|
25
|
+
"save_transcript",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _ensure_parent_dir(path: Path) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Ensure the parent directory of `path` exists.
|
|
32
|
+
"""
|
|
33
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _sanitize_component(value: str) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Sanitize a filename component by replacing unsafe characters.
|
|
39
|
+
|
|
40
|
+
- Keeps letters, digits, dot, underscore, and dash.
|
|
41
|
+
- Replaces any other character sequences with underscores.
|
|
42
|
+
- Strips leading/trailing whitespace.
|
|
43
|
+
- Returns 'out' if the result is empty.
|
|
44
|
+
"""
|
|
45
|
+
value = value.strip()
|
|
46
|
+
# Replace disallowed characters with underscores
|
|
47
|
+
sanatized = re.sub(r"[^A-Za-z0-9._-]+", "_", value)
|
|
48
|
+
return sanatized or "out"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def save_text_file(path: Path, content: str) -> Path:
|
|
52
|
+
"""
|
|
53
|
+
Save `content` as UTF-8 text to `path`.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
The same `path` for convenience.
|
|
57
|
+
"""
|
|
58
|
+
_ensure_parent_dir(path)
|
|
59
|
+
path.write_text(content or "", encoding="utf-8")
|
|
60
|
+
return path
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def save_json_file(path: Path, data: Any, pretty: bool = True) -> Path:
|
|
64
|
+
"""
|
|
65
|
+
Save `data` as JSON to `path`.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
path: Destination file path.
|
|
69
|
+
data: JSON-serializable object.
|
|
70
|
+
pretty: Whether to pretty-print with indentation.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The same `path` for convenience.
|
|
74
|
+
"""
|
|
75
|
+
_ensure_parent_dir(path)
|
|
76
|
+
if pretty:
|
|
77
|
+
serialized = json.dumps(data, ensure_ascii=False, indent=2)
|
|
78
|
+
else:
|
|
79
|
+
serialized = json.dumps(data, ensure_ascii=False, separators=(",", ":"))
|
|
80
|
+
path.write_text(serialized, encoding="utf-8")
|
|
81
|
+
return path
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def save_transcript(
|
|
85
|
+
transcripts_dir: Path,
|
|
86
|
+
base_name: str,
|
|
87
|
+
provider: str,
|
|
88
|
+
text: str,
|
|
89
|
+
raw: dict | None = None,
|
|
90
|
+
) -> tuple[Path, Path | None]:
|
|
91
|
+
"""
|
|
92
|
+
Save a transcript text and, optionally, the raw JSON response.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
transcripts_dir: Base directory where transcripts are stored.
|
|
96
|
+
base_name: Base file name (without extension).
|
|
97
|
+
provider: Provider name used as suffix (e.g., 'mistral').
|
|
98
|
+
text: Transcript text to write.
|
|
99
|
+
raw: Optional raw response to serialize as JSON.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
(text_path, json_path_or_None)
|
|
103
|
+
"""
|
|
104
|
+
transcripts_dir = Path(transcripts_dir)
|
|
105
|
+
transcripts_dir.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
safe_base = _sanitize_component(base_name)
|
|
108
|
+
safe_provider = _sanitize_component(provider)
|
|
109
|
+
|
|
110
|
+
text_path = transcripts_dir / f"{safe_base}_{safe_provider}.txt"
|
|
111
|
+
save_text_file(text_path, text or "")
|
|
112
|
+
|
|
113
|
+
json_path: Path | None = None
|
|
114
|
+
if raw is not None:
|
|
115
|
+
json_path = transcripts_dir / f"{safe_base}_{safe_provider}.json"
|
|
116
|
+
save_json_file(json_path, raw, pretty=True)
|
|
117
|
+
|
|
118
|
+
return text_path, json_path
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Provider registry for SuperVoxtral.
|
|
3
|
+
|
|
4
|
+
This module centralizes provider discovery and retrieval so the CLI (and other
|
|
5
|
+
consumers) can instantiate providers by name without importing their concrete modules.
|
|
6
|
+
|
|
7
|
+
Design goals:
|
|
8
|
+
- Simple API: register_provider(name, factory) and get_provider(name)
|
|
9
|
+
- Lazy imports: default providers are registered with factories that import on demand
|
|
10
|
+
- Friendly errors: list available providers on unknown name
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
|
|
17
|
+
from svx.core.config import Config
|
|
18
|
+
|
|
19
|
+
from .base import Provider, ProviderError, TranscriptionResult
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Provider",
|
|
23
|
+
"ProviderError",
|
|
24
|
+
"TranscriptionResult",
|
|
25
|
+
"register_provider",
|
|
26
|
+
"get_provider",
|
|
27
|
+
"available_providers",
|
|
28
|
+
"register_default_providers",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
# Factory callable that returns a Provider instance when called.
|
|
32
|
+
ProviderFactory = Callable[[Config | None], Provider]
|
|
33
|
+
|
|
34
|
+
# Internal registry mapping provider name -> factory
|
|
35
|
+
_registry: dict[str, ProviderFactory] = {}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def register_provider(name: str, factory: ProviderFactory) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Register a provider factory by a short, lowercase name.
|
|
41
|
+
If the name already exists, it will be overwritten.
|
|
42
|
+
"""
|
|
43
|
+
key = name.strip().lower()
|
|
44
|
+
if not key:
|
|
45
|
+
raise ValueError("Provider name cannot be empty.")
|
|
46
|
+
_registry[key] = factory
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_provider(name: str, cfg: Config | None = None) -> Provider:
|
|
50
|
+
"""
|
|
51
|
+
Retrieve a Provider instance by name.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
KeyError: if no provider is registered under that name.
|
|
55
|
+
"""
|
|
56
|
+
register_default_providers()
|
|
57
|
+
key = name.strip().lower()
|
|
58
|
+
try:
|
|
59
|
+
factory = _registry[key]
|
|
60
|
+
except KeyError as e:
|
|
61
|
+
available = ", ".join(sorted(_registry.keys())) or "(none)"
|
|
62
|
+
raise KeyError(f"Unknown provider '{name}'. Available: {available}") from e
|
|
63
|
+
return factory(cfg)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def available_providers() -> list[str]:
|
|
67
|
+
"""
|
|
68
|
+
Return the list of available provider names (sorted).
|
|
69
|
+
"""
|
|
70
|
+
register_default_providers()
|
|
71
|
+
return sorted(_registry.keys())
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def register_default_providers() -> None:
|
|
75
|
+
"""
|
|
76
|
+
Register built-in providers with lazy imports to avoid hard dependencies at import time.
|
|
77
|
+
Safe to call multiple times (idempotent).
|
|
78
|
+
"""
|
|
79
|
+
# Mistral (voxtral) provider
|
|
80
|
+
if "mistral" not in _registry:
|
|
81
|
+
|
|
82
|
+
def _mistral_factory(cfg: Config | None = None) -> Provider:
|
|
83
|
+
# Lazy import to avoid requiring 'mistralai' until the provider is actually used.
|
|
84
|
+
from .mistral import MistralProvider
|
|
85
|
+
|
|
86
|
+
return MistralProvider(cfg=cfg)
|
|
87
|
+
|
|
88
|
+
register_provider("mistral", _mistral_factory)
|
svx/providers/base.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base provider interface for SuperVoxtral.
|
|
3
|
+
|
|
4
|
+
This module defines:
|
|
5
|
+
- TranscriptionResult: a simple TypedDict structure for provider responses
|
|
6
|
+
- Provider: a Protocol describing the required transcription interface
|
|
7
|
+
- ProviderError: a generic exception for provider-related failures
|
|
8
|
+
|
|
9
|
+
All concrete providers should implement the `Provider` protocol.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Protocol, TypedDict, runtime_checkable
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TranscriptionResult(TypedDict):
|
|
19
|
+
"""
|
|
20
|
+
Normalized transcription result returned by providers.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
text: The best-effort, human-readable transcript or model output.
|
|
24
|
+
raw: Provider-specific raw response payload (JSON-like dict).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
text: str
|
|
28
|
+
raw: dict
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ProviderError(RuntimeError):
|
|
32
|
+
"""
|
|
33
|
+
Generic provider exception to represent recoverable/handled failures.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@runtime_checkable
|
|
38
|
+
class Provider(Protocol):
|
|
39
|
+
"""
|
|
40
|
+
Provider interface for transcription/chat-with-audio services.
|
|
41
|
+
|
|
42
|
+
Implementations should be side-effect free aside from network I/O and must
|
|
43
|
+
raise `ProviderError` (or a subclass) for expected provider failures
|
|
44
|
+
(misconfiguration, auth, invalid arguments). Unexpected errors may propagate.
|
|
45
|
+
|
|
46
|
+
Required attributes:
|
|
47
|
+
name: A short, lowercase, unique identifier for the provider (e.g. "mistral").
|
|
48
|
+
|
|
49
|
+
Required methods:
|
|
50
|
+
transcribe: Perform the transcription given an audio file and optional user prompt.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
# Short, unique name (e.g., "mistral", "whisper")
|
|
54
|
+
name: str
|
|
55
|
+
|
|
56
|
+
def transcribe(
|
|
57
|
+
self,
|
|
58
|
+
audio_path: Path,
|
|
59
|
+
user_prompt: str | None,
|
|
60
|
+
model: str | None = None,
|
|
61
|
+
language: str | None = None,
|
|
62
|
+
transcribe_mode: bool = False,
|
|
63
|
+
) -> TranscriptionResult:
|
|
64
|
+
"""
|
|
65
|
+
Transcribe or process `audio_path` and return a normalized result.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
audio_path: Path to an audio file (wav/mp3/opus...) to send to the provider.
|
|
69
|
+
user_prompt: Optional user prompt to guide the transcription or analysis.
|
|
70
|
+
model: Optional provider-specific model identifier.
|
|
71
|
+
language: Optional language hint/constraint (e.g., "en", "fr").
|
|
72
|
+
transcribe_mode: Optional bool to enable specialized modes like pure
|
|
73
|
+
transcription (default False).
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
TranscriptionResult including a human-readable `text` and
|
|
77
|
+
provider `raw` payload.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ProviderError: For known/handled provider errors (e.g., missing API key).
|
|
81
|
+
Exception: For unexpected failures (network issues, serialization, etc.).
|
|
82
|
+
"""
|
|
83
|
+
...
|
svx/providers/mistral.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mistral provider implementation for SuperVoxtral.
|
|
3
|
+
|
|
4
|
+
This module provides a concrete Provider that uses Mistral's
|
|
5
|
+
"chat with audio" capability (Voxtral) to process audio and return text.
|
|
6
|
+
|
|
7
|
+
Requirements:
|
|
8
|
+
- User config must define [providers.mistral].api_key in config.toml.
|
|
9
|
+
- Package 'mistralai' installed and importable.
|
|
10
|
+
|
|
11
|
+
The provider composes messages with:
|
|
12
|
+
- User content including the audio (base64) and optional user prompt text.
|
|
13
|
+
|
|
14
|
+
It returns a normalized TranscriptionResult: {"text": str, "raw": dict}.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import base64
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, cast
|
|
24
|
+
|
|
25
|
+
from svx.core.config import Config, ProviderConfig
|
|
26
|
+
|
|
27
|
+
from .base import Provider, ProviderError, TranscriptionResult
|
|
28
|
+
|
|
29
|
+
__all__ = ["MistralProvider"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _read_file_as_base64(path: Path) -> str:
|
|
33
|
+
"""
|
|
34
|
+
Read a file and return its base64-encoded string.
|
|
35
|
+
"""
|
|
36
|
+
data = Path(path).read_bytes()
|
|
37
|
+
return base64.b64encode(data).decode("utf-8")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _extract_text_from_response(resp: Any) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Attempt to robustly extract the textual content from a Mistral response.
|
|
43
|
+
|
|
44
|
+
Handles both dict-like and attribute-like SDK response formats.
|
|
45
|
+
Falls back to str(resp) if extraction fails.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
# Get first choice
|
|
49
|
+
choice0 = resp["choices"][0] if isinstance(resp, dict) else resp.choices[0] # type: ignore[index]
|
|
50
|
+
# Get message
|
|
51
|
+
message = choice0["message"] if isinstance(choice0, dict) else choice0.message
|
|
52
|
+
# Get content (could be str or list of segments)
|
|
53
|
+
content = message["content"] if isinstance(message, dict) else message.content
|
|
54
|
+
if isinstance(content, str):
|
|
55
|
+
return content
|
|
56
|
+
if isinstance(content, list):
|
|
57
|
+
parts: list[str] = []
|
|
58
|
+
for c in content:
|
|
59
|
+
if isinstance(c, dict) and c.get("type") == "text":
|
|
60
|
+
parts.append(c.get("text", ""))
|
|
61
|
+
return "\n".join(p for p in parts if p)
|
|
62
|
+
return str(content)
|
|
63
|
+
except Exception:
|
|
64
|
+
return str(resp)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _normalize_raw_response(resp: Any) -> dict[str, Any]:
|
|
68
|
+
"""
|
|
69
|
+
Convert the response into a plain dict for persistence.
|
|
70
|
+
|
|
71
|
+
- If response is already a dict, use it as-is.
|
|
72
|
+
- If it has 'model_dump_json()', parse it.
|
|
73
|
+
- Else, try json.loads(str(resp)).
|
|
74
|
+
- Else, store as {"raw": str(resp)}.
|
|
75
|
+
"""
|
|
76
|
+
if isinstance(resp, dict):
|
|
77
|
+
return resp
|
|
78
|
+
try:
|
|
79
|
+
# pydantic-like
|
|
80
|
+
if hasattr(resp, "model_dump_json"):
|
|
81
|
+
return json.loads(resp.model_dump_json()) # type: ignore[call-arg]
|
|
82
|
+
except Exception:
|
|
83
|
+
pass
|
|
84
|
+
try:
|
|
85
|
+
return json.loads(str(resp))
|
|
86
|
+
except Exception:
|
|
87
|
+
return {"raw": str(resp)}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class MistralProvider(Provider):
|
|
91
|
+
"""
|
|
92
|
+
Mistral Voxtral provider implementation.
|
|
93
|
+
|
|
94
|
+
Uses the Mistral Python SDK to call `chat.with_audio` endpoint.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
name = "mistral"
|
|
98
|
+
|
|
99
|
+
def __init__(self, cfg: Config | None = None):
|
|
100
|
+
if cfg is None:
|
|
101
|
+
cfg = Config.load()
|
|
102
|
+
mistral_cfg = cfg.providers.get("mistral", ProviderConfig())
|
|
103
|
+
self.api_key = mistral_cfg.api_key
|
|
104
|
+
if not self.api_key:
|
|
105
|
+
raise ProviderError("Missing providers.mistral.api_key in user config (config.toml).")
|
|
106
|
+
|
|
107
|
+
def transcribe(
|
|
108
|
+
self,
|
|
109
|
+
audio_path: Path,
|
|
110
|
+
user_prompt: str | None,
|
|
111
|
+
model: str | None = "voxtral-small-latest",
|
|
112
|
+
language: str | None = None,
|
|
113
|
+
transcribe_mode: bool = False,
|
|
114
|
+
) -> TranscriptionResult:
|
|
115
|
+
"""
|
|
116
|
+
Transcribe/process audio using Mistral's chat-with-audio or transcription endpoint.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
audio_path: Path to wav/mp3/opus file to send.
|
|
120
|
+
user_prompt: Optional user prompt to include with the audio
|
|
121
|
+
(ignored in transcribe_mode).
|
|
122
|
+
model: Voxtral model identifier (default: "voxtral-small-latest" for chat,
|
|
123
|
+
"voxtral-mini-latest" for transcribe).
|
|
124
|
+
language: Optional language hint for transcription (used only in
|
|
125
|
+
transcribe_mode).
|
|
126
|
+
transcribe_mode: If True, use dedicated transcription endpoint without prompt.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
TranscriptionResult: {"text": text, "raw": raw_dict}
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ProviderError: for expected configuration/import errors.
|
|
133
|
+
"""
|
|
134
|
+
try:
|
|
135
|
+
from mistralai import Mistral
|
|
136
|
+
except Exception as e:
|
|
137
|
+
raise ProviderError(
|
|
138
|
+
"Failed to import 'mistralai'. Ensure the 'mistralai' package is installed."
|
|
139
|
+
) from e
|
|
140
|
+
|
|
141
|
+
if not Path(audio_path).exists():
|
|
142
|
+
raise ProviderError(f"Audio file not found: {audio_path}")
|
|
143
|
+
|
|
144
|
+
client = Mistral(api_key=self.api_key)
|
|
145
|
+
|
|
146
|
+
if transcribe_mode:
|
|
147
|
+
if user_prompt:
|
|
148
|
+
logging.warning("Transcribe mode: user_prompt is ignored.")
|
|
149
|
+
model_name = model or "voxtral-mini-latest"
|
|
150
|
+
logging.info(
|
|
151
|
+
"Calling Mistral transcription endpoint model=%s with audio=%s (%s), language=%s",
|
|
152
|
+
model_name,
|
|
153
|
+
Path(audio_path).name,
|
|
154
|
+
Path(audio_path).suffix,
|
|
155
|
+
language or "auto",
|
|
156
|
+
)
|
|
157
|
+
with open(audio_path, "rb") as f:
|
|
158
|
+
resp = client.audio.transcriptions.complete(
|
|
159
|
+
model=model_name,
|
|
160
|
+
file={"content": f, "file_name": Path(audio_path).name},
|
|
161
|
+
language=language,
|
|
162
|
+
)
|
|
163
|
+
text = resp.text
|
|
164
|
+
raw = _normalize_raw_response(resp)
|
|
165
|
+
else:
|
|
166
|
+
audio_b64 = _read_file_as_base64(Path(audio_path))
|
|
167
|
+
|
|
168
|
+
# Compose messages (user only)
|
|
169
|
+
messages: list[dict[str, Any]] = []
|
|
170
|
+
user_content: list[dict[str, Any]] = [{"type": "input_audio", "input_audio": audio_b64}]
|
|
171
|
+
if user_prompt:
|
|
172
|
+
user_content.append({"type": "text", "text": user_prompt})
|
|
173
|
+
messages.append({"role": "user", "content": user_content})
|
|
174
|
+
|
|
175
|
+
# Execute request
|
|
176
|
+
model_name = model or "voxtral-small-latest"
|
|
177
|
+
logging.info(
|
|
178
|
+
"Calling Mistral chat-with-audio model=%s with audio=%s (%s)",
|
|
179
|
+
model_name,
|
|
180
|
+
Path(audio_path).name,
|
|
181
|
+
Path(audio_path).suffix,
|
|
182
|
+
)
|
|
183
|
+
resp = client.chat.complete(model=model_name, messages=cast(Any, messages))
|
|
184
|
+
|
|
185
|
+
# Extract normalized text and raw payload
|
|
186
|
+
text = _extract_text_from_response(resp)
|
|
187
|
+
raw = _normalize_raw_response(resp)
|
|
188
|
+
|
|
189
|
+
return TranscriptionResult(text=text, raw=raw)
|