supervoxtral 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/AGENTS.md +2 -2
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/PKG-INFO +1 -1
- supervoxtral-0.1.2/README.md +236 -0
- supervoxtral-0.1.2/macos-shortcut.png +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/pyproject.toml +1 -1
- supervoxtral-0.1.2/supervoxtral.gif +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/cli.py +5 -1
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/core/config.py +1 -1
- supervoxtral-0.1.2/svx/core/pipeline.py +286 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/core/prompt.py +1 -1
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/ui/qt_app.py +95 -45
- supervoxtral-0.1.0/README.md +0 -237
- supervoxtral-0.1.0/svx/core/pipeline.py +0 -260
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/.gitignore +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/LICENSE +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/logs/.gitkeep +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/notes.md +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/prompt/.gitkeep +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/recordings/.gitkeep +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/__init__.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/core/__init__.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/core/audio.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/core/clipboard.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/core/storage.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/providers/__init__.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/providers/base.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/svx/providers/mistral.py +0 -0
- {supervoxtral-0.1.0 → supervoxtral-0.1.2}/transcripts/.gitkeep +0 -0
|
@@ -34,9 +34,9 @@ supervoxtral/
|
|
|
34
34
|
|
|
35
35
|
- **Entry**: `svx/cli.py` Typer `record` command parses args (e.g., --prompt, --save-all, --gui, --transcribe).
|
|
36
36
|
- **Config & Prompt**: Load `Config` via `Config.load()` (`core/config.py`); if transcribe_mode, skip prompt resolution; else resolve prompt with `cfg.resolve_prompt()` (`core/prompt.py`).
|
|
37
|
-
- **Pipeline**: Run `RecordingPipeline` (`core/pipeline.py`): record WAV/stop (`core/audio.py`), optional conversion (ffmpeg), get provider/init (`providers/__init__.py`, e.g., `mistral.py` from `cfg`); if transcribe_mode: no prompt, model override to voxtral-mini-latest (with warning if changed), pass transcribe_mode to provider.transcribe; transcribe, conditional save (`core/storage.py` based on `keep_*`/`save_all`), clipboard copy, logging setup.
|
|
37
|
+
- **Pipeline**: Run `RecordingPipeline` (`core/pipeline.py`): record WAV/stop (`core/audio.py`), optional conversion (ffmpeg), get provider/init (`providers/__init__.py`, e.g., `mistral.py` from `cfg`); if transcribe_mode (CLI only): no prompt, model override to voxtral-mini-latest (with warning if changed), pass transcribe_mode to provider.transcribe; for GUI: --transcribe ignored (warning), recording starts immediately, uses modular record()/process()/clean() with dynamic mode (Transcribe: no prompt, model override; Prompt: resolved prompt); transcribe, conditional save (`core/storage.py` based on `keep_*`/`save_all`), clipboard copy, logging setup.
|
|
38
38
|
- **Cleanup**: Temp files auto-deleted (tempfile) if `keep_*=false`; dirs created only if persistence enabled.
|
|
39
|
-
- **End**: Return `{"text": str, "raw": dict, "duration": float, "paths": dict}`; CLI prints result, GUI emits progress/updates via callback.
|
|
39
|
+
- **End**: Return `{"text": str, "raw": dict, "duration": float, "paths": dict}`; CLI prints result, GUI emits progress/updates via callback (buttons: 'Transcribe' for stop/transcribe without prompt; 'Prompt' for stop/use resolved prompt; default 'Prompt' on Esc/close).
|
|
40
40
|
|
|
41
41
|
## Build & test
|
|
42
42
|
```bash
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# supervoxtral
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
SuperVoxtral is a lightweight Python CLI/GUI utility for recording microphone audio and integrate with Mistral's Voxtral APIs for transcription or audio-enabled chat.
|
|
6
|
+
|
|
7
|
+
Voxtral models, such as `voxtral-mini-latest` and `voxtral-small-latest`, deliver fast inference times, high transcription accuracy across languages and accents, and minimal API costs. In contrast to OpenAI's Whisper, which performs only standalone transcription, Voxtral supports two modes: pure transcription via a dedicated endpoint (no prompts needed) or chat mode, where audio input combines with text prompts for refined outputs—like error correction or contextual summarization—without invoking a separate LLM.
|
|
8
|
+
|
|
9
|
+
For instance, use a prompt like: "_Transcribe this audio precisely and remove all minor speech hesitations: "um", "uh", "er", "euh", "ben", etc._"
|
|
10
|
+
|
|
11
|
+
The GUI is minimal, launches fast, and can be bound to a system hotkey. Upon stopping recording, it transcribes via the pipeline and copies the result directly to the system clipboard, enabling efficient voice-driven workflows: e.g., dictating code snippets into an IDE or prompting LLMs via audio without typing.
|
|
12
|
+
|
|
13
|
+
## Requirements
|
|
14
|
+
|
|
15
|
+
- Python 3.11+
|
|
16
|
+
- ffmpeg (for MP3/Opus conversions)
|
|
17
|
+
- macOS: `brew install ffmpeg`
|
|
18
|
+
- Ubuntu/Debian: `sudo apt-get install ffmpeg`
|
|
19
|
+
- Windows: https://ffmpeg.org/download.html
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
The package is available on PyPI. We recommend using `uv` (a fast Python package installer) for a simple, global tool installation—no virtual environment setup required.
|
|
24
|
+
|
|
25
|
+
- For core CLI functionality:
|
|
26
|
+
```
|
|
27
|
+
uv tool install supervoxtral
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
- For GUI support (includes PySide6):
|
|
31
|
+
```
|
|
32
|
+
uv tool install "supervoxtral[gui]"
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
This installs the `svx` command globally. If you don't have `uv`, install it first via `curl -LsSf https://astral.sh/uv/install.sh | sh` (or from https://docs.astral.sh/uv/getting-started/installation/).
|
|
36
|
+
|
|
37
|
+
**Alternative: Using pip with a virtual environment**
|
|
38
|
+
|
|
39
|
+
If you prefer not to use uv, you can install via pip in a virtual environment:
|
|
40
|
+
|
|
41
|
+
1. Create and activate a virtual environment:
|
|
42
|
+
|
|
43
|
+
- macOS/Linux:
|
|
44
|
+
```
|
|
45
|
+
python -m venv .venv
|
|
46
|
+
source .venv/bin/activate
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
- Windows (PowerShell):
|
|
50
|
+
```
|
|
51
|
+
python -m venv .venv
|
|
52
|
+
.\.venv\Scripts\Activate.ps1
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
2. Install the package:
|
|
56
|
+
```
|
|
57
|
+
pip install supervoxtral
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
For GUI support (includes PySide6):
|
|
61
|
+
```
|
|
62
|
+
pip install supervoxtral[gui]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
This installs the `svx` command within the virtual environment. Make sure to activate the environment before running `svx`.
|
|
66
|
+
|
|
67
|
+
**For development** (local editing):
|
|
68
|
+
1. Clone the repo and navigate to the project root.
|
|
69
|
+
2. Create/activate a virtual environment:
|
|
70
|
+
- macOS/Linux: `python -m venv .venv && source .venv/bin/activate`
|
|
71
|
+
- Windows: `python -m venv .venv && .\.venv\Scripts\Activate.ps1`
|
|
72
|
+
3. Install in editable mode: `pip install -e .` (or `pip install -e ".[dev]"` for dev tools).
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
To get started quickly with SuperVoxtral:
|
|
77
|
+
|
|
78
|
+
1. Initialize the configuration: `svx config init`
|
|
79
|
+
This creates the default `config.toml` file with zero-footprint settings.
|
|
80
|
+
|
|
81
|
+
2. Open the configuration directory: `svx config open`
|
|
82
|
+
Edit `config.toml` and add your [Mistral API key](https://console.mistral.ai/api-keys) under the `[providers.mistral]` section:
|
|
83
|
+
```
|
|
84
|
+
[providers.mistral]
|
|
85
|
+
api_key = "your_mistral_api_key_here"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
3. Launch the GUI: `svx record --gui`
|
|
89
|
+
This opens the minimal GUI, starts recording immediately; click 'Transcribe' for pure transcription (no prompt) or 'Prompt' for prompted transcription (resolved prompt); --transcribe ignored with warning (results copied to clipboard).
|
|
90
|
+
|
|
91
|
+
### macOS Shortcuts Integration
|
|
92
|
+
|
|
93
|
+
To enable fast, hotkey-driven access on macOS, integrate SuperVoxtral with the Shortcuts app. Create a new Shortcut that runs `svx record --gui` via a "Run Shell Script" action (ensure `svx` is in your PATH). Assign a global hotkey in Shortcuts settings for instant GUI launch—ideal for quick voice-to-text workflows, with results copied directly to the clipboard.
|
|
94
|
+
|
|
95
|
+
#### Quick Setup Steps
|
|
96
|
+
1. Open the Shortcuts app and create a new shortcut.
|
|
97
|
+
2. Add the "Run Shell Script" action with input: `svx record --gui`.
|
|
98
|
+
3. In shortcut details, set a keyboard shortcut (e.g., Cmd+Shift+V).
|
|
99
|
+
|
|
100
|
+

|
|
101
|
+
|
|
102
|
+
## Configuration (API keys and prompts)
|
|
103
|
+
|
|
104
|
+
API keys and default behavior are configured only in your user configuration file (config.toml), not via environment variables.
|
|
105
|
+
|
|
106
|
+
- Location of the user config:
|
|
107
|
+
- macOS: ~/Library/Application Support/SuperVoxtral/config.toml
|
|
108
|
+
- Linux: ${XDG_CONFIG_HOME:-~/.config}/supervoxtral/config.toml
|
|
109
|
+
- Windows: %APPDATA%/SuperVoxtral/config.toml
|
|
110
|
+
|
|
111
|
+
- Initialize your user config and user prompt file:
|
|
112
|
+
- `svx config init`: Creates config.toml (with sensible defaults, including zero-footprint mode) and a user prompt file at: `~/Library/Application Support/SuperVoxtral/` (macOS), `${XDG_CONFIG_HOME:-~/.config}/supervoxtral/` (Linux), or `%APPDATA%/SuperVoxtral/prompt/` (Windows).
|
|
113
|
+
- `svx config open`: Opens the directory.
|
|
114
|
+
- `svx config show`: Displays the current configuration.
|
|
115
|
+
|
|
116
|
+
Here's an example of the default `config.toml` generated by `svx config init`:
|
|
117
|
+
|
|
118
|
+
```toml
|
|
119
|
+
# SuperVoxtral - User configuration
|
|
120
|
+
#
|
|
121
|
+
# Basics:
|
|
122
|
+
# - This configuration controls the default behavior of `svx record`.
|
|
123
|
+
# - The parameters below override the binary's built-in defaults.
|
|
124
|
+
# - You can override a few options at runtime via the CLI:
|
|
125
|
+
# --prompt / --prompt-file (set a one-off prompt for this run)
|
|
126
|
+
# --log-level (debugging)
|
|
127
|
+
# --outfile-prefix (one-off output naming)
|
|
128
|
+
#
|
|
129
|
+
# Output persistence:
|
|
130
|
+
# - Set keep_* = true to create and save files to project
|
|
131
|
+
# directories (recordings/, transcripts/, logs/).
|
|
132
|
+
# - false (default): use temp files/console only (no disk
|
|
133
|
+
# footprint in project dir).
|
|
134
|
+
#
|
|
135
|
+
# Authentication:
|
|
136
|
+
# - API keys are defined in provider-specific sections in this file.
|
|
137
|
+
[providers.mistral]
|
|
138
|
+
# api_key = ""
|
|
139
|
+
|
|
140
|
+
[defaults]
|
|
141
|
+
# Provider to use (currently supported: "mistral")
|
|
142
|
+
provider = "mistral"
|
|
143
|
+
|
|
144
|
+
# File format sent to the provider: "wav" | "mp3" | "opus"
|
|
145
|
+
# Recording is always WAV; conversion is applied if "mp3" or "opus"
|
|
146
|
+
format = "opus"
|
|
147
|
+
|
|
148
|
+
# Model to use on the provider side (example for Mistral Voxtral)
|
|
149
|
+
model = "voxtral-mini-latest"
|
|
150
|
+
|
|
151
|
+
# Language hint (may help the provider)
|
|
152
|
+
language = "fr"
|
|
153
|
+
|
|
154
|
+
# Audio recording parameters
|
|
155
|
+
rate = 16000
|
|
156
|
+
channels = 1
|
|
157
|
+
device = ""
|
|
158
|
+
|
|
159
|
+
# Output persistence:
|
|
160
|
+
# - keep_audio_files: false uses temp files (no recordings/ dir),
|
|
161
|
+
# true saves to recordings/
|
|
162
|
+
keep_audio_files = false
|
|
163
|
+
# - keep_transcript_files: false prints/copies only (no
|
|
164
|
+
# transcripts/ dir), true saves to transcripts/
|
|
165
|
+
keep_transcript_files = false
|
|
166
|
+
# - keep_log_files: false console only (no logs/ dir), true
|
|
167
|
+
# saves to logs/app.log
|
|
168
|
+
keep_log_files = false
|
|
169
|
+
|
|
170
|
+
# Automatically copy the transcribed text to the system clipboard
|
|
171
|
+
copy = true
|
|
172
|
+
|
|
173
|
+
# Log level: "DEBUG" | "INFO" | "WARNING" | "ERROR"
|
|
174
|
+
log_level = "INFO"
|
|
175
|
+
|
|
176
|
+
[prompt]
|
|
177
|
+
# Default user prompt source:
|
|
178
|
+
# - Option 1: Use a file (recommended)
|
|
179
|
+
file = "~/.config/supervoxtral/prompt/user.md"
|
|
180
|
+
#
|
|
181
|
+
# - Option 2: Inline prompt (less recommended for long text)
|
|
182
|
+
# text = "Please transcribe the audio and provide a concise summary in French."
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
**Configuration is centralized via a structured `Config` object loaded from your user configuration file (`config.toml`). CLI arguments override select values (e.g., prompt, log level), but most defaults (provider, model, keep flags) come from `config.toml`. No environment variables are used for API keys or settings.**
|
|
186
|
+
|
|
187
|
+
No `.env` or shell environment variables are used for API keys.
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
## Usage (CLI)
|
|
191
|
+
|
|
192
|
+
The CLI provides config utilities and a unified `record` entrypoint for both CLI and GUI modes, using a centralized pipeline for consistent behavior (recording, conversion, transcription, saving, clipboard copy, logging).
|
|
193
|
+
|
|
194
|
+
**Zero-footprint defaults**: No directories created; outputs to console/clipboard. Use `--save-all` or set `keep_* = true` in config.toml for persistence.
|
|
195
|
+
|
|
196
|
+
Most defaults (provider, format, model, language, rate, channels, device, keep flags, copy) come from config.toml. CLI overrides are limited to specific options.
|
|
197
|
+
|
|
198
|
+
### Record Command
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
svx record [OPTIONS]
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
**Options**:
|
|
205
|
+
- `--user-prompt TEXT` (or `--prompt TEXT`): Inline user prompt for this run.
|
|
206
|
+
- `--user-prompt-file PATH` (or `--prompt-file PATH`): Path to a markdown file with the user prompt.
|
|
207
|
+
- `--transcribe`: Enable pure transcription mode (ignores prompts; uses dedicated endpoint).
|
|
208
|
+
- `--outfile-prefix PREFIX`: Custom prefix for output files (default: timestamp).
|
|
209
|
+
- `--gui`: Launch the GUI frontend (interactive: recording starts immediately; buttons 'Transcribe' (pure, no prompt) or 'Prompt' (with resolved prompt); respects config and other CLI options; --transcribe ignored with warning).
|
|
210
|
+
- `--save-all`: Override config to keep audio, transcripts, and logs for this run.
|
|
211
|
+
- `--log-level LEVEL`: Set logging level (DEBUG, INFO, WARNING, ERROR; default: INFO).
|
|
212
|
+
|
|
213
|
+
**Examples**:
|
|
214
|
+
- Record with prompt: `svx record --prompt "What's in this audio?"`
|
|
215
|
+
- Records WAV, converts if needed, sends to provider with prompt, outputs to console/clipboard.
|
|
216
|
+
- Persist outputs: `svx record --save-all --prompt "Summarize this"`
|
|
217
|
+
- Saves to recordings/, transcripts/, logs/.
|
|
218
|
+
- Transcribe only: `svx record --transcribe`
|
|
219
|
+
- No prompt; direct transcription. Add `--save-all` to persist.
|
|
220
|
+
- Launch GUI: `svx record --gui`
|
|
221
|
+
- Interactive mode: recording starts immediately; click 'Transcribe' (pure transcription, no prompt) or 'Prompt' (with resolved prompt); --transcribe ignored with warning. GUI respects config.toml and CLI flags (e.g., `--gui --save-all`).
|
|
222
|
+
|
|
223
|
+
**Prompt Resolution Priority** (for non-transcribe mode):
|
|
224
|
+
1. CLI `--user-prompt` or `--user-prompt-file`
|
|
225
|
+
2. config.toml [prompt] section (text or file)
|
|
226
|
+
3. User prompt file (user.md in config dir)
|
|
227
|
+
4. Fallback: "What's in this audio?"
|
|
228
|
+
|
|
229
|
+
## Changelog
|
|
230
|
+
|
|
231
|
+
- 0.1.2: Interactive mode in GUI (choose transcribe / prompt / cancel while recording)
|
|
232
|
+
- 0.1.1: Minor updates to default config and default prompt
|
|
233
|
+
|
|
234
|
+
## License
|
|
235
|
+
|
|
236
|
+
MIT
|
|
Binary file
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "supervoxtral"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.2"
|
|
8
8
|
description = "CLI/GUI audio recorder and transcription client using Mistral Voxtral (chat with audio and transcription)."
|
|
9
9
|
requires-python = ">=3.11"
|
|
10
10
|
license = { text = "MIT" }
|
|
Binary file
|
|
@@ -191,6 +191,11 @@ def record(
|
|
|
191
191
|
user_prompt = None
|
|
192
192
|
user_prompt_file = None
|
|
193
193
|
|
|
194
|
+
if gui and transcribe:
|
|
195
|
+
console.print("[yellow]Warning: --transcribe has no effect in GUI mode.[/yellow]")
|
|
196
|
+
console.print("[yellow]Use the 'Transcribe' or 'Prompt' buttons in the interface.[/yellow]")
|
|
197
|
+
transcribe = False
|
|
198
|
+
|
|
194
199
|
# If GUI requested, launch GUI with the resolved parameters and exit.
|
|
195
200
|
if gui:
|
|
196
201
|
from svx.ui.qt_app import run_gui
|
|
@@ -202,7 +207,6 @@ def record(
|
|
|
202
207
|
user_prompt_file=user_prompt_file,
|
|
203
208
|
save_all=save_all,
|
|
204
209
|
outfile_prefix=outfile_prefix,
|
|
205
|
-
transcribe_mode=transcribe,
|
|
206
210
|
)
|
|
207
211
|
return
|
|
208
212
|
|
|
@@ -227,7 +227,7 @@ def init_user_config(force: bool = False, prompt_file: Path | None = None) -> Pa
|
|
|
227
227
|
"# Audio recording parameters\n"
|
|
228
228
|
"rate = 16000\n"
|
|
229
229
|
"channels = 1\n"
|
|
230
|
-
'device = ""\n\n'
|
|
230
|
+
'#device = ""\n\n'
|
|
231
231
|
"# Output persistence:\n"
|
|
232
232
|
"# - keep_audio_files: false uses temp files (no recordings/ dir),\n"
|
|
233
233
|
"# true saves to recordings/\n"
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
import threading
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from logging import FileHandler
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import svx.core.config as config
|
|
12
|
+
from svx.core.audio import convert_audio, record_wav, timestamp
|
|
13
|
+
from svx.core.clipboard import copy_to_clipboard
|
|
14
|
+
from svx.core.config import Config
|
|
15
|
+
from svx.core.storage import save_transcript
|
|
16
|
+
from svx.providers import get_provider
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RecordingPipeline:
|
|
20
|
+
"""
|
|
21
|
+
Centralized pipeline for recording audio, transcribing via provider, saving outputs,
|
|
22
|
+
and copying to clipboard. Handles temporary files when not keeping audio.
|
|
23
|
+
|
|
24
|
+
Supports runtime overrides like save_all for keeping all files and adding log handlers.
|
|
25
|
+
Optional progress_callback for status updates (e.g., for GUI).
|
|
26
|
+
Supports transcribe_mode for pure transcription without prompt using dedicated endpoint.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
cfg: Config,
|
|
32
|
+
user_prompt: str | None = None,
|
|
33
|
+
user_prompt_file: Path | None = None,
|
|
34
|
+
save_all: bool = False,
|
|
35
|
+
outfile_prefix: str | None = None,
|
|
36
|
+
progress_callback: Callable[[str], None] | None = None,
|
|
37
|
+
transcribe_mode: bool = False,
|
|
38
|
+
) -> None:
|
|
39
|
+
self.cfg = cfg
|
|
40
|
+
self.user_prompt = user_prompt
|
|
41
|
+
self.user_prompt_file = user_prompt_file
|
|
42
|
+
self.save_all = save_all
|
|
43
|
+
self.outfile_prefix = outfile_prefix
|
|
44
|
+
self.progress_callback = progress_callback
|
|
45
|
+
self.transcribe_mode = transcribe_mode
|
|
46
|
+
|
|
47
|
+
def _status(self, msg: str) -> None:
|
|
48
|
+
"""Emit status update via callback if provided."""
|
|
49
|
+
if self.progress_callback:
|
|
50
|
+
self.progress_callback(msg)
|
|
51
|
+
logging.info(msg)
|
|
52
|
+
|
|
53
|
+
def record(self, stop_event: threading.Event | None = None) -> tuple[Path, float]:
|
|
54
|
+
"""
|
|
55
|
+
Record audio and return wav_path, duration.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
tuple[Path, float]: wav_path, duration.
|
|
59
|
+
"""
|
|
60
|
+
# Resolve parameters
|
|
61
|
+
_provider = self.cfg.defaults.provider
|
|
62
|
+
audio_format = self.cfg.defaults.format
|
|
63
|
+
model = self.cfg.defaults.model
|
|
64
|
+
_original_model = model
|
|
65
|
+
_language = self.cfg.defaults.language
|
|
66
|
+
rate = self.cfg.defaults.rate
|
|
67
|
+
channels = self.cfg.defaults.channels
|
|
68
|
+
device = self.cfg.defaults.device
|
|
69
|
+
base = self.outfile_prefix or f"rec_{timestamp()}"
|
|
70
|
+
keep_audio = self.save_all or self.cfg.defaults.keep_audio_files
|
|
71
|
+
|
|
72
|
+
# Validation (fail fast)
|
|
73
|
+
if channels not in (1, 2):
|
|
74
|
+
raise ValueError("channels must be 1 or 2")
|
|
75
|
+
if rate <= 0:
|
|
76
|
+
raise ValueError("rate must be > 0")
|
|
77
|
+
if audio_format not in {"wav", "mp3", "opus"}:
|
|
78
|
+
raise ValueError("format must be one of wav|mp3|opus")
|
|
79
|
+
|
|
80
|
+
stop_for_recording = stop_event or threading.Event()
|
|
81
|
+
|
|
82
|
+
self._status("Recording...")
|
|
83
|
+
if keep_audio:
|
|
84
|
+
self.cfg.recordings_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
wav_path = self.cfg.recordings_dir / f"{base}.wav"
|
|
86
|
+
duration = record_wav(
|
|
87
|
+
wav_path,
|
|
88
|
+
samplerate=rate,
|
|
89
|
+
channels=channels,
|
|
90
|
+
device=device,
|
|
91
|
+
stop_event=stop_for_recording,
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
# Use mktemp for temp wav_path
|
|
95
|
+
wav_path = Path(tempfile.mktemp(suffix=".wav"))
|
|
96
|
+
duration = record_wav(
|
|
97
|
+
wav_path,
|
|
98
|
+
samplerate=rate,
|
|
99
|
+
channels=channels,
|
|
100
|
+
device=device,
|
|
101
|
+
stop_event=stop_for_recording,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
self._status("Recording completed.")
|
|
105
|
+
return wav_path, duration
|
|
106
|
+
|
|
107
|
+
def _setup_save_all(self) -> None:
|
|
108
|
+
"""Apply save_all overrides: set keeps to True, create dirs, add file logging."""
|
|
109
|
+
if not self.save_all:
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
# Override config defaults
|
|
113
|
+
self.cfg.defaults.keep_audio_files = True
|
|
114
|
+
self.cfg.defaults.keep_transcript_files = True
|
|
115
|
+
self.cfg.defaults.keep_log_files = True
|
|
116
|
+
|
|
117
|
+
# Ensure directories
|
|
118
|
+
config.RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
config.TRANSCRIPTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
config.LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
|
|
122
|
+
# Add file handler if not present
|
|
123
|
+
root_logger = logging.getLogger()
|
|
124
|
+
if not any(isinstance(h, FileHandler) for h in root_logger.handlers): # type: ignore[reportUnknownMemberType]
|
|
125
|
+
from svx.core.config import _get_log_level
|
|
126
|
+
|
|
127
|
+
log_level_int = _get_log_level(self.cfg.defaults.log_level)
|
|
128
|
+
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(name)s | %(message)s")
|
|
129
|
+
file_handler = logging.FileHandler(config.LOGS_DIR / "app.log", encoding="utf-8")
|
|
130
|
+
file_handler.setLevel(log_level_int)
|
|
131
|
+
file_handler.setFormatter(formatter)
|
|
132
|
+
root_logger.addHandler(file_handler)
|
|
133
|
+
logging.info("File logging enabled for this run")
|
|
134
|
+
|
|
135
|
+
def process(
|
|
136
|
+
self, wav_path: Path, duration: float, transcribe_mode: bool, user_prompt: str | None = None
|
|
137
|
+
) -> dict[str, Any]:
|
|
138
|
+
"""
|
|
139
|
+
Process recorded audio: convert if needed, transcribe, save, copy.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
wav_path: Path to the recorded WAV file.
|
|
143
|
+
duration: Recording duration in seconds.
|
|
144
|
+
transcribe_mode: Whether to use pure transcription mode.
|
|
145
|
+
user_prompt: User prompt to use (None for transcribe_mode).
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Dict with 'text' (str), 'raw' (dict), 'duration' (float),
|
|
149
|
+
'paths' (dict of Path or None).
|
|
150
|
+
"""
|
|
151
|
+
# Resolve parameters
|
|
152
|
+
provider = self.cfg.defaults.provider
|
|
153
|
+
audio_format = self.cfg.defaults.format
|
|
154
|
+
model = self.cfg.defaults.model
|
|
155
|
+
original_model = model
|
|
156
|
+
if transcribe_mode:
|
|
157
|
+
model = "voxtral-mini-latest"
|
|
158
|
+
if original_model != "voxtral-mini-latest":
|
|
159
|
+
logging.warning(
|
|
160
|
+
"Transcribe mode: model override from '%s' to 'voxtral-mini-latest'\n"
|
|
161
|
+
"(optimized for transcription).",
|
|
162
|
+
original_model,
|
|
163
|
+
)
|
|
164
|
+
language = self.cfg.defaults.language
|
|
165
|
+
if wav_path.stem.endswith(".wav"):
|
|
166
|
+
base = wav_path.stem.replace(".wav", "")
|
|
167
|
+
else:
|
|
168
|
+
base = wav_path.stem
|
|
169
|
+
keep_transcript = self.save_all or self.cfg.defaults.keep_transcript_files
|
|
170
|
+
copy_to_clip = self.cfg.defaults.copy
|
|
171
|
+
|
|
172
|
+
# Resolve user prompt if not provided
|
|
173
|
+
final_user_prompt = None
|
|
174
|
+
if not transcribe_mode:
|
|
175
|
+
if user_prompt is None:
|
|
176
|
+
final_user_prompt = self.cfg.resolve_prompt(self.user_prompt, self.user_prompt_file)
|
|
177
|
+
else:
|
|
178
|
+
final_user_prompt = user_prompt
|
|
179
|
+
self._status("Transcribe mode not activated: using prompt.")
|
|
180
|
+
else:
|
|
181
|
+
self._status("Transcribe mode activated: no prompt used.")
|
|
182
|
+
|
|
183
|
+
paths: dict[str, Path | None] = {"wav": wav_path}
|
|
184
|
+
|
|
185
|
+
# Convert if needed
|
|
186
|
+
to_send_path = wav_path
|
|
187
|
+
_converted = False
|
|
188
|
+
if audio_format in {"mp3", "opus"}:
|
|
189
|
+
self._status("Converting...")
|
|
190
|
+
to_send_path = convert_audio(wav_path, audio_format)
|
|
191
|
+
logging.info("Converted %s -> %s", wav_path, to_send_path)
|
|
192
|
+
paths["converted"] = to_send_path
|
|
193
|
+
_converted = True
|
|
194
|
+
|
|
195
|
+
# Transcribe
|
|
196
|
+
self._status("Transcribing...")
|
|
197
|
+
prov = get_provider(provider, cfg=self.cfg)
|
|
198
|
+
result = prov.transcribe(
|
|
199
|
+
to_send_path,
|
|
200
|
+
user_prompt=final_user_prompt,
|
|
201
|
+
model=model,
|
|
202
|
+
language=language,
|
|
203
|
+
transcribe_mode=transcribe_mode,
|
|
204
|
+
)
|
|
205
|
+
text = result["text"]
|
|
206
|
+
raw = result["raw"]
|
|
207
|
+
|
|
208
|
+
# Save if keeping transcripts
|
|
209
|
+
if keep_transcript:
|
|
210
|
+
self.cfg.transcripts_dir.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
txt_path, json_path = save_transcript(
|
|
212
|
+
self.cfg.transcripts_dir, base, provider, text, raw
|
|
213
|
+
)
|
|
214
|
+
paths["txt"] = txt_path
|
|
215
|
+
paths["json"] = json_path
|
|
216
|
+
else:
|
|
217
|
+
paths["txt"] = None
|
|
218
|
+
paths["json"] = None
|
|
219
|
+
|
|
220
|
+
# Copy to clipboard
|
|
221
|
+
if copy_to_clip:
|
|
222
|
+
try:
|
|
223
|
+
copy_to_clipboard(text)
|
|
224
|
+
logging.info("Copied transcription to clipboard")
|
|
225
|
+
except Exception as e:
|
|
226
|
+
logging.warning("Failed to copy to clipboard: %s", e)
|
|
227
|
+
|
|
228
|
+
logging.info("Processing finished (%.2fs)", duration)
|
|
229
|
+
return {
|
|
230
|
+
"text": text,
|
|
231
|
+
"raw": raw,
|
|
232
|
+
"duration": duration,
|
|
233
|
+
"paths": paths,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
def clean(self, wav_path: Path, paths: dict[str, Path | None], keep_audio: bool) -> None:
|
|
237
|
+
"""
|
|
238
|
+
Clean up temporary files.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
wav_path: The original WAV path.
|
|
242
|
+
paths: The paths dict from process().
|
|
243
|
+
keep_audio: Whether to keep audio files (if True, no deletion).
|
|
244
|
+
"""
|
|
245
|
+
if not keep_audio and wav_path.exists():
|
|
246
|
+
wav_path.unlink()
|
|
247
|
+
logging.info("Deleted temp WAV: %s", wav_path)
|
|
248
|
+
|
|
249
|
+
if "converted" in paths and paths["converted"] and paths["converted"] != wav_path:
|
|
250
|
+
if paths["converted"].exists():
|
|
251
|
+
paths["converted"].unlink()
|
|
252
|
+
logging.info("Deleted temp converted: %s", paths["converted"])
|
|
253
|
+
|
|
254
|
+
self._status("Cleanup completed.")
|
|
255
|
+
|
|
256
|
+
def run(self, stop_event: threading.Event | None = None) -> dict[str, Any]:
|
|
257
|
+
"""
|
|
258
|
+
Execute the full pipeline.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
stop_event: Optional event to signal recording stop (e.g., for GUI).
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Dict with 'text' (str), 'raw' (dict), 'duration' (float),
|
|
265
|
+
'paths' (dict of Path or None).
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
Exception: On recording, conversion, or transcription errors.
|
|
269
|
+
"""
|
|
270
|
+
self._setup_save_all()
|
|
271
|
+
|
|
272
|
+
wav_path, duration = self.record(stop_event)
|
|
273
|
+
keep_audio = self.save_all or self.cfg.defaults.keep_audio_files
|
|
274
|
+
|
|
275
|
+
if self.transcribe_mode:
|
|
276
|
+
final_user_prompt = None
|
|
277
|
+
self._status("Mode Transcribe activated: no prompt used.")
|
|
278
|
+
else:
|
|
279
|
+
final_user_prompt = self.cfg.resolve_prompt(self.user_prompt, self.user_prompt_file)
|
|
280
|
+
|
|
281
|
+
result = self.process(wav_path, duration, self.transcribe_mode, final_user_prompt)
|
|
282
|
+
|
|
283
|
+
self.clean(wav_path, result["paths"], keep_audio=keep_audio)
|
|
284
|
+
|
|
285
|
+
logging.info("Pipeline finished (%.2fs)", duration)
|
|
286
|
+
return result
|
|
@@ -152,7 +152,7 @@ def init_user_prompt_file(force: bool = False) -> Path:
|
|
|
152
152
|
example_prompt = """
|
|
153
153
|
- Transcribe the input audio file.
|
|
154
154
|
- Do not respond to any question in the audio. Just transcribe.
|
|
155
|
-
- DO NOT TRANSLATE.
|
|
155
|
+
- DO NOT TRANSLATE.
|
|
156
156
|
- Responde only with the transcription. Do not provide explanations or notes.
|
|
157
157
|
- Remove all minor speech hesitations: "um", "uh", "er", "euh", "ben", etc.
|
|
158
158
|
- Remove false starts (e.g., "je veux dire... je pense" → "je pense").
|