agent-cli 0.70.5__py3-none-any.whl → 0.72.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,12 +23,12 @@ from agent_cli.core.utils import (
23
23
  def rag_proxy(
24
24
  docs_folder: Path = typer.Option( # noqa: B008
25
25
  "./rag_docs",
26
- help="Folder to watch for documents",
26
+ help="Folder to watch for documents. Files are auto-indexed on startup and when changed. Must not overlap with `--chroma-path`.",
27
27
  rich_help_panel="RAG Configuration",
28
28
  ),
29
29
  chroma_path: Path = typer.Option( # noqa: B008
30
30
  "./rag_db",
31
- help="Path to ChromaDB persistence directory",
31
+ help="ChromaDB storage directory for vector embeddings. Must be separate from `--docs-folder` to avoid indexing database files.",
32
32
  rich_help_panel="RAG Configuration",
33
33
  ),
34
34
  openai_base_url: str | None = opts.OPENAI_BASE_URL,
@@ -36,30 +36,62 @@ def rag_proxy(
36
36
  openai_api_key: str | None = opts.OPENAI_API_KEY,
37
37
  limit: int = typer.Option(
38
38
  3,
39
- help="Number of document chunks to retrieve per query.",
39
+ help="Number of document chunks to retrieve per query. Higher values provide more context but use more tokens. Can be overridden per-request via `rag_top_k` in the JSON body.",
40
40
  rich_help_panel="RAG Configuration",
41
41
  ),
42
42
  host: str = opts.SERVER_HOST,
43
43
  port: int = typer.Option(
44
44
  8000,
45
- help="Port to bind to",
45
+ help="Port for the RAG proxy API (e.g., `http://localhost:8000/v1/chat/completions`).",
46
46
  rich_help_panel="Server Configuration",
47
47
  ),
48
- log_level: opts.LogLevel = opts.LOG_LEVEL,
48
+ log_level: opts.LogLevel = opts.SERVER_LOG_LEVEL,
49
49
  config_file: str | None = opts.CONFIG_FILE,
50
50
  print_args: bool = opts.PRINT_ARGS,
51
51
  enable_rag_tools: bool = typer.Option(
52
52
  True, # noqa: FBT003
53
53
  "--rag-tools/--no-rag-tools",
54
- help="Allow agent to fetch full documents when snippets are insufficient.",
54
+ help="Enable `read_full_document()` tool so the LLM can request full document content when retrieved snippets are insufficient. Can be overridden per-request via `rag_enable_tools` in the JSON body.",
55
55
  rich_help_panel="RAG Configuration",
56
56
  ),
57
57
  ) -> None:
58
- """Start the RAG (Retrieval-Augmented Generation) Proxy Server.
58
+ """Start a RAG proxy server that enables "chat with your documents".
59
59
 
60
- This server watches a folder for documents, indexes them, and provides
61
- an OpenAI-compatible API that proxies requests to a backend LLM (like llama.cpp),
62
- injecting relevant context from the documents.
60
+ Watches a folder for documents, indexes them into a vector store, and provides an
61
+ OpenAI-compatible API at `/v1/chat/completions`. When you send a chat request,
62
+ the server retrieves relevant document chunks and injects them as context before
63
+ forwarding to your LLM backend.
64
+
65
+ **Quick start:**
66
+
67
+ - `agent-cli rag-proxy` — Start with defaults (./rag_docs, OpenAI-compatible API)
68
+ - `agent-cli rag-proxy --docs-folder ~/notes` — Index your notes folder
69
+
70
+ **How it works:**
71
+
72
+ 1. Documents in `--docs-folder` are chunked, embedded, and stored in ChromaDB
73
+ 2. A file watcher auto-reindexes when files change
74
+ 3. Chat requests trigger a semantic search for relevant chunks
75
+ 4. Retrieved context is injected into the prompt before forwarding to the LLM
76
+ 5. Responses include a `rag_sources` field listing which documents were used
77
+
78
+ **Supported file formats:**
79
+
80
+ Text: `.txt`, `.md`, `.json`, `.py`, `.js`, `.ts`, `.yaml`, `.toml`, `.rst`, etc.
81
+ Rich documents (via MarkItDown): `.pdf`, `.docx`, `.pptx`, `.xlsx`, `.html`, `.csv`
82
+
83
+ **API endpoints:**
84
+
85
+ - `POST /v1/chat/completions` — Main chat endpoint (OpenAI-compatible)
86
+ - `GET /health` — Health check with configuration info
87
+ - `GET /files` — List indexed files with chunk counts
88
+ - `POST /reindex` — Trigger manual reindex
89
+ - All other paths are proxied to the LLM backend
90
+
91
+ **Per-request overrides (in JSON body):**
92
+
93
+ - `rag_top_k`: Override `--limit` for this request
94
+ - `rag_enable_tools`: Override `--rag-tools` for this request
63
95
  """
64
96
  if print_args:
65
97
  print_command_line_args(locals())
agent_cli/agents/speak.py CHANGED
@@ -86,7 +86,7 @@ def speak(
86
86
  *,
87
87
  text: str | None = typer.Argument(
88
88
  None,
89
- help="Text to speak. Reads from clipboard if not provided.",
89
+ help="Text to synthesize. If not provided, reads from clipboard.",
90
90
  rich_help_panel="General Options",
91
91
  ),
92
92
  # --- Provider Selection ---
@@ -127,7 +127,27 @@ def speak(
127
127
  config_file: str | None = opts.CONFIG_FILE,
128
128
  print_args: bool = opts.PRINT_ARGS,
129
129
  ) -> None:
130
- """Convert text to speech using Wyoming or OpenAI-compatible TTS server."""
130
+ """Convert text to speech and play audio through speakers.
131
+
132
+ By default, synthesized audio plays immediately. Use `--save-file` to save
133
+ to a WAV file instead (skips playback).
134
+
135
+ Text can be provided as an argument or read from clipboard automatically.
136
+
137
+ **Examples:**
138
+
139
+ Speak text directly:
140
+ `agent-cli speak "Hello, world!"`
141
+
142
+ Speak clipboard contents:
143
+ `agent-cli speak`
144
+
145
+ Save to file instead of playing:
146
+ `agent-cli speak "Hello" --save-file greeting.wav`
147
+
148
+ Use OpenAI-compatible TTS:
149
+ `agent-cli speak "Hello" --tts-provider openai`
150
+ """
131
151
  if print_args:
132
152
  print_command_line_args(locals())
133
153
 
@@ -471,7 +471,7 @@ def transcribe( # noqa: PLR0912
471
471
  extra_instructions: str | None = typer.Option(
472
472
  None,
473
473
  "--extra-instructions",
474
- help="Additional instructions for the LLM to process the transcription.",
474
+ help="Extra instructions appended to the LLM cleanup prompt (requires `--llm`).",
475
475
  rich_help_panel="LLM Configuration",
476
476
  ),
477
477
  from_file: Path | None = opts.FROM_FILE,
@@ -513,7 +513,25 @@ def transcribe( # noqa: PLR0912
513
513
  print_args: bool = opts.PRINT_ARGS,
514
514
  transcription_log: Path | None = opts.TRANSCRIPTION_LOG,
515
515
  ) -> None:
516
- """Wyoming ASR Client for streaming microphone audio to a transcription server."""
516
+ """Record audio from microphone and transcribe to text.
517
+
518
+ Records until you press Ctrl+C (or send SIGINT), then transcribes using your
519
+ configured ASR provider. The transcript is copied to the clipboard by default.
520
+
521
+ **With `--llm`**: Passes the raw transcript through an LLM to clean up speech
522
+ recognition errors, add punctuation, remove filler words, and improve readability.
523
+
524
+ **With `--toggle`**: Bind to a hotkey for push-to-talk. First call starts recording,
525
+ second call stops and transcribes.
526
+
527
+ **Examples**:
528
+
529
+ - Record and transcribe: `agent-cli transcribe`
530
+
531
+ - With LLM cleanup: `agent-cli transcribe --llm`
532
+
533
+ - Re-transcribe last recording: `agent-cli transcribe --last-recording 1`
534
+ """
517
535
  if print_args:
518
536
  print_command_line_args(locals())
519
537
 
@@ -296,45 +296,45 @@ def transcribe_daemon( # noqa: PLR0912
296
296
  "user",
297
297
  "--role",
298
298
  "-r",
299
- help="Role name for logging (e.g., 'meeting', 'notes', 'user').",
299
+ help="Label for log entries. Use to distinguish speakers or contexts in logs.",
300
300
  ),
301
301
  silence_threshold: float = typer.Option(
302
302
  1.0,
303
303
  "--silence-threshold",
304
304
  "-s",
305
- help="Seconds of silence to end a speech segment.",
305
+ help="Seconds of silence after speech to finalize a segment. Increase for slower speakers.",
306
306
  ),
307
307
  min_segment: float = typer.Option(
308
308
  0.25,
309
309
  "--min-segment",
310
310
  "-m",
311
- help="Minimum speech duration in seconds to trigger a segment.",
311
+ help="Minimum seconds of speech required before a segment is processed. Filters brief sounds.",
312
312
  ),
313
313
  vad_threshold: float = typer.Option(
314
314
  0.3,
315
315
  "--vad-threshold",
316
- help="VAD speech detection threshold (0.0-1.0). Higher = more aggressive filtering.",
316
+ help="Silero VAD confidence threshold (0.0-1.0). Higher values require clearer speech; lower values are more sensitive to quiet/distant voices.",
317
317
  ),
318
318
  save_audio: bool = typer.Option(
319
319
  True, # noqa: FBT003
320
320
  "--save-audio/--no-save-audio",
321
- help="Save audio segments as MP3 files.",
321
+ help="Save each speech segment as MP3. Requires `ffmpeg` to be installed.",
322
322
  ),
323
323
  audio_dir: Path | None = typer.Option( # noqa: B008
324
324
  None,
325
325
  "--audio-dir",
326
- help="Directory for MP3 files. Default: ~/.config/agent-cli/audio",
326
+ help="Base directory for MP3 files. Files are organized by date: `YYYY/MM/DD/HHMMSS_mmm.mp3`. Default: `~/.config/agent-cli/audio`.",
327
327
  ),
328
328
  transcription_log: Path | None = typer.Option( # noqa: B008
329
329
  None,
330
330
  "--transcription-log",
331
331
  "-t",
332
- help="JSON Lines log file path. Default: ~/.config/agent-cli/transcriptions.jsonl",
332
+ help="JSONL file for transcript logging (one JSON object per line with timestamp, role, raw/processed text, audio path). Default: `~/.config/agent-cli/transcriptions.jsonl`.",
333
333
  ),
334
334
  clipboard: bool = typer.Option(
335
335
  False, # noqa: FBT003
336
336
  "--clipboard/--no-clipboard",
337
- help="Copy each transcription to clipboard.",
337
+ help="Copy each completed transcription to clipboard (overwrites previous). Useful with `--llm` to get cleaned text.",
338
338
  ),
339
339
  # --- Provider Selection ---
340
340
  asr_provider: str = opts.ASR_PROVIDER,
@@ -368,25 +368,37 @@ def transcribe_daemon( # noqa: PLR0912
368
368
  config_file: str | None = opts.CONFIG_FILE,
369
369
  print_args: bool = opts.PRINT_ARGS,
370
370
  ) -> None:
371
- """Run a continuous transcription daemon with voice activity detection.
371
+ """Continuous transcription daemon using Silero VAD for speech detection.
372
372
 
373
- This command runs indefinitely, capturing audio from your microphone,
374
- detecting speech segments using Silero VAD, transcribing them, and
375
- logging results with timestamps.
373
+ Unlike `transcribe` (single recording session), this daemon runs indefinitely
374
+ and automatically detects speech segments using Voice Activity Detection (VAD).
375
+ Each detected segment is transcribed and logged with timestamps.
376
376
 
377
- Examples:
378
- # Basic daemon
379
- agent-cli transcribe-daemon
377
+ **How it works:**
380
378
 
381
- # With role and custom silence threshold
382
- agent-cli transcribe-daemon --role meeting --silence-threshold 1.5
379
+ 1. Listens continuously to microphone input
380
+ 2. Silero VAD detects when you start/stop speaking
381
+ 3. After `--silence-threshold` seconds of silence, the segment is finalized
382
+ 4. Segment is transcribed (and optionally cleaned by LLM with `--llm`)
383
+ 5. Results are appended to the JSONL log file
384
+ 6. Audio is saved as MP3 if `--save-audio` is enabled (requires `ffmpeg`)
385
+
386
+ **Use cases:** Meeting transcription, note-taking, voice journaling, accessibility.
383
387
 
384
- # With LLM cleanup
385
- agent-cli transcribe-daemon --llm --role notes
388
+ **Examples:**
389
+
390
+ agent-cli transcribe-daemon
391
+ agent-cli transcribe-daemon --role meeting --silence-threshold 1.5
392
+ agent-cli transcribe-daemon --llm --clipboard --role notes
393
+ agent-cli transcribe-daemon --transcription-log ~/meeting.jsonl --no-save-audio
394
+ agent-cli transcribe-daemon --asr-provider openai --llm-provider gemini --llm
386
395
 
387
- # Custom log file and audio directory
388
- agent-cli transcribe-daemon --transcription-log ~/meeting.jsonl --audio-dir ~/audio
396
+ **Tips:**
389
397
 
398
+ - Use `--role` to tag entries (e.g., `speaker1`, `meeting`, `personal`)
399
+ - Adjust `--vad-threshold` if detection is too sensitive (increase) or missing speech (decrease)
400
+ - Use `--stop` to cleanly terminate a running daemon
401
+ - With `--llm`, transcripts are cleaned up (punctuation, filler words removed)
390
402
  """
391
403
  if print_args:
392
404
  print_command_line_args(locals())
@@ -229,15 +229,23 @@ def voice_edit(
229
229
  config_file: str | None = opts.CONFIG_FILE,
230
230
  print_args: bool = opts.PRINT_ARGS,
231
231
  ) -> None:
232
- """Interact with clipboard text via a voice command using local or remote services.
233
-
234
- Usage:
235
- - Run in foreground: agent-cli voice-edit --input-device-index 1
236
- - Run in background: agent-cli voice-edit --input-device-index 1 &
237
- - Check status: agent-cli voice-edit --status
238
- - Stop background process: agent-cli voice-edit --stop
239
- - List output devices: agent-cli voice-edit --list-output-devices
240
- - Save TTS to file: agent-cli voice-edit --tts --save-file response.wav
232
+ """Edit or query clipboard text using voice commands.
233
+
234
+ **Workflow:** Captures clipboard text → records your voice command → transcribes
235
+ it sends both to an LLM → copies result back to clipboard.
236
+
237
+ Use this for hands-free text editing (e.g., "make this more formal") or
238
+ asking questions about clipboard content (e.g., "summarize this").
239
+
240
+ **Typical hotkey integration:** Run `voice-edit &` on keypress to start
241
+ recording, then send SIGINT (via `--stop`) on second keypress to process.
242
+
243
+ **Examples:**
244
+
245
+ - Basic usage: `agent-cli voice-edit`
246
+ - With TTS response: `agent-cli voice-edit --tts`
247
+ - Toggle on/off: `agent-cli voice-edit --toggle`
248
+ - List audio devices: `agent-cli voice-edit --list-devices`
241
249
  """
242
250
  if print_args:
243
251
  print_command_line_args(locals())
agent_cli/cli.py CHANGED
@@ -14,9 +14,32 @@ from .config import load_config, normalize_provider_defaults
14
14
  from .core.process import set_process_title
15
15
  from .core.utils import console
16
16
 
17
+ _HELP = """\
18
+ AI-powered voice, text, and development tools.
19
+
20
+ **Voice & Text:**
21
+
22
+ - **Voice-to-text** - Transcribe speech with optional LLM cleanup
23
+ - **Text-to-speech** - Convert text to natural-sounding audio
24
+ - **Voice chat** - Conversational AI with memory and tool use
25
+ - **Text correction** - Fix grammar, spelling, and punctuation
26
+
27
+ **Development:**
28
+
29
+ - **Parallel development** - Git worktrees with integrated coding agents
30
+ - **Local servers** - ASR/TTS with Wyoming + OpenAI-compatible APIs,
31
+ MLX on macOS ARM, CUDA/CPU Whisper, and automatic model TTL
32
+
33
+ **Provider Flexibility:**
34
+
35
+ Mix local (Ollama, Wyoming) and cloud (OpenAI, Gemini) backends freely.
36
+
37
+ Run `agent-cli <command> --help` for detailed command documentation.
38
+ """
39
+
17
40
  app = typer.Typer(
18
41
  name="agent-cli",
19
- help="A suite of AI-powered command-line tools for text correction, audio transcription, and voice assistance.",
42
+ help=_HELP,
20
43
  context_settings={"help_option_names": ["-h", "--help"]},
21
44
  add_completion=True,
22
45
  rich_markup_mode="markdown",
@@ -56,7 +79,7 @@ def main(
56
79
  ),
57
80
  ] = False,
58
81
  ) -> None:
59
- """A suite of AI-powered tools."""
82
+ """AI-powered voice, text, and development tools."""
60
83
  if ctx.invoked_subcommand is None:
61
84
  console.print("[bold red]No command specified.[/bold red]")
62
85
  console.print("[bold yellow]Running --help for your convenience.[/bold yellow]")
agent_cli/config_cmd.py CHANGED
@@ -20,7 +20,17 @@ from agent_cli.core.utils import console
20
20
 
21
21
  config_app = typer.Typer(
22
22
  name="config",
23
- help="Manage agent-cli configuration files.",
23
+ help="""Manage agent-cli configuration files.
24
+
25
+ Config files are TOML format and searched in order:
26
+
27
+ 1. `./agent-cli-config.toml` (project-local)
28
+ 2. `~/.config/agent-cli/config.toml` (user default)
29
+
30
+ Settings in `[defaults]` apply to all commands. Override per-command
31
+ with sections like `[chat]` or `[transcribe]`. CLI arguments override
32
+ config file settings.
33
+ """,
24
34
  add_completion=True,
25
35
  rich_markup_mode="markdown",
26
36
  no_args_is_help=True,
@@ -40,30 +50,30 @@ CONFIG_PATH_OPTION: Path | None = typer.Option(
40
50
  None,
41
51
  "--path",
42
52
  "-p",
43
- help="Path to config file. Uses auto-detection if not specified.",
53
+ help="Override auto-detection and use this config file path.",
44
54
  )
45
55
  CONFIG_PATH_INIT_OPTION: Path | None = typer.Option(
46
56
  None,
47
57
  "--path",
48
58
  "-p",
49
- help="Custom path for config file. Default: ~/.config/agent-cli/config.toml",
59
+ help="Where to create the config file (default: `~/.config/agent-cli/config.toml`).",
50
60
  )
51
61
  FORCE_OPTION: bool = typer.Option(
52
62
  False, # noqa: FBT003
53
63
  "--force",
54
64
  "-f",
55
- help="Overwrite existing config without confirmation.",
65
+ help="Overwrite existing config without prompting for confirmation.",
56
66
  )
57
67
  RAW_OPTION: bool = typer.Option(
58
68
  False, # noqa: FBT003
59
69
  "--raw",
60
70
  "-r",
61
- help="Output raw file contents (for copy-paste).",
71
+ help="Print plain file contents without syntax highlighting or line numbers.",
62
72
  )
63
73
  JSON_OPTION: bool = typer.Option(
64
74
  False, # noqa: FBT003
65
75
  "--json",
66
- help="Output as JSON for automation.",
76
+ help="Output as JSON with `path`, `exists`, and `content` fields.",
67
77
  )
68
78
 
69
79
 
@@ -149,10 +159,13 @@ def config_init(
149
159
  path: Path | None = CONFIG_PATH_INIT_OPTION,
150
160
  force: bool = FORCE_OPTION,
151
161
  ) -> None:
152
- """Create a new config file with all options commented out.
162
+ """Create a new config file with all options as commented-out examples.
153
163
 
154
- The generated config file serves as a template showing all available
155
- options. Uncomment and modify the options you want to customize.
164
+ Generates a TOML template with `[defaults]` for global settings and
165
+ command-specific sections like `[chat]`, `[transcribe]`, etc. Uncomment
166
+ and edit the options you want to customize.
167
+
168
+ Example: `agent-cli config init && agent-cli config edit`
156
169
  """
157
170
  target_path = _get_config_file(path) or USER_CONFIG_PATH
158
171
 
@@ -182,7 +195,9 @@ def config_edit(
182
195
  ) -> None:
183
196
  """Open the config file in your default editor.
184
197
 
185
- The editor is determined by: $EDITOR > $VISUAL > platform default.
198
+ Editor preference: `$EDITOR` `$VISUAL` `nano`/`vim` → `vi` (or
199
+ `notepad` on Windows). If no config exists, run `agent-cli config init`
200
+ first.
186
201
  """
187
202
  config_file = _get_config_file(path)
188
203
 
@@ -234,7 +249,11 @@ def config_show(
234
249
  raw: bool = RAW_OPTION,
235
250
  json_output: bool = JSON_OPTION,
236
251
  ) -> None:
237
- """Display the config file location and contents."""
252
+ """Display the active config file path and contents.
253
+
254
+ By default, shows syntax-highlighted TOML with line numbers. Use `--raw`
255
+ for plain output (useful for piping), or `--json` for programmatic access.
256
+ """
238
257
  config_file = _get_config_file(path)
239
258
 
240
259
  if config_file is None:
agent_cli/core/deps.py CHANGED
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import functools
6
+ import importlib
6
7
  import json
7
8
  import os
8
9
  from importlib.util import find_spec
@@ -12,7 +13,7 @@ from typing import TYPE_CHECKING, TypeVar
12
13
  import typer
13
14
 
14
15
  from agent_cli.config import load_config
15
- from agent_cli.core.utils import console, print_error_message
16
+ from agent_cli.core.utils import err_console, print_error_message
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from collections.abc import Callable
@@ -139,7 +140,7 @@ def _try_auto_install(missing: list[str]) -> bool:
139
140
  else:
140
141
  extras_to_install.append(extra)
141
142
 
142
- console.print(
143
+ err_console.print(
143
144
  f"[yellow]Auto-installing missing extras: {', '.join(extras_to_install)}[/]",
144
145
  )
145
146
  return install_extras_programmatic(extras_to_install, quiet=True)
@@ -159,7 +160,9 @@ def _check_and_install_extras(extras: tuple[str, ...]) -> list[str]:
159
160
  print_error_message("Auto-install failed.\n" + get_combined_install_hint(missing))
160
161
  return missing
161
162
 
162
- console.print("[green]Installation complete![/]")
163
+ err_console.print("[green]Installation complete![/]")
164
+ # Invalidate import caches so find_spec() can see newly installed packages
165
+ importlib.invalidate_caches()
163
166
  still_missing = [e for e in extras if not check_extra_installed(e)]
164
167
  if still_missing:
165
168
  print_error_message(
agent_cli/core/vad.py CHANGED
@@ -3,38 +3,22 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
- import urllib.request
7
6
  from collections import deque
8
- from pathlib import Path
9
7
 
10
8
  from agent_cli import constants
11
9
 
12
10
  try:
13
11
  import numpy as np
14
- import torch
12
+ from silero_vad_lite import SileroVAD
15
13
  except ImportError as e:
16
14
  msg = (
17
- "silero-vad is required for the transcribe-daemon command. "
15
+ "silero-vad-lite is required for the transcribe-daemon command. "
18
16
  "Install it with: `pip install agent-cli[vad]` or `uv sync --extra vad`."
19
17
  )
20
18
  raise ImportError(msg) from e
21
19
 
22
20
  LOGGER = logging.getLogger(__name__)
23
21
 
24
- _SILERO_VAD_ONNX_URL = (
25
- "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
26
- )
27
-
28
-
29
- def _get_model_path() -> Path:
30
- """Get the path to the Silero VAD ONNX model, downloading if needed."""
31
- cache_dir = Path.home() / ".cache" / "silero-vad"
32
- cache_dir.mkdir(parents=True, exist_ok=True)
33
- model_path = cache_dir / "silero_vad.onnx"
34
- if not model_path.exists():
35
- urllib.request.urlretrieve(_SILERO_VAD_ONNX_URL, model_path) # noqa: S310
36
- return model_path
37
-
38
22
 
39
23
  class VoiceActivityDetector:
40
24
  """Silero VAD-based voice activity detection for audio segmentation.
@@ -56,8 +40,6 @@ class VoiceActivityDetector:
56
40
  msg = f"Sample rate must be 8000 or 16000, got {sample_rate}"
57
41
  raise ValueError(msg)
58
42
 
59
- from silero_vad.utils_vad import OnnxWrapper # noqa: PLC0415
60
-
61
43
  self.sample_rate = sample_rate
62
44
  self.threshold = threshold
63
45
  self.silence_threshold_ms = silence_threshold_ms
@@ -74,7 +56,7 @@ class VoiceActivityDetector:
74
56
  )
75
57
 
76
58
  # Model and state
77
- self._model = OnnxWrapper(str(_get_model_path()))
59
+ self._model = SileroVAD(sample_rate=sample_rate)
78
60
  self._pre_speech_buffer: deque[bytes] = deque(maxlen=pre_speech_windows)
79
61
  self._pending = bytearray()
80
62
  self._audio_buffer = bytearray()
@@ -92,7 +74,7 @@ class VoiceActivityDetector:
92
74
 
93
75
  def reset(self) -> None:
94
76
  """Reset VAD state for a new recording session."""
95
- self._model.reset_states()
77
+ self._model = SileroVAD(sample_rate=self.sample_rate)
96
78
  self._pre_speech_buffer.clear()
97
79
  self._pending.clear()
98
80
  self._audio_buffer.clear()
@@ -103,7 +85,7 @@ class VoiceActivityDetector:
103
85
  def _is_speech(self, window: bytes) -> bool:
104
86
  """Check if audio window contains speech."""
105
87
  audio = np.frombuffer(window, dtype=np.int16).astype(np.float32) / 32768.0
106
- prob = float(self._model(torch.from_numpy(audio), self.sample_rate).item())
88
+ prob = self._model.process(audio)
107
89
  LOGGER.debug("Speech prob: %.3f, threshold: %.2f", prob, self.threshold)
108
90
  return prob >= self.threshold
109
91
 
@@ -154,7 +136,7 @@ class VoiceActivityDetector:
154
136
  self._silence_samples = 0
155
137
  self._speech_samples = 0
156
138
  self._audio_buffer.clear()
157
- self._model.reset_states()
139
+ self._model = SileroVAD(sample_rate=self.sample_rate)
158
140
  else:
159
141
  # Not speaking - maintain rolling pre-speech buffer (auto-limited by deque maxlen)
160
142
  self._pre_speech_buffer.append(window)