agent-cli 0.70.5__py3-none-any.whl → 0.71.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/agents/assistant.py +23 -27
- agent_cli/agents/autocorrect.py +29 -3
- agent_cli/agents/chat.py +44 -14
- agent_cli/agents/memory/__init__.py +19 -1
- agent_cli/agents/memory/add.py +3 -3
- agent_cli/agents/memory/proxy.py +19 -10
- agent_cli/agents/rag_proxy.py +41 -9
- agent_cli/agents/speak.py +22 -2
- agent_cli/agents/transcribe.py +20 -2
- agent_cli/agents/transcribe_daemon.py +33 -21
- agent_cli/agents/voice_edit.py +17 -9
- agent_cli/cli.py +25 -2
- agent_cli/config_cmd.py +30 -11
- agent_cli/dev/cli.py +295 -65
- agent_cli/docs_gen.py +18 -8
- agent_cli/install/extras.py +39 -10
- agent_cli/install/hotkeys.py +22 -11
- agent_cli/install/services.py +54 -14
- agent_cli/opts.py +23 -20
- agent_cli/server/cli.py +118 -44
- {agent_cli-0.70.5.dist-info → agent_cli-0.71.0.dist-info}/METADATA +456 -187
- {agent_cli-0.70.5.dist-info → agent_cli-0.71.0.dist-info}/RECORD +25 -25
- {agent_cli-0.70.5.dist-info → agent_cli-0.71.0.dist-info}/WHEEL +0 -0
- {agent_cli-0.70.5.dist-info → agent_cli-0.71.0.dist-info}/entry_points.txt +0 -0
- {agent_cli-0.70.5.dist-info → agent_cli-0.71.0.dist-info}/licenses/LICENSE +0 -0
agent_cli/agents/assistant.py
CHANGED
|
@@ -1,29 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
This agent uses Wyoming wake word detection to implement a hands-free voice assistant that:
|
|
4
|
-
1. Continuously listens for a wake word
|
|
5
|
-
2. When the wake word is detected, starts recording user speech
|
|
6
|
-
3. When the wake word is detected again, stops recording and processes the speech
|
|
7
|
-
4. Sends the recorded speech to ASR for transcription
|
|
8
|
-
5. Optionally processes the transcript with an LLM and speaks the response
|
|
9
|
-
|
|
10
|
-
WORKFLOW:
|
|
11
|
-
1. Agent starts listening for the specified wake word
|
|
12
|
-
2. First wake word detection -> start recording user speech
|
|
13
|
-
3. Second wake word detection -> stop recording and process the speech
|
|
14
|
-
4. Transcribe the recorded speech using Wyoming ASR
|
|
15
|
-
5. Optionally process with LLM and respond with TTS
|
|
16
|
-
|
|
17
|
-
USAGE:
|
|
18
|
-
- Start the agent: assistant --wake-word "ok_nabu" --input-device-index 1
|
|
19
|
-
- The agent runs continuously until stopped with Ctrl+C or --stop
|
|
20
|
-
- Uses background process management for daemon-like operation
|
|
21
|
-
|
|
22
|
-
REQUIREMENTS:
|
|
23
|
-
- Wyoming wake word server (e.g., wyoming-openwakeword)
|
|
24
|
-
- Wyoming ASR server (e.g., wyoming-whisper)
|
|
25
|
-
- Optional: Wyoming TTS server for responses
|
|
26
|
-
"""
|
|
1
|
+
"""Wake word-based voice assistant using Wyoming protocol services."""
|
|
27
2
|
|
|
28
3
|
from __future__ import annotations
|
|
29
4
|
|
|
@@ -313,7 +288,28 @@ def assistant(
|
|
|
313
288
|
config_file: str | None = opts.CONFIG_FILE,
|
|
314
289
|
print_args: bool = opts.PRINT_ARGS,
|
|
315
290
|
) -> None:
|
|
316
|
-
"""
|
|
291
|
+
"""Hands-free voice assistant using wake word detection.
|
|
292
|
+
|
|
293
|
+
Continuously listens for a wake word, then records your speech until you say
|
|
294
|
+
the wake word again. The recording is transcribed and sent to an LLM for a
|
|
295
|
+
conversational response, optionally spoken back via TTS.
|
|
296
|
+
|
|
297
|
+
**Conversation flow:**
|
|
298
|
+
1. Say wake word → starts recording
|
|
299
|
+
2. Speak your question/command
|
|
300
|
+
3. Say wake word again → stops recording and processes
|
|
301
|
+
|
|
302
|
+
The assistant runs in a loop, ready for the next command after each response.
|
|
303
|
+
Stop with Ctrl+C or `--stop`.
|
|
304
|
+
|
|
305
|
+
**Requirements:**
|
|
306
|
+
- Wyoming wake word server (e.g., wyoming-openwakeword on port 10400)
|
|
307
|
+
- Wyoming ASR server (e.g., wyoming-whisper on port 10300)
|
|
308
|
+
- Optional: TTS server for spoken responses (enable with `--tts`)
|
|
309
|
+
|
|
310
|
+
**Example:**
|
|
311
|
+
`assistant --wake-word ok_nabu --tts --input-device-name USB`
|
|
312
|
+
"""
|
|
317
313
|
if print_args:
|
|
318
314
|
print_command_line_args(locals())
|
|
319
315
|
|
agent_cli/agents/autocorrect.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Fix grammar, spelling, and punctuation in text using an LLM."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -216,7 +216,7 @@ def autocorrect(
|
|
|
216
216
|
*,
|
|
217
217
|
text: str | None = typer.Argument(
|
|
218
218
|
None,
|
|
219
|
-
help="
|
|
219
|
+
help="Text to correct. If omitted, reads from system clipboard.",
|
|
220
220
|
rich_help_panel="General Options",
|
|
221
221
|
),
|
|
222
222
|
# --- Provider Selection ---
|
|
@@ -240,7 +240,33 @@ def autocorrect(
|
|
|
240
240
|
config_file: str | None = opts.CONFIG_FILE,
|
|
241
241
|
print_args: bool = opts.PRINT_ARGS,
|
|
242
242
|
) -> None:
|
|
243
|
-
"""
|
|
243
|
+
"""Fix grammar, spelling, and punctuation using an LLM.
|
|
244
|
+
|
|
245
|
+
Reads text from clipboard (or argument), sends to LLM for correction,
|
|
246
|
+
and copies the result back to clipboard. Only makes technical corrections
|
|
247
|
+
without changing meaning or tone.
|
|
248
|
+
|
|
249
|
+
**Workflow:**
|
|
250
|
+
1. Read text from clipboard (or `TEXT` argument)
|
|
251
|
+
2. Send to LLM for grammar/spelling/punctuation fixes
|
|
252
|
+
3. Copy corrected text to clipboard (unless `--json`)
|
|
253
|
+
4. Display result
|
|
254
|
+
|
|
255
|
+
**Examples:**
|
|
256
|
+
```bash
|
|
257
|
+
# Correct text from clipboard (default)
|
|
258
|
+
agent-cli autocorrect
|
|
259
|
+
|
|
260
|
+
# Correct specific text
|
|
261
|
+
agent-cli autocorrect "this is incorect"
|
|
262
|
+
|
|
263
|
+
# Use OpenAI instead of local Ollama
|
|
264
|
+
agent-cli autocorrect --llm-provider openai
|
|
265
|
+
|
|
266
|
+
# Get JSON output for scripting (disables clipboard)
|
|
267
|
+
agent-cli autocorrect --json
|
|
268
|
+
```
|
|
269
|
+
"""
|
|
244
270
|
if print_args:
|
|
245
271
|
print_command_line_args(locals())
|
|
246
272
|
|
agent_cli/agents/chat.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
|
|
1
|
+
"""Voice-based conversational chat agent with memory and tools.
|
|
2
|
+
|
|
3
|
+
Runs an interactive voice loop: listens for speech, transcribes it,
|
|
4
|
+
sends to the LLM (with conversation context), and optionally speaks the response.
|
|
5
|
+
|
|
6
|
+
**Available tools** (automatically used by the LLM when relevant):
|
|
7
|
+
- `add_memory`/`search_memory`/`update_memory` - persistent long-term memory
|
|
8
|
+
- `duckduckgo_search` - web search for current information
|
|
9
|
+
- `read_file`/`execute_code` - file access and shell commands
|
|
10
|
+
|
|
11
|
+
**Process management**: Use `--toggle` to start/stop via hotkey, `--stop` to terminate,
|
|
12
|
+
or `--status` to check if running. Useful for binding to a keyboard shortcut.
|
|
11
13
|
"""
|
|
12
14
|
|
|
13
15
|
from __future__ import annotations
|
|
@@ -425,14 +427,15 @@ def chat(
|
|
|
425
427
|
history_dir: Path = typer.Option( # noqa: B008
|
|
426
428
|
"~/.config/agent-cli/history",
|
|
427
429
|
"--history-dir",
|
|
428
|
-
help="Directory
|
|
430
|
+
help="Directory for conversation history and long-term memory. "
|
|
431
|
+
"Both `conversation.json` and `long_term_memory.json` are stored here.",
|
|
429
432
|
rich_help_panel="History Options",
|
|
430
433
|
),
|
|
431
434
|
last_n_messages: int = typer.Option(
|
|
432
435
|
50,
|
|
433
436
|
"--last-n-messages",
|
|
434
|
-
help="Number of messages to include
|
|
435
|
-
"
|
|
437
|
+
help="Number of past messages to include as context for the LLM. "
|
|
438
|
+
"Set to 0 to start fresh each session (memory tools still persist).",
|
|
436
439
|
rich_help_panel="History Options",
|
|
437
440
|
),
|
|
438
441
|
# --- General Options ---
|
|
@@ -444,7 +447,34 @@ def chat(
|
|
|
444
447
|
config_file: str | None = opts.CONFIG_FILE,
|
|
445
448
|
print_args: bool = opts.PRINT_ARGS,
|
|
446
449
|
) -> None:
|
|
447
|
-
"""
|
|
450
|
+
"""Voice-based conversational chat agent with memory and tools.
|
|
451
|
+
|
|
452
|
+
Runs an interactive loop: listen → transcribe → LLM → speak response.
|
|
453
|
+
Conversation history is persisted and included as context for continuity.
|
|
454
|
+
|
|
455
|
+
**Built-in tools** (LLM uses automatically when relevant):
|
|
456
|
+
|
|
457
|
+
- `add_memory`/`search_memory`/`update_memory` - persistent long-term memory
|
|
458
|
+
- `duckduckgo_search` - web search for current information
|
|
459
|
+
- `read_file`/`execute_code` - file access and shell commands
|
|
460
|
+
|
|
461
|
+
**Process management**: Use `--toggle` to start/stop via hotkey (bind to
|
|
462
|
+
a keyboard shortcut), `--stop` to terminate, or `--status` to check state.
|
|
463
|
+
|
|
464
|
+
**Examples**:
|
|
465
|
+
|
|
466
|
+
Use OpenAI-compatible providers for speech and LLM, with TTS enabled:
|
|
467
|
+
|
|
468
|
+
agent-cli chat --asr-provider openai --llm-provider openai --tts
|
|
469
|
+
|
|
470
|
+
Start in background mode (toggle on/off with hotkey):
|
|
471
|
+
|
|
472
|
+
agent-cli chat --toggle
|
|
473
|
+
|
|
474
|
+
Use local Ollama LLM with Wyoming ASR:
|
|
475
|
+
|
|
476
|
+
agent-cli chat --llm-provider ollama
|
|
477
|
+
"""
|
|
448
478
|
if print_args:
|
|
449
479
|
print_command_line_args(locals())
|
|
450
480
|
|
|
@@ -9,7 +9,25 @@ from agent_cli.core.process import set_process_title
|
|
|
9
9
|
|
|
10
10
|
memory_app = typer.Typer(
|
|
11
11
|
name="memory",
|
|
12
|
-
help="
|
|
12
|
+
help="""Long-term memory system for AI chat applications.
|
|
13
|
+
|
|
14
|
+
Provides persistent memory across conversations by storing facts and context
|
|
15
|
+
in Markdown files, with automatic vector indexing for semantic retrieval.
|
|
16
|
+
|
|
17
|
+
**Subcommands:**
|
|
18
|
+
|
|
19
|
+
- `proxy`: Start an OpenAI-compatible proxy that injects relevant memories
|
|
20
|
+
into chat requests and extracts new facts from responses
|
|
21
|
+
- `add`: Manually add facts/memories without going through LLM extraction
|
|
22
|
+
|
|
23
|
+
**Quick Start:**
|
|
24
|
+
|
|
25
|
+
# Start the memory proxy (point your chat client at localhost:8100)
|
|
26
|
+
agent-cli memory proxy --openai-base-url http://localhost:11434/v1
|
|
27
|
+
|
|
28
|
+
# Manually seed some memories
|
|
29
|
+
agent-cli memory add "User prefers dark mode" "User is a Python developer"
|
|
30
|
+
""",
|
|
13
31
|
add_completion=True,
|
|
14
32
|
rich_markup_mode="markdown",
|
|
15
33
|
no_args_is_help=True,
|
agent_cli/agents/memory/add.py
CHANGED
|
@@ -127,17 +127,17 @@ def add(
|
|
|
127
127
|
"default",
|
|
128
128
|
"--conversation-id",
|
|
129
129
|
"-c",
|
|
130
|
-
help="Conversation
|
|
130
|
+
help="Conversation namespace for these memories. Memories are retrieved per-conversation unless shared globally.",
|
|
131
131
|
),
|
|
132
132
|
memory_path: Path = typer.Option( # noqa: B008
|
|
133
133
|
"./memory_db",
|
|
134
134
|
"--memory-path",
|
|
135
|
-
help="
|
|
135
|
+
help="Directory for memory storage (same as `memory proxy --memory-path`).",
|
|
136
136
|
),
|
|
137
137
|
git_versioning: bool = typer.Option(
|
|
138
138
|
True, # noqa: FBT003
|
|
139
139
|
"--git-versioning/--no-git-versioning",
|
|
140
|
-
help="
|
|
140
|
+
help="Auto-commit changes to git for version history.",
|
|
141
141
|
),
|
|
142
142
|
quiet: bool = opts.QUIET,
|
|
143
143
|
config_file: str | None = opts.CONFIG_FILE,
|
agent_cli/agents/memory/proxy.py
CHANGED
|
@@ -19,7 +19,7 @@ from agent_cli.core.utils import console, print_command_line_args
|
|
|
19
19
|
def proxy(
|
|
20
20
|
memory_path: Path = typer.Option( # noqa: B008
|
|
21
21
|
"./memory_db",
|
|
22
|
-
help="
|
|
22
|
+
help="Directory for memory storage. Contains `entries/` (Markdown files) and `chroma/` (vector index). Created automatically if it doesn't exist.",
|
|
23
23
|
rich_help_panel="Memory Configuration",
|
|
24
24
|
),
|
|
25
25
|
openai_base_url: str | None = opts.OPENAI_BASE_URL,
|
|
@@ -27,7 +27,7 @@ def proxy(
|
|
|
27
27
|
openai_api_key: str | None = opts.OPENAI_API_KEY,
|
|
28
28
|
default_top_k: int = typer.Option(
|
|
29
29
|
5,
|
|
30
|
-
help="Number of
|
|
30
|
+
help="Number of relevant memories to inject into each request. Higher values provide more context but increase token usage.",
|
|
31
31
|
rich_help_panel="Memory Configuration",
|
|
32
32
|
),
|
|
33
33
|
host: str = opts.SERVER_HOST,
|
|
@@ -38,7 +38,7 @@ def proxy(
|
|
|
38
38
|
),
|
|
39
39
|
max_entries: int = typer.Option(
|
|
40
40
|
500,
|
|
41
|
-
help="Maximum
|
|
41
|
+
help="Maximum entries per conversation before oldest are evicted. Summaries are preserved separately.",
|
|
42
42
|
rich_help_panel="Memory Configuration",
|
|
43
43
|
),
|
|
44
44
|
mmr_lambda: float = typer.Option(
|
|
@@ -48,7 +48,7 @@ def proxy(
|
|
|
48
48
|
),
|
|
49
49
|
recency_weight: float = typer.Option(
|
|
50
50
|
0.2,
|
|
51
|
-
help="
|
|
51
|
+
help="Weight for recency vs semantic relevance (0.0-1.0). At 0.2: 20% recency, 80% semantic similarity.",
|
|
52
52
|
rich_help_panel="Memory Configuration",
|
|
53
53
|
),
|
|
54
54
|
score_threshold: float = typer.Option(
|
|
@@ -59,13 +59,13 @@ def proxy(
|
|
|
59
59
|
summarization: bool = typer.Option(
|
|
60
60
|
True, # noqa: FBT003
|
|
61
61
|
"--summarization/--no-summarization",
|
|
62
|
-
help="
|
|
62
|
+
help="Extract facts and generate summaries after each turn using the LLM. Disable to only store raw conversation turns.",
|
|
63
63
|
rich_help_panel="Memory Configuration",
|
|
64
64
|
),
|
|
65
65
|
git_versioning: bool = typer.Option(
|
|
66
66
|
True, # noqa: FBT003
|
|
67
67
|
"--git-versioning/--no-git-versioning",
|
|
68
|
-
help="
|
|
68
|
+
help="Auto-commit memory changes to git. Initializes a repo in `--memory-path` if needed. Provides full history of memory evolution.",
|
|
69
69
|
rich_help_panel="Memory Configuration",
|
|
70
70
|
),
|
|
71
71
|
log_level: opts.LogLevel = opts.LOG_LEVEL,
|
|
@@ -78,7 +78,7 @@ def proxy(
|
|
|
78
78
|
CLI, or IDE plugin) and an OpenAI-compatible LLM provider (e.g., OpenAI,
|
|
79
79
|
Ollama, vLLM).
|
|
80
80
|
|
|
81
|
-
Key Features
|
|
81
|
+
**Key Features:**
|
|
82
82
|
|
|
83
83
|
- **Simple Markdown Files:** Memories are stored as human-readable Markdown
|
|
84
84
|
files, serving as the ultimate source of truth.
|
|
@@ -89,7 +89,7 @@ def proxy(
|
|
|
89
89
|
- **Proxy Middleware:** Works transparently with any OpenAI-compatible
|
|
90
90
|
`/chat/completions` endpoint.
|
|
91
91
|
|
|
92
|
-
How it works
|
|
92
|
+
**How it works:**
|
|
93
93
|
|
|
94
94
|
1. Intercepts `POST /v1/chat/completions` requests.
|
|
95
95
|
2. **Retrieves** relevant memories (facts, previous conversations) from a
|
|
@@ -99,8 +99,17 @@ def proxy(
|
|
|
99
99
|
5. **Extracts** new facts from the conversation in the background and
|
|
100
100
|
updates the long-term memory store (including handling contradictions).
|
|
101
101
|
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
**Example:**
|
|
103
|
+
|
|
104
|
+
# Start proxy pointing to local Ollama
|
|
105
|
+
agent-cli memory proxy --openai-base-url http://localhost:11434/v1
|
|
106
|
+
|
|
107
|
+
# Then configure your chat client to use http://localhost:8100/v1
|
|
108
|
+
# as its OpenAI base URL. All requests flow through the memory proxy.
|
|
109
|
+
|
|
110
|
+
**Per-request overrides:** Clients can include these fields in the request
|
|
111
|
+
body: `memory_id` (conversation ID), `memory_top_k`, `memory_recency_weight`,
|
|
112
|
+
`memory_score_threshold`.
|
|
104
113
|
"""
|
|
105
114
|
if print_args:
|
|
106
115
|
print_command_line_args(locals())
|
agent_cli/agents/rag_proxy.py
CHANGED
|
@@ -23,12 +23,12 @@ from agent_cli.core.utils import (
|
|
|
23
23
|
def rag_proxy(
|
|
24
24
|
docs_folder: Path = typer.Option( # noqa: B008
|
|
25
25
|
"./rag_docs",
|
|
26
|
-
help="Folder to watch for documents",
|
|
26
|
+
help="Folder to watch for documents. Files are auto-indexed on startup and when changed. Must not overlap with `--chroma-path`.",
|
|
27
27
|
rich_help_panel="RAG Configuration",
|
|
28
28
|
),
|
|
29
29
|
chroma_path: Path = typer.Option( # noqa: B008
|
|
30
30
|
"./rag_db",
|
|
31
|
-
help="
|
|
31
|
+
help="ChromaDB storage directory for vector embeddings. Must be separate from `--docs-folder` to avoid indexing database files.",
|
|
32
32
|
rich_help_panel="RAG Configuration",
|
|
33
33
|
),
|
|
34
34
|
openai_base_url: str | None = opts.OPENAI_BASE_URL,
|
|
@@ -36,13 +36,13 @@ def rag_proxy(
|
|
|
36
36
|
openai_api_key: str | None = opts.OPENAI_API_KEY,
|
|
37
37
|
limit: int = typer.Option(
|
|
38
38
|
3,
|
|
39
|
-
help="Number of document chunks to retrieve per query.",
|
|
39
|
+
help="Number of document chunks to retrieve per query. Higher values provide more context but use more tokens. Can be overridden per-request via `rag_top_k` in the JSON body.",
|
|
40
40
|
rich_help_panel="RAG Configuration",
|
|
41
41
|
),
|
|
42
42
|
host: str = opts.SERVER_HOST,
|
|
43
43
|
port: int = typer.Option(
|
|
44
44
|
8000,
|
|
45
|
-
help="Port
|
|
45
|
+
help="Port for the RAG proxy API (e.g., `http://localhost:8000/v1/chat/completions`).",
|
|
46
46
|
rich_help_panel="Server Configuration",
|
|
47
47
|
),
|
|
48
48
|
log_level: opts.LogLevel = opts.LOG_LEVEL,
|
|
@@ -51,15 +51,47 @@ def rag_proxy(
|
|
|
51
51
|
enable_rag_tools: bool = typer.Option(
|
|
52
52
|
True, # noqa: FBT003
|
|
53
53
|
"--rag-tools/--no-rag-tools",
|
|
54
|
-
help="
|
|
54
|
+
help="Enable `read_full_document()` tool so the LLM can request full document content when retrieved snippets are insufficient. Can be overridden per-request via `rag_enable_tools` in the JSON body.",
|
|
55
55
|
rich_help_panel="RAG Configuration",
|
|
56
56
|
),
|
|
57
57
|
) -> None:
|
|
58
|
-
"""Start
|
|
58
|
+
"""Start a RAG proxy server that enables "chat with your documents".
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
60
|
+
Watches a folder for documents, indexes them into a vector store, and provides an
|
|
61
|
+
OpenAI-compatible API at `/v1/chat/completions`. When you send a chat request,
|
|
62
|
+
the server retrieves relevant document chunks and injects them as context before
|
|
63
|
+
forwarding to your LLM backend.
|
|
64
|
+
|
|
65
|
+
**Quick start:**
|
|
66
|
+
|
|
67
|
+
- `agent-cli rag-proxy` — Start with defaults (./rag_docs, OpenAI-compatible API)
|
|
68
|
+
- `agent-cli rag-proxy --docs-folder ~/notes` — Index your notes folder
|
|
69
|
+
|
|
70
|
+
**How it works:**
|
|
71
|
+
|
|
72
|
+
1. Documents in `--docs-folder` are chunked, embedded, and stored in ChromaDB
|
|
73
|
+
2. A file watcher auto-reindexes when files change
|
|
74
|
+
3. Chat requests trigger a semantic search for relevant chunks
|
|
75
|
+
4. Retrieved context is injected into the prompt before forwarding to the LLM
|
|
76
|
+
5. Responses include a `rag_sources` field listing which documents were used
|
|
77
|
+
|
|
78
|
+
**Supported file formats:**
|
|
79
|
+
|
|
80
|
+
Text: `.txt`, `.md`, `.json`, `.py`, `.js`, `.ts`, `.yaml`, `.toml`, `.rst`, etc.
|
|
81
|
+
Rich documents (via MarkItDown): `.pdf`, `.docx`, `.pptx`, `.xlsx`, `.html`, `.csv`
|
|
82
|
+
|
|
83
|
+
**API endpoints:**
|
|
84
|
+
|
|
85
|
+
- `POST /v1/chat/completions` — Main chat endpoint (OpenAI-compatible)
|
|
86
|
+
- `GET /health` — Health check with configuration info
|
|
87
|
+
- `GET /files` — List indexed files with chunk counts
|
|
88
|
+
- `POST /reindex` — Trigger manual reindex
|
|
89
|
+
- All other paths are proxied to the LLM backend
|
|
90
|
+
|
|
91
|
+
**Per-request overrides (in JSON body):**
|
|
92
|
+
|
|
93
|
+
- `rag_top_k`: Override `--limit` for this request
|
|
94
|
+
- `rag_enable_tools`: Override `--rag-tools` for this request
|
|
63
95
|
"""
|
|
64
96
|
if print_args:
|
|
65
97
|
print_command_line_args(locals())
|
agent_cli/agents/speak.py
CHANGED
|
@@ -86,7 +86,7 @@ def speak(
|
|
|
86
86
|
*,
|
|
87
87
|
text: str | None = typer.Argument(
|
|
88
88
|
None,
|
|
89
|
-
help="Text to
|
|
89
|
+
help="Text to synthesize. If not provided, reads from clipboard.",
|
|
90
90
|
rich_help_panel="General Options",
|
|
91
91
|
),
|
|
92
92
|
# --- Provider Selection ---
|
|
@@ -127,7 +127,27 @@ def speak(
|
|
|
127
127
|
config_file: str | None = opts.CONFIG_FILE,
|
|
128
128
|
print_args: bool = opts.PRINT_ARGS,
|
|
129
129
|
) -> None:
|
|
130
|
-
"""Convert text to speech
|
|
130
|
+
"""Convert text to speech and play audio through speakers.
|
|
131
|
+
|
|
132
|
+
By default, synthesized audio plays immediately. Use `--save-file` to save
|
|
133
|
+
to a WAV file instead (skips playback).
|
|
134
|
+
|
|
135
|
+
Text can be provided as an argument or read from clipboard automatically.
|
|
136
|
+
|
|
137
|
+
**Examples:**
|
|
138
|
+
|
|
139
|
+
Speak text directly:
|
|
140
|
+
`agent-cli speak "Hello, world!"`
|
|
141
|
+
|
|
142
|
+
Speak clipboard contents:
|
|
143
|
+
`agent-cli speak`
|
|
144
|
+
|
|
145
|
+
Save to file instead of playing:
|
|
146
|
+
`agent-cli speak "Hello" --save-file greeting.wav`
|
|
147
|
+
|
|
148
|
+
Use OpenAI-compatible TTS:
|
|
149
|
+
`agent-cli speak "Hello" --tts-provider openai`
|
|
150
|
+
"""
|
|
131
151
|
if print_args:
|
|
132
152
|
print_command_line_args(locals())
|
|
133
153
|
|
agent_cli/agents/transcribe.py
CHANGED
|
@@ -471,7 +471,7 @@ def transcribe( # noqa: PLR0912
|
|
|
471
471
|
extra_instructions: str | None = typer.Option(
|
|
472
472
|
None,
|
|
473
473
|
"--extra-instructions",
|
|
474
|
-
help="
|
|
474
|
+
help="Extra instructions appended to the LLM cleanup prompt (requires `--llm`).",
|
|
475
475
|
rich_help_panel="LLM Configuration",
|
|
476
476
|
),
|
|
477
477
|
from_file: Path | None = opts.FROM_FILE,
|
|
@@ -513,7 +513,25 @@ def transcribe( # noqa: PLR0912
|
|
|
513
513
|
print_args: bool = opts.PRINT_ARGS,
|
|
514
514
|
transcription_log: Path | None = opts.TRANSCRIPTION_LOG,
|
|
515
515
|
) -> None:
|
|
516
|
-
"""
|
|
516
|
+
"""Record audio from microphone and transcribe to text.
|
|
517
|
+
|
|
518
|
+
Records until you press Ctrl+C (or send SIGINT), then transcribes using your
|
|
519
|
+
configured ASR provider. The transcript is copied to the clipboard by default.
|
|
520
|
+
|
|
521
|
+
**With `--llm`**: Passes the raw transcript through an LLM to clean up speech
|
|
522
|
+
recognition errors, add punctuation, remove filler words, and improve readability.
|
|
523
|
+
|
|
524
|
+
**With `--toggle`**: Bind to a hotkey for push-to-talk. First call starts recording,
|
|
525
|
+
second call stops and transcribes.
|
|
526
|
+
|
|
527
|
+
**Examples**:
|
|
528
|
+
|
|
529
|
+
- Record and transcribe: `agent-cli transcribe`
|
|
530
|
+
|
|
531
|
+
- With LLM cleanup: `agent-cli transcribe --llm`
|
|
532
|
+
|
|
533
|
+
- Re-transcribe last recording: `agent-cli transcribe --last-recording 1`
|
|
534
|
+
"""
|
|
517
535
|
if print_args:
|
|
518
536
|
print_command_line_args(locals())
|
|
519
537
|
|
|
@@ -296,45 +296,45 @@ def transcribe_daemon( # noqa: PLR0912
|
|
|
296
296
|
"user",
|
|
297
297
|
"--role",
|
|
298
298
|
"-r",
|
|
299
|
-
help="
|
|
299
|
+
help="Label for log entries. Use to distinguish speakers or contexts in logs.",
|
|
300
300
|
),
|
|
301
301
|
silence_threshold: float = typer.Option(
|
|
302
302
|
1.0,
|
|
303
303
|
"--silence-threshold",
|
|
304
304
|
"-s",
|
|
305
|
-
help="Seconds of silence to
|
|
305
|
+
help="Seconds of silence after speech to finalize a segment. Increase for slower speakers.",
|
|
306
306
|
),
|
|
307
307
|
min_segment: float = typer.Option(
|
|
308
308
|
0.25,
|
|
309
309
|
"--min-segment",
|
|
310
310
|
"-m",
|
|
311
|
-
help="Minimum
|
|
311
|
+
help="Minimum seconds of speech required before a segment is processed. Filters brief sounds.",
|
|
312
312
|
),
|
|
313
313
|
vad_threshold: float = typer.Option(
|
|
314
314
|
0.3,
|
|
315
315
|
"--vad-threshold",
|
|
316
|
-
help="VAD
|
|
316
|
+
help="Silero VAD confidence threshold (0.0-1.0). Higher values require clearer speech; lower values are more sensitive to quiet/distant voices.",
|
|
317
317
|
),
|
|
318
318
|
save_audio: bool = typer.Option(
|
|
319
319
|
True, # noqa: FBT003
|
|
320
320
|
"--save-audio/--no-save-audio",
|
|
321
|
-
help="Save
|
|
321
|
+
help="Save each speech segment as MP3. Requires `ffmpeg` to be installed.",
|
|
322
322
|
),
|
|
323
323
|
audio_dir: Path | None = typer.Option( # noqa: B008
|
|
324
324
|
None,
|
|
325
325
|
"--audio-dir",
|
|
326
|
-
help="
|
|
326
|
+
help="Base directory for MP3 files. Files are organized by date: `YYYY/MM/DD/HHMMSS_mmm.mp3`. Default: `~/.config/agent-cli/audio`.",
|
|
327
327
|
),
|
|
328
328
|
transcription_log: Path | None = typer.Option( # noqa: B008
|
|
329
329
|
None,
|
|
330
330
|
"--transcription-log",
|
|
331
331
|
"-t",
|
|
332
|
-
help="JSON
|
|
332
|
+
help="JSONL file for transcript logging (one JSON object per line with timestamp, role, raw/processed text, audio path). Default: `~/.config/agent-cli/transcriptions.jsonl`.",
|
|
333
333
|
),
|
|
334
334
|
clipboard: bool = typer.Option(
|
|
335
335
|
False, # noqa: FBT003
|
|
336
336
|
"--clipboard/--no-clipboard",
|
|
337
|
-
help="Copy each transcription to clipboard.",
|
|
337
|
+
help="Copy each completed transcription to clipboard (overwrites previous). Useful with `--llm` to get cleaned text.",
|
|
338
338
|
),
|
|
339
339
|
# --- Provider Selection ---
|
|
340
340
|
asr_provider: str = opts.ASR_PROVIDER,
|
|
@@ -368,25 +368,37 @@ def transcribe_daemon( # noqa: PLR0912
|
|
|
368
368
|
config_file: str | None = opts.CONFIG_FILE,
|
|
369
369
|
print_args: bool = opts.PRINT_ARGS,
|
|
370
370
|
) -> None:
|
|
371
|
-
"""
|
|
371
|
+
"""Continuous transcription daemon using Silero VAD for speech detection.
|
|
372
372
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
373
|
+
Unlike `transcribe` (single recording session), this daemon runs indefinitely
|
|
374
|
+
and automatically detects speech segments using Voice Activity Detection (VAD).
|
|
375
|
+
Each detected segment is transcribed and logged with timestamps.
|
|
376
376
|
|
|
377
|
-
|
|
378
|
-
# Basic daemon
|
|
379
|
-
agent-cli transcribe-daemon
|
|
377
|
+
**How it works:**
|
|
380
378
|
|
|
381
|
-
|
|
382
|
-
|
|
379
|
+
1. Listens continuously to microphone input
|
|
380
|
+
2. Silero VAD detects when you start/stop speaking
|
|
381
|
+
3. After `--silence-threshold` seconds of silence, the segment is finalized
|
|
382
|
+
4. Segment is transcribed (and optionally cleaned by LLM with `--llm`)
|
|
383
|
+
5. Results are appended to the JSONL log file
|
|
384
|
+
6. Audio is saved as MP3 if `--save-audio` is enabled (requires `ffmpeg`)
|
|
385
|
+
|
|
386
|
+
**Use cases:** Meeting transcription, note-taking, voice journaling, accessibility.
|
|
383
387
|
|
|
384
|
-
|
|
385
|
-
|
|
388
|
+
**Examples:**
|
|
389
|
+
|
|
390
|
+
agent-cli transcribe-daemon
|
|
391
|
+
agent-cli transcribe-daemon --role meeting --silence-threshold 1.5
|
|
392
|
+
agent-cli transcribe-daemon --llm --clipboard --role notes
|
|
393
|
+
agent-cli transcribe-daemon --transcription-log ~/meeting.jsonl --no-save-audio
|
|
394
|
+
agent-cli transcribe-daemon --asr-provider openai --llm-provider gemini --llm
|
|
386
395
|
|
|
387
|
-
|
|
388
|
-
agent-cli transcribe-daemon --transcription-log ~/meeting.jsonl --audio-dir ~/audio
|
|
396
|
+
**Tips:**
|
|
389
397
|
|
|
398
|
+
- Use `--role` to tag entries (e.g., `speaker1`, `meeting`, `personal`)
|
|
399
|
+
- Adjust `--vad-threshold` if detection is too sensitive (increase) or missing speech (decrease)
|
|
400
|
+
- Use `--stop` to cleanly terminate a running daemon
|
|
401
|
+
- With `--llm`, transcripts are cleaned up (punctuation, filler words removed)
|
|
390
402
|
"""
|
|
391
403
|
if print_args:
|
|
392
404
|
print_command_line_args(locals())
|
agent_cli/agents/voice_edit.py
CHANGED
|
@@ -229,15 +229,23 @@ def voice_edit(
|
|
|
229
229
|
config_file: str | None = opts.CONFIG_FILE,
|
|
230
230
|
print_args: bool = opts.PRINT_ARGS,
|
|
231
231
|
) -> None:
|
|
232
|
-
"""
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
232
|
+
"""Edit or query clipboard text using voice commands.
|
|
233
|
+
|
|
234
|
+
**Workflow:** Captures clipboard text → records your voice command → transcribes
|
|
235
|
+
it → sends both to an LLM → copies result back to clipboard.
|
|
236
|
+
|
|
237
|
+
Use this for hands-free text editing (e.g., "make this more formal") or
|
|
238
|
+
asking questions about clipboard content (e.g., "summarize this").
|
|
239
|
+
|
|
240
|
+
**Typical hotkey integration:** Run `voice-edit &` on keypress to start
|
|
241
|
+
recording, then send SIGINT (via `--stop`) on second keypress to process.
|
|
242
|
+
|
|
243
|
+
**Examples:**
|
|
244
|
+
|
|
245
|
+
- Basic usage: `agent-cli voice-edit`
|
|
246
|
+
- With TTS response: `agent-cli voice-edit --tts`
|
|
247
|
+
- Toggle on/off: `agent-cli voice-edit --toggle`
|
|
248
|
+
- List audio devices: `agent-cli voice-edit --list-devices`
|
|
241
249
|
"""
|
|
242
250
|
if print_args:
|
|
243
251
|
print_command_line_args(locals())
|