agent-cli 0.70.2__py3-none-any.whl → 0.72.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/_extras.json +4 -3
- agent_cli/_requirements/memory.txt +14 -1
- agent_cli/_requirements/rag.txt +14 -1
- agent_cli/_requirements/vad.txt +1 -85
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/agents/assistant.py +24 -28
- agent_cli/agents/autocorrect.py +30 -4
- agent_cli/agents/chat.py +45 -15
- agent_cli/agents/memory/__init__.py +19 -1
- agent_cli/agents/memory/add.py +3 -3
- agent_cli/agents/memory/proxy.py +20 -11
- agent_cli/agents/rag_proxy.py +42 -10
- agent_cli/agents/speak.py +23 -3
- agent_cli/agents/transcribe.py +21 -3
- agent_cli/agents/transcribe_daemon.py +34 -22
- agent_cli/agents/voice_edit.py +18 -10
- agent_cli/cli.py +25 -2
- agent_cli/config_cmd.py +30 -11
- agent_cli/core/deps.py +6 -3
- agent_cli/core/transcription_logger.py +1 -1
- agent_cli/core/vad.py +6 -24
- agent_cli/dev/cli.py +295 -65
- agent_cli/docs_gen.py +18 -8
- agent_cli/install/extras.py +44 -13
- agent_cli/install/hotkeys.py +22 -11
- agent_cli/install/services.py +54 -14
- agent_cli/opts.py +43 -22
- agent_cli/server/cli.py +128 -62
- agent_cli/server/proxy/api.py +77 -19
- agent_cli/services/__init__.py +46 -5
- {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/METADATA +627 -246
- {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/RECORD +35 -34
- {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/WHEEL +0 -0
- {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/entry_points.txt +0 -0
- {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/licenses/LICENSE +0 -0
agent_cli/server/cli.py
CHANGED
|
@@ -10,6 +10,7 @@ from typing import Annotated
|
|
|
10
10
|
|
|
11
11
|
import typer
|
|
12
12
|
|
|
13
|
+
from agent_cli import opts
|
|
13
14
|
from agent_cli.cli import app as main_app
|
|
14
15
|
from agent_cli.core.deps import requires_extras
|
|
15
16
|
from agent_cli.core.process import set_process_title
|
|
@@ -28,7 +29,30 @@ def _has(package: str) -> bool:
|
|
|
28
29
|
|
|
29
30
|
app = typer.Typer(
|
|
30
31
|
name="server",
|
|
31
|
-
help="Run ASR/TTS servers
|
|
32
|
+
help="""Run local ASR/TTS servers with OpenAI-compatible APIs.
|
|
33
|
+
|
|
34
|
+
**Available servers:**
|
|
35
|
+
|
|
36
|
+
- `whisper` - Local speech-to-text using Whisper models (faster-whisper or MLX)
|
|
37
|
+
- `tts` - Local text-to-speech using Piper (CPU) or Kokoro (GPU)
|
|
38
|
+
- `transcribe-proxy` - Proxy to external ASR providers (OpenAI, Gemini, Wyoming)
|
|
39
|
+
|
|
40
|
+
**Common workflows:**
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Run local Whisper server (lazy loads large-v3 by default)
|
|
44
|
+
agent-cli server whisper
|
|
45
|
+
|
|
46
|
+
# Run local TTS with Kokoro backend (GPU-accelerated)
|
|
47
|
+
agent-cli server tts --backend kokoro
|
|
48
|
+
|
|
49
|
+
# Run transcription proxy using your configured ASR provider
|
|
50
|
+
agent-cli server transcribe-proxy
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
All servers support Home Assistant via Wyoming protocol and can be used as
|
|
54
|
+
drop-in replacements for OpenAI's audio APIs.
|
|
55
|
+
""",
|
|
32
56
|
add_completion=True,
|
|
33
57
|
rich_markup_mode="markdown",
|
|
34
58
|
no_args_is_help=True,
|
|
@@ -169,14 +193,18 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
|
|
|
169
193
|
typer.Option(
|
|
170
194
|
"--model",
|
|
171
195
|
"-m",
|
|
172
|
-
help=
|
|
196
|
+
help=(
|
|
197
|
+
"Whisper model(s) to load. Common models: `tiny`, `base`, `small`, "
|
|
198
|
+
"`medium`, `large-v3`, `distil-large-v3`. Can specify multiple for "
|
|
199
|
+
"different accuracy/speed tradeoffs. Default: `large-v3`"
|
|
200
|
+
),
|
|
173
201
|
),
|
|
174
202
|
] = None,
|
|
175
203
|
default_model: Annotated[
|
|
176
204
|
str | None,
|
|
177
205
|
typer.Option(
|
|
178
206
|
"--default-model",
|
|
179
|
-
help="
|
|
207
|
+
help=("Model to use when client doesn't specify one. Must be in the `--model` list"),
|
|
180
208
|
),
|
|
181
209
|
] = None,
|
|
182
210
|
device: Annotated[
|
|
@@ -184,42 +212,54 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
|
|
|
184
212
|
typer.Option(
|
|
185
213
|
"--device",
|
|
186
214
|
"-d",
|
|
187
|
-
help=
|
|
215
|
+
help=(
|
|
216
|
+
"Compute device: `auto` (detect GPU), `cuda`, `cuda:0`, `cpu`. "
|
|
217
|
+
"MLX backend always uses Apple Silicon"
|
|
218
|
+
),
|
|
188
219
|
),
|
|
189
220
|
] = "auto",
|
|
190
221
|
compute_type: Annotated[
|
|
191
222
|
str,
|
|
192
223
|
typer.Option(
|
|
193
224
|
"--compute-type",
|
|
194
|
-
help=
|
|
225
|
+
help=(
|
|
226
|
+
"Precision for faster-whisper: `auto`, `float16`, `int8`, `int8_float16`. "
|
|
227
|
+
"Lower precision = faster + less VRAM"
|
|
228
|
+
),
|
|
195
229
|
),
|
|
196
230
|
] = "auto",
|
|
197
231
|
cache_dir: Annotated[
|
|
198
232
|
Path | None,
|
|
199
233
|
typer.Option(
|
|
200
234
|
"--cache-dir",
|
|
201
|
-
help="
|
|
235
|
+
help="Custom directory for downloaded models (default: HuggingFace cache)",
|
|
202
236
|
),
|
|
203
237
|
] = None,
|
|
204
238
|
ttl: Annotated[
|
|
205
239
|
int,
|
|
206
240
|
typer.Option(
|
|
207
241
|
"--ttl",
|
|
208
|
-
help=
|
|
242
|
+
help=(
|
|
243
|
+
"Seconds of inactivity before unloading model from memory. "
|
|
244
|
+
"Set to 0 to keep loaded indefinitely"
|
|
245
|
+
),
|
|
209
246
|
),
|
|
210
247
|
] = 300,
|
|
211
248
|
preload: Annotated[
|
|
212
249
|
bool,
|
|
213
250
|
typer.Option(
|
|
214
251
|
"--preload",
|
|
215
|
-
help=
|
|
252
|
+
help=(
|
|
253
|
+
"Load model(s) immediately at startup instead of on first request. "
|
|
254
|
+
"Useful for reducing first-request latency"
|
|
255
|
+
),
|
|
216
256
|
),
|
|
217
257
|
] = False,
|
|
218
258
|
host: Annotated[
|
|
219
259
|
str,
|
|
220
260
|
typer.Option(
|
|
221
261
|
"--host",
|
|
222
|
-
help="
|
|
262
|
+
help="Network interface to bind. Use `0.0.0.0` for all interfaces",
|
|
223
263
|
),
|
|
224
264
|
] = "0.0.0.0", # noqa: S104
|
|
225
265
|
port: Annotated[
|
|
@@ -227,44 +267,40 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
|
|
|
227
267
|
typer.Option(
|
|
228
268
|
"--port",
|
|
229
269
|
"-p",
|
|
230
|
-
help="HTTP API
|
|
270
|
+
help="Port for OpenAI-compatible HTTP API (`/v1/audio/transcriptions`)",
|
|
231
271
|
),
|
|
232
272
|
] = 10301,
|
|
233
273
|
wyoming_port: Annotated[
|
|
234
274
|
int,
|
|
235
275
|
typer.Option(
|
|
236
276
|
"--wyoming-port",
|
|
237
|
-
help="Wyoming protocol
|
|
277
|
+
help="Port for Wyoming protocol (Home Assistant integration)",
|
|
238
278
|
),
|
|
239
279
|
] = 10300,
|
|
240
280
|
no_wyoming: Annotated[
|
|
241
281
|
bool,
|
|
242
282
|
typer.Option(
|
|
243
283
|
"--no-wyoming",
|
|
244
|
-
help="Disable Wyoming server",
|
|
284
|
+
help="Disable Wyoming protocol server (only run HTTP API)",
|
|
245
285
|
),
|
|
246
286
|
] = False,
|
|
247
287
|
download_only: Annotated[
|
|
248
288
|
bool,
|
|
249
289
|
typer.Option(
|
|
250
290
|
"--download-only",
|
|
251
|
-
help="Download model(s) and exit
|
|
291
|
+
help="Download model(s) to cache and exit. Useful for Docker builds",
|
|
252
292
|
),
|
|
253
293
|
] = False,
|
|
254
|
-
log_level:
|
|
255
|
-
str,
|
|
256
|
-
typer.Option(
|
|
257
|
-
"--log-level",
|
|
258
|
-
"-l",
|
|
259
|
-
help="Logging level: debug, info, warning, error",
|
|
260
|
-
),
|
|
261
|
-
] = "info",
|
|
294
|
+
log_level: opts.LogLevel = opts.SERVER_LOG_LEVEL,
|
|
262
295
|
backend: Annotated[
|
|
263
296
|
str,
|
|
264
297
|
typer.Option(
|
|
265
298
|
"--backend",
|
|
266
299
|
"-b",
|
|
267
|
-
help=
|
|
300
|
+
help=(
|
|
301
|
+
"Inference backend: `auto` (faster-whisper on CUDA/CPU, MLX on Apple Silicon), "
|
|
302
|
+
"`faster-whisper`, `mlx`"
|
|
303
|
+
),
|
|
268
304
|
),
|
|
269
305
|
] = "auto",
|
|
270
306
|
) -> None:
|
|
@@ -278,7 +314,8 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
|
|
|
278
314
|
Models are loaded lazily on first request and unloaded after being
|
|
279
315
|
idle for the TTL duration, freeing VRAM for other applications.
|
|
280
316
|
|
|
281
|
-
Examples
|
|
317
|
+
**Examples:**
|
|
318
|
+
|
|
282
319
|
# Run with default large-v3 model
|
|
283
320
|
agent-cli server whisper
|
|
284
321
|
|
|
@@ -290,7 +327,6 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
|
|
|
290
327
|
|
|
291
328
|
# Download model without starting server
|
|
292
329
|
agent-cli server whisper --model large-v3 --download-only
|
|
293
|
-
|
|
294
330
|
"""
|
|
295
331
|
# Setup Rich logging for consistent output
|
|
296
332
|
setup_rich_logging(log_level)
|
|
@@ -378,6 +414,7 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
|
|
|
378
414
|
console.print()
|
|
379
415
|
console.print("[dim]Configuration:[/dim]")
|
|
380
416
|
console.print(f" Backend: [cyan]{actual_backend}[/cyan]")
|
|
417
|
+
console.print(f" Log level: [cyan]{log_level}[/cyan]")
|
|
381
418
|
console.print()
|
|
382
419
|
console.print("[dim]Endpoints:[/dim]")
|
|
383
420
|
console.print(f" HTTP API: [cyan]http://{host}:{port}[/cyan]")
|
|
@@ -422,45 +459,64 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
|
|
|
422
459
|
|
|
423
460
|
|
|
424
461
|
@app.command("transcribe-proxy")
|
|
425
|
-
@requires_extras("server", "
|
|
462
|
+
@requires_extras("server", "wyoming", "llm")
|
|
426
463
|
def transcribe_proxy_cmd(
|
|
427
464
|
host: Annotated[
|
|
428
465
|
str,
|
|
429
|
-
typer.Option("--host", help="
|
|
466
|
+
typer.Option("--host", help="Network interface to bind. Use `0.0.0.0` for all interfaces"),
|
|
430
467
|
] = "0.0.0.0", # noqa: S104
|
|
431
468
|
port: Annotated[
|
|
432
469
|
int,
|
|
433
|
-
typer.Option("--port", "-p", help="Port
|
|
470
|
+
typer.Option("--port", "-p", help="Port for the HTTP API"),
|
|
434
471
|
] = 61337,
|
|
435
472
|
reload: Annotated[
|
|
436
473
|
bool,
|
|
437
|
-
typer.Option("--reload", help="
|
|
474
|
+
typer.Option("--reload", help="Auto-reload on code changes (development only)"),
|
|
438
475
|
] = False,
|
|
476
|
+
log_level: opts.LogLevel = opts.SERVER_LOG_LEVEL,
|
|
439
477
|
) -> None:
|
|
440
|
-
"""Run transcription proxy
|
|
478
|
+
r"""Run transcription proxy that forwards to your configured ASR provider.
|
|
441
479
|
|
|
442
|
-
|
|
443
|
-
|
|
480
|
+
Unlike `server whisper` which runs a local Whisper model, this proxy
|
|
481
|
+
forwards audio to external ASR providers configured in your agent-cli
|
|
482
|
+
config file or environment variables.
|
|
444
483
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
- /health endpoint for health checks
|
|
484
|
+
**Supported ASR providers:** `wyoming`, `openai`, `gemini`
|
|
485
|
+
**Supported LLM providers for cleanup:** `ollama`, `openai`, `gemini`
|
|
448
486
|
|
|
449
|
-
|
|
487
|
+
The server exposes:
|
|
450
488
|
|
|
451
|
-
|
|
452
|
-
|
|
489
|
+
- `POST /transcribe` - Accepts audio files, returns `{raw_transcript, cleaned_transcript}`
|
|
490
|
+
- `GET /health` - Health check endpoint
|
|
491
|
+
|
|
492
|
+
**When to use this vs `server whisper`:**
|
|
493
|
+
|
|
494
|
+
- Use `transcribe-proxy` when you want to use cloud ASR (OpenAI/Gemini)
|
|
495
|
+
or connect to a remote Wyoming server
|
|
496
|
+
- Use `server whisper` when you want to run a local Whisper model
|
|
497
|
+
|
|
498
|
+
Configuration is read from `~/.config/agent-cli/config.yaml` or env vars
|
|
499
|
+
like `ASR_PROVIDER`, `LLM_PROVIDER`, `OPENAI_API_KEY`, etc.
|
|
500
|
+
|
|
501
|
+
**Examples:**
|
|
502
|
+
|
|
503
|
+
# Run with providers from config file
|
|
453
504
|
agent-cli server transcribe-proxy
|
|
454
505
|
|
|
455
|
-
# Run
|
|
456
|
-
agent-cli server transcribe-proxy
|
|
506
|
+
# Run with OpenAI ASR via env vars
|
|
507
|
+
ASR_PROVIDER=openai OPENAI_API_KEY=sk-... agent-cli server transcribe-proxy
|
|
457
508
|
|
|
509
|
+
# Test with curl
|
|
510
|
+
curl -X POST http://localhost:61337/transcribe \\
|
|
511
|
+
-F "audio=@recording.wav" -F "cleanup=true"
|
|
458
512
|
"""
|
|
459
513
|
_check_server_deps()
|
|
514
|
+
setup_rich_logging(log_level)
|
|
460
515
|
|
|
461
516
|
console.print(
|
|
462
517
|
f"[bold green]Starting Agent CLI transcription proxy on {host}:{port}[/bold green]",
|
|
463
518
|
)
|
|
519
|
+
console.print(f"[dim]Log level: {log_level}[/dim]")
|
|
464
520
|
if reload:
|
|
465
521
|
console.print("[yellow]Auto-reload enabled for development[/yellow]")
|
|
466
522
|
|
|
@@ -471,7 +527,7 @@ def transcribe_proxy_cmd(
|
|
|
471
527
|
host=host,
|
|
472
528
|
port=port,
|
|
473
529
|
reload=reload,
|
|
474
|
-
log_level=
|
|
530
|
+
log_level=log_level.lower(),
|
|
475
531
|
)
|
|
476
532
|
|
|
477
533
|
|
|
@@ -483,14 +539,18 @@ def tts_cmd( # noqa: PLR0915
|
|
|
483
539
|
typer.Option(
|
|
484
540
|
"--model",
|
|
485
541
|
"-m",
|
|
486
|
-
help=
|
|
542
|
+
help=(
|
|
543
|
+
"Model/voice(s) to load. Piper: `en_US-lessac-medium`, `en_GB-alan-medium`. "
|
|
544
|
+
"Kokoro: `af_heart`, `af_bella`, `am_adam`. "
|
|
545
|
+
"Auto-downloads on first use"
|
|
546
|
+
),
|
|
487
547
|
),
|
|
488
548
|
] = None,
|
|
489
549
|
default_model: Annotated[
|
|
490
550
|
str | None,
|
|
491
551
|
typer.Option(
|
|
492
552
|
"--default-model",
|
|
493
|
-
help="
|
|
553
|
+
help=("Voice to use when client doesn't specify one. Must be in the `--model` list"),
|
|
494
554
|
),
|
|
495
555
|
] = None,
|
|
496
556
|
device: Annotated[
|
|
@@ -498,35 +558,44 @@ def tts_cmd( # noqa: PLR0915
|
|
|
498
558
|
typer.Option(
|
|
499
559
|
"--device",
|
|
500
560
|
"-d",
|
|
501
|
-
help=
|
|
561
|
+
help=(
|
|
562
|
+
"Compute device: `auto`, `cpu`, `cuda`, `mps`. "
|
|
563
|
+
"Piper is CPU-only; Kokoro supports GPU acceleration"
|
|
564
|
+
),
|
|
502
565
|
),
|
|
503
566
|
] = "auto",
|
|
504
567
|
cache_dir: Annotated[
|
|
505
568
|
Path | None,
|
|
506
569
|
typer.Option(
|
|
507
570
|
"--cache-dir",
|
|
508
|
-
help="
|
|
571
|
+
help="Custom directory for downloaded models (default: ~/.cache/agent-cli/tts/)",
|
|
509
572
|
),
|
|
510
573
|
] = None,
|
|
511
574
|
ttl: Annotated[
|
|
512
575
|
int,
|
|
513
576
|
typer.Option(
|
|
514
577
|
"--ttl",
|
|
515
|
-
help=
|
|
578
|
+
help=(
|
|
579
|
+
"Seconds of inactivity before unloading model from memory. "
|
|
580
|
+
"Set to 0 to keep loaded indefinitely"
|
|
581
|
+
),
|
|
516
582
|
),
|
|
517
583
|
] = 300,
|
|
518
584
|
preload: Annotated[
|
|
519
585
|
bool,
|
|
520
586
|
typer.Option(
|
|
521
587
|
"--preload",
|
|
522
|
-
help=
|
|
588
|
+
help=(
|
|
589
|
+
"Load model(s) immediately at startup instead of on first request. "
|
|
590
|
+
"Useful for reducing first-request latency"
|
|
591
|
+
),
|
|
523
592
|
),
|
|
524
593
|
] = False,
|
|
525
594
|
host: Annotated[
|
|
526
595
|
str,
|
|
527
596
|
typer.Option(
|
|
528
597
|
"--host",
|
|
529
|
-
help="
|
|
598
|
+
help="Network interface to bind. Use `0.0.0.0` for all interfaces",
|
|
530
599
|
),
|
|
531
600
|
] = "0.0.0.0", # noqa: S104
|
|
532
601
|
port: Annotated[
|
|
@@ -534,44 +603,40 @@ def tts_cmd( # noqa: PLR0915
|
|
|
534
603
|
typer.Option(
|
|
535
604
|
"--port",
|
|
536
605
|
"-p",
|
|
537
|
-
help="HTTP API
|
|
606
|
+
help="Port for OpenAI-compatible HTTP API (`/v1/audio/speech`)",
|
|
538
607
|
),
|
|
539
608
|
] = 10201,
|
|
540
609
|
wyoming_port: Annotated[
|
|
541
610
|
int,
|
|
542
611
|
typer.Option(
|
|
543
612
|
"--wyoming-port",
|
|
544
|
-
help="Wyoming protocol
|
|
613
|
+
help="Port for Wyoming protocol (Home Assistant integration)",
|
|
545
614
|
),
|
|
546
615
|
] = 10200,
|
|
547
616
|
no_wyoming: Annotated[
|
|
548
617
|
bool,
|
|
549
618
|
typer.Option(
|
|
550
619
|
"--no-wyoming",
|
|
551
|
-
help="Disable Wyoming server",
|
|
620
|
+
help="Disable Wyoming protocol server (only run HTTP API)",
|
|
552
621
|
),
|
|
553
622
|
] = False,
|
|
554
623
|
download_only: Annotated[
|
|
555
624
|
bool,
|
|
556
625
|
typer.Option(
|
|
557
626
|
"--download-only",
|
|
558
|
-
help="Download model(s) and exit
|
|
627
|
+
help="Download model(s)/voice(s) to cache and exit. Useful for Docker builds",
|
|
559
628
|
),
|
|
560
629
|
] = False,
|
|
561
|
-
log_level:
|
|
562
|
-
str,
|
|
563
|
-
typer.Option(
|
|
564
|
-
"--log-level",
|
|
565
|
-
"-l",
|
|
566
|
-
help="Logging level: debug, info, warning, error",
|
|
567
|
-
),
|
|
568
|
-
] = "info",
|
|
630
|
+
log_level: opts.LogLevel = opts.SERVER_LOG_LEVEL,
|
|
569
631
|
backend: Annotated[
|
|
570
632
|
str,
|
|
571
633
|
typer.Option(
|
|
572
634
|
"--backend",
|
|
573
635
|
"-b",
|
|
574
|
-
help=
|
|
636
|
+
help=(
|
|
637
|
+
"TTS engine: `auto` (prefer Kokoro if available), "
|
|
638
|
+
"`piper` (CPU, many languages), `kokoro` (GPU, high quality)"
|
|
639
|
+
),
|
|
575
640
|
),
|
|
576
641
|
] = "auto",
|
|
577
642
|
) -> None:
|
|
@@ -594,7 +659,8 @@ def tts_cmd( # noqa: PLR0915
|
|
|
594
659
|
Voices: af_heart, af_bella, am_adam, bf_emma, bm_george, etc.
|
|
595
660
|
See https://huggingface.co/hexgrad/Kokoro-82M for all voices.
|
|
596
661
|
|
|
597
|
-
Examples
|
|
662
|
+
**Examples:**
|
|
663
|
+
|
|
598
664
|
# Run with Kokoro (auto-downloads model and voices)
|
|
599
665
|
agent-cli server tts --backend kokoro
|
|
600
666
|
|
|
@@ -609,7 +675,6 @@ def tts_cmd( # noqa: PLR0915
|
|
|
609
675
|
|
|
610
676
|
# Download Piper model without starting server
|
|
611
677
|
agent-cli server tts --backend piper --model en_US-lessac-medium --download-only
|
|
612
|
-
|
|
613
678
|
"""
|
|
614
679
|
# Setup Rich logging for consistent output
|
|
615
680
|
setup_rich_logging(log_level)
|
|
@@ -679,6 +744,7 @@ def tts_cmd( # noqa: PLR0915
|
|
|
679
744
|
console.print()
|
|
680
745
|
console.print("[dim]Configuration:[/dim]")
|
|
681
746
|
console.print(f" Backend: [cyan]{resolved_backend}[/cyan]")
|
|
747
|
+
console.print(f" Log level: [cyan]{log_level}[/cyan]")
|
|
682
748
|
console.print()
|
|
683
749
|
console.print("[dim]Endpoints:[/dim]")
|
|
684
750
|
console.print(f" HTTP API: [cyan]http://{host}:{port}[/cyan]")
|
agent_cli/server/proxy/api.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
from pathlib import Path
|
|
7
|
-
from typing import Annotated, Any
|
|
8
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
8
9
|
|
|
9
10
|
from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile
|
|
10
11
|
from pydantic import BaseModel
|
|
@@ -26,6 +27,9 @@ from agent_cli.server.common import log_requests_middleware
|
|
|
26
27
|
from agent_cli.services import asr
|
|
27
28
|
from agent_cli.services.llm import process_and_update_clipboard
|
|
28
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from typer.models import OptionInfo
|
|
32
|
+
|
|
29
33
|
# Configure logging
|
|
30
34
|
logging.basicConfig(level=logging.INFO)
|
|
31
35
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -37,6 +41,40 @@ app = FastAPI(
|
|
|
37
41
|
)
|
|
38
42
|
|
|
39
43
|
|
|
44
|
+
@app.on_event("startup")
|
|
45
|
+
async def log_effective_config() -> None:
|
|
46
|
+
"""Log effective configuration on startup to help debug env var issues."""
|
|
47
|
+
(
|
|
48
|
+
provider_cfg,
|
|
49
|
+
wyoming_cfg,
|
|
50
|
+
openai_asr_cfg,
|
|
51
|
+
gemini_asr_cfg,
|
|
52
|
+
ollama_cfg,
|
|
53
|
+
openai_llm_cfg,
|
|
54
|
+
gemini_llm_cfg,
|
|
55
|
+
_,
|
|
56
|
+
) = _load_transcription_configs()
|
|
57
|
+
|
|
58
|
+
LOGGER.info("ASR provider: %s", provider_cfg.asr_provider)
|
|
59
|
+
if provider_cfg.asr_provider == "wyoming":
|
|
60
|
+
LOGGER.info(" Wyoming: %s:%d", wyoming_cfg.asr_wyoming_ip, wyoming_cfg.asr_wyoming_port)
|
|
61
|
+
elif provider_cfg.asr_provider == "openai":
|
|
62
|
+
LOGGER.info(" Model: %s", openai_asr_cfg.asr_openai_model)
|
|
63
|
+
LOGGER.info(" Base URL: %s", openai_asr_cfg.openai_base_url or "https://api.openai.com/v1")
|
|
64
|
+
elif provider_cfg.asr_provider == "gemini":
|
|
65
|
+
LOGGER.info(" Model: %s", gemini_asr_cfg.asr_gemini_model)
|
|
66
|
+
|
|
67
|
+
LOGGER.info("LLM provider: %s", provider_cfg.llm_provider)
|
|
68
|
+
if provider_cfg.llm_provider == "ollama":
|
|
69
|
+
LOGGER.info(" Model: %s", ollama_cfg.llm_ollama_model)
|
|
70
|
+
LOGGER.info(" Host: %s", ollama_cfg.llm_ollama_host)
|
|
71
|
+
elif provider_cfg.llm_provider == "openai":
|
|
72
|
+
LOGGER.info(" Model: %s", openai_llm_cfg.llm_openai_model)
|
|
73
|
+
LOGGER.info(" Base URL: %s", openai_llm_cfg.openai_base_url or "https://api.openai.com/v1")
|
|
74
|
+
elif provider_cfg.llm_provider == "gemini":
|
|
75
|
+
LOGGER.info(" Model: %s", gemini_llm_cfg.llm_gemini_model)
|
|
76
|
+
|
|
77
|
+
|
|
40
78
|
@app.middleware("http")
|
|
41
79
|
async def log_requests(request: Request, call_next) -> Any: # type: ignore[no-untyped-def] # noqa: ANN001
|
|
42
80
|
"""Log basic request information."""
|
|
@@ -83,6 +121,7 @@ async def health_check() -> HealthResponse:
|
|
|
83
121
|
|
|
84
122
|
async def _transcribe_with_provider(
|
|
85
123
|
audio_data: bytes,
|
|
124
|
+
filename: str,
|
|
86
125
|
provider_cfg: config.ProviderSelection,
|
|
87
126
|
wyoming_asr_cfg: config.WyomingASR,
|
|
88
127
|
openai_asr_cfg: config.OpenAIASR,
|
|
@@ -90,6 +129,7 @@ async def _transcribe_with_provider(
|
|
|
90
129
|
) -> str:
|
|
91
130
|
"""Transcribe audio using the configured provider."""
|
|
92
131
|
transcriber = asr.create_recorded_audio_transcriber(provider_cfg)
|
|
132
|
+
file_suffix = Path(filename).suffix.lower() or ".wav"
|
|
93
133
|
|
|
94
134
|
if provider_cfg.asr_provider == "wyoming":
|
|
95
135
|
return await transcriber(
|
|
@@ -102,12 +142,14 @@ async def _transcribe_with_provider(
|
|
|
102
142
|
audio_data=audio_data,
|
|
103
143
|
openai_asr_cfg=openai_asr_cfg,
|
|
104
144
|
logger=LOGGER,
|
|
145
|
+
file_suffix=file_suffix,
|
|
105
146
|
)
|
|
106
147
|
if provider_cfg.asr_provider == "gemini":
|
|
107
148
|
return await transcriber(
|
|
108
149
|
audio_data=audio_data,
|
|
109
150
|
gemini_asr_cfg=gemini_asr_cfg,
|
|
110
151
|
logger=LOGGER,
|
|
152
|
+
file_suffix=file_suffix,
|
|
111
153
|
)
|
|
112
154
|
msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}"
|
|
113
155
|
raise NotImplementedError(msg)
|
|
@@ -153,6 +195,13 @@ def _validate_audio_file(audio: UploadFile) -> None:
|
|
|
153
195
|
)
|
|
154
196
|
|
|
155
197
|
|
|
198
|
+
def _cfg(key: str, defaults: dict[str, Any], opt: OptionInfo) -> Any:
|
|
199
|
+
"""Get config with priority: env var > config file > option default."""
|
|
200
|
+
if opt.envvar and (env_val := os.environ.get(opt.envvar)):
|
|
201
|
+
return int(env_val) if isinstance(opt.default, int) else env_val
|
|
202
|
+
return defaults.get(key, opt.default)
|
|
203
|
+
|
|
204
|
+
|
|
156
205
|
def _load_transcription_configs() -> tuple[
|
|
157
206
|
config.ProviderSelection,
|
|
158
207
|
config.WyomingASR,
|
|
@@ -163,41 +212,43 @@ def _load_transcription_configs() -> tuple[
|
|
|
163
212
|
config.GeminiLLM,
|
|
164
213
|
dict[str, Any],
|
|
165
214
|
]:
|
|
166
|
-
"""Load
|
|
215
|
+
"""Load config objects. Priority: env var > config file > default."""
|
|
167
216
|
loaded_config = config.load_config()
|
|
168
217
|
wildcard_config = loaded_config.get("defaults", {})
|
|
169
218
|
command_config = loaded_config.get("transcribe", {})
|
|
170
219
|
defaults = {**wildcard_config, **command_config}
|
|
171
220
|
|
|
172
221
|
provider_cfg = config.ProviderSelection(
|
|
173
|
-
asr_provider=
|
|
174
|
-
llm_provider=
|
|
175
|
-
tts_provider=opts.TTS_PROVIDER
|
|
222
|
+
asr_provider=_cfg("asr_provider", defaults, opts.ASR_PROVIDER),
|
|
223
|
+
llm_provider=_cfg("llm_provider", defaults, opts.LLM_PROVIDER),
|
|
224
|
+
tts_provider=_cfg("tts_provider", defaults, opts.TTS_PROVIDER),
|
|
176
225
|
)
|
|
177
226
|
wyoming_asr_cfg = config.WyomingASR(
|
|
178
|
-
asr_wyoming_ip=
|
|
179
|
-
asr_wyoming_port=
|
|
227
|
+
asr_wyoming_ip=_cfg("asr_wyoming_ip", defaults, opts.ASR_WYOMING_IP),
|
|
228
|
+
asr_wyoming_port=_cfg("asr_wyoming_port", defaults, opts.ASR_WYOMING_PORT),
|
|
180
229
|
)
|
|
181
230
|
openai_asr_cfg = config.OpenAIASR(
|
|
182
|
-
asr_openai_model=
|
|
183
|
-
openai_api_key=
|
|
231
|
+
asr_openai_model=_cfg("asr_openai_model", defaults, opts.ASR_OPENAI_MODEL),
|
|
232
|
+
openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
|
|
233
|
+
openai_base_url=_cfg("asr_openai_base_url", defaults, opts.ASR_OPENAI_BASE_URL),
|
|
234
|
+
asr_openai_prompt=_cfg("asr_openai_prompt", defaults, opts.ASR_OPENAI_PROMPT),
|
|
184
235
|
)
|
|
185
236
|
gemini_asr_cfg = config.GeminiASR(
|
|
186
|
-
asr_gemini_model=
|
|
187
|
-
gemini_api_key=
|
|
237
|
+
asr_gemini_model=_cfg("asr_gemini_model", defaults, opts.ASR_GEMINI_MODEL),
|
|
238
|
+
gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
|
|
188
239
|
)
|
|
189
240
|
ollama_cfg = config.Ollama(
|
|
190
|
-
llm_ollama_model=
|
|
191
|
-
llm_ollama_host=
|
|
241
|
+
llm_ollama_model=_cfg("llm_ollama_model", defaults, opts.LLM_OLLAMA_MODEL),
|
|
242
|
+
llm_ollama_host=_cfg("llm_ollama_host", defaults, opts.LLM_OLLAMA_HOST),
|
|
192
243
|
)
|
|
193
244
|
openai_llm_cfg = config.OpenAILLM(
|
|
194
|
-
llm_openai_model=
|
|
195
|
-
openai_api_key=
|
|
196
|
-
openai_base_url=
|
|
245
|
+
llm_openai_model=_cfg("llm_openai_model", defaults, opts.LLM_OPENAI_MODEL),
|
|
246
|
+
openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
|
|
247
|
+
openai_base_url=_cfg("openai_base_url", defaults, opts.OPENAI_BASE_URL),
|
|
197
248
|
)
|
|
198
249
|
gemini_llm_cfg = config.GeminiLLM(
|
|
199
|
-
llm_gemini_model=
|
|
200
|
-
gemini_api_key=
|
|
250
|
+
llm_gemini_model=_cfg("llm_gemini_model", defaults, opts.LLM_GEMINI_MODEL),
|
|
251
|
+
gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
|
|
201
252
|
)
|
|
202
253
|
|
|
203
254
|
return (
|
|
@@ -309,8 +360,14 @@ async def transcribe_audio(
|
|
|
309
360
|
defaults,
|
|
310
361
|
) = _load_transcription_configs()
|
|
311
362
|
|
|
312
|
-
#
|
|
363
|
+
# Read uploaded file
|
|
313
364
|
audio_data = await audio_file.read()
|
|
365
|
+
LOGGER.info(
|
|
366
|
+
"Received audio: filename=%s, size=%d bytes, content_type=%s",
|
|
367
|
+
audio_file.filename,
|
|
368
|
+
len(audio_data),
|
|
369
|
+
audio_file.content_type,
|
|
370
|
+
)
|
|
314
371
|
|
|
315
372
|
# Convert audio to Wyoming format if using local ASR
|
|
316
373
|
if provider_cfg.asr_provider == "wyoming":
|
|
@@ -319,6 +376,7 @@ async def transcribe_audio(
|
|
|
319
376
|
# Transcribe audio using the configured provider
|
|
320
377
|
raw_transcript = await _transcribe_with_provider(
|
|
321
378
|
audio_data,
|
|
379
|
+
audio_file.filename or "audio.wav",
|
|
322
380
|
provider_cfg,
|
|
323
381
|
wyoming_asr_cfg,
|
|
324
382
|
openai_asr_cfg,
|