agent-cli 0.70.2__py3-none-any.whl → 0.72.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. agent_cli/_extras.json +4 -3
  2. agent_cli/_requirements/memory.txt +14 -1
  3. agent_cli/_requirements/rag.txt +14 -1
  4. agent_cli/_requirements/vad.txt +1 -85
  5. agent_cli/_requirements/wyoming.txt +71 -0
  6. agent_cli/agents/assistant.py +24 -28
  7. agent_cli/agents/autocorrect.py +30 -4
  8. agent_cli/agents/chat.py +45 -15
  9. agent_cli/agents/memory/__init__.py +19 -1
  10. agent_cli/agents/memory/add.py +3 -3
  11. agent_cli/agents/memory/proxy.py +20 -11
  12. agent_cli/agents/rag_proxy.py +42 -10
  13. agent_cli/agents/speak.py +23 -3
  14. agent_cli/agents/transcribe.py +21 -3
  15. agent_cli/agents/transcribe_daemon.py +34 -22
  16. agent_cli/agents/voice_edit.py +18 -10
  17. agent_cli/cli.py +25 -2
  18. agent_cli/config_cmd.py +30 -11
  19. agent_cli/core/deps.py +6 -3
  20. agent_cli/core/transcription_logger.py +1 -1
  21. agent_cli/core/vad.py +6 -24
  22. agent_cli/dev/cli.py +295 -65
  23. agent_cli/docs_gen.py +18 -8
  24. agent_cli/install/extras.py +44 -13
  25. agent_cli/install/hotkeys.py +22 -11
  26. agent_cli/install/services.py +54 -14
  27. agent_cli/opts.py +43 -22
  28. agent_cli/server/cli.py +128 -62
  29. agent_cli/server/proxy/api.py +77 -19
  30. agent_cli/services/__init__.py +46 -5
  31. {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/METADATA +627 -246
  32. {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/RECORD +35 -34
  33. {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/WHEEL +0 -0
  34. {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/entry_points.txt +0 -0
  35. {agent_cli-0.70.2.dist-info → agent_cli-0.72.1.dist-info}/licenses/LICENSE +0 -0
agent_cli/server/cli.py CHANGED
@@ -10,6 +10,7 @@ from typing import Annotated
10
10
 
11
11
  import typer
12
12
 
13
+ from agent_cli import opts
13
14
  from agent_cli.cli import app as main_app
14
15
  from agent_cli.core.deps import requires_extras
15
16
  from agent_cli.core.process import set_process_title
@@ -28,7 +29,30 @@ def _has(package: str) -> bool:
28
29
 
29
30
  app = typer.Typer(
30
31
  name="server",
31
- help="Run ASR/TTS servers (Whisper, TTS, or proxy mode).",
32
+ help="""Run local ASR/TTS servers with OpenAI-compatible APIs.
33
+
34
+ **Available servers:**
35
+
36
+ - `whisper` - Local speech-to-text using Whisper models (faster-whisper or MLX)
37
+ - `tts` - Local text-to-speech using Piper (CPU) or Kokoro (GPU)
38
+ - `transcribe-proxy` - Proxy to external ASR providers (OpenAI, Gemini, Wyoming)
39
+
40
+ **Common workflows:**
41
+
42
+ ```bash
43
+ # Run local Whisper server (lazy loads large-v3 by default)
44
+ agent-cli server whisper
45
+
46
+ # Run local TTS with Kokoro backend (GPU-accelerated)
47
+ agent-cli server tts --backend kokoro
48
+
49
+ # Run transcription proxy using your configured ASR provider
50
+ agent-cli server transcribe-proxy
51
+ ```
52
+
53
+ All servers support Home Assistant via Wyoming protocol and can be used as
54
+ drop-in replacements for OpenAI's audio APIs.
55
+ """,
32
56
  add_completion=True,
33
57
  rich_markup_mode="markdown",
34
58
  no_args_is_help=True,
@@ -169,14 +193,18 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
169
193
  typer.Option(
170
194
  "--model",
171
195
  "-m",
172
- help="Model name(s) to load (can specify multiple)",
196
+ help=(
197
+ "Whisper model(s) to load. Common models: `tiny`, `base`, `small`, "
198
+ "`medium`, `large-v3`, `distil-large-v3`. Can specify multiple for "
199
+ "different accuracy/speed tradeoffs. Default: `large-v3`"
200
+ ),
173
201
  ),
174
202
  ] = None,
175
203
  default_model: Annotated[
176
204
  str | None,
177
205
  typer.Option(
178
206
  "--default-model",
179
- help="Default model when not specified in request",
207
+ help=("Model to use when client doesn't specify one. Must be in the `--model` list"),
180
208
  ),
181
209
  ] = None,
182
210
  device: Annotated[
@@ -184,42 +212,54 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
184
212
  typer.Option(
185
213
  "--device",
186
214
  "-d",
187
- help="Device: auto, cuda, cuda:0, cpu",
215
+ help=(
216
+ "Compute device: `auto` (detect GPU), `cuda`, `cuda:0`, `cpu`. "
217
+ "MLX backend always uses Apple Silicon"
218
+ ),
188
219
  ),
189
220
  ] = "auto",
190
221
  compute_type: Annotated[
191
222
  str,
192
223
  typer.Option(
193
224
  "--compute-type",
194
- help="Compute type: auto, float16, int8, int8_float16",
225
+ help=(
226
+ "Precision for faster-whisper: `auto`, `float16`, `int8`, `int8_float16`. "
227
+ "Lower precision = faster + less VRAM"
228
+ ),
195
229
  ),
196
230
  ] = "auto",
197
231
  cache_dir: Annotated[
198
232
  Path | None,
199
233
  typer.Option(
200
234
  "--cache-dir",
201
- help="Model cache directory",
235
+ help="Custom directory for downloaded models (default: HuggingFace cache)",
202
236
  ),
203
237
  ] = None,
204
238
  ttl: Annotated[
205
239
  int,
206
240
  typer.Option(
207
241
  "--ttl",
208
- help="Seconds before unloading idle model",
242
+ help=(
243
+ "Seconds of inactivity before unloading model from memory. "
244
+ "Set to 0 to keep loaded indefinitely"
245
+ ),
209
246
  ),
210
247
  ] = 300,
211
248
  preload: Annotated[
212
249
  bool,
213
250
  typer.Option(
214
251
  "--preload",
215
- help="Load model(s) at startup and wait for completion",
252
+ help=(
253
+ "Load model(s) immediately at startup instead of on first request. "
254
+ "Useful for reducing first-request latency"
255
+ ),
216
256
  ),
217
257
  ] = False,
218
258
  host: Annotated[
219
259
  str,
220
260
  typer.Option(
221
261
  "--host",
222
- help="Host to bind the server to",
262
+ help="Network interface to bind. Use `0.0.0.0` for all interfaces",
223
263
  ),
224
264
  ] = "0.0.0.0", # noqa: S104
225
265
  port: Annotated[
@@ -227,44 +267,40 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
227
267
  typer.Option(
228
268
  "--port",
229
269
  "-p",
230
- help="HTTP API port",
270
+ help="Port for OpenAI-compatible HTTP API (`/v1/audio/transcriptions`)",
231
271
  ),
232
272
  ] = 10301,
233
273
  wyoming_port: Annotated[
234
274
  int,
235
275
  typer.Option(
236
276
  "--wyoming-port",
237
- help="Wyoming protocol port",
277
+ help="Port for Wyoming protocol (Home Assistant integration)",
238
278
  ),
239
279
  ] = 10300,
240
280
  no_wyoming: Annotated[
241
281
  bool,
242
282
  typer.Option(
243
283
  "--no-wyoming",
244
- help="Disable Wyoming server",
284
+ help="Disable Wyoming protocol server (only run HTTP API)",
245
285
  ),
246
286
  ] = False,
247
287
  download_only: Annotated[
248
288
  bool,
249
289
  typer.Option(
250
290
  "--download-only",
251
- help="Download model(s) and exit without starting server",
291
+ help="Download model(s) to cache and exit. Useful for Docker builds",
252
292
  ),
253
293
  ] = False,
254
- log_level: Annotated[
255
- str,
256
- typer.Option(
257
- "--log-level",
258
- "-l",
259
- help="Logging level: debug, info, warning, error",
260
- ),
261
- ] = "info",
294
+ log_level: opts.LogLevel = opts.SERVER_LOG_LEVEL,
262
295
  backend: Annotated[
263
296
  str,
264
297
  typer.Option(
265
298
  "--backend",
266
299
  "-b",
267
- help="Backend: auto (platform detection), faster-whisper, mlx",
300
+ help=(
301
+ "Inference backend: `auto` (faster-whisper on CUDA/CPU, MLX on Apple Silicon), "
302
+ "`faster-whisper`, `mlx`"
303
+ ),
268
304
  ),
269
305
  ] = "auto",
270
306
  ) -> None:
@@ -278,7 +314,8 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
278
314
  Models are loaded lazily on first request and unloaded after being
279
315
  idle for the TTL duration, freeing VRAM for other applications.
280
316
 
281
- Examples:
317
+ **Examples:**
318
+
282
319
  # Run with default large-v3 model
283
320
  agent-cli server whisper
284
321
 
@@ -290,7 +327,6 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
290
327
 
291
328
  # Download model without starting server
292
329
  agent-cli server whisper --model large-v3 --download-only
293
-
294
330
  """
295
331
  # Setup Rich logging for consistent output
296
332
  setup_rich_logging(log_level)
@@ -378,6 +414,7 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
378
414
  console.print()
379
415
  console.print("[dim]Configuration:[/dim]")
380
416
  console.print(f" Backend: [cyan]{actual_backend}[/cyan]")
417
+ console.print(f" Log level: [cyan]{log_level}[/cyan]")
381
418
  console.print()
382
419
  console.print("[dim]Endpoints:[/dim]")
383
420
  console.print(f" HTTP API: [cyan]http://{host}:{port}[/cyan]")
@@ -422,45 +459,64 @@ def whisper_cmd( # noqa: PLR0912, PLR0915
422
459
 
423
460
 
424
461
  @app.command("transcribe-proxy")
425
- @requires_extras("server", "audio", "llm")
462
+ @requires_extras("server", "wyoming", "llm")
426
463
  def transcribe_proxy_cmd(
427
464
  host: Annotated[
428
465
  str,
429
- typer.Option("--host", help="Host to bind the server to"),
466
+ typer.Option("--host", help="Network interface to bind. Use `0.0.0.0` for all interfaces"),
430
467
  ] = "0.0.0.0", # noqa: S104
431
468
  port: Annotated[
432
469
  int,
433
- typer.Option("--port", "-p", help="Port to bind the server to"),
470
+ typer.Option("--port", "-p", help="Port for the HTTP API"),
434
471
  ] = 61337,
435
472
  reload: Annotated[
436
473
  bool,
437
- typer.Option("--reload", help="Enable auto-reload for development"),
474
+ typer.Option("--reload", help="Auto-reload on code changes (development only)"),
438
475
  ] = False,
476
+ log_level: opts.LogLevel = opts.SERVER_LOG_LEVEL,
439
477
  ) -> None:
440
- """Run transcription proxy server.
478
+ r"""Run transcription proxy that forwards to your configured ASR provider.
441
479
 
442
- This server proxies transcription requests to configured ASR providers
443
- (Wyoming, OpenAI, or Gemini) based on your agent-cli configuration.
480
+ Unlike `server whisper` which runs a local Whisper model, this proxy
481
+ forwards audio to external ASR providers configured in your agent-cli
482
+ config file or environment variables.
444
483
 
445
- It exposes:
446
- - /transcribe endpoint for audio transcription
447
- - /health endpoint for health checks
484
+ **Supported ASR providers:** `wyoming`, `openai`, `gemini`
485
+ **Supported LLM providers for cleanup:** `ollama`, `openai`, `gemini`
448
486
 
449
- This is the original server command functionality.
487
+ The server exposes:
450
488
 
451
- Examples:
452
- # Run on default port
489
+ - `POST /transcribe` - Accepts audio files, returns `{raw_transcript, cleaned_transcript}`
490
+ - `GET /health` - Health check endpoint
491
+
492
+ **When to use this vs `server whisper`:**
493
+
494
+ - Use `transcribe-proxy` when you want to use cloud ASR (OpenAI/Gemini)
495
+ or connect to a remote Wyoming server
496
+ - Use `server whisper` when you want to run a local Whisper model
497
+
498
+ Configuration is read from `~/.config/agent-cli/config.yaml` or env vars
499
+ like `ASR_PROVIDER`, `LLM_PROVIDER`, `OPENAI_API_KEY`, etc.
500
+
501
+ **Examples:**
502
+
503
+ # Run with providers from config file
453
504
  agent-cli server transcribe-proxy
454
505
 
455
- # Run on custom port
456
- agent-cli server transcribe-proxy --port 8080
506
+ # Run with OpenAI ASR via env vars
507
+ ASR_PROVIDER=openai OPENAI_API_KEY=sk-... agent-cli server transcribe-proxy
457
508
 
509
+ # Test with curl
510
+ curl -X POST http://localhost:61337/transcribe \\
511
+ -F "audio=@recording.wav" -F "cleanup=true"
458
512
  """
459
513
  _check_server_deps()
514
+ setup_rich_logging(log_level)
460
515
 
461
516
  console.print(
462
517
  f"[bold green]Starting Agent CLI transcription proxy on {host}:{port}[/bold green]",
463
518
  )
519
+ console.print(f"[dim]Log level: {log_level}[/dim]")
464
520
  if reload:
465
521
  console.print("[yellow]Auto-reload enabled for development[/yellow]")
466
522
 
@@ -471,7 +527,7 @@ def transcribe_proxy_cmd(
471
527
  host=host,
472
528
  port=port,
473
529
  reload=reload,
474
- log_level="info",
530
+ log_level=log_level.lower(),
475
531
  )
476
532
 
477
533
 
@@ -483,14 +539,18 @@ def tts_cmd( # noqa: PLR0915
483
539
  typer.Option(
484
540
  "--model",
485
541
  "-m",
486
- help="Model name(s) to load. Piper: 'en_US-lessac-medium'. Kokoro: 'kokoro' (auto-downloads)",
542
+ help=(
543
+ "Model/voice(s) to load. Piper: `en_US-lessac-medium`, `en_GB-alan-medium`. "
544
+ "Kokoro: `af_heart`, `af_bella`, `am_adam`. "
545
+ "Auto-downloads on first use"
546
+ ),
487
547
  ),
488
548
  ] = None,
489
549
  default_model: Annotated[
490
550
  str | None,
491
551
  typer.Option(
492
552
  "--default-model",
493
- help="Default model when not specified in request",
553
+ help=("Voice to use when client doesn't specify one. Must be in the `--model` list"),
494
554
  ),
495
555
  ] = None,
496
556
  device: Annotated[
@@ -498,35 +558,44 @@ def tts_cmd( # noqa: PLR0915
498
558
  typer.Option(
499
559
  "--device",
500
560
  "-d",
501
- help="Device: auto, cpu, cuda, mps (Piper is CPU-only, Kokoro supports GPU)",
561
+ help=(
562
+ "Compute device: `auto`, `cpu`, `cuda`, `mps`. "
563
+ "Piper is CPU-only; Kokoro supports GPU acceleration"
564
+ ),
502
565
  ),
503
566
  ] = "auto",
504
567
  cache_dir: Annotated[
505
568
  Path | None,
506
569
  typer.Option(
507
570
  "--cache-dir",
508
- help="Model cache directory",
571
+ help="Custom directory for downloaded models (default: ~/.cache/agent-cli/tts/)",
509
572
  ),
510
573
  ] = None,
511
574
  ttl: Annotated[
512
575
  int,
513
576
  typer.Option(
514
577
  "--ttl",
515
- help="Seconds before unloading idle model",
578
+ help=(
579
+ "Seconds of inactivity before unloading model from memory. "
580
+ "Set to 0 to keep loaded indefinitely"
581
+ ),
516
582
  ),
517
583
  ] = 300,
518
584
  preload: Annotated[
519
585
  bool,
520
586
  typer.Option(
521
587
  "--preload",
522
- help="Load model(s) at startup and wait for completion",
588
+ help=(
589
+ "Load model(s) immediately at startup instead of on first request. "
590
+ "Useful for reducing first-request latency"
591
+ ),
523
592
  ),
524
593
  ] = False,
525
594
  host: Annotated[
526
595
  str,
527
596
  typer.Option(
528
597
  "--host",
529
- help="Host to bind the server to",
598
+ help="Network interface to bind. Use `0.0.0.0` for all interfaces",
530
599
  ),
531
600
  ] = "0.0.0.0", # noqa: S104
532
601
  port: Annotated[
@@ -534,44 +603,40 @@ def tts_cmd( # noqa: PLR0915
534
603
  typer.Option(
535
604
  "--port",
536
605
  "-p",
537
- help="HTTP API port",
606
+ help="Port for OpenAI-compatible HTTP API (`/v1/audio/speech`)",
538
607
  ),
539
608
  ] = 10201,
540
609
  wyoming_port: Annotated[
541
610
  int,
542
611
  typer.Option(
543
612
  "--wyoming-port",
544
- help="Wyoming protocol port",
613
+ help="Port for Wyoming protocol (Home Assistant integration)",
545
614
  ),
546
615
  ] = 10200,
547
616
  no_wyoming: Annotated[
548
617
  bool,
549
618
  typer.Option(
550
619
  "--no-wyoming",
551
- help="Disable Wyoming server",
620
+ help="Disable Wyoming protocol server (only run HTTP API)",
552
621
  ),
553
622
  ] = False,
554
623
  download_only: Annotated[
555
624
  bool,
556
625
  typer.Option(
557
626
  "--download-only",
558
- help="Download model(s) and exit without starting server",
627
+ help="Download model(s)/voice(s) to cache and exit. Useful for Docker builds",
559
628
  ),
560
629
  ] = False,
561
- log_level: Annotated[
562
- str,
563
- typer.Option(
564
- "--log-level",
565
- "-l",
566
- help="Logging level: debug, info, warning, error",
567
- ),
568
- ] = "info",
630
+ log_level: opts.LogLevel = opts.SERVER_LOG_LEVEL,
569
631
  backend: Annotated[
570
632
  str,
571
633
  typer.Option(
572
634
  "--backend",
573
635
  "-b",
574
- help="Backend: auto, piper, kokoro",
636
+ help=(
637
+ "TTS engine: `auto` (prefer Kokoro if available), "
638
+ "`piper` (CPU, many languages), `kokoro` (GPU, high quality)"
639
+ ),
575
640
  ),
576
641
  ] = "auto",
577
642
  ) -> None:
@@ -594,7 +659,8 @@ def tts_cmd( # noqa: PLR0915
594
659
  Voices: af_heart, af_bella, am_adam, bf_emma, bm_george, etc.
595
660
  See https://huggingface.co/hexgrad/Kokoro-82M for all voices.
596
661
 
597
- Examples:
662
+ **Examples:**
663
+
598
664
  # Run with Kokoro (auto-downloads model and voices)
599
665
  agent-cli server tts --backend kokoro
600
666
 
@@ -609,7 +675,6 @@ def tts_cmd( # noqa: PLR0915
609
675
 
610
676
  # Download Piper model without starting server
611
677
  agent-cli server tts --backend piper --model en_US-lessac-medium --download-only
612
-
613
678
  """
614
679
  # Setup Rich logging for consistent output
615
680
  setup_rich_logging(log_level)
@@ -679,6 +744,7 @@ def tts_cmd( # noqa: PLR0915
679
744
  console.print()
680
745
  console.print("[dim]Configuration:[/dim]")
681
746
  console.print(f" Backend: [cyan]{resolved_backend}[/cyan]")
747
+ console.print(f" Log level: [cyan]{log_level}[/cyan]")
682
748
  console.print()
683
749
  console.print("[dim]Endpoints:[/dim]")
684
750
  console.print(f" HTTP API: [cyan]http://{host}:{port}[/cyan]")
@@ -3,8 +3,9 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
+ import os
6
7
  from pathlib import Path
7
- from typing import Annotated, Any
8
+ from typing import TYPE_CHECKING, Annotated, Any
8
9
 
9
10
  from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile
10
11
  from pydantic import BaseModel
@@ -26,6 +27,9 @@ from agent_cli.server.common import log_requests_middleware
26
27
  from agent_cli.services import asr
27
28
  from agent_cli.services.llm import process_and_update_clipboard
28
29
 
30
+ if TYPE_CHECKING:
31
+ from typer.models import OptionInfo
32
+
29
33
  # Configure logging
30
34
  logging.basicConfig(level=logging.INFO)
31
35
  LOGGER = logging.getLogger(__name__)
@@ -37,6 +41,40 @@ app = FastAPI(
37
41
  )
38
42
 
39
43
 
44
+ @app.on_event("startup")
45
+ async def log_effective_config() -> None:
46
+ """Log effective configuration on startup to help debug env var issues."""
47
+ (
48
+ provider_cfg,
49
+ wyoming_cfg,
50
+ openai_asr_cfg,
51
+ gemini_asr_cfg,
52
+ ollama_cfg,
53
+ openai_llm_cfg,
54
+ gemini_llm_cfg,
55
+ _,
56
+ ) = _load_transcription_configs()
57
+
58
+ LOGGER.info("ASR provider: %s", provider_cfg.asr_provider)
59
+ if provider_cfg.asr_provider == "wyoming":
60
+ LOGGER.info(" Wyoming: %s:%d", wyoming_cfg.asr_wyoming_ip, wyoming_cfg.asr_wyoming_port)
61
+ elif provider_cfg.asr_provider == "openai":
62
+ LOGGER.info(" Model: %s", openai_asr_cfg.asr_openai_model)
63
+ LOGGER.info(" Base URL: %s", openai_asr_cfg.openai_base_url or "https://api.openai.com/v1")
64
+ elif provider_cfg.asr_provider == "gemini":
65
+ LOGGER.info(" Model: %s", gemini_asr_cfg.asr_gemini_model)
66
+
67
+ LOGGER.info("LLM provider: %s", provider_cfg.llm_provider)
68
+ if provider_cfg.llm_provider == "ollama":
69
+ LOGGER.info(" Model: %s", ollama_cfg.llm_ollama_model)
70
+ LOGGER.info(" Host: %s", ollama_cfg.llm_ollama_host)
71
+ elif provider_cfg.llm_provider == "openai":
72
+ LOGGER.info(" Model: %s", openai_llm_cfg.llm_openai_model)
73
+ LOGGER.info(" Base URL: %s", openai_llm_cfg.openai_base_url or "https://api.openai.com/v1")
74
+ elif provider_cfg.llm_provider == "gemini":
75
+ LOGGER.info(" Model: %s", gemini_llm_cfg.llm_gemini_model)
76
+
77
+
40
78
  @app.middleware("http")
41
79
  async def log_requests(request: Request, call_next) -> Any: # type: ignore[no-untyped-def] # noqa: ANN001
42
80
  """Log basic request information."""
@@ -83,6 +121,7 @@ async def health_check() -> HealthResponse:
83
121
 
84
122
  async def _transcribe_with_provider(
85
123
  audio_data: bytes,
124
+ filename: str,
86
125
  provider_cfg: config.ProviderSelection,
87
126
  wyoming_asr_cfg: config.WyomingASR,
88
127
  openai_asr_cfg: config.OpenAIASR,
@@ -90,6 +129,7 @@ async def _transcribe_with_provider(
90
129
  ) -> str:
91
130
  """Transcribe audio using the configured provider."""
92
131
  transcriber = asr.create_recorded_audio_transcriber(provider_cfg)
132
+ file_suffix = Path(filename).suffix.lower() or ".wav"
93
133
 
94
134
  if provider_cfg.asr_provider == "wyoming":
95
135
  return await transcriber(
@@ -102,12 +142,14 @@ async def _transcribe_with_provider(
102
142
  audio_data=audio_data,
103
143
  openai_asr_cfg=openai_asr_cfg,
104
144
  logger=LOGGER,
145
+ file_suffix=file_suffix,
105
146
  )
106
147
  if provider_cfg.asr_provider == "gemini":
107
148
  return await transcriber(
108
149
  audio_data=audio_data,
109
150
  gemini_asr_cfg=gemini_asr_cfg,
110
151
  logger=LOGGER,
152
+ file_suffix=file_suffix,
111
153
  )
112
154
  msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}"
113
155
  raise NotImplementedError(msg)
@@ -153,6 +195,13 @@ def _validate_audio_file(audio: UploadFile) -> None:
153
195
  )
154
196
 
155
197
 
198
+ def _cfg(key: str, defaults: dict[str, Any], opt: OptionInfo) -> Any:
199
+ """Get config with priority: env var > config file > option default."""
200
+ if opt.envvar and (env_val := os.environ.get(opt.envvar)):
201
+ return int(env_val) if isinstance(opt.default, int) else env_val
202
+ return defaults.get(key, opt.default)
203
+
204
+
156
205
  def _load_transcription_configs() -> tuple[
157
206
  config.ProviderSelection,
158
207
  config.WyomingASR,
@@ -163,41 +212,43 @@ def _load_transcription_configs() -> tuple[
163
212
  config.GeminiLLM,
164
213
  dict[str, Any],
165
214
  ]:
166
- """Load and create all required configuration objects."""
215
+ """Load config objects. Priority: env var > config file > default."""
167
216
  loaded_config = config.load_config()
168
217
  wildcard_config = loaded_config.get("defaults", {})
169
218
  command_config = loaded_config.get("transcribe", {})
170
219
  defaults = {**wildcard_config, **command_config}
171
220
 
172
221
  provider_cfg = config.ProviderSelection(
173
- asr_provider=defaults.get("asr_provider", opts.ASR_PROVIDER.default), # type: ignore[attr-defined]
174
- llm_provider=defaults.get("llm_provider", opts.LLM_PROVIDER.default), # type: ignore[attr-defined]
175
- tts_provider=opts.TTS_PROVIDER.default, # type: ignore[attr-defined]
222
+ asr_provider=_cfg("asr_provider", defaults, opts.ASR_PROVIDER),
223
+ llm_provider=_cfg("llm_provider", defaults, opts.LLM_PROVIDER),
224
+ tts_provider=_cfg("tts_provider", defaults, opts.TTS_PROVIDER),
176
225
  )
177
226
  wyoming_asr_cfg = config.WyomingASR(
178
- asr_wyoming_ip=defaults.get("asr_wyoming_ip", opts.ASR_WYOMING_IP.default), # type: ignore[attr-defined]
179
- asr_wyoming_port=defaults.get("asr_wyoming_port", opts.ASR_WYOMING_PORT.default), # type: ignore[attr-defined]
227
+ asr_wyoming_ip=_cfg("asr_wyoming_ip", defaults, opts.ASR_WYOMING_IP),
228
+ asr_wyoming_port=_cfg("asr_wyoming_port", defaults, opts.ASR_WYOMING_PORT),
180
229
  )
181
230
  openai_asr_cfg = config.OpenAIASR(
182
- asr_openai_model=defaults.get("asr_openai_model", opts.ASR_OPENAI_MODEL.default), # type: ignore[attr-defined]
183
- openai_api_key=defaults.get("openai_api_key", opts.OPENAI_API_KEY.default), # type: ignore[attr-defined,union-attr]
231
+ asr_openai_model=_cfg("asr_openai_model", defaults, opts.ASR_OPENAI_MODEL),
232
+ openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
233
+ openai_base_url=_cfg("asr_openai_base_url", defaults, opts.ASR_OPENAI_BASE_URL),
234
+ asr_openai_prompt=_cfg("asr_openai_prompt", defaults, opts.ASR_OPENAI_PROMPT),
184
235
  )
185
236
  gemini_asr_cfg = config.GeminiASR(
186
- asr_gemini_model=defaults.get("asr_gemini_model", opts.ASR_GEMINI_MODEL.default), # type: ignore[attr-defined]
187
- gemini_api_key=defaults.get("gemini_api_key", opts.GEMINI_API_KEY.default), # type: ignore[attr-defined,union-attr]
237
+ asr_gemini_model=_cfg("asr_gemini_model", defaults, opts.ASR_GEMINI_MODEL),
238
+ gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
188
239
  )
189
240
  ollama_cfg = config.Ollama(
190
- llm_ollama_model=defaults.get("llm_ollama_model", opts.LLM_OLLAMA_MODEL.default), # type: ignore[attr-defined]
191
- llm_ollama_host=defaults.get("llm_ollama_host", opts.LLM_OLLAMA_HOST.default), # type: ignore[attr-defined]
241
+ llm_ollama_model=_cfg("llm_ollama_model", defaults, opts.LLM_OLLAMA_MODEL),
242
+ llm_ollama_host=_cfg("llm_ollama_host", defaults, opts.LLM_OLLAMA_HOST),
192
243
  )
193
244
  openai_llm_cfg = config.OpenAILLM(
194
- llm_openai_model=defaults.get("llm_openai_model", opts.LLM_OPENAI_MODEL.default), # type: ignore[attr-defined]
195
- openai_api_key=defaults.get("openai_api_key", opts.OPENAI_API_KEY.default), # type: ignore[attr-defined,union-attr]
196
- openai_base_url=defaults.get("openai_base_url", opts.OPENAI_BASE_URL.default), # type: ignore[attr-defined,union-attr]
245
+ llm_openai_model=_cfg("llm_openai_model", defaults, opts.LLM_OPENAI_MODEL),
246
+ openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
247
+ openai_base_url=_cfg("openai_base_url", defaults, opts.OPENAI_BASE_URL),
197
248
  )
198
249
  gemini_llm_cfg = config.GeminiLLM(
199
- llm_gemini_model=defaults.get("llm_gemini_model", opts.LLM_GEMINI_MODEL.default), # type: ignore[attr-defined]
200
- gemini_api_key=defaults.get("gemini_api_key", opts.GEMINI_API_KEY.default), # type: ignore[attr-defined,union-attr]
250
+ llm_gemini_model=_cfg("llm_gemini_model", defaults, opts.LLM_GEMINI_MODEL),
251
+ gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
201
252
  )
202
253
 
203
254
  return (
@@ -309,8 +360,14 @@ async def transcribe_audio(
309
360
  defaults,
310
361
  ) = _load_transcription_configs()
311
362
 
312
- # Save uploaded file
363
+ # Read uploaded file
313
364
  audio_data = await audio_file.read()
365
+ LOGGER.info(
366
+ "Received audio: filename=%s, size=%d bytes, content_type=%s",
367
+ audio_file.filename,
368
+ len(audio_data),
369
+ audio_file.content_type,
370
+ )
314
371
 
315
372
  # Convert audio to Wyoming format if using local ASR
316
373
  if provider_cfg.asr_provider == "wyoming":
@@ -319,6 +376,7 @@ async def transcribe_audio(
319
376
  # Transcribe audio using the configured provider
320
377
  raw_transcript = await _transcribe_with_provider(
321
378
  audio_data,
379
+ audio_file.filename or "audio.wav",
322
380
  provider_cfg,
323
381
  wyoming_asr_cfg,
324
382
  openai_asr_cfg,