vocal-cli 0.3.7__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vocal-cli
3
- Version: 0.3.7
3
+ Version: 0.3.8
4
4
  Summary: CLI tool for Vocal - Ollama-style Voice Model Management
5
5
  Project-URL: Homepage, https://github.com/niradler/vocal
6
6
  Project-URL: Documentation, https://github.com/niradler/vocal/tree/master/docs
@@ -23,6 +23,6 @@ Requires-Dist: rich>=14.3.3
23
23
  Requires-Dist: sounddevice>=0.5.5
24
24
  Requires-Dist: typer>=0.24.1
25
25
  Requires-Dist: uvicorn>=0.41.0
26
- Requires-Dist: vocal-core>=0.3.7
27
- Requires-Dist: vocal-sdk>=0.3.7
26
+ Requires-Dist: vocal-core>=0.3.8
27
+ Requires-Dist: vocal-sdk>=0.3.8
28
28
  Requires-Dist: websockets>=16.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "vocal-cli"
3
- version = "0.3.7"
3
+ version = "0.3.8"
4
4
  description = "CLI tool for Vocal - Ollama-style Voice Model Management"
5
5
  requires-python = ">=3.11"
6
6
  license = { text = "SSPL-1.0" }
@@ -19,8 +19,8 @@ classifiers = [
19
19
  "Topic :: Utilities",
20
20
  ]
21
21
  dependencies = [
22
- "vocal-core>=0.3.7",
23
- "vocal-sdk>=0.3.7",
22
+ "vocal-core>=0.3.8",
23
+ "vocal-sdk>=0.3.8",
24
24
  "typer>=0.24.1",
25
25
  "rich>=14.3.3",
26
26
  "uvicorn>=0.41.0",
@@ -4,4 +4,4 @@ Vocal CLI - Command-line interface for Vocal Speech AI Platform
4
4
  This CLI provides Ollama-style commands for model management and audio transcription.
5
5
  """
6
6
 
7
- __version__ = "0.3.7"
7
+ __version__ = "0.3.8"
@@ -19,7 +19,7 @@ from rich.table import Table
19
19
 
20
20
  from vocal_core.config import vocal_settings
21
21
  from vocal_sdk import VocalClient
22
- from vocal_sdk.api.audio import voice_clone_v1_audio_clone_post
22
+ from vocal_sdk.api.audio import text_to_speech_v1_audio_speech_post, voice_clone_v1_audio_clone_post
23
23
  from vocal_sdk.api.models import (
24
24
  delete_model_v1_models_model_id_delete,
25
25
  list_models_v1_models_get,
@@ -35,6 +35,8 @@ from vocal_sdk.models import (
35
35
  BodyVoiceCloneV1AudioClonePost,
36
36
  BodyVoiceCloneV1AudioClonePostResponseFormat,
37
37
  TranscriptionFormat,
38
+ TTSRequest,
39
+ TTSRequestResponseFormat,
38
40
  )
39
41
  from vocal_sdk.types import UNSET, File, Unset
40
42
 
@@ -296,6 +298,75 @@ def models_delete(
296
298
  raise typer.Exit(1)
297
299
 
298
300
 
301
+ @app.command()
302
+ def speak(
303
+ text: str = typer.Argument(..., help="Text to synthesize"),
304
+ output: Path | None = typer.Option(None, "--output", "-o", help="Output file path (default: play audio)"),
305
+ model: str = typer.Option(
306
+ vocal_settings.TTS_DEFAULT_MODEL,
307
+ "--model",
308
+ "-m",
309
+ help="TTS model to use (e.g. 'pyttsx3', 'k2-fsa/OmniVoice')",
310
+ ),
311
+ models: bool = typer.Option(False, "--models", help="Interactively select from downloaded TTS models"),
312
+ voice: str | None = typer.Option(None, "--voice", help="Voice ID or instruction (model-specific, e.g. 'female, young adult, american accent')"),
313
+ speed: float = typer.Option(1.0, "--speed", "-s", min=0.25, max=4.0, help="Speech speed multiplier"),
314
+ response_format: str = typer.Option("wav", "--format", "-f", help="Output audio format: wav, mp3, flac, pcm, aac, opus"),
315
+ api_url: str = typer.Option("http://localhost:8000", "--api-url", envvar="VOCAL_API_URL", help="Vocal API URL"),
316
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Show synthesis timing and output size"),
317
+ ) -> None:
318
+ """Synthesize text to speech"""
319
+ if models:
320
+ selected_model = _model_wizard(api_url, task="tts")
321
+ if selected_model is None:
322
+ raise typer.Exit(1)
323
+ model = selected_model
324
+
325
+ try:
326
+ fmt = TTSRequestResponseFormat(response_format.lower())
327
+ except ValueError:
328
+ valid = ", ".join(f.value for f in TTSRequestResponseFormat)
329
+ console.print(f"[red]Error:[/red] Invalid format '{response_format}'. Valid options: {valid}")
330
+ raise typer.Exit(1)
331
+
332
+ if model != "pyttsx3":
333
+ _check_model_ready(api_url, model)
334
+
335
+ try:
336
+ vc = _make_client(api_url)
337
+ t0 = time.monotonic()
338
+
339
+ body = TTSRequest(
340
+ model=model,
341
+ input_=text,
342
+ voice=voice if voice is not None else UNSET,
343
+ speed=speed,
344
+ response_format=fmt,
345
+ )
346
+ resp = text_to_speech_v1_audio_speech_post.sync_detailed(client=vc, body=body)
347
+
348
+ elapsed = time.monotonic() - t0
349
+
350
+ if resp.status_code != 200:
351
+ _clone_error(resp.content, resp.status_code)
352
+ raise typer.Exit(1)
353
+
354
+ audio_bytes = resp.content
355
+ _speak_output(audio_bytes, response_format, output, elapsed, verbose)
356
+
357
+ except typer.Exit:
358
+ raise
359
+ except UnexpectedStatus as e:
360
+ msg = _api_error_message(e)
361
+ console.print(f"[red]Error:[/red] {msg}")
362
+ if e.status_code == 503:
363
+ console.print("[dim]The model or a required package is not available on the server.[/dim]")
364
+ raise typer.Exit(1)
365
+ except Exception as e:
366
+ console.print(f"[red]Error:[/red] {e}")
367
+ raise typer.Exit(1)
368
+
369
+
299
370
  @app.command()
300
371
  def clone(
301
372
  text: str = typer.Argument(..., help="Text to synthesize in the cloned voice"),
@@ -898,7 +969,9 @@ def _print_output_devices_table() -> None:
898
969
  @app.command()
899
970
  def chat(
900
971
  model: str = typer.Option(vocal_settings.STT_DEFAULT_MODEL, "--model", "-m", help="STT model to use"),
901
- models: bool = typer.Option(False, "--models", help="Interactively select from downloaded STT models"),
972
+ models: bool = typer.Option(False, "--models", help="Interactively select from downloaded STT and TTS models"),
973
+ tts_model: str = typer.Option(vocal_settings.TTS_DEFAULT_MODEL, "--tts-model", help="TTS model to use for voice responses"),
974
+ tts_voice: str | None = typer.Option(None, "--tts-voice", help="TTS voice ID or instruction (model-specific)"),
902
975
  language: str | None = typer.Option(None, "--language", "-l", help="Language code (e.g. 'en'). Auto-detected if omitted."),
903
976
  device: str | None = typer.Option(None, "--device", "-d", help="Input device index or name (use --devices to pick interactively)"),
904
977
  output_device: str | None = typer.Option(None, "--output-device", "-o", help="Output device index or name (run `vocal devices --output` to list)"),
@@ -923,6 +996,10 @@ def chat(
923
996
  if selected_model is None:
924
997
  raise typer.Exit(1)
925
998
  model = selected_model
999
+ selected_tts = _model_wizard(api_url, task="tts")
1000
+ if selected_tts is None:
1001
+ raise typer.Exit(1)
1002
+ tts_model = selected_tts
926
1003
  try:
927
1004
  device_idx = _resolve_device(device)
928
1005
  output_device_idx = _resolve_output_device(output_device)
@@ -936,11 +1013,11 @@ def chat(
936
1013
  device_label = f"[dim]{active_device['name']}[/dim]"
937
1014
  threshold_hint = f" vad=[cyan]{silence_threshold:.0f}[/cyan]" if silence_threshold is not None else ""
938
1015
 
939
- console.print(f"[green]Voice chat started[/green] model=[cyan]{model}[/cyan] device={device_label}{threshold_hint} Ctrl+C to stop\n")
1016
+ console.print(f"[green]Voice chat started[/green] stt=[cyan]{model}[/cyan] tts=[cyan]{tts_model}[/cyan] device={device_label}{threshold_hint} Ctrl+C to stop\n")
940
1017
  console.print("[dim]Speak — I'll transcribe, think, and respond with audio.[/dim]\n")
941
1018
 
942
1019
  try:
943
- asyncio.run(_chat_async(ws_url, device_idx, output_device_idx, model, language, system_prompt, silence_threshold, verbose))
1020
+ asyncio.run(_chat_async(ws_url, device_idx, output_device_idx, model, language, system_prompt, silence_threshold, verbose, tts_model=tts_model, tts_voice=tts_voice))
944
1021
  except KeyboardInterrupt:
945
1022
  console.print("\n[yellow]Stopped.[/yellow]")
946
1023
  except Exception as e:
@@ -948,6 +1025,25 @@ def chat(
948
1025
  raise typer.Exit(1)
949
1026
 
950
1027
 
1028
+ def _speak_output(audio_bytes: bytes, response_format: str, output: Path | None, elapsed: float, verbose: bool) -> None:
1029
+ if output:
1030
+ output.write_bytes(audio_bytes)
1031
+ timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
1032
+ console.print(f"[green]Saved[/green] {len(audio_bytes):,} bytes -> [cyan]{output}[/cyan]{timing}")
1033
+ elif sys.stdout.isatty():
1034
+ if response_format == "wav":
1035
+ timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
1036
+ console.print(f"[green]Playing[/green] {len(audio_bytes):,} bytes{timing}")
1037
+ _play_wav_bytes(audio_bytes)
1038
+ else:
1039
+ console.print(f"[yellow]Tip:[/yellow] Use [cyan]--output file.{response_format}[/cyan] to save non-WAV audio, or omit [cyan]--format[/cyan] for auto-play.")
1040
+ sys.stdout.buffer.write(audio_bytes)
1041
+ else:
1042
+ if verbose:
1043
+ sys.stderr.write(f" {len(audio_bytes):,} bytes ({response_format}) in {elapsed:.1f}s\n")
1044
+ sys.stdout.buffer.write(audio_bytes)
1045
+
1046
+
951
1047
  def _clone_error(content: bytes, status_code: int) -> None:
952
1048
  try:
953
1049
  detail = json.loads(content).get("detail", "")
@@ -963,7 +1059,7 @@ def _clone_output(audio_bytes: bytes, fmt: BodyVoiceCloneV1AudioClonePostRespons
963
1059
  if output:
964
1060
  output.write_bytes(audio_bytes)
965
1061
  timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
966
- console.print(f"[green]Saved[/green] {len(audio_bytes):,} bytes [cyan]{output}[/cyan]{timing}")
1062
+ console.print(f"[green]Saved[/green] {len(audio_bytes):,} bytes -> [cyan]{output}[/cyan]{timing}")
967
1063
  elif sys.stdout.isatty():
968
1064
  if fmt == BodyVoiceCloneV1AudioClonePostResponseFormat.WAV:
969
1065
  timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
@@ -1080,16 +1176,20 @@ async def _chat_receiver(ws, output_device_idx: int | None, verbose: bool, loop:
1080
1176
  console.print(f"\n[red]error:[/red] {event.get('error', {}).get('message', 'unknown')}")
1081
1177
 
1082
1178
 
1083
- def _build_chat_session_cfg(model: str, language: str | None, system_prompt: str, vad_threshold: float | None) -> dict:
1179
+ def _build_chat_session_cfg(model: str, language: str | None, system_prompt: str, vad_threshold: float | None, tts_model: str | None = None, tts_voice: str | None = None) -> dict:
1084
1180
  cfg: dict = {"type": "realtime", "model": model, "input_sample_rate": _SAMPLE_RATE, "system_prompt": system_prompt}
1085
1181
  if language:
1086
1182
  cfg["language"] = language
1087
1183
  if vad_threshold is not None:
1088
1184
  cfg["turn_detection"] = {"threshold": vad_threshold / 32768.0}
1185
+ if tts_model:
1186
+ cfg["tts_model"] = tts_model
1187
+ if tts_voice:
1188
+ cfg["tts_voice"] = tts_voice
1089
1189
  return cfg
1090
1190
 
1091
1191
 
1092
- async def _chat_async(ws_url: str, device_idx: int | None, output_device_idx: int | None, model: str, language: str | None, system_prompt: str, vad_threshold: float | None, verbose: bool) -> None:
1192
+ async def _chat_async(ws_url: str, device_idx: int | None, output_device_idx: int | None, model: str, language: str | None, system_prompt: str, vad_threshold: float | None, verbose: bool, tts_model: str | None = None, tts_voice: str | None = None) -> None:
1093
1193
  audio_q: queue.SimpleQueue = queue.SimpleQueue()
1094
1194
  loop = asyncio.get_running_loop()
1095
1195
  stop_event = asyncio.Event()
@@ -1116,7 +1216,7 @@ async def _chat_async(ws_url: str, device_idx: int | None, output_device_idx: in
1116
1216
  async with websockets.connect(f"{ws_url}/v1/realtime", open_timeout=10) as ws:
1117
1217
  await asyncio.wait_for(ws.recv(), timeout=5.0)
1118
1218
 
1119
- session_cfg = _build_chat_session_cfg(model, language, system_prompt, vad_threshold)
1219
+ session_cfg = _build_chat_session_cfg(model, language, system_prompt, vad_threshold, tts_model=tts_model, tts_voice=tts_voice)
1120
1220
  await ws.send(json.dumps({"type": "session.update", "session": session_cfg}))
1121
1221
  await asyncio.wait_for(ws.recv(), timeout=5.0)
1122
1222
 
File without changes
File without changes