vocal-cli 0.3.7__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vocal_cli-0.3.7 → vocal_cli-0.3.8}/PKG-INFO +3 -3
- {vocal_cli-0.3.7 → vocal_cli-0.3.8}/pyproject.toml +3 -3
- {vocal_cli-0.3.7 → vocal_cli-0.3.8}/vocal_cli/__init__.py +1 -1
- {vocal_cli-0.3.7 → vocal_cli-0.3.8}/vocal_cli/main.py +108 -8
- {vocal_cli-0.3.7 → vocal_cli-0.3.8}/.gitignore +0 -0
- {vocal_cli-0.3.7 → vocal_cli-0.3.8}/MANIFEST.in +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vocal-cli
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: CLI tool for Vocal - Ollama-style Voice Model Management
|
|
5
5
|
Project-URL: Homepage, https://github.com/niradler/vocal
|
|
6
6
|
Project-URL: Documentation, https://github.com/niradler/vocal/tree/master/docs
|
|
@@ -23,6 +23,6 @@ Requires-Dist: rich>=14.3.3
|
|
|
23
23
|
Requires-Dist: sounddevice>=0.5.5
|
|
24
24
|
Requires-Dist: typer>=0.24.1
|
|
25
25
|
Requires-Dist: uvicorn>=0.41.0
|
|
26
|
-
Requires-Dist: vocal-core>=0.3.
|
|
27
|
-
Requires-Dist: vocal-sdk>=0.3.
|
|
26
|
+
Requires-Dist: vocal-core>=0.3.8
|
|
27
|
+
Requires-Dist: vocal-sdk>=0.3.8
|
|
28
28
|
Requires-Dist: websockets>=16.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "vocal-cli"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.8"
|
|
4
4
|
description = "CLI tool for Vocal - Ollama-style Voice Model Management"
|
|
5
5
|
requires-python = ">=3.11"
|
|
6
6
|
license = { text = "SSPL-1.0" }
|
|
@@ -19,8 +19,8 @@ classifiers = [
|
|
|
19
19
|
"Topic :: Utilities",
|
|
20
20
|
]
|
|
21
21
|
dependencies = [
|
|
22
|
-
"vocal-core>=0.3.
|
|
23
|
-
"vocal-sdk>=0.3.
|
|
22
|
+
"vocal-core>=0.3.8",
|
|
23
|
+
"vocal-sdk>=0.3.8",
|
|
24
24
|
"typer>=0.24.1",
|
|
25
25
|
"rich>=14.3.3",
|
|
26
26
|
"uvicorn>=0.41.0",
|
|
@@ -19,7 +19,7 @@ from rich.table import Table
|
|
|
19
19
|
|
|
20
20
|
from vocal_core.config import vocal_settings
|
|
21
21
|
from vocal_sdk import VocalClient
|
|
22
|
-
from vocal_sdk.api.audio import voice_clone_v1_audio_clone_post
|
|
22
|
+
from vocal_sdk.api.audio import text_to_speech_v1_audio_speech_post, voice_clone_v1_audio_clone_post
|
|
23
23
|
from vocal_sdk.api.models import (
|
|
24
24
|
delete_model_v1_models_model_id_delete,
|
|
25
25
|
list_models_v1_models_get,
|
|
@@ -35,6 +35,8 @@ from vocal_sdk.models import (
|
|
|
35
35
|
BodyVoiceCloneV1AudioClonePost,
|
|
36
36
|
BodyVoiceCloneV1AudioClonePostResponseFormat,
|
|
37
37
|
TranscriptionFormat,
|
|
38
|
+
TTSRequest,
|
|
39
|
+
TTSRequestResponseFormat,
|
|
38
40
|
)
|
|
39
41
|
from vocal_sdk.types import UNSET, File, Unset
|
|
40
42
|
|
|
@@ -296,6 +298,75 @@ def models_delete(
|
|
|
296
298
|
raise typer.Exit(1)
|
|
297
299
|
|
|
298
300
|
|
|
301
|
+
@app.command()
|
|
302
|
+
def speak(
|
|
303
|
+
text: str = typer.Argument(..., help="Text to synthesize"),
|
|
304
|
+
output: Path | None = typer.Option(None, "--output", "-o", help="Output file path (default: play audio)"),
|
|
305
|
+
model: str = typer.Option(
|
|
306
|
+
vocal_settings.TTS_DEFAULT_MODEL,
|
|
307
|
+
"--model",
|
|
308
|
+
"-m",
|
|
309
|
+
help="TTS model to use (e.g. 'pyttsx3', 'k2-fsa/OmniVoice')",
|
|
310
|
+
),
|
|
311
|
+
models: bool = typer.Option(False, "--models", help="Interactively select from downloaded TTS models"),
|
|
312
|
+
voice: str | None = typer.Option(None, "--voice", help="Voice ID or instruction (model-specific, e.g. 'female, young adult, american accent')"),
|
|
313
|
+
speed: float = typer.Option(1.0, "--speed", "-s", min=0.25, max=4.0, help="Speech speed multiplier"),
|
|
314
|
+
response_format: str = typer.Option("wav", "--format", "-f", help="Output audio format: wav, mp3, flac, pcm, aac, opus"),
|
|
315
|
+
api_url: str = typer.Option("http://localhost:8000", "--api-url", envvar="VOCAL_API_URL", help="Vocal API URL"),
|
|
316
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show synthesis timing and output size"),
|
|
317
|
+
) -> None:
|
|
318
|
+
"""Synthesize text to speech"""
|
|
319
|
+
if models:
|
|
320
|
+
selected_model = _model_wizard(api_url, task="tts")
|
|
321
|
+
if selected_model is None:
|
|
322
|
+
raise typer.Exit(1)
|
|
323
|
+
model = selected_model
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
fmt = TTSRequestResponseFormat(response_format.lower())
|
|
327
|
+
except ValueError:
|
|
328
|
+
valid = ", ".join(f.value for f in TTSRequestResponseFormat)
|
|
329
|
+
console.print(f"[red]Error:[/red] Invalid format '{response_format}'. Valid options: {valid}")
|
|
330
|
+
raise typer.Exit(1)
|
|
331
|
+
|
|
332
|
+
if model != "pyttsx3":
|
|
333
|
+
_check_model_ready(api_url, model)
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
vc = _make_client(api_url)
|
|
337
|
+
t0 = time.monotonic()
|
|
338
|
+
|
|
339
|
+
body = TTSRequest(
|
|
340
|
+
model=model,
|
|
341
|
+
input_=text,
|
|
342
|
+
voice=voice if voice is not None else UNSET,
|
|
343
|
+
speed=speed,
|
|
344
|
+
response_format=fmt,
|
|
345
|
+
)
|
|
346
|
+
resp = text_to_speech_v1_audio_speech_post.sync_detailed(client=vc, body=body)
|
|
347
|
+
|
|
348
|
+
elapsed = time.monotonic() - t0
|
|
349
|
+
|
|
350
|
+
if resp.status_code != 200:
|
|
351
|
+
_clone_error(resp.content, resp.status_code)
|
|
352
|
+
raise typer.Exit(1)
|
|
353
|
+
|
|
354
|
+
audio_bytes = resp.content
|
|
355
|
+
_speak_output(audio_bytes, response_format, output, elapsed, verbose)
|
|
356
|
+
|
|
357
|
+
except typer.Exit:
|
|
358
|
+
raise
|
|
359
|
+
except UnexpectedStatus as e:
|
|
360
|
+
msg = _api_error_message(e)
|
|
361
|
+
console.print(f"[red]Error:[/red] {msg}")
|
|
362
|
+
if e.status_code == 503:
|
|
363
|
+
console.print("[dim]The model or a required package is not available on the server.[/dim]")
|
|
364
|
+
raise typer.Exit(1)
|
|
365
|
+
except Exception as e:
|
|
366
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
367
|
+
raise typer.Exit(1)
|
|
368
|
+
|
|
369
|
+
|
|
299
370
|
@app.command()
|
|
300
371
|
def clone(
|
|
301
372
|
text: str = typer.Argument(..., help="Text to synthesize in the cloned voice"),
|
|
@@ -898,7 +969,9 @@ def _print_output_devices_table() -> None:
|
|
|
898
969
|
@app.command()
|
|
899
970
|
def chat(
|
|
900
971
|
model: str = typer.Option(vocal_settings.STT_DEFAULT_MODEL, "--model", "-m", help="STT model to use"),
|
|
901
|
-
models: bool = typer.Option(False, "--models", help="Interactively select from downloaded STT models"),
|
|
972
|
+
models: bool = typer.Option(False, "--models", help="Interactively select from downloaded STT and TTS models"),
|
|
973
|
+
tts_model: str = typer.Option(vocal_settings.TTS_DEFAULT_MODEL, "--tts-model", help="TTS model to use for voice responses"),
|
|
974
|
+
tts_voice: str | None = typer.Option(None, "--tts-voice", help="TTS voice ID or instruction (model-specific)"),
|
|
902
975
|
language: str | None = typer.Option(None, "--language", "-l", help="Language code (e.g. 'en'). Auto-detected if omitted."),
|
|
903
976
|
device: str | None = typer.Option(None, "--device", "-d", help="Input device index or name (use --devices to pick interactively)"),
|
|
904
977
|
output_device: str | None = typer.Option(None, "--output-device", "-o", help="Output device index or name (run `vocal devices --output` to list)"),
|
|
@@ -923,6 +996,10 @@ def chat(
|
|
|
923
996
|
if selected_model is None:
|
|
924
997
|
raise typer.Exit(1)
|
|
925
998
|
model = selected_model
|
|
999
|
+
selected_tts = _model_wizard(api_url, task="tts")
|
|
1000
|
+
if selected_tts is None:
|
|
1001
|
+
raise typer.Exit(1)
|
|
1002
|
+
tts_model = selected_tts
|
|
926
1003
|
try:
|
|
927
1004
|
device_idx = _resolve_device(device)
|
|
928
1005
|
output_device_idx = _resolve_output_device(output_device)
|
|
@@ -936,11 +1013,11 @@ def chat(
|
|
|
936
1013
|
device_label = f"[dim]{active_device['name']}[/dim]"
|
|
937
1014
|
threshold_hint = f" vad=[cyan]{silence_threshold:.0f}[/cyan]" if silence_threshold is not None else ""
|
|
938
1015
|
|
|
939
|
-
console.print(f"[green]Voice chat started[/green]
|
|
1016
|
+
console.print(f"[green]Voice chat started[/green] stt=[cyan]{model}[/cyan] tts=[cyan]{tts_model}[/cyan] device={device_label}{threshold_hint} Ctrl+C to stop\n")
|
|
940
1017
|
console.print("[dim]Speak — I'll transcribe, think, and respond with audio.[/dim]\n")
|
|
941
1018
|
|
|
942
1019
|
try:
|
|
943
|
-
asyncio.run(_chat_async(ws_url, device_idx, output_device_idx, model, language, system_prompt, silence_threshold, verbose))
|
|
1020
|
+
asyncio.run(_chat_async(ws_url, device_idx, output_device_idx, model, language, system_prompt, silence_threshold, verbose, tts_model=tts_model, tts_voice=tts_voice))
|
|
944
1021
|
except KeyboardInterrupt:
|
|
945
1022
|
console.print("\n[yellow]Stopped.[/yellow]")
|
|
946
1023
|
except Exception as e:
|
|
@@ -948,6 +1025,25 @@ def chat(
|
|
|
948
1025
|
raise typer.Exit(1)
|
|
949
1026
|
|
|
950
1027
|
|
|
1028
|
+
def _speak_output(audio_bytes: bytes, response_format: str, output: Path | None, elapsed: float, verbose: bool) -> None:
|
|
1029
|
+
if output:
|
|
1030
|
+
output.write_bytes(audio_bytes)
|
|
1031
|
+
timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
|
|
1032
|
+
console.print(f"[green]Saved[/green] {len(audio_bytes):,} bytes -> [cyan]{output}[/cyan]{timing}")
|
|
1033
|
+
elif sys.stdout.isatty():
|
|
1034
|
+
if response_format == "wav":
|
|
1035
|
+
timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
|
|
1036
|
+
console.print(f"[green]Playing[/green] {len(audio_bytes):,} bytes{timing}")
|
|
1037
|
+
_play_wav_bytes(audio_bytes)
|
|
1038
|
+
else:
|
|
1039
|
+
console.print(f"[yellow]Tip:[/yellow] Use [cyan]--output file.{response_format}[/cyan] to save non-WAV audio, or omit [cyan]--format[/cyan] for auto-play.")
|
|
1040
|
+
sys.stdout.buffer.write(audio_bytes)
|
|
1041
|
+
else:
|
|
1042
|
+
if verbose:
|
|
1043
|
+
sys.stderr.write(f" {len(audio_bytes):,} bytes ({response_format}) in {elapsed:.1f}s\n")
|
|
1044
|
+
sys.stdout.buffer.write(audio_bytes)
|
|
1045
|
+
|
|
1046
|
+
|
|
951
1047
|
def _clone_error(content: bytes, status_code: int) -> None:
|
|
952
1048
|
try:
|
|
953
1049
|
detail = json.loads(content).get("detail", "")
|
|
@@ -963,7 +1059,7 @@ def _clone_output(audio_bytes: bytes, fmt: BodyVoiceCloneV1AudioClonePostRespons
|
|
|
963
1059
|
if output:
|
|
964
1060
|
output.write_bytes(audio_bytes)
|
|
965
1061
|
timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
|
|
966
|
-
console.print(f"[green]Saved[/green] {len(audio_bytes):,} bytes
|
|
1062
|
+
console.print(f"[green]Saved[/green] {len(audio_bytes):,} bytes -> [cyan]{output}[/cyan]{timing}")
|
|
967
1063
|
elif sys.stdout.isatty():
|
|
968
1064
|
if fmt == BodyVoiceCloneV1AudioClonePostResponseFormat.WAV:
|
|
969
1065
|
timing = f" [dim]{elapsed:.1f}s[/dim]" if verbose else ""
|
|
@@ -1080,16 +1176,20 @@ async def _chat_receiver(ws, output_device_idx: int | None, verbose: bool, loop:
|
|
|
1080
1176
|
console.print(f"\n[red]error:[/red] {event.get('error', {}).get('message', 'unknown')}")
|
|
1081
1177
|
|
|
1082
1178
|
|
|
1083
|
-
def _build_chat_session_cfg(model: str, language: str | None, system_prompt: str, vad_threshold: float | None) -> dict:
|
|
1179
|
+
def _build_chat_session_cfg(model: str, language: str | None, system_prompt: str, vad_threshold: float | None, tts_model: str | None = None, tts_voice: str | None = None) -> dict:
|
|
1084
1180
|
cfg: dict = {"type": "realtime", "model": model, "input_sample_rate": _SAMPLE_RATE, "system_prompt": system_prompt}
|
|
1085
1181
|
if language:
|
|
1086
1182
|
cfg["language"] = language
|
|
1087
1183
|
if vad_threshold is not None:
|
|
1088
1184
|
cfg["turn_detection"] = {"threshold": vad_threshold / 32768.0}
|
|
1185
|
+
if tts_model:
|
|
1186
|
+
cfg["tts_model"] = tts_model
|
|
1187
|
+
if tts_voice:
|
|
1188
|
+
cfg["tts_voice"] = tts_voice
|
|
1089
1189
|
return cfg
|
|
1090
1190
|
|
|
1091
1191
|
|
|
1092
|
-
async def _chat_async(ws_url: str, device_idx: int | None, output_device_idx: int | None, model: str, language: str | None, system_prompt: str, vad_threshold: float | None, verbose: bool) -> None:
|
|
1192
|
+
async def _chat_async(ws_url: str, device_idx: int | None, output_device_idx: int | None, model: str, language: str | None, system_prompt: str, vad_threshold: float | None, verbose: bool, tts_model: str | None = None, tts_voice: str | None = None) -> None:
|
|
1093
1193
|
audio_q: queue.SimpleQueue = queue.SimpleQueue()
|
|
1094
1194
|
loop = asyncio.get_running_loop()
|
|
1095
1195
|
stop_event = asyncio.Event()
|
|
@@ -1116,7 +1216,7 @@ async def _chat_async(ws_url: str, device_idx: int | None, output_device_idx: in
|
|
|
1116
1216
|
async with websockets.connect(f"{ws_url}/v1/realtime", open_timeout=10) as ws:
|
|
1117
1217
|
await asyncio.wait_for(ws.recv(), timeout=5.0)
|
|
1118
1218
|
|
|
1119
|
-
session_cfg = _build_chat_session_cfg(model, language, system_prompt, vad_threshold)
|
|
1219
|
+
session_cfg = _build_chat_session_cfg(model, language, system_prompt, vad_threshold, tts_model=tts_model, tts_voice=tts_voice)
|
|
1120
1220
|
await ws.send(json.dumps({"type": "session.update", "session": session_cfg}))
|
|
1121
1221
|
await asyncio.wait_for(ws.recv(), timeout=5.0)
|
|
1122
1222
|
|
|
File without changes
|
|
File without changes
|