abstractvoice 0.5.2__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2408 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/text_sanitize.py +33 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.2.dist-info/METADATA +213 -0
- abstractvoice-0.6.2.dist-info/RECORD +53 -0
- {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.2.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.2.dist-info/METADATA +0 -1458
- abstractvoice-0.5.2.dist-info/RECORD +0 -23
- abstractvoice-0.5.2.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/top_level.txt +0 -0
|
@@ -8,11 +8,18 @@ that interacts with an LLM API for text generation.
|
|
|
8
8
|
|
|
9
9
|
import argparse
|
|
10
10
|
import cmd
|
|
11
|
+
import atexit
|
|
11
12
|
import json
|
|
12
13
|
import re
|
|
14
|
+
import shlex
|
|
15
|
+
import shutil
|
|
13
16
|
import sys
|
|
17
|
+
import importlib.util
|
|
18
|
+
import threading
|
|
19
|
+
import time
|
|
14
20
|
import requests
|
|
15
21
|
from abstractvoice import VoiceManager
|
|
22
|
+
from abstractvoice.text_sanitize import sanitize_markdown_for_speech
|
|
16
23
|
|
|
17
24
|
|
|
18
25
|
# ANSI color codes
|
|
@@ -31,18 +38,34 @@ class VoiceREPL(cmd.Cmd):
|
|
|
31
38
|
"""Voice-enabled REPL for LLM interaction."""
|
|
32
39
|
|
|
33
40
|
intro = "" # Will be set in __init__ to include help
|
|
34
|
-
prompt =
|
|
41
|
+
prompt = "> "
|
|
35
42
|
|
|
36
43
|
# Override cmd module settings
|
|
37
44
|
ruler = "" # No horizontal rule line
|
|
38
45
|
use_rawinput = True
|
|
39
46
|
|
|
40
|
-
def __init__(
|
|
41
|
-
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
api_url="http://localhost:11434/api/chat",
|
|
50
|
+
model="cogito:3b",
|
|
51
|
+
debug_mode=False,
|
|
52
|
+
verbose_mode: bool = False,
|
|
53
|
+
language="en",
|
|
54
|
+
tts_model=None,
|
|
55
|
+
voice_mode: str = "off",
|
|
56
|
+
disable_tts=False,
|
|
57
|
+
cloning_engine: str = "f5_tts",
|
|
58
|
+
):
|
|
42
59
|
super().__init__()
|
|
43
60
|
|
|
61
|
+
# Best-effort: enable proper line editing + history (Up/Down arrows).
|
|
62
|
+
# Some Python builds (notably when built without readline/libedit) will
|
|
63
|
+
# otherwise treat arrow keys as escape sequences and corrupt the prompt.
|
|
64
|
+
self._init_readline()
|
|
65
|
+
|
|
44
66
|
# Debug mode
|
|
45
67
|
self.debug_mode = debug_mode
|
|
68
|
+
self.verbose_mode = bool(verbose_mode)
|
|
46
69
|
|
|
47
70
|
# API settings
|
|
48
71
|
self.api_url = api_url
|
|
@@ -52,6 +75,8 @@ class VoiceREPL(cmd.Cmd):
|
|
|
52
75
|
|
|
53
76
|
# Language settings
|
|
54
77
|
self.current_language = language
|
|
78
|
+
self._initial_tts_model = tts_model
|
|
79
|
+
self.cloning_engine = str(cloning_engine or "f5_tts").strip().lower()
|
|
55
80
|
|
|
56
81
|
# Initialize voice manager with language support
|
|
57
82
|
if disable_tts:
|
|
@@ -61,19 +86,36 @@ class VoiceREPL(cmd.Cmd):
|
|
|
61
86
|
self.voice_manager = VoiceManager(
|
|
62
87
|
language=language,
|
|
63
88
|
tts_model=tts_model,
|
|
64
|
-
debug_mode=debug_mode
|
|
89
|
+
debug_mode=debug_mode,
|
|
90
|
+
allow_downloads=False,
|
|
91
|
+
cloned_tts_streaming=False,
|
|
92
|
+
cloning_engine=self.cloning_engine,
|
|
65
93
|
)
|
|
94
|
+
|
|
95
|
+
# Current speaking voice:
|
|
96
|
+
# - None => Piper (default, language-driven)
|
|
97
|
+
# - str => cloned voice_id
|
|
98
|
+
self.current_tts_voice: str | None = None
|
|
99
|
+
|
|
100
|
+
# When reference_text is auto-generated via ASR ("asr" source), print a
|
|
101
|
+
# ready-to-copy `/clone_set_ref_text ...` hint once per voice for easy correction.
|
|
102
|
+
self._printed_asr_ref_text_hint: set[str] = set()
|
|
103
|
+
|
|
104
|
+
# Seed a default cloned voice (HAL9000) if samples are present.
|
|
105
|
+
self._seed_hal9000_voice()
|
|
66
106
|
|
|
67
107
|
# Settings
|
|
68
108
|
self.use_tts = True
|
|
69
|
-
|
|
109
|
+
# Voice input mode (mic). Default: OFF for fast startup + offline-first.
|
|
110
|
+
# Use `--voice-mode stop` (or `/voice stop`) to enable hands-free.
|
|
111
|
+
self.voice_mode = (voice_mode or "off").strip().lower() # off, full, wait, stop, ptt
|
|
70
112
|
self.voice_mode_active = False # Is voice recognition running?
|
|
113
|
+
self._ptt_session_active = False
|
|
114
|
+
self._ptt_recording = False
|
|
115
|
+
self._ptt_busy = False
|
|
71
116
|
|
|
72
117
|
# System prompt
|
|
73
|
-
self.system_prompt = ""
|
|
74
|
-
You are a Helpful Voice Assistant. By design, your answers are short and more conversational, unless specifically asked to detail something.
|
|
75
|
-
You only speak, so never use any text formatting or markdown. Write for a speaker.
|
|
76
|
-
"""
|
|
118
|
+
self.system_prompt = "You are a Helpful Voice Assistant. By design, your answers are short and conversational, unless specifically asked to detail something. You only speak, so never use any text formatting, hinting, *emotions*, emojis or markdown. Incarnate the speaker, never comment your instructions."
|
|
77
119
|
|
|
78
120
|
# Message history
|
|
79
121
|
self.messages = [{"role": "system", "content": self.system_prompt}]
|
|
@@ -82,27 +124,136 @@ class VoiceREPL(cmd.Cmd):
|
|
|
82
124
|
self.system_tokens = 0
|
|
83
125
|
self.user_tokens = 0
|
|
84
126
|
self.assistant_tokens = 0
|
|
127
|
+
# LLM token totals (best-effort, Ollama API `eval_count`).
|
|
128
|
+
self.total_llm_out_tokens = 0
|
|
129
|
+
# Word counting
|
|
130
|
+
self.system_words = 0
|
|
131
|
+
self.user_words = 0
|
|
132
|
+
self.assistant_words = 0
|
|
133
|
+
# Best-effort tokenizer cache (tiktoken optional).
|
|
134
|
+
self._tiktoken_encoding = None
|
|
135
|
+
self._tiktoken_unavailable = False
|
|
85
136
|
self._count_system_tokens()
|
|
86
|
-
|
|
137
|
+
self._count_system_words()
|
|
138
|
+
|
|
139
|
+
# Best-effort metrics captured from voice input paths.
|
|
140
|
+
self._pending_stt_metrics: dict | None = None
|
|
141
|
+
|
|
87
142
|
if self.debug_mode:
|
|
88
143
|
print(f"Initialized with API URL: {api_url}")
|
|
89
144
|
print(f"Using model: {model}")
|
|
90
|
-
|
|
145
|
+
|
|
146
|
+
# Optionally auto-start voice input (mic). Keep OFF by default to avoid
|
|
147
|
+
# loading STT models (slow) unless the user explicitly opts in.
|
|
148
|
+
if self.voice_manager and self.voice_mode and self.voice_mode != "off":
|
|
149
|
+
try:
|
|
150
|
+
self.do_voice(self.voice_mode)
|
|
151
|
+
except Exception:
|
|
152
|
+
# Never block REPL start.
|
|
153
|
+
self.voice_mode = "off"
|
|
154
|
+
self.voice_mode_active = False
|
|
155
|
+
|
|
91
156
|
# Set intro with help information
|
|
92
157
|
self.intro = self._get_intro()
|
|
158
|
+
|
|
159
|
+
def _init_readline(self) -> None:
|
|
160
|
+
"""Initialize readline history + make ANSI prompts safe (best-effort)."""
|
|
161
|
+
rl = None
|
|
162
|
+
try:
|
|
163
|
+
import readline as _readline # type: ignore
|
|
164
|
+
|
|
165
|
+
rl = _readline
|
|
166
|
+
except Exception:
|
|
167
|
+
# Windows users may have pyreadline3 installed.
|
|
168
|
+
try:
|
|
169
|
+
import pyreadline3 as _readline # type: ignore
|
|
170
|
+
|
|
171
|
+
rl = _readline
|
|
172
|
+
except Exception:
|
|
173
|
+
rl = None
|
|
174
|
+
|
|
175
|
+
if rl is None:
|
|
176
|
+
# Keep prompt simple and avoid ANSI; prevents strange cursor behavior
|
|
177
|
+
# when arrow keys emit escape codes in cooked terminals.
|
|
178
|
+
self.prompt = "> "
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
# Keep prompt plain when readline is enabled. ANSI prompts are fragile
|
|
182
|
+
# across readline/libedit builds and can corrupt redraw/history behavior.
|
|
183
|
+
self.prompt = "> "
|
|
184
|
+
|
|
185
|
+
# Persist history across sessions (best-effort).
|
|
186
|
+
try:
|
|
187
|
+
from pathlib import Path
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
import appdirs
|
|
191
|
+
|
|
192
|
+
hist_dir = Path(appdirs.user_data_dir("abstractvoice"))
|
|
193
|
+
except Exception:
|
|
194
|
+
hist_dir = Path.home() / ".abstractvoice"
|
|
195
|
+
|
|
196
|
+
hist_dir.mkdir(parents=True, exist_ok=True)
|
|
197
|
+
hist_path = hist_dir / "repl_history"
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
rl.read_history_file(str(hist_path))
|
|
201
|
+
except FileNotFoundError:
|
|
202
|
+
pass
|
|
203
|
+
except Exception:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
rl.set_history_length(2000)
|
|
208
|
+
except Exception:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
def _save_history():
|
|
212
|
+
try:
|
|
213
|
+
rl.write_history_file(str(hist_path))
|
|
214
|
+
except Exception:
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
atexit.register(_save_history)
|
|
218
|
+
except Exception:
|
|
219
|
+
pass
|
|
220
|
+
|
|
221
|
+
# Ensure Up/Down arrows traverse history reliably across GNU readline and
|
|
222
|
+
# macOS libedit-backed readline. Some libedit defaults perform prefix
|
|
223
|
+
# search/completion, which can look like text is being appended.
|
|
224
|
+
try:
|
|
225
|
+
doc = getattr(rl, "__doc__", "") or ""
|
|
226
|
+
is_libedit = "libedit" in doc.lower()
|
|
227
|
+
if is_libedit:
|
|
228
|
+
# libedit syntax
|
|
229
|
+
rl.parse_and_bind("bind ^[[A ed-prev-history")
|
|
230
|
+
rl.parse_and_bind("bind ^[[B ed-next-history")
|
|
231
|
+
rl.parse_and_bind("bind ^[[OA ed-prev-history")
|
|
232
|
+
rl.parse_and_bind("bind ^[[OB ed-next-history")
|
|
233
|
+
else:
|
|
234
|
+
# GNU readline syntax
|
|
235
|
+
rl.parse_and_bind('"\\e[A": previous-history')
|
|
236
|
+
rl.parse_and_bind('"\\e[B": next-history')
|
|
237
|
+
rl.parse_and_bind('"\\eOA": previous-history')
|
|
238
|
+
rl.parse_and_bind('"\\eOB": next-history')
|
|
239
|
+
except Exception:
|
|
240
|
+
pass
|
|
93
241
|
|
|
94
242
|
def _get_intro(self):
|
|
95
243
|
"""Generate intro message with help."""
|
|
96
244
|
intro = f"\n{Colors.BOLD}Welcome to AbstractVoice CLI REPL{Colors.END}\n"
|
|
97
245
|
if self.voice_manager:
|
|
98
246
|
lang_name = self.voice_manager.get_language_name()
|
|
99
|
-
|
|
247
|
+
mic = (self.voice_mode or "off").upper()
|
|
248
|
+
intro += f"API: {self.api_url} | Model: {self.model} | Voice: {lang_name} | Mic: {mic} | Cloning: {self.cloning_engine}\n"
|
|
100
249
|
else:
|
|
101
250
|
intro += f"API: {self.api_url} | Model: {self.model} | Voice: Disabled\n"
|
|
102
251
|
intro += f"\n{Colors.CYAN}Quick Start:{Colors.END}\n"
|
|
103
252
|
intro += " • Type messages to chat with the LLM\n"
|
|
104
|
-
intro += " •
|
|
253
|
+
intro += " • Voice input (mic): off by default. Enable: /voice stop (or start with --voice-mode stop)\n"
|
|
254
|
+
intro += " • PTT: /voice ptt then SPACE to capture (ESC exits)\n"
|
|
105
255
|
intro += " • Use /language <lang> to switch voice language\n"
|
|
256
|
+
intro += " • Use /clones and /tts_voice to use cloned voices\n"
|
|
106
257
|
intro += " • Type /help for full command list\n"
|
|
107
258
|
intro += " • Type /exit or /q to quit\n"
|
|
108
259
|
return intro
|
|
@@ -110,6 +261,236 @@ class VoiceREPL(cmd.Cmd):
|
|
|
110
261
|
def _count_system_tokens(self):
|
|
111
262
|
"""Count tokens in the system prompt."""
|
|
112
263
|
self._count_tokens(self.system_prompt, "system")
|
|
264
|
+
|
|
265
|
+
def _count_system_words(self):
|
|
266
|
+
self.system_words = self._count_words(self.system_prompt)
|
|
267
|
+
|
|
268
|
+
def _count_words(self, text: str) -> int:
|
|
269
|
+
s = str(text or "").strip()
|
|
270
|
+
if not s:
|
|
271
|
+
return 0
|
|
272
|
+
# A "word" here is whitespace-delimited for simplicity across languages.
|
|
273
|
+
return len([w for w in re.split(r"\s+", s) if w])
|
|
274
|
+
|
|
275
|
+
def _get_tiktoken_encoding(self):
|
|
276
|
+
if getattr(self, "_tiktoken_unavailable", False):
|
|
277
|
+
return None
|
|
278
|
+
enc = getattr(self, "_tiktoken_encoding", None)
|
|
279
|
+
if enc is not None:
|
|
280
|
+
return enc
|
|
281
|
+
try:
|
|
282
|
+
import tiktoken
|
|
283
|
+
except ImportError:
|
|
284
|
+
self._tiktoken_unavailable = True
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
|
289
|
+
except Exception:
|
|
290
|
+
try:
|
|
291
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
292
|
+
except Exception:
|
|
293
|
+
self._tiktoken_unavailable = True
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
self._tiktoken_encoding = enc
|
|
297
|
+
return enc
|
|
298
|
+
|
|
299
|
+
def _fmt_s(self, seconds: float | None) -> str:
|
|
300
|
+
try:
|
|
301
|
+
if seconds is None:
|
|
302
|
+
return "--"
|
|
303
|
+
s = float(seconds)
|
|
304
|
+
if s < 0:
|
|
305
|
+
return "--"
|
|
306
|
+
# Keep it compact but readable.
|
|
307
|
+
if s < 10:
|
|
308
|
+
return f"{s:.2f}s"
|
|
309
|
+
return f"{s:.1f}s"
|
|
310
|
+
except Exception:
|
|
311
|
+
return "--"
|
|
312
|
+
|
|
313
|
+
def _fmt_num(self, x: float | None, *, digits: int = 2) -> str:
|
|
314
|
+
try:
|
|
315
|
+
if x is None:
|
|
316
|
+
return "--"
|
|
317
|
+
return f"{float(x):.{int(digits)}f}"
|
|
318
|
+
except Exception:
|
|
319
|
+
return "--"
|
|
320
|
+
|
|
321
|
+
def _fmt_wtok(self, words: int | None, tokens: int | None) -> str:
|
|
322
|
+
w = int(words) if isinstance(words, int) else (int(words) if words is not None else 0)
|
|
323
|
+
if isinstance(tokens, int):
|
|
324
|
+
return f"{w}w/{int(tokens)}tok"
|
|
325
|
+
return f"{w}w/--tok"
|
|
326
|
+
|
|
327
|
+
def _summarize_audio_source(self, source: str) -> tuple[int | None, float | None]:
|
|
328
|
+
"""Best-effort: return (file_count, total_seconds) for an audio source path."""
|
|
329
|
+
try:
|
|
330
|
+
from pathlib import Path
|
|
331
|
+
|
|
332
|
+
p = Path(str(source)).expanduser()
|
|
333
|
+
except Exception:
|
|
334
|
+
return None, None
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
import soundfile as sf
|
|
338
|
+
except Exception:
|
|
339
|
+
return None, None
|
|
340
|
+
|
|
341
|
+
supported = {".wav", ".flac", ".ogg"}
|
|
342
|
+
files = []
|
|
343
|
+
try:
|
|
344
|
+
if p.is_file():
|
|
345
|
+
files = [p]
|
|
346
|
+
elif p.is_dir():
|
|
347
|
+
files = sorted([x for x in p.iterdir() if x.is_file() and x.suffix.lower() in supported])
|
|
348
|
+
else:
|
|
349
|
+
return None, None
|
|
350
|
+
except Exception:
|
|
351
|
+
return None, None
|
|
352
|
+
|
|
353
|
+
total_s = 0.0
|
|
354
|
+
max_files = 25
|
|
355
|
+
for fp in files[:max_files]:
|
|
356
|
+
try:
|
|
357
|
+
info = sf.info(str(fp))
|
|
358
|
+
d = float(getattr(info, "duration", 0.0) or 0.0)
|
|
359
|
+
if d > 0:
|
|
360
|
+
total_s += d
|
|
361
|
+
except Exception:
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
# If there are too many files, the displayed duration is a lower bound.
|
|
365
|
+
return (int(len(files)) if files else 0), (float(total_s) if total_s > 0 else None)
|
|
366
|
+
|
|
367
|
+
def _print_verbose_turn_stats(self, turn: dict) -> None:
|
|
368
|
+
if not bool(getattr(self, "verbose_mode", False)):
|
|
369
|
+
return
|
|
370
|
+
if not isinstance(turn, dict):
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
stt = turn.get("stt") if isinstance(turn.get("stt"), dict) else None
|
|
374
|
+
llm = turn.get("llm") if isinstance(turn.get("llm"), dict) else {}
|
|
375
|
+
counts = turn.get("counts") if isinstance(turn.get("counts"), dict) else {}
|
|
376
|
+
tts = turn.get("tts") if isinstance(turn.get("tts"), dict) else None
|
|
377
|
+
|
|
378
|
+
in_w = counts.get("in_words")
|
|
379
|
+
out_w = counts.get("out_words")
|
|
380
|
+
in_t = counts.get("in_tokens")
|
|
381
|
+
out_t = counts.get("out_tokens")
|
|
382
|
+
|
|
383
|
+
llm_s = llm.get("s")
|
|
384
|
+
api = llm.get("api") if isinstance(llm.get("api"), dict) else {}
|
|
385
|
+
api_prompt_tok = api.get("prompt_eval_count") if isinstance(api.get("prompt_eval_count"), int) else None
|
|
386
|
+
api_out_tok = api.get("eval_count") if isinstance(api.get("eval_count"), int) else None
|
|
387
|
+
|
|
388
|
+
# Line 1: STT (if any) + LLM + in/out counts and written speed.
|
|
389
|
+
parts1 = []
|
|
390
|
+
if stt:
|
|
391
|
+
stt_s = stt.get("stt_s")
|
|
392
|
+
stt_a = stt.get("audio_s")
|
|
393
|
+
stt_rtf = stt.get("rtf")
|
|
394
|
+
stt_txt = f"STT {self._fmt_s(stt_s)}"
|
|
395
|
+
if stt_a:
|
|
396
|
+
stt_txt += f"(a{self._fmt_s(stt_a)})"
|
|
397
|
+
if stt_rtf is not None:
|
|
398
|
+
stt_txt += f" rtf{self._fmt_num(stt_rtf, digits=2)}"
|
|
399
|
+
parts1.append(stt_txt)
|
|
400
|
+
|
|
401
|
+
if llm_s is not None or api_prompt_tok is not None or api_out_tok is not None:
|
|
402
|
+
llm_txt = f"LLM {self._fmt_s(llm_s)}"
|
|
403
|
+
if api_prompt_tok is not None or api_out_tok is not None:
|
|
404
|
+
p = str(api_prompt_tok) if api_prompt_tok is not None else "--"
|
|
405
|
+
o = str(api_out_tok) if api_out_tok is not None else "--"
|
|
406
|
+
llm_txt += f" (api p{p} o{o})"
|
|
407
|
+
parts1.append(llm_txt)
|
|
408
|
+
|
|
409
|
+
in_txt = f"in {self._fmt_wtok(in_w, in_t)}"
|
|
410
|
+
out_txt = f"out {self._fmt_wtok(out_w, out_t)}"
|
|
411
|
+
|
|
412
|
+
wps_written = None
|
|
413
|
+
try:
|
|
414
|
+
if isinstance(out_w, int) and out_w > 0 and llm_s and float(llm_s) > 0:
|
|
415
|
+
wps_written = float(out_w) / float(llm_s)
|
|
416
|
+
except Exception:
|
|
417
|
+
wps_written = None
|
|
418
|
+
|
|
419
|
+
if wps_written is not None:
|
|
420
|
+
out_txt += f" ({self._fmt_num(wps_written, digits=1)}w/s)"
|
|
421
|
+
|
|
422
|
+
parts1.append(in_txt)
|
|
423
|
+
parts1.append(out_txt)
|
|
424
|
+
|
|
425
|
+
line1 = " | ".join(parts1)
|
|
426
|
+
|
|
427
|
+
# Line 2: TTS (if any) + spoken speed + totals.
|
|
428
|
+
parts2 = []
|
|
429
|
+
if self.voice_manager and self.use_tts:
|
|
430
|
+
if not tts:
|
|
431
|
+
parts2.append("TTS --")
|
|
432
|
+
else:
|
|
433
|
+
eng = str(tts.get("engine") or "").strip().lower()
|
|
434
|
+
if eng == "clone":
|
|
435
|
+
ce = tts.get("clone_engine")
|
|
436
|
+
label = f"clone[{ce}]" if ce else "clone"
|
|
437
|
+
elif eng:
|
|
438
|
+
label = eng
|
|
439
|
+
else:
|
|
440
|
+
label = "tts"
|
|
441
|
+
|
|
442
|
+
err = (tts.get("error") or "").strip()
|
|
443
|
+
if err:
|
|
444
|
+
# Keep single-line and short.
|
|
445
|
+
msg = " ".join(err.split())
|
|
446
|
+
if len(msg) > 120:
|
|
447
|
+
msg = msg[:120].rstrip() + "…"
|
|
448
|
+
parts2.append(f"TTS {label} ERR {msg}")
|
|
449
|
+
else:
|
|
450
|
+
synth_s = tts.get("synth_s")
|
|
451
|
+
audio_s = tts.get("audio_s")
|
|
452
|
+
rtf = tts.get("rtf")
|
|
453
|
+
tts_txt = f"TTS {label} {self._fmt_s(synth_s)}→{self._fmt_s(audio_s)}"
|
|
454
|
+
if rtf is not None:
|
|
455
|
+
tts_txt += f" rtf{self._fmt_num(rtf, digits=2)}"
|
|
456
|
+
|
|
457
|
+
# Extra clone streaming details when available.
|
|
458
|
+
if eng == "clone" and bool(tts.get("streaming")):
|
|
459
|
+
ttfb_s = tts.get("ttfb_s")
|
|
460
|
+
if ttfb_s is not None:
|
|
461
|
+
tts_txt += f" ttfb{self._fmt_s(ttfb_s)}"
|
|
462
|
+
ch = tts.get("chunks")
|
|
463
|
+
if isinstance(ch, int):
|
|
464
|
+
tts_txt += f" ch{ch}"
|
|
465
|
+
|
|
466
|
+
wps_spoken = None
|
|
467
|
+
try:
|
|
468
|
+
if isinstance(out_w, int) and out_w > 0 and audio_s and float(audio_s) > 0:
|
|
469
|
+
wps_spoken = float(out_w) / float(audio_s)
|
|
470
|
+
except Exception:
|
|
471
|
+
wps_spoken = None
|
|
472
|
+
if wps_spoken is not None:
|
|
473
|
+
tts_txt += f" ({self._fmt_num(wps_spoken, digits=1)}w/s)"
|
|
474
|
+
|
|
475
|
+
parts2.append(tts_txt)
|
|
476
|
+
else:
|
|
477
|
+
parts2.append("TTS off")
|
|
478
|
+
|
|
479
|
+
total_words = int(getattr(self, "system_words", 0) + getattr(self, "user_words", 0) + getattr(self, "assistant_words", 0))
|
|
480
|
+
total_tokens = None
|
|
481
|
+
if self._get_tiktoken_encoding() is not None:
|
|
482
|
+
total_tokens = int(getattr(self, "system_tokens", 0) + getattr(self, "user_tokens", 0) + getattr(self, "assistant_tokens", 0))
|
|
483
|
+
|
|
484
|
+
tot_txt = f"tot {self._fmt_wtok(total_words, total_tokens)}"
|
|
485
|
+
if isinstance(getattr(self, "total_llm_out_tokens", None), int) and getattr(self, "total_llm_out_tokens") > 0:
|
|
486
|
+
tot_txt += f" (api out {int(getattr(self, 'total_llm_out_tokens'))}tok)"
|
|
487
|
+
parts2.append(tot_txt)
|
|
488
|
+
|
|
489
|
+
line2 = " | ".join(parts2)
|
|
490
|
+
|
|
491
|
+
# Keep it readable; two lines max.
|
|
492
|
+
print(f"{Colors.YELLOW}{line1}{Colors.END}")
|
|
493
|
+
print(f"{Colors.YELLOW}{line2}{Colors.END}")
|
|
113
494
|
|
|
114
495
|
def parseline(self, line):
|
|
115
496
|
"""Parse the line to extract command and arguments.
|
|
@@ -117,14 +498,11 @@ class VoiceREPL(cmd.Cmd):
|
|
|
117
498
|
Override to handle / prefix for commands. This ensures /voice, /help, etc.
|
|
118
499
|
are recognized as commands by stripping the leading / before parsing.
|
|
119
500
|
"""
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
# Call parent parseline to do the actual parsing
|
|
127
|
-
return super().parseline(line)
|
|
501
|
+
# Commands still use leading "/". In PTT mode we don't accept typed input.
|
|
502
|
+
s = line.strip()
|
|
503
|
+
if s.startswith("/"):
|
|
504
|
+
return super().parseline(s[1:].strip())
|
|
505
|
+
return super().parseline(line.strip())
|
|
128
506
|
|
|
129
507
|
def default(self, line):
|
|
130
508
|
"""Handle regular text input.
|
|
@@ -133,29 +511,123 @@ class VoiceREPL(cmd.Cmd):
|
|
|
133
511
|
All other commands MUST use / prefix.
|
|
134
512
|
"""
|
|
135
513
|
# Skip empty lines
|
|
136
|
-
|
|
514
|
+
text = line.strip()
|
|
515
|
+
if not text:
|
|
137
516
|
return
|
|
138
517
|
|
|
139
|
-
#
|
|
140
|
-
if
|
|
141
|
-
|
|
142
|
-
|
|
518
|
+
# In PTT mode we do not accept typed input.
|
|
519
|
+
if self.voice_mode == "ptt":
|
|
520
|
+
print("PTT mode: press SPACE to speak, ESC to exit.")
|
|
521
|
+
return
|
|
522
|
+
|
|
143
523
|
# Check if in voice mode - don't send to LLM
|
|
144
524
|
if self.voice_mode_active:
|
|
145
525
|
if self.debug_mode:
|
|
146
|
-
print(f"Voice mode active ({self.voice_mode}). Use /voice off
|
|
526
|
+
print(f"Voice mode active ({self.voice_mode}). Use /voice off to disable.")
|
|
527
|
+
return
|
|
528
|
+
|
|
529
|
+
# Interrupt any ongoing TTS playback immediately when the user types.
|
|
530
|
+
# This is the expected “barge-in by typing” UX for a REPL.
|
|
531
|
+
try:
|
|
532
|
+
if self.voice_manager:
|
|
533
|
+
self.voice_manager.stop_speaking()
|
|
534
|
+
except Exception:
|
|
535
|
+
pass
|
|
536
|
+
|
|
537
|
+
# Shortcut: paste a reference audio path to clone+use a voice.
|
|
538
|
+
# Examples:
|
|
539
|
+
# audio_samples/hal9000/hal9000_hello.wav
|
|
540
|
+
# audio_samples/hal9000/hal9000_hello.wav | Hello, Dave.
|
|
541
|
+
if self._maybe_handle_clone_shortcut(text):
|
|
147
542
|
return
|
|
148
543
|
|
|
149
544
|
# Everything else goes to LLM
|
|
150
|
-
self.
|
|
545
|
+
self._pending_stt_metrics = None
|
|
546
|
+
self.process_query(text)
|
|
547
|
+
|
|
548
|
+
# NOTE: PTT is implemented as a dedicated key-loop session (no typing).
|
|
549
|
+
|
|
550
|
+
def _maybe_handle_clone_shortcut(self, text: str) -> bool:
|
|
551
|
+
"""Best-effort: treat a pasted WAV/FLAC/OGG path as `/clone_use`."""
|
|
552
|
+
if not self.voice_manager:
|
|
553
|
+
return False
|
|
554
|
+
|
|
555
|
+
raw = (text or "").strip()
|
|
556
|
+
if not raw:
|
|
557
|
+
return False
|
|
558
|
+
if raw.startswith("/"):
|
|
559
|
+
return False
|
|
560
|
+
|
|
561
|
+
# Optional transcript with a simple pipe syntax:
|
|
562
|
+
# path.wav | Hello.
|
|
563
|
+
left, sep, right = raw.partition("|")
|
|
564
|
+
path_str = left.strip()
|
|
565
|
+
ref_text = right.strip() if sep else ""
|
|
566
|
+
reference_text = ref_text or None
|
|
567
|
+
|
|
568
|
+
# Strip naive wrapping quotes.
|
|
569
|
+
if (path_str.startswith('"') and path_str.endswith('"')) or (path_str.startswith("'") and path_str.endswith("'")):
|
|
570
|
+
path_str = path_str[1:-1].strip()
|
|
571
|
+
|
|
572
|
+
try:
|
|
573
|
+
from pathlib import Path
|
|
574
|
+
|
|
575
|
+
p = Path(path_str).expanduser()
|
|
576
|
+
except Exception:
|
|
577
|
+
return False
|
|
578
|
+
|
|
579
|
+
if not p.exists():
|
|
580
|
+
return False
|
|
581
|
+
|
|
582
|
+
exts = {".wav", ".flac", ".ogg"}
|
|
583
|
+
if p.is_file() and p.suffix.lower() not in exts:
|
|
584
|
+
return False
|
|
585
|
+
if p.is_dir():
|
|
586
|
+
try:
|
|
587
|
+
has_audio = any(x.is_file() and x.suffix.lower() in exts for x in p.iterdir())
|
|
588
|
+
except Exception:
|
|
589
|
+
has_audio = False
|
|
590
|
+
if not has_audio:
|
|
591
|
+
return False
|
|
592
|
+
|
|
593
|
+
# Build a `/clone_use` call with a stable name.
|
|
594
|
+
import shlex as _shlex
|
|
595
|
+
|
|
596
|
+
default_name = p.stem if p.is_file() else p.name
|
|
597
|
+
args = f"{_shlex.quote(str(p))} {_shlex.quote(default_name)}"
|
|
598
|
+
if reference_text:
|
|
599
|
+
args += f" --text {_shlex.quote(reference_text)}"
|
|
600
|
+
try:
|
|
601
|
+
self.do_clone_use(args)
|
|
602
|
+
except Exception as e:
|
|
603
|
+
print(f"❌ Clone shortcut failed: {e}")
|
|
604
|
+
if self.debug_mode:
|
|
605
|
+
import traceback
|
|
606
|
+
|
|
607
|
+
traceback.print_exc()
|
|
608
|
+
return True
|
|
151
609
|
|
|
152
610
|
def process_query(self, query):
|
|
153
611
|
"""Process a query and get a response from the LLM."""
|
|
154
612
|
if not query:
|
|
155
613
|
return
|
|
614
|
+
|
|
615
|
+
# Consume any pending STT metrics for this turn (voice/PTT input).
|
|
616
|
+
stt_metrics = getattr(self, "_pending_stt_metrics", None)
|
|
617
|
+
self._pending_stt_metrics = None
|
|
618
|
+
|
|
619
|
+
# If audio is currently playing, stop it so the new request can be handled
|
|
620
|
+
# without overlapping speech.
|
|
621
|
+
try:
|
|
622
|
+
if self.voice_manager:
|
|
623
|
+
self.voice_manager.stop_speaking()
|
|
624
|
+
except Exception:
|
|
625
|
+
pass
|
|
156
626
|
|
|
157
|
-
#
|
|
158
|
-
self.
|
|
627
|
+
# Per-turn counts
|
|
628
|
+
user_words = self._count_words(query)
|
|
629
|
+
self.user_words += int(user_words)
|
|
630
|
+
user_tokens = self._count_tokens(query, "user")
|
|
159
631
|
|
|
160
632
|
# Create the message
|
|
161
633
|
user_message = {"role": "user", "content": query}
|
|
@@ -175,6 +647,7 @@ class VoiceREPL(cmd.Cmd):
|
|
|
175
647
|
}
|
|
176
648
|
|
|
177
649
|
# Make API request
|
|
650
|
+
llm_t0 = time.monotonic()
|
|
178
651
|
response = requests.post(self.api_url, json=payload)
|
|
179
652
|
response.raise_for_status()
|
|
180
653
|
|
|
@@ -182,6 +655,22 @@ class VoiceREPL(cmd.Cmd):
|
|
|
182
655
|
try:
|
|
183
656
|
# First, try to parse as JSON
|
|
184
657
|
response_data = response.json()
|
|
658
|
+
api_llm_metrics = {}
|
|
659
|
+
try:
|
|
660
|
+
# Ollama exposes timing + token counts (nanoseconds).
|
|
661
|
+
# Keep best-effort: if fields are missing, we just omit them.
|
|
662
|
+
for k in (
|
|
663
|
+
"total_duration",
|
|
664
|
+
"load_duration",
|
|
665
|
+
"prompt_eval_count",
|
|
666
|
+
"prompt_eval_duration",
|
|
667
|
+
"eval_count",
|
|
668
|
+
"eval_duration",
|
|
669
|
+
):
|
|
670
|
+
if k in response_data:
|
|
671
|
+
api_llm_metrics[k] = response_data.get(k)
|
|
672
|
+
except Exception:
|
|
673
|
+
api_llm_metrics = {}
|
|
185
674
|
|
|
186
675
|
# Check for different API formats
|
|
187
676
|
if "message" in response_data and "content" in response_data["message"]:
|
|
@@ -200,6 +689,7 @@ class VoiceREPL(cmd.Cmd):
|
|
|
200
689
|
|
|
201
690
|
# Handle streaming or non-JSON response
|
|
202
691
|
response_text = response.text.strip()
|
|
692
|
+
api_llm_metrics = {}
|
|
203
693
|
|
|
204
694
|
# Try to extract content from streaming format if possible
|
|
205
695
|
if response_text.startswith("{") and "content" in response_text:
|
|
@@ -228,9 +718,13 @@ class VoiceREPL(cmd.Cmd):
|
|
|
228
718
|
except Exception as e:
|
|
229
719
|
if self.debug_mode:
|
|
230
720
|
print(f"Error extracting content from streaming response: {e}")
|
|
721
|
+
llm_t1 = time.monotonic()
|
|
722
|
+
llm_s = float(llm_t1 - llm_t0)
|
|
231
723
|
|
|
232
|
-
#
|
|
233
|
-
self.
|
|
724
|
+
# Per-turn counts
|
|
725
|
+
assistant_words = self._count_words(response_text)
|
|
726
|
+
self.assistant_words += int(assistant_words)
|
|
727
|
+
assistant_tokens = self._count_tokens(response_text, "assistant")
|
|
234
728
|
|
|
235
729
|
# Add to message history
|
|
236
730
|
self.messages.append({"role": "assistant", "content": response_text})
|
|
@@ -238,9 +732,61 @@ class VoiceREPL(cmd.Cmd):
|
|
|
238
732
|
# Display the response with color
|
|
239
733
|
print(f"{Colors.CYAN}{response_text}{Colors.END}")
|
|
240
734
|
|
|
735
|
+
# Record last-turn stats (best-effort; printed only in verbose mode).
|
|
736
|
+
self._last_turn_metrics = {
|
|
737
|
+
"stt": stt_metrics,
|
|
738
|
+
"llm": {
|
|
739
|
+
"s": llm_s,
|
|
740
|
+
"api": api_llm_metrics,
|
|
741
|
+
},
|
|
742
|
+
"counts": {
|
|
743
|
+
"in_words": int(user_words),
|
|
744
|
+
"out_words": int(assistant_words),
|
|
745
|
+
"in_tokens": int(user_tokens) if isinstance(user_tokens, int) else None,
|
|
746
|
+
"out_tokens": int(assistant_tokens) if isinstance(assistant_tokens, int) else None,
|
|
747
|
+
},
|
|
748
|
+
}
|
|
749
|
+
try:
|
|
750
|
+
out_tok = api_llm_metrics.get("eval_count") if isinstance(api_llm_metrics, dict) else None
|
|
751
|
+
if isinstance(out_tok, int) and out_tok >= 0:
|
|
752
|
+
self.total_llm_out_tokens += int(out_tok)
|
|
753
|
+
except Exception:
|
|
754
|
+
pass
|
|
755
|
+
|
|
241
756
|
# Speak the response if voice manager is available
|
|
242
757
|
if self.voice_manager and self.use_tts:
|
|
243
|
-
|
|
758
|
+
try:
|
|
759
|
+
# UX guard: never trigger big cloning downloads during normal chat.
|
|
760
|
+
if self.current_tts_voice and not self._is_cloning_runtime_ready(voice_id=self.current_tts_voice):
|
|
761
|
+
print(
|
|
762
|
+
"ℹ️ Cloned voice selected but cloning runtime is not ready.\n"
|
|
763
|
+
" Run /cloning_status then /cloning_download, or switch back with /tts_voice piper."
|
|
764
|
+
)
|
|
765
|
+
else:
|
|
766
|
+
self._speak_with_spinner_until_audio_starts(response_text)
|
|
767
|
+
except Exception as e:
|
|
768
|
+
print(f"❌ TTS failed: {e}")
|
|
769
|
+
|
|
770
|
+
# Capture best-effort TTS metrics (Piper or cloned).
|
|
771
|
+
tts_metrics = None
|
|
772
|
+
try:
|
|
773
|
+
if self.voice_manager and hasattr(self.voice_manager, "pop_last_tts_metrics"):
|
|
774
|
+
tts_metrics = self.voice_manager.pop_last_tts_metrics()
|
|
775
|
+
except Exception:
|
|
776
|
+
tts_metrics = None
|
|
777
|
+
|
|
778
|
+
try:
|
|
779
|
+
if isinstance(getattr(self, "_last_turn_metrics", None), dict):
|
|
780
|
+
self._last_turn_metrics["tts"] = tts_metrics
|
|
781
|
+
except Exception:
|
|
782
|
+
pass
|
|
783
|
+
|
|
784
|
+
# Verbose stats (max 2 lines).
|
|
785
|
+
try:
|
|
786
|
+
if self.verbose_mode and isinstance(getattr(self, "_last_turn_metrics", None), dict):
|
|
787
|
+
self._print_verbose_turn_stats(self._last_turn_metrics)
|
|
788
|
+
except Exception:
|
|
789
|
+
pass
|
|
244
790
|
|
|
245
791
|
except requests.exceptions.ConnectionError as e:
|
|
246
792
|
print(f"❌ Cannot connect to Ollama API at {self.api_url}")
|
|
@@ -274,37 +820,29 @@ class VoiceREPL(cmd.Cmd):
|
|
|
274
820
|
|
|
275
821
|
def _count_tokens(self, text, role):
|
|
276
822
|
"""Count tokens in text."""
|
|
823
|
+
encoding = self._get_tiktoken_encoding()
|
|
824
|
+
if encoding is None:
|
|
825
|
+
return None
|
|
277
826
|
try:
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# Initialize the tokenizer
|
|
281
|
-
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
|
282
|
-
|
|
283
|
-
# Count tokens
|
|
284
|
-
token_count = len(encoding.encode(text))
|
|
285
|
-
|
|
286
|
-
# Update the token counts based on role
|
|
287
|
-
if role == "system":
|
|
288
|
-
self.system_tokens = token_count
|
|
289
|
-
elif role == "user":
|
|
290
|
-
self.user_tokens += token_count
|
|
291
|
-
elif role == "assistant":
|
|
292
|
-
self.assistant_tokens += token_count
|
|
293
|
-
|
|
294
|
-
# Calculate total tokens
|
|
295
|
-
total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
|
|
296
|
-
|
|
297
|
-
if self.debug_mode:
|
|
298
|
-
print(f"{role.capitalize()} tokens: {token_count}")
|
|
299
|
-
print(f"Total tokens: {total_tokens}")
|
|
300
|
-
|
|
301
|
-
except ImportError:
|
|
302
|
-
# If tiktoken is not available, just don't count tokens
|
|
303
|
-
pass
|
|
827
|
+
token_count = len(encoding.encode(str(text or "")))
|
|
304
828
|
except Exception as e:
|
|
305
829
|
if self.debug_mode:
|
|
306
830
|
print(f"Error counting tokens: {e}")
|
|
307
|
-
|
|
831
|
+
return None
|
|
832
|
+
|
|
833
|
+
# Update the token counts based on role
|
|
834
|
+
if role == "system":
|
|
835
|
+
self.system_tokens = int(token_count)
|
|
836
|
+
elif role == "user":
|
|
837
|
+
self.user_tokens += int(token_count)
|
|
838
|
+
elif role == "assistant":
|
|
839
|
+
self.assistant_tokens += int(token_count)
|
|
840
|
+
|
|
841
|
+
if self.debug_mode:
|
|
842
|
+
total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
|
|
843
|
+
print(f"{role.capitalize()} tokens: {token_count}")
|
|
844
|
+
print(f"Total tokens: {total_tokens}")
|
|
845
|
+
return int(token_count)
|
|
308
846
|
|
|
309
847
|
def _clean_response(self, text):
|
|
310
848
|
"""Clean LLM response text."""
|
|
@@ -323,8 +861,12 @@ class VoiceREPL(cmd.Cmd):
|
|
|
323
861
|
"""Switch voice language.
|
|
324
862
|
|
|
325
863
|
Usage: /language <lang>
|
|
326
|
-
Available languages: en, fr, es, de,
|
|
864
|
+
Available languages: en, fr, es, de, ru, zh
|
|
327
865
|
"""
|
|
866
|
+
if not self.voice_manager:
|
|
867
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
868
|
+
return
|
|
869
|
+
|
|
328
870
|
if not args:
|
|
329
871
|
current_name = self.voice_manager.get_language_name()
|
|
330
872
|
current_code = self.voice_manager.get_language()
|
|
@@ -359,10 +901,13 @@ class VoiceREPL(cmd.Cmd):
|
|
|
359
901
|
'fr': "Langue changée en français.",
|
|
360
902
|
'es': "Idioma cambiado a español.",
|
|
361
903
|
'de': "Sprache auf Deutsch umgestellt.",
|
|
362
|
-
'
|
|
904
|
+
'ru': "Язык переключен на русский.",
|
|
905
|
+
'zh': "语言已切换到中文。"
|
|
363
906
|
}
|
|
364
907
|
test_msg = test_messages.get(language, "Language switched.")
|
|
365
|
-
|
|
908
|
+
# Respect TTS toggle: if the user disabled TTS, don't speak test messages.
|
|
909
|
+
if getattr(self, "use_tts", True):
|
|
910
|
+
self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
|
|
366
911
|
|
|
367
912
|
# Restart voice mode if it was active
|
|
368
913
|
if was_active:
|
|
@@ -383,10 +928,13 @@ class VoiceREPL(cmd.Cmd):
|
|
|
383
928
|
/setvoice <voice_id> # Set voice (format: language.voice_id)
|
|
384
929
|
|
|
385
930
|
Examples:
|
|
386
|
-
/setvoice # List all voices
|
|
387
|
-
/setvoice fr.
|
|
388
|
-
/setvoice it.mai_male_vits # Set Italian male VITS voice
|
|
931
|
+
/setvoice # List all Piper voices
|
|
932
|
+
/setvoice fr.siwis # Switch to French (voice id is best-effort)
|
|
389
933
|
"""
|
|
934
|
+
if not self.voice_manager:
|
|
935
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
936
|
+
return
|
|
937
|
+
|
|
390
938
|
if not args:
|
|
391
939
|
# Show all available voices with metadata
|
|
392
940
|
print(f"\n{Colors.CYAN}Available Voice Models:{Colors.END}")
|
|
@@ -398,7 +946,7 @@ class VoiceREPL(cmd.Cmd):
|
|
|
398
946
|
# Get language name
|
|
399
947
|
lang_names = {
|
|
400
948
|
'en': 'English', 'fr': 'French', 'es': 'Spanish',
|
|
401
|
-
'de': 'German', '
|
|
949
|
+
'de': 'German', 'ru': 'Russian', 'zh': 'Chinese'
|
|
402
950
|
}
|
|
403
951
|
lang_name = lang_names.get(language, language.upper())
|
|
404
952
|
|
|
@@ -406,24 +954,22 @@ class VoiceREPL(cmd.Cmd):
|
|
|
406
954
|
|
|
407
955
|
for voice_id, voice_info in voices.items():
|
|
408
956
|
cached_icon = "✅" if voice_info.get('cached', False) else "📥"
|
|
409
|
-
quality_icon = "
|
|
410
|
-
size_text = f"{voice_info
|
|
957
|
+
quality_icon = "🔧"
|
|
958
|
+
size_text = f"{voice_info.get('size_mb', 0)}MB"
|
|
411
959
|
|
|
412
960
|
print(f" {cached_icon} {quality_icon} {language}.{voice_id}")
|
|
413
961
|
print(f" {voice_info['name']} ({size_text})")
|
|
414
962
|
print(f" {voice_info['description']}")
|
|
415
|
-
|
|
416
|
-
print(f" ⚠️ Requires espeak-ng")
|
|
963
|
+
# Piper has no system deps.
|
|
417
964
|
|
|
418
965
|
print(f"\n{Colors.YELLOW}Usage:{Colors.END}")
|
|
419
966
|
print(" /setvoice <language>.<voice_id>")
|
|
420
|
-
print(" Example: /setvoice fr.
|
|
421
|
-
print("\n📥 = Download needed ✅ = Ready
|
|
967
|
+
print(" Example: /setvoice fr.siwis")
|
|
968
|
+
print("\n📥 = Download needed ✅ = Ready")
|
|
422
969
|
|
|
423
970
|
except Exception as e:
|
|
424
971
|
print(f"❌ Error listing models: {e}")
|
|
425
|
-
|
|
426
|
-
self.voice_manager.list_voices()
|
|
972
|
+
print(" (No fallback available)")
|
|
427
973
|
return
|
|
428
974
|
|
|
429
975
|
voice_spec = args.strip()
|
|
@@ -451,39 +997,28 @@ class VoiceREPL(cmd.Cmd):
|
|
|
451
997
|
# Download and set the specific voice using programmatic API
|
|
452
998
|
try:
|
|
453
999
|
print(f"🔄 Setting voice {voice_spec}...")
|
|
454
|
-
|
|
455
|
-
# Use the programmatic download API
|
|
456
|
-
success = self.voice_manager.download_model(voice_spec)
|
|
1000
|
+
success = self.voice_manager.set_voice(language, voice_id)
|
|
457
1001
|
|
|
458
1002
|
if success:
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
if success:
|
|
463
|
-
# Update current language
|
|
464
|
-
self.current_language = language
|
|
465
|
-
|
|
466
|
-
print(f"✅ Voice set to {voice_spec}")
|
|
467
|
-
|
|
468
|
-
# Test the voice
|
|
469
|
-
test_messages = {
|
|
470
|
-
'en': 'Voice changed to English.',
|
|
471
|
-
'fr': 'Voix changée en français.',
|
|
472
|
-
'es': 'Voz cambiada al español.',
|
|
473
|
-
'de': 'Stimme auf Deutsch geändert.',
|
|
474
|
-
'it': 'Voce cambiata in italiano.'
|
|
475
|
-
}
|
|
476
|
-
test_msg = test_messages.get(language, f'Voice changed to {language}.')
|
|
477
|
-
self.voice_manager.speak(test_msg)
|
|
1003
|
+
self.current_language = language
|
|
1004
|
+
print(f"✅ Voice set to {voice_spec}")
|
|
478
1005
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
1006
|
+
test_messages = {
|
|
1007
|
+
'en': 'Voice changed to English.',
|
|
1008
|
+
'fr': 'Voix changée en français.',
|
|
1009
|
+
'es': 'Voz cambiada al español.',
|
|
1010
|
+
'de': 'Stimme auf Deutsch geändert.',
|
|
1011
|
+
'ru': 'Голос изменён на русский.',
|
|
1012
|
+
'zh': '语音已切换到中文。'
|
|
1013
|
+
}
|
|
1014
|
+
test_msg = test_messages.get(language, f'Voice changed to {language}.')
|
|
1015
|
+
if getattr(self, "use_tts", True):
|
|
1016
|
+
self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
|
|
1017
|
+
|
|
1018
|
+
if was_active:
|
|
1019
|
+
self.do_voice(self.voice_mode)
|
|
484
1020
|
else:
|
|
485
|
-
print(f"❌ Failed to
|
|
486
|
-
print(" Check your internet connection or try a different voice")
|
|
1021
|
+
print(f"❌ Failed to set voice: {voice_spec}")
|
|
487
1022
|
|
|
488
1023
|
except Exception as e:
|
|
489
1024
|
print(f"❌ Error setting voice: {e}")
|
|
@@ -521,185 +1056,1736 @@ class VoiceREPL(cmd.Cmd):
|
|
|
521
1056
|
off - Disable voice input
|
|
522
1057
|
full - Continuous listening, interrupts TTS on speech detection
|
|
523
1058
|
wait - Pause listening while TTS is speaking (recommended)
|
|
524
|
-
stop -
|
|
525
|
-
ptt - Push-to-talk
|
|
1059
|
+
stop - Keep listening while speaking, but only stop TTS on stop phrase
|
|
1060
|
+
ptt - Push-to-talk (use /ptt to record one utterance)
|
|
526
1061
|
"""
|
|
527
|
-
arg = arg.lower().strip()
|
|
1062
|
+
arg = (arg or "").lower().strip()
|
|
528
1063
|
|
|
529
1064
|
# Handle legacy "on" argument
|
|
530
1065
|
if arg == "on":
|
|
531
1066
|
arg = "wait"
|
|
532
1067
|
|
|
533
1068
|
if arg in ["off", "full", "wait", "stop", "ptt"]:
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
1069
|
+
if not self.voice_manager:
|
|
1070
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
1071
|
+
return
|
|
1072
|
+
|
|
1073
|
+
# Exit PTT session if running.
|
|
1074
|
+
if self._ptt_session_active:
|
|
1075
|
+
self._ptt_session_active = False
|
|
1076
|
+
self._ptt_recording = False
|
|
1077
|
+
self._ptt_busy = False
|
|
1078
|
+
|
|
1079
|
+
# Stop any ongoing mic session.
|
|
1080
|
+
try:
|
|
1081
|
+
self.voice_manager.stop_listening()
|
|
1082
|
+
except Exception:
|
|
1083
|
+
pass
|
|
1084
|
+
self.voice_mode_active = False
|
|
1085
|
+
|
|
538
1086
|
self.voice_mode = arg
|
|
539
1087
|
self.voice_manager.set_voice_mode(arg)
|
|
540
|
-
|
|
1088
|
+
|
|
541
1089
|
if arg == "off":
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
1090
|
+
print("Voice mode disabled.")
|
|
1091
|
+
return
|
|
1092
|
+
|
|
1093
|
+
if arg == "ptt":
|
|
1094
|
+
# PTT is a dedicated session: no text entry.
|
|
1095
|
+
print("Voice mode: PTT - Push-to-talk (no typing).")
|
|
1096
|
+
print("SPACE: start/stop recording (transcribe on stop)")
|
|
1097
|
+
print("ESC: exit PTT mode")
|
|
1098
|
+
self._run_ptt_session()
|
|
1099
|
+
return
|
|
1100
|
+
|
|
1101
|
+
# Continuous listening modes.
|
|
1102
|
+
try:
|
|
549
1103
|
self.voice_manager.listen(
|
|
550
1104
|
on_transcription=self._voice_callback,
|
|
551
|
-
|
|
1105
|
+
# Stop phrase interrupts TTS; keep listening.
|
|
1106
|
+
on_stop=lambda: (
|
|
1107
|
+
print("\n⏹️ Stopped speaking.\n") if (self.voice_manager and self.voice_manager.is_speaking()) else None
|
|
1108
|
+
),
|
|
552
1109
|
)
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
1110
|
+
self.voice_mode_active = True
|
|
1111
|
+
except Exception as e:
|
|
1112
|
+
self.voice_mode_active = False
|
|
1113
|
+
self.voice_mode = "off"
|
|
1114
|
+
print(f"❌ Failed to start microphone listening: {e}")
|
|
1115
|
+
print(" Tip: check microphone permissions/device availability.")
|
|
1116
|
+
return
|
|
1117
|
+
|
|
1118
|
+
if arg == "wait":
|
|
1119
|
+
print("Voice mode: WAIT - Listens continuously except while speaking.")
|
|
1120
|
+
print("Use /voice off to disable.")
|
|
1121
|
+
elif arg == "stop":
|
|
1122
|
+
print("Voice mode: STOP - Always listens; stop phrase stops TTS.")
|
|
1123
|
+
print("Use /voice off to disable.")
|
|
1124
|
+
elif arg == "full":
|
|
1125
|
+
print("Voice mode: FULL - Interrupts TTS on any speech (best with AEC/headset).")
|
|
1126
|
+
print("Use /voice off to disable.")
|
|
567
1127
|
else:
|
|
568
1128
|
print("Usage: /voice off | full | wait | stop | ptt")
|
|
569
1129
|
print(" off - Disable voice input")
|
|
570
1130
|
print(" full - Continuous listening, interrupts TTS on speech")
|
|
571
|
-
print(" wait -
|
|
572
|
-
print(" stop -
|
|
573
|
-
print(" ptt - Push-to-talk
|
|
574
|
-
|
|
575
|
-
def
|
|
576
|
-
"""
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
if
|
|
582
|
-
|
|
583
|
-
# Don't process "stop" as a query
|
|
1131
|
+
print(" wait - Listen except while speaking")
|
|
1132
|
+
print(" stop - Always listen; stop phrase stops TTS")
|
|
1133
|
+
print(" ptt - Push-to-talk (no typing; SPACE triggers capture)")
|
|
1134
|
+
|
|
1135
|
+
def do_ptt(self, arg):
|
|
1136
|
+
"""Push-to-talk: record a single utterance, then process it.
|
|
1137
|
+
|
|
1138
|
+
Usage:
|
|
1139
|
+
/ptt
|
|
1140
|
+
"""
|
|
1141
|
+
if not self.voice_manager:
|
|
1142
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
584
1143
|
return
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
# In PTT mode, process immediately
|
|
1144
|
+
print("❌ /ptt is deprecated. Use: /voice ptt (then SPACE)")
|
|
1145
|
+
return
|
|
1146
|
+
|
|
1147
|
+
# Ensure we are not already listening.
|
|
1148
|
+
try:
|
|
1149
|
+
self.voice_manager.stop_listening()
|
|
1150
|
+
except Exception:
|
|
593
1151
|
pass
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
"""
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
if arg == "on":
|
|
611
|
-
self.use_tts = True
|
|
612
|
-
print("TTS enabled" if self.debug_mode else "")
|
|
613
|
-
elif arg == "off":
|
|
614
|
-
self.use_tts = False
|
|
615
|
-
print("TTS disabled" if self.debug_mode else "")
|
|
616
|
-
else:
|
|
617
|
-
print("Usage: /tts on | off")
|
|
618
|
-
|
|
619
|
-
def do_speed(self, arg):
|
|
620
|
-
"""Set the TTS speed multiplier."""
|
|
621
|
-
if not arg.strip():
|
|
622
|
-
print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
|
|
1152
|
+
|
|
1153
|
+
return
|
|
1154
|
+
|
|
1155
|
+
def _run_ptt_session(self) -> None:
|
|
1156
|
+
"""PTT mode key loop (no typing).
|
|
1157
|
+
|
|
1158
|
+
Clean semantics:
|
|
1159
|
+
- SPACE toggles recording (start/stop)
|
|
1160
|
+
- on stop: transcribe immediately and send to the LLM
|
|
1161
|
+
- ESC exits PTT mode (returns to STOP mode)
|
|
1162
|
+
|
|
1163
|
+
This avoids relying on VAD end-of-utterance, which is fragile when speaker
|
|
1164
|
+
echo is present (common on laptop speakers).
|
|
1165
|
+
"""
|
|
1166
|
+
if not self.voice_manager:
|
|
623
1167
|
return
|
|
624
|
-
|
|
1168
|
+
self._ptt_session_active = True
|
|
1169
|
+
self._ptt_recording = False
|
|
1170
|
+
self._ptt_busy = False
|
|
1171
|
+
|
|
1172
|
+
# Lazy imports: keep REPL startup snappy.
|
|
1173
|
+
import io
|
|
1174
|
+
import wave
|
|
1175
|
+
|
|
625
1176
|
try:
|
|
626
|
-
|
|
1177
|
+
import sounddevice as sd
|
|
1178
|
+
except Exception as e:
|
|
1179
|
+
print(f"❌ PTT requires sounddevice: {e}")
|
|
1180
|
+
self._ptt_session_active = False
|
|
1181
|
+
return
|
|
1182
|
+
|
|
1183
|
+
sr = 16000
|
|
1184
|
+
frames: list[bytes] = []
|
|
1185
|
+
stream = {"obj": None}
|
|
1186
|
+
cols = 80
|
|
1187
|
+
try:
|
|
1188
|
+
cols = int(shutil.get_terminal_size((80, 20)).columns)
|
|
1189
|
+
except Exception:
|
|
1190
|
+
cols = 80
|
|
1191
|
+
|
|
1192
|
+
def _clear_status() -> None:
|
|
1193
|
+
try:
|
|
1194
|
+
sys.stdout.write("\r" + (" " * max(10, cols - 1)) + "\r")
|
|
1195
|
+
sys.stdout.flush()
|
|
1196
|
+
except Exception:
|
|
1197
|
+
pass
|
|
1198
|
+
|
|
1199
|
+
def _status_line(msg: str) -> None:
|
|
1200
|
+
# Render on a single line (no newline) so SPACE can be pressed repeatedly.
|
|
1201
|
+
try:
|
|
1202
|
+
_clear_status()
|
|
1203
|
+
sys.stdout.write(str(msg)[: max(0, cols - 1)])
|
|
1204
|
+
sys.stdout.flush()
|
|
1205
|
+
except Exception:
|
|
1206
|
+
pass
|
|
1207
|
+
|
|
1208
|
+
def _println(msg: str = "") -> None:
|
|
1209
|
+
# When in raw terminal mode, '\n' does NOT reliably return to column 0.
|
|
1210
|
+
# Use CRLF explicitly to prevent "diagonal drifting" rendering.
|
|
1211
|
+
try:
|
|
1212
|
+
_clear_status()
|
|
1213
|
+
sys.stdout.write("\r\n" + str(msg) + "\r\n")
|
|
1214
|
+
sys.stdout.flush()
|
|
1215
|
+
except Exception:
|
|
1216
|
+
pass
|
|
1217
|
+
|
|
1218
|
+
def _start_recording() -> None:
|
|
1219
|
+
nonlocal frames
|
|
1220
|
+
if self._ptt_recording:
|
|
1221
|
+
return
|
|
1222
|
+
if self._ptt_busy:
|
|
1223
|
+
return
|
|
1224
|
+
frames = []
|
|
1225
|
+
|
|
1226
|
+
# Interrupt any speech immediately.
|
|
1227
|
+
try:
|
|
1228
|
+
self.voice_manager.stop_speaking()
|
|
1229
|
+
except Exception:
|
|
1230
|
+
pass
|
|
1231
|
+
|
|
1232
|
+
def _cb(indata, _frames, _time, status):
|
|
1233
|
+
if status and self.debug_mode:
|
|
1234
|
+
pass
|
|
1235
|
+
try:
|
|
1236
|
+
frames.append(indata.copy().tobytes())
|
|
1237
|
+
except Exception:
|
|
1238
|
+
pass
|
|
1239
|
+
|
|
1240
|
+
try:
|
|
1241
|
+
stream["obj"] = sd.InputStream(
|
|
1242
|
+
samplerate=sr,
|
|
1243
|
+
channels=1,
|
|
1244
|
+
dtype="int16",
|
|
1245
|
+
callback=_cb,
|
|
1246
|
+
blocksize=int(sr * 0.03),
|
|
1247
|
+
)
|
|
1248
|
+
stream["obj"].start()
|
|
1249
|
+
self._ptt_recording = True
|
|
1250
|
+
_status_line("🎙️ Recording… (SPACE to send, ESC to exit)")
|
|
1251
|
+
except Exception as e:
|
|
1252
|
+
self._ptt_recording = False
|
|
1253
|
+
stream["obj"] = None
|
|
1254
|
+
_clear_status()
|
|
1255
|
+
_println(f"❌ Failed to start microphone stream: {e}")
|
|
1256
|
+
|
|
1257
|
+
def _stop_recording_and_send() -> None:
|
|
1258
|
+
if not self._ptt_recording:
|
|
1259
|
+
return
|
|
1260
|
+
self._ptt_recording = False
|
|
1261
|
+
_clear_status()
|
|
1262
|
+
|
|
1263
|
+
try:
|
|
1264
|
+
if stream["obj"] is not None:
|
|
1265
|
+
try:
|
|
1266
|
+
stream["obj"].stop()
|
|
1267
|
+
except Exception:
|
|
1268
|
+
pass
|
|
1269
|
+
try:
|
|
1270
|
+
stream["obj"].close()
|
|
1271
|
+
except Exception:
|
|
1272
|
+
pass
|
|
1273
|
+
finally:
|
|
1274
|
+
stream["obj"] = None
|
|
1275
|
+
|
|
1276
|
+
pcm = b"".join(frames)
|
|
1277
|
+
if len(pcm) < int(sr * 0.25) * 2:
|
|
1278
|
+
_println("…(too short, try again)")
|
|
1279
|
+
return
|
|
1280
|
+
|
|
1281
|
+
buf = io.BytesIO()
|
|
1282
|
+
with wave.open(buf, "wb") as w:
|
|
1283
|
+
w.setnchannels(1)
|
|
1284
|
+
w.setsampwidth(2)
|
|
1285
|
+
w.setframerate(sr)
|
|
1286
|
+
w.writeframes(pcm)
|
|
1287
|
+
wav_bytes = buf.getvalue()
|
|
1288
|
+
|
|
1289
|
+
self._ptt_busy = True
|
|
1290
|
+
try:
|
|
1291
|
+
audio_s = 0.0
|
|
1292
|
+
try:
|
|
1293
|
+
if sr and sr > 0:
|
|
1294
|
+
audio_s = float(len(pcm)) / float(int(sr) * 2)
|
|
1295
|
+
except Exception:
|
|
1296
|
+
audio_s = 0.0
|
|
1297
|
+
|
|
1298
|
+
t0 = time.monotonic()
|
|
1299
|
+
text = (self.voice_manager.transcribe_from_bytes(wav_bytes, language=self.current_language) or "").strip()
|
|
1300
|
+
t1 = time.monotonic()
|
|
1301
|
+
stt_s = float(t1 - t0)
|
|
1302
|
+
self._pending_stt_metrics = {
|
|
1303
|
+
"stt_s": stt_s,
|
|
1304
|
+
"audio_s": float(audio_s),
|
|
1305
|
+
"rtf": (stt_s / float(audio_s)) if audio_s else None,
|
|
1306
|
+
"sample_rate": int(sr),
|
|
1307
|
+
"chunks": None,
|
|
1308
|
+
"chunk_ms": None,
|
|
1309
|
+
"profile": "ptt",
|
|
1310
|
+
"ts": time.time(),
|
|
1311
|
+
}
|
|
1312
|
+
except Exception as e:
|
|
1313
|
+
self._ptt_busy = False
|
|
1314
|
+
_println(f"❌ Transcription failed: {e}")
|
|
1315
|
+
return
|
|
1316
|
+
self._ptt_busy = False
|
|
1317
|
+
|
|
1318
|
+
if not text:
|
|
1319
|
+
_println("…(no transcription)")
|
|
1320
|
+
return
|
|
1321
|
+
|
|
1322
|
+
_println(f"> {text}")
|
|
1323
|
+
self.process_query(text)
|
|
1324
|
+
|
|
1325
|
+
# Platform key read.
|
|
1326
|
+
import sys
|
|
1327
|
+
if sys.platform == "win32":
|
|
1328
|
+
import msvcrt
|
|
1329
|
+
|
|
1330
|
+
while self._ptt_session_active:
|
|
1331
|
+
ch = msvcrt.getwch()
|
|
1332
|
+
if ch == "\x1b": # ESC
|
|
1333
|
+
break
|
|
1334
|
+
if self._ptt_busy:
|
|
1335
|
+
continue
|
|
1336
|
+
if ch == " ":
|
|
1337
|
+
if not self._ptt_recording:
|
|
1338
|
+
_start_recording()
|
|
1339
|
+
else:
|
|
1340
|
+
_stop_recording_and_send()
|
|
1341
|
+
else:
|
|
1342
|
+
import termios
|
|
1343
|
+
import tty
|
|
1344
|
+
|
|
1345
|
+
fd = sys.stdin.fileno()
|
|
1346
|
+
old = termios.tcgetattr(fd)
|
|
1347
|
+
try:
|
|
1348
|
+
tty.setraw(fd)
|
|
1349
|
+
|
|
1350
|
+
def _run_in_cooked(block):
|
|
1351
|
+
"""Run a block with normal tty settings.
|
|
1352
|
+
|
|
1353
|
+
In raw mode, many terminals treat '\n' as LF without CR, so prints from
|
|
1354
|
+
deeper code paths (LLM responses) can drift/indent. We temporarily
|
|
1355
|
+
restore the terminal mode to keep output rendering stable.
|
|
1356
|
+
"""
|
|
1357
|
+
try:
|
|
1358
|
+
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
|
1359
|
+
except Exception:
|
|
1360
|
+
pass
|
|
1361
|
+
try:
|
|
1362
|
+
block()
|
|
1363
|
+
finally:
|
|
1364
|
+
try:
|
|
1365
|
+
tty.setraw(fd)
|
|
1366
|
+
except Exception:
|
|
1367
|
+
pass
|
|
1368
|
+
|
|
1369
|
+
while self._ptt_session_active:
|
|
1370
|
+
ch = sys.stdin.read(1)
|
|
1371
|
+
if ch == "\x1b": # ESC
|
|
1372
|
+
break
|
|
1373
|
+
if self._ptt_busy:
|
|
1374
|
+
continue
|
|
1375
|
+
if ch == " ":
|
|
1376
|
+
if not self._ptt_recording:
|
|
1377
|
+
_start_recording()
|
|
1378
|
+
else:
|
|
1379
|
+
_run_in_cooked(_stop_recording_and_send)
|
|
1380
|
+
finally:
|
|
1381
|
+
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
|
1382
|
+
|
|
1383
|
+
self._ptt_session_active = False
|
|
1384
|
+
self._ptt_recording = False
|
|
1385
|
+
self._ptt_busy = False
|
|
1386
|
+
try:
|
|
1387
|
+
if stream["obj"] is not None:
|
|
1388
|
+
stream["obj"].stop()
|
|
1389
|
+
stream["obj"].close()
|
|
1390
|
+
except Exception:
|
|
1391
|
+
pass
|
|
1392
|
+
_clear_status()
|
|
1393
|
+
# Ensure we end on a clean line before restoring other modes.
|
|
1394
|
+
try:
|
|
1395
|
+
sys.stdout.write("\r\n")
|
|
1396
|
+
sys.stdout.flush()
|
|
1397
|
+
except Exception:
|
|
1398
|
+
pass
|
|
1399
|
+
# Restore to STOP after exiting PTT.
|
|
1400
|
+
try:
|
|
1401
|
+
self.do_voice("stop")
|
|
1402
|
+
except Exception:
|
|
1403
|
+
pass
|
|
1404
|
+
|
|
1405
|
+
def _voice_callback(self, text):
|
|
1406
|
+
"""Callback for voice recognition."""
|
|
1407
|
+
# Capture best-effort STT metrics from the recognizer (for verbose stats).
|
|
1408
|
+
stt_metrics = None
|
|
1409
|
+
try:
|
|
1410
|
+
vm = self.voice_manager
|
|
1411
|
+
rec = getattr(vm, "voice_recognizer", None) if vm else None
|
|
1412
|
+
if rec is not None and hasattr(rec, "pop_last_stt_metrics"):
|
|
1413
|
+
stt_metrics = rec.pop_last_stt_metrics()
|
|
1414
|
+
except Exception:
|
|
1415
|
+
stt_metrics = None
|
|
1416
|
+
self._pending_stt_metrics = stt_metrics
|
|
1417
|
+
|
|
1418
|
+
# Print what the user said
|
|
1419
|
+
print(f"\n> {text}")
|
|
1420
|
+
# NOTE: stop phrases are handled by the stop_callback path (interrupt TTS).
|
|
1421
|
+
# We do not use "stop" to exit voice mode; use /voice off explicitly.
|
|
1422
|
+
|
|
1423
|
+
# Mode-specific handling
|
|
1424
|
+
if self.voice_mode == "stop":
|
|
1425
|
+
# In 'stop' mode, don't interrupt TTS - just queue the message
|
|
1426
|
+
# But since we're in callback, TTS interrupt is already paused
|
|
1427
|
+
pass
|
|
1428
|
+
elif self.voice_mode == "ptt":
|
|
1429
|
+
# In PTT mode, process immediately
|
|
1430
|
+
pass
|
|
1431
|
+
# 'full' mode has default behavior
|
|
1432
|
+
|
|
1433
|
+
# Process the user's query
|
|
1434
|
+
self.process_query(text)
|
|
1435
|
+
|
|
1436
|
+
def _voice_stop_callback(self):
|
|
1437
|
+
"""Callback when voice mode is stopped."""
|
|
1438
|
+
self.voice_mode = "off"
|
|
1439
|
+
self.voice_mode_active = False
|
|
1440
|
+
self.voice_manager.stop_listening()
|
|
1441
|
+
print("Voice mode disabled.")
|
|
1442
|
+
|
|
1443
|
+
def do_tts(self, arg):
|
|
1444
|
+
"""Toggle text-to-speech."""
|
|
1445
|
+
arg = arg.lower().strip()
|
|
1446
|
+
|
|
1447
|
+
if arg == "on":
|
|
1448
|
+
self.use_tts = True
|
|
1449
|
+
if self.voice_manager is None:
|
|
1450
|
+
# Re-enable voice features (TTS/STT) by creating a VoiceManager.
|
|
1451
|
+
self.voice_manager = VoiceManager(
|
|
1452
|
+
language=self.current_language,
|
|
1453
|
+
tts_model=self._initial_tts_model,
|
|
1454
|
+
debug_mode=self.debug_mode,
|
|
1455
|
+
allow_downloads=False,
|
|
1456
|
+
cloned_tts_streaming=False,
|
|
1457
|
+
cloning_engine=self.cloning_engine,
|
|
1458
|
+
)
|
|
1459
|
+
print("TTS enabled" if self.debug_mode else "")
|
|
1460
|
+
elif arg == "off":
|
|
1461
|
+
self.use_tts = False
|
|
1462
|
+
print("TTS disabled" if self.debug_mode else "")
|
|
1463
|
+
else:
|
|
1464
|
+
print("Usage: /tts on | off")
|
|
1465
|
+
|
|
1466
|
+
def do_speed(self, arg):
|
|
1467
|
+
"""Set the TTS speed multiplier."""
|
|
1468
|
+
if not self.voice_manager:
|
|
1469
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1470
|
+
return
|
|
1471
|
+
if not arg.strip():
|
|
1472
|
+
print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
|
|
1473
|
+
return
|
|
1474
|
+
|
|
1475
|
+
try:
|
|
1476
|
+
speed = float(arg.strip())
|
|
627
1477
|
if 0.5 <= speed <= 2.0:
|
|
628
1478
|
self.voice_manager.set_speed(speed)
|
|
629
1479
|
print(f"TTS speed set to {speed}x")
|
|
630
1480
|
else:
|
|
631
|
-
print("Speed should be between 0.5 and 2.0")
|
|
632
|
-
except ValueError:
|
|
633
|
-
print("Usage: /speed <number> (e.g., /speed 1.5)")
|
|
634
|
-
|
|
635
|
-
def do_tts_model(self, arg):
|
|
636
|
-
"""
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
1481
|
+
print("Speed should be between 0.5 and 2.0")
|
|
1482
|
+
except ValueError:
|
|
1483
|
+
print("Usage: /speed <number> (e.g., /speed 1.5)")
|
|
1484
|
+
|
|
1485
|
+
def do_tts_model(self, arg):
|
|
1486
|
+
"""Deprecated: legacy TTS model switching.
|
|
1487
|
+
|
|
1488
|
+
AbstractVoice core is Piper-first; use `/setvoice` (Piper voices) or cloned voices.
|
|
1489
|
+
"""
|
|
1490
|
+
print("❌ /tts_model is not supported (Piper-first core).")
|
|
1491
|
+
print(" Use /setvoice for Piper voices, or /tts_voice clone <id> for cloned voices.")
|
|
1492
|
+
|
|
1493
|
+
def do_whisper(self, arg):
|
|
1494
|
+
"""Change Whisper model."""
|
|
1495
|
+
if not self.voice_manager:
|
|
1496
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
1497
|
+
return
|
|
1498
|
+
model = arg.strip()
|
|
1499
|
+
if not model:
|
|
1500
|
+
print(f"Current Whisper model: {self.voice_manager.get_whisper()}")
|
|
1501
|
+
return
|
|
1502
|
+
|
|
1503
|
+
self.voice_manager.set_whisper(model)
|
|
1504
|
+
|
|
1505
|
+
def do_speak(self, arg):
|
|
1506
|
+
"""Speak a text immediately (without calling the LLM).
|
|
1507
|
+
|
|
1508
|
+
Usage:
|
|
1509
|
+
/speak Hello world
|
|
1510
|
+
"""
|
|
1511
|
+
if not self.voice_manager:
|
|
1512
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1513
|
+
return
|
|
1514
|
+
|
|
1515
|
+
text = arg.strip()
|
|
1516
|
+
if not text:
|
|
1517
|
+
print("Usage: /speak <text>")
|
|
1518
|
+
return
|
|
1519
|
+
|
|
1520
|
+
try:
|
|
1521
|
+
self._speak_with_spinner_until_audio_starts(text)
|
|
1522
|
+
if self.verbose_mode:
|
|
1523
|
+
out_words = self._count_words(text)
|
|
1524
|
+
out_tokens = None
|
|
1525
|
+
try:
|
|
1526
|
+
enc = self._get_tiktoken_encoding()
|
|
1527
|
+
if enc is not None:
|
|
1528
|
+
out_tokens = int(len(enc.encode(str(text or ""))))
|
|
1529
|
+
except Exception:
|
|
1530
|
+
out_tokens = None
|
|
1531
|
+
|
|
1532
|
+
tts_metrics = None
|
|
1533
|
+
try:
|
|
1534
|
+
if hasattr(self.voice_manager, "pop_last_tts_metrics"):
|
|
1535
|
+
tts_metrics = self.voice_manager.pop_last_tts_metrics()
|
|
1536
|
+
except Exception:
|
|
1537
|
+
tts_metrics = None
|
|
1538
|
+
|
|
1539
|
+
turn = {
|
|
1540
|
+
"stt": None,
|
|
1541
|
+
"llm": {},
|
|
1542
|
+
"counts": {
|
|
1543
|
+
"in_words": 0,
|
|
1544
|
+
"out_words": int(out_words),
|
|
1545
|
+
"in_tokens": None,
|
|
1546
|
+
"out_tokens": out_tokens,
|
|
1547
|
+
},
|
|
1548
|
+
"tts": tts_metrics,
|
|
1549
|
+
}
|
|
1550
|
+
self._print_verbose_turn_stats(turn)
|
|
1551
|
+
except Exception as e:
|
|
1552
|
+
print(f"❌ Speak failed: {e}")
|
|
1553
|
+
if self.debug_mode:
|
|
1554
|
+
import traceback
|
|
1555
|
+
traceback.print_exc()
|
|
1556
|
+
|
|
1557
|
+
def _speak_with_spinner_until_audio_starts(self, text: str) -> None:
|
|
1558
|
+
"""REPL UX: show spinner while waiting for first audio, then stop.
|
|
1559
|
+
|
|
1560
|
+
This avoids corrupting the `cmd` prompt while still giving feedback during
|
|
1561
|
+
long cloned-TTS synthesis. Once playback starts, the prompt is displayed
|
|
1562
|
+
normally so the user can interrupt anytime by typing.
|
|
1563
|
+
"""
|
|
1564
|
+
if not self.voice_manager:
|
|
1565
|
+
return
|
|
1566
|
+
|
|
1567
|
+
# LLM output often contains Markdown. Strip the most common formatting
|
|
1568
|
+
# tokens so TTS stays natural (do not change what is printed).
|
|
1569
|
+
speak_text = sanitize_markdown_for_speech(text)
|
|
1570
|
+
|
|
1571
|
+
is_clone = bool(self.current_tts_voice)
|
|
1572
|
+
if not is_clone:
|
|
1573
|
+
# Offline-first: Piper voices must be explicitly cached. Provide a clear
|
|
1574
|
+
# message instead of hanging on implicit downloads.
|
|
1575
|
+
try:
|
|
1576
|
+
a = getattr(self.voice_manager, "tts_adapter", None)
|
|
1577
|
+
if a is not None and hasattr(a, "is_available") and not bool(a.is_available()):
|
|
1578
|
+
lang = str(getattr(self, "current_language", "en") or "en").strip().lower()
|
|
1579
|
+
raise RuntimeError(
|
|
1580
|
+
f"Piper voice model for '{lang}' is not available locally.\n"
|
|
1581
|
+
f"Run: python -m abstractvoice download --piper {lang}"
|
|
1582
|
+
)
|
|
1583
|
+
except RuntimeError:
|
|
1584
|
+
raise
|
|
1585
|
+
except Exception:
|
|
1586
|
+
pass
|
|
1587
|
+
ind = self._busy_indicator(enabled=is_clone)
|
|
1588
|
+
try:
|
|
1589
|
+
if is_clone:
|
|
1590
|
+
ind.start()
|
|
1591
|
+
self.voice_manager.speak(speak_text, voice=self.current_tts_voice)
|
|
1592
|
+
|
|
1593
|
+
if not is_clone:
|
|
1594
|
+
return
|
|
1595
|
+
|
|
1596
|
+
# Wait until audio playback actually starts (or synthesis ends without audio).
|
|
1597
|
+
vm = self.voice_manager
|
|
1598
|
+
while True:
|
|
1599
|
+
try:
|
|
1600
|
+
playing = bool(vm.is_speaking())
|
|
1601
|
+
synth_active = bool(
|
|
1602
|
+
getattr(vm, "_cloned_synthesis_active", None) and vm._cloned_synthesis_active.is_set()
|
|
1603
|
+
)
|
|
1604
|
+
except Exception:
|
|
1605
|
+
playing, synth_active = False, False
|
|
1606
|
+
|
|
1607
|
+
if playing:
|
|
1608
|
+
break
|
|
1609
|
+
|
|
1610
|
+
# If synthesis is no longer active and we aren't playing, stop the spinner
|
|
1611
|
+
# (either done very quickly or failed).
|
|
1612
|
+
if not synth_active:
|
|
1613
|
+
break
|
|
1614
|
+
|
|
1615
|
+
time.sleep(0.05)
|
|
1616
|
+
finally:
|
|
1617
|
+
try:
|
|
1618
|
+
ind.stop()
|
|
1619
|
+
except Exception:
|
|
1620
|
+
pass
|
|
1621
|
+
# If ASR auto-generated the clone's reference_text, print an easy override command
|
|
1622
|
+
# (once). We do this after stopping the spinner to avoid corrupting the prompt line.
|
|
1623
|
+
try:
|
|
1624
|
+
if is_clone and self.current_tts_voice:
|
|
1625
|
+
self._maybe_print_asr_ref_text_override(self.current_tts_voice)
|
|
1626
|
+
except Exception:
|
|
1627
|
+
pass
|
|
1628
|
+
# Do not print the prompt manually: `cmd` will render it on return,
|
|
1629
|
+
# and printing here can result in duplicate prompts (`> >`).
|
|
1630
|
+
|
|
1631
|
+
def _maybe_print_asr_ref_text_override(self, voice_id: str) -> None:
|
|
1632
|
+
"""If `reference_text` was auto-generated via ASR, print a paste-ready override hint.
|
|
1633
|
+
|
|
1634
|
+
Important: `/clone_set_ref_text` uses a simple `split(maxsplit=1)`, so quoting is not
|
|
1635
|
+
interpreted. We therefore print the command *without* quotes to avoid storing them.
|
|
1636
|
+
"""
|
|
1637
|
+
if not self.voice_manager:
|
|
1638
|
+
return
|
|
1639
|
+
vid = str(voice_id or "").strip()
|
|
1640
|
+
if not vid:
|
|
1641
|
+
return
|
|
1642
|
+
if vid in self._printed_asr_ref_text_hint:
|
|
1643
|
+
return
|
|
1644
|
+
try:
|
|
1645
|
+
info = self.voice_manager.get_cloned_voice(vid) or {}
|
|
1646
|
+
except Exception:
|
|
1647
|
+
return
|
|
1648
|
+
meta = info.get("meta") or {}
|
|
1649
|
+
src = str(meta.get("reference_text_source") or "").strip().lower()
|
|
1650
|
+
ref_text = str(info.get("reference_text") or "").strip()
|
|
1651
|
+
if not ref_text:
|
|
1652
|
+
return
|
|
1653
|
+
if src != "asr":
|
|
1654
|
+
return
|
|
1655
|
+
|
|
1656
|
+
# Mark first so any printing errors won't cause repeated spam.
|
|
1657
|
+
self._printed_asr_ref_text_hint.add(vid)
|
|
1658
|
+
|
|
1659
|
+
prefix = vid[:8] if len(vid) >= 8 else vid
|
|
1660
|
+
name = str(info.get("name") or "").strip()
|
|
1661
|
+
label = f"{name} ({prefix})" if name else prefix
|
|
1662
|
+
print("ℹ️ Auto-generated reference transcript (ASR).")
|
|
1663
|
+
print(f" Voice: {label}")
|
|
1664
|
+
print(" If you want to correct it, copy/paste and edit the text after the id:")
|
|
1665
|
+
print(f" /clone_set_ref_text {prefix} {ref_text}")
|
|
1666
|
+
|
|
1667
|
+
class _busy_indicator:
|
|
1668
|
+
"""A minimal, discreet spinner (no extra lines)."""
|
|
1669
|
+
|
|
1670
|
+
def __init__(self, enabled: bool = False):
|
|
1671
|
+
self.enabled = bool(enabled)
|
|
1672
|
+
self._stop = threading.Event()
|
|
1673
|
+
self._thread = None
|
|
1674
|
+
|
|
1675
|
+
def start(self):
|
|
1676
|
+
if not self.enabled:
|
|
1677
|
+
return
|
|
1678
|
+
if self._thread and self._thread.is_alive():
|
|
1679
|
+
return
|
|
1680
|
+
|
|
1681
|
+
def _run():
|
|
1682
|
+
frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
1683
|
+
i = 0
|
|
1684
|
+
t0 = time.time()
|
|
1685
|
+
# Small delay so fast operations don't flash.
|
|
1686
|
+
time.sleep(0.25)
|
|
1687
|
+
if self._stop.is_set():
|
|
1688
|
+
return
|
|
1689
|
+
# Hide cursor for a cleaner look.
|
|
1690
|
+
try:
|
|
1691
|
+
sys.stdout.write("\033[?25l")
|
|
1692
|
+
sys.stdout.flush()
|
|
1693
|
+
except Exception:
|
|
1694
|
+
pass
|
|
1695
|
+
while not self._stop.is_set():
|
|
1696
|
+
elapsed = time.time() - t0
|
|
1697
|
+
sys.stdout.write(f"\r(synthesizing {elapsed:0.1f}s) {frames[i % len(frames)]}")
|
|
1698
|
+
sys.stdout.flush()
|
|
1699
|
+
i += 1
|
|
1700
|
+
time.sleep(0.1)
|
|
1701
|
+
|
|
1702
|
+
self._thread = threading.Thread(target=_run, daemon=True)
|
|
1703
|
+
self._thread.start()
|
|
1704
|
+
|
|
1705
|
+
def stop(self):
|
|
1706
|
+
if not self.enabled:
|
|
1707
|
+
return
|
|
1708
|
+
self._stop.set()
|
|
1709
|
+
try:
|
|
1710
|
+
if self._thread:
|
|
1711
|
+
self._thread.join(timeout=0.5)
|
|
1712
|
+
except Exception:
|
|
1713
|
+
pass
|
|
1714
|
+
# Clear spinner line.
|
|
1715
|
+
try:
|
|
1716
|
+
# `\033[2K` clears the entire line (more robust than fixed spaces).
|
|
1717
|
+
sys.stdout.write("\r\033[2K\r")
|
|
1718
|
+
# Restore cursor.
|
|
1719
|
+
sys.stdout.write("\033[?25h")
|
|
1720
|
+
sys.stdout.flush()
|
|
1721
|
+
except Exception:
|
|
1722
|
+
pass
|
|
1723
|
+
|
|
1724
|
+
def __enter__(self):
|
|
1725
|
+
self.start()
|
|
1726
|
+
return self
|
|
1727
|
+
|
|
1728
|
+
def __exit__(self, exc_type, exc, tb):
|
|
1729
|
+
self.stop()
|
|
1730
|
+
return False
|
|
1731
|
+
|
|
1732
|
+
# NOTE: We intentionally do not keep a background spinner running while the REPL
|
|
1733
|
+
# is waiting for user input (it corrupts the prompt line). Instead, we show a
|
|
1734
|
+
# spinner only until the first audio actually starts, then stop it so the prompt
|
|
1735
|
+
# stays usable for interruption-by-typing.
|
|
1736
|
+
|
|
1737
|
+
def do_clones(self, arg):
|
|
1738
|
+
"""List cloned voices in the local store."""
|
|
1739
|
+
if not self.voice_manager:
|
|
1740
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1741
|
+
return
|
|
1742
|
+
try:
|
|
1743
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
1744
|
+
if not voices:
|
|
1745
|
+
print("No cloned voices yet. Use /clone <path> or /clone-my-voice.")
|
|
1746
|
+
return
|
|
1747
|
+
print(f"\n{Colors.CYAN}Cloned voices:{Colors.END}")
|
|
1748
|
+
for v in voices:
|
|
1749
|
+
vid = v.get("voice_id") or v.get("voice", "")
|
|
1750
|
+
name = v.get("name", "")
|
|
1751
|
+
eng = (v.get("engine") or "").strip()
|
|
1752
|
+
eng_txt = f" [{eng}]" if eng else ""
|
|
1753
|
+
src = (v.get("meta") or {}).get("reference_text_source", "")
|
|
1754
|
+
src_txt = f" [{src}]" if src else ""
|
|
1755
|
+
current = " (current)" if self.current_tts_voice == vid else ""
|
|
1756
|
+
print(f" - {name}: {vid}{eng_txt}{src_txt}{current}")
|
|
1757
|
+
print("Tip: /clone_rm <id-or-name> deletes one; /clone_rm_all --yes deletes all.")
|
|
1758
|
+
except Exception as e:
|
|
1759
|
+
print(f"❌ Error listing cloned voices: {e}")
|
|
1760
|
+
|
|
1761
|
+
def _resolve_clone_id(self, wanted: str) -> str | None:
|
|
1762
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
1763
|
+
for v in voices:
|
|
1764
|
+
vid = v.get("voice_id") or ""
|
|
1765
|
+
name = v.get("name") or ""
|
|
1766
|
+
if wanted == vid or vid.startswith(wanted) or wanted == name:
|
|
1767
|
+
return vid
|
|
1768
|
+
return None
|
|
1769
|
+
|
|
1770
|
+
def _resolve_clone_id_by_source(self, source: str, *, engine: str | None = None) -> str | None:
|
|
1771
|
+
"""Find a cloned voice by its stored meta.source (best-effort)."""
|
|
1772
|
+
if not self.voice_manager:
|
|
1773
|
+
return None
|
|
1774
|
+
|
|
1775
|
+
try:
|
|
1776
|
+
from pathlib import Path
|
|
1777
|
+
|
|
1778
|
+
target = Path(str(source)).expanduser()
|
|
1779
|
+
try:
|
|
1780
|
+
target_norm = str(target.resolve())
|
|
1781
|
+
except Exception:
|
|
1782
|
+
target_norm = str(target)
|
|
1783
|
+
except Exception:
|
|
1784
|
+
target_norm = str(source)
|
|
1785
|
+
|
|
1786
|
+
try:
|
|
1787
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
1788
|
+
except Exception:
|
|
1789
|
+
return None
|
|
1790
|
+
|
|
1791
|
+
wanted_engine = (str(engine).strip().lower() if engine else None) or None
|
|
1792
|
+
for v in voices:
|
|
1793
|
+
meta = v.get("meta") or {}
|
|
1794
|
+
src = meta.get("source")
|
|
1795
|
+
if not src:
|
|
1796
|
+
continue
|
|
1797
|
+
try:
|
|
1798
|
+
from pathlib import Path
|
|
1799
|
+
|
|
1800
|
+
p = Path(str(src)).expanduser()
|
|
1801
|
+
try:
|
|
1802
|
+
src_norm = str(p.resolve())
|
|
1803
|
+
except Exception:
|
|
1804
|
+
src_norm = str(p)
|
|
1805
|
+
except Exception:
|
|
1806
|
+
src_norm = str(src)
|
|
1807
|
+
|
|
1808
|
+
if src_norm != target_norm:
|
|
1809
|
+
continue
|
|
1810
|
+
if wanted_engine and (str(v.get("engine") or "").strip().lower() != wanted_engine):
|
|
1811
|
+
continue
|
|
1812
|
+
return str(v.get("voice_id") or "").strip() or None
|
|
1813
|
+
return None
|
|
1814
|
+
|
|
1815
|
+
def do_clone_info(self, arg):
|
|
1816
|
+
"""Show details for a cloned voice.
|
|
1817
|
+
|
|
1818
|
+
Usage:
|
|
1819
|
+
/clone_info <id-or-name>
|
|
1820
|
+
"""
|
|
1821
|
+
if not self.voice_manager:
|
|
1822
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1823
|
+
return
|
|
1824
|
+
wanted = arg.strip()
|
|
1825
|
+
if not wanted:
|
|
1826
|
+
print("Usage: /clone_info <id-or-name>")
|
|
1827
|
+
return
|
|
1828
|
+
vid = self._resolve_clone_id(wanted)
|
|
1829
|
+
if not vid:
|
|
1830
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
1831
|
+
return
|
|
1832
|
+
try:
|
|
1833
|
+
info = self.voice_manager.get_cloned_voice(vid)
|
|
1834
|
+
meta = info.get("meta") or {}
|
|
1835
|
+
print(f"\n{Colors.CYAN}Cloned voice info:{Colors.END}")
|
|
1836
|
+
print(f" id: {info.get('voice_id')}")
|
|
1837
|
+
print(f" name: {info.get('name')}")
|
|
1838
|
+
print(f" engine: {info.get('engine')}")
|
|
1839
|
+
print(f" refs: {len(info.get('reference_files') or [])}")
|
|
1840
|
+
print(f" ref_text_source: {meta.get('reference_text_source','')}")
|
|
1841
|
+
rt = (info.get('reference_text') or '').strip()
|
|
1842
|
+
if rt:
|
|
1843
|
+
short = (rt[:200] + "…") if len(rt) > 200 else rt
|
|
1844
|
+
print(f" reference_text: {short}")
|
|
1845
|
+
else:
|
|
1846
|
+
print(" reference_text: (missing)")
|
|
1847
|
+
except Exception as e:
|
|
1848
|
+
print(f"❌ Error: {e}")
|
|
1849
|
+
|
|
1850
|
+
def do_clone_ref(self, arg):
|
|
1851
|
+
"""Print the full reference_text for a cloned voice.
|
|
1852
|
+
|
|
1853
|
+
Usage:
|
|
1854
|
+
/clone_ref <id-or-name>
|
|
1855
|
+
"""
|
|
1856
|
+
if not self.voice_manager:
|
|
1857
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1858
|
+
return
|
|
1859
|
+
wanted = arg.strip()
|
|
1860
|
+
if not wanted:
|
|
1861
|
+
print("Usage: /clone_ref <id-or-name>")
|
|
1862
|
+
return
|
|
1863
|
+
vid = self._resolve_clone_id(wanted)
|
|
1864
|
+
if not vid:
|
|
1865
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
1866
|
+
return
|
|
1867
|
+
info = self.voice_manager.get_cloned_voice(vid)
|
|
1868
|
+
print((info.get("reference_text") or "").strip())
|
|
1869
|
+
|
|
1870
|
+
def do_clone_rename(self, arg):
|
|
1871
|
+
"""Rename a cloned voice.
|
|
1872
|
+
|
|
1873
|
+
Usage:
|
|
1874
|
+
/clone_rename <id-or-name> <new_name>
|
|
1875
|
+
"""
|
|
1876
|
+
if not self.voice_manager:
|
|
1877
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1878
|
+
return
|
|
1879
|
+
parts = arg.strip().split(maxsplit=1)
|
|
1880
|
+
if len(parts) < 2:
|
|
1881
|
+
print("Usage: /clone_rename <id-or-name> <new_name>")
|
|
1882
|
+
return
|
|
1883
|
+
vid = self._resolve_clone_id(parts[0])
|
|
1884
|
+
if not vid:
|
|
1885
|
+
print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
|
|
1886
|
+
return
|
|
1887
|
+
self.voice_manager.rename_cloned_voice(vid, parts[1])
|
|
1888
|
+
print("✅ Renamed.")
|
|
1889
|
+
|
|
1890
|
+
def do_clone_rm(self, arg):
|
|
1891
|
+
"""Remove a cloned voice from the store.
|
|
1892
|
+
|
|
1893
|
+
Usage:
|
|
1894
|
+
/clone_rm <id-or-name>
|
|
1895
|
+
"""
|
|
1896
|
+
if not self.voice_manager:
|
|
1897
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1898
|
+
return
|
|
1899
|
+
wanted = arg.strip()
|
|
1900
|
+
if not wanted:
|
|
1901
|
+
print("Usage: /clone_rm <id-or-name>")
|
|
1902
|
+
return
|
|
1903
|
+
vid = self._resolve_clone_id(wanted)
|
|
1904
|
+
if not vid:
|
|
1905
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
1906
|
+
return
|
|
1907
|
+
# If currently selected, switch back to Piper.
|
|
1908
|
+
if self.current_tts_voice == vid:
|
|
1909
|
+
self.current_tts_voice = None
|
|
1910
|
+
self.voice_manager.delete_cloned_voice(vid)
|
|
1911
|
+
print("✅ Deleted.")
|
|
1912
|
+
|
|
1913
|
+
def do_clone_rm_all(self, arg):
|
|
1914
|
+
"""Remove ALL cloned voices from the local store.
|
|
1915
|
+
|
|
1916
|
+
Usage:
|
|
1917
|
+
/clone_rm_all --yes
|
|
1918
|
+
"""
|
|
1919
|
+
if not self.voice_manager:
|
|
1920
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1921
|
+
return
|
|
1922
|
+
|
|
1923
|
+
confirm = (arg or "").strip().lower()
|
|
1924
|
+
if confirm not in ("--yes", "-y", "yes"):
|
|
1925
|
+
try:
|
|
1926
|
+
n = len(self.voice_manager.list_cloned_voices() or [])
|
|
1927
|
+
except Exception:
|
|
1928
|
+
n = 0
|
|
1929
|
+
if n <= 0:
|
|
1930
|
+
print("No cloned voices to delete.")
|
|
1931
|
+
return
|
|
1932
|
+
print(f"⚠️ This will permanently delete {n} cloned voice(s).")
|
|
1933
|
+
print("Re-run with: /clone_rm_all --yes")
|
|
1934
|
+
return
|
|
1935
|
+
|
|
1936
|
+
# If currently selected, switch back to Piper.
|
|
1937
|
+
self.current_tts_voice = None
|
|
1938
|
+
|
|
1939
|
+
deleted = 0
|
|
1940
|
+
failed = 0
|
|
1941
|
+
try:
|
|
1942
|
+
voices = list(self.voice_manager.list_cloned_voices() or [])
|
|
1943
|
+
except Exception as e:
|
|
1944
|
+
print(f"❌ Error listing cloned voices: {e}")
|
|
1945
|
+
return
|
|
1946
|
+
|
|
1947
|
+
for v in voices:
|
|
1948
|
+
vid = str(v.get("voice_id") or v.get("voice") or "").strip()
|
|
1949
|
+
if not vid:
|
|
1950
|
+
continue
|
|
1951
|
+
try:
|
|
1952
|
+
self.voice_manager.delete_cloned_voice(vid)
|
|
1953
|
+
deleted += 1
|
|
1954
|
+
except Exception:
|
|
1955
|
+
failed += 1
|
|
1956
|
+
|
|
1957
|
+
if failed:
|
|
1958
|
+
print(f"✅ Deleted {deleted} cloned voice(s). ⚠️ Failed: {failed}")
|
|
1959
|
+
else:
|
|
1960
|
+
print(f"✅ Deleted {deleted} cloned voice(s).")
|
|
1961
|
+
|
|
1962
|
+
def do_clone_export(self, arg):
|
|
1963
|
+
"""Export a cloned voice bundle (.zip).
|
|
1964
|
+
|
|
1965
|
+
Usage:
|
|
1966
|
+
/clone_export <id-or-name> <path.zip>
|
|
1967
|
+
"""
|
|
1968
|
+
if not self.voice_manager:
|
|
1969
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1970
|
+
return
|
|
1971
|
+
parts = arg.strip().split(maxsplit=1)
|
|
1972
|
+
if len(parts) < 2:
|
|
1973
|
+
print("Usage: /clone_export <id-or-name> <path.zip>")
|
|
1974
|
+
return
|
|
1975
|
+
vid = self._resolve_clone_id(parts[0])
|
|
1976
|
+
if not vid:
|
|
1977
|
+
print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
|
|
1978
|
+
return
|
|
1979
|
+
out = self.voice_manager.export_voice(vid, parts[1])
|
|
1980
|
+
print(f"✅ Exported: {out}")
|
|
1981
|
+
|
|
1982
|
+
def do_clone_import(self, arg):
|
|
1983
|
+
"""Import a cloned voice bundle (.zip).
|
|
1984
|
+
|
|
1985
|
+
Usage:
|
|
1986
|
+
/clone_import <path.zip>
|
|
1987
|
+
"""
|
|
1988
|
+
if not self.voice_manager:
|
|
1989
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1990
|
+
return
|
|
1991
|
+
path = arg.strip()
|
|
1992
|
+
if not path:
|
|
1993
|
+
print("Usage: /clone_import <path.zip>")
|
|
1994
|
+
return
|
|
1995
|
+
vid = self.voice_manager.import_voice(path)
|
|
1996
|
+
print(f"✅ Imported as: {vid}")
|
|
1997
|
+
|
|
1998
|
+
def do_clone(self, arg):
|
|
1999
|
+
"""Clone a voice from a reference file or folder.
|
|
2000
|
+
|
|
2001
|
+
Usage:
|
|
2002
|
+
/clone <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
|
|
2003
|
+
"""
|
|
2004
|
+
if not self.voice_manager:
|
|
2005
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2006
|
+
return
|
|
2007
|
+
|
|
2008
|
+
try:
|
|
2009
|
+
parts = shlex.split(arg.strip())
|
|
2010
|
+
except ValueError as e:
|
|
2011
|
+
print(f"Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
|
|
2012
|
+
return
|
|
2013
|
+
|
|
2014
|
+
if not parts:
|
|
2015
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2016
|
+
return
|
|
2017
|
+
|
|
2018
|
+
engine = None
|
|
2019
|
+
reference_text = None
|
|
2020
|
+
pos = []
|
|
2021
|
+
i = 0
|
|
2022
|
+
while i < len(parts):
|
|
2023
|
+
tok = parts[i]
|
|
2024
|
+
if tok in ("--engine",):
|
|
2025
|
+
if i + 1 >= len(parts):
|
|
2026
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2027
|
+
return
|
|
2028
|
+
engine = parts[i + 1]
|
|
2029
|
+
i += 2
|
|
2030
|
+
continue
|
|
2031
|
+
if tok in ("--text", "--reference-text", "--reference_text"):
|
|
2032
|
+
if i + 1 >= len(parts):
|
|
2033
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2034
|
+
return
|
|
2035
|
+
reference_text = parts[i + 1]
|
|
2036
|
+
i += 2
|
|
2037
|
+
continue
|
|
2038
|
+
pos.append(tok)
|
|
2039
|
+
i += 1
|
|
2040
|
+
|
|
2041
|
+
if not pos:
|
|
2042
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2043
|
+
return
|
|
2044
|
+
|
|
2045
|
+
path = pos[0]
|
|
2046
|
+
name = pos[1] if len(pos) > 1 else None
|
|
2047
|
+
try:
|
|
2048
|
+
t0 = time.monotonic()
|
|
2049
|
+
voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine)
|
|
2050
|
+
t1 = time.monotonic()
|
|
2051
|
+
|
|
2052
|
+
eng = ""
|
|
2053
|
+
ref_src = ""
|
|
2054
|
+
try:
|
|
2055
|
+
info = self.voice_manager.get_cloned_voice(voice_id) or {}
|
|
2056
|
+
eng = str(info.get("engine") or "").strip()
|
|
2057
|
+
ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
|
|
2058
|
+
except Exception:
|
|
2059
|
+
eng = ""
|
|
2060
|
+
ref_src = ""
|
|
2061
|
+
|
|
2062
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2063
|
+
print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
|
|
2064
|
+
print(" Use /tts_voice clone <id-or-name> to select it.")
|
|
2065
|
+
print(" Tip: set reference text for best quality:")
|
|
2066
|
+
print(" /clone_set_ref_text <id-or-name> \"...\"")
|
|
2067
|
+
if not self._is_cloning_runtime_ready(voice_id=voice_id):
|
|
2068
|
+
print(" (Cloning runtime not ready yet; run /cloning_status and /cloning_download first.)")
|
|
2069
|
+
if str(eng or (engine or self.cloning_engine) or "").strip().lower() == "chroma" and not (reference_text or "").strip():
|
|
2070
|
+
print("ℹ️ No reference transcript provided.")
|
|
2071
|
+
print(" We will auto-generate it via STT on first speak (offline-first: requires cached STT model).")
|
|
2072
|
+
print(" Optional (often best quality): /clone_set_ref_text <id-or-name> \"...\" (or re-run /clone ... --text \"...\")")
|
|
2073
|
+
|
|
2074
|
+
if self.verbose_mode:
|
|
2075
|
+
n_files, ref_audio_s = self._summarize_audio_source(path)
|
|
2076
|
+
n_txt = str(n_files) if isinstance(n_files, int) else "--"
|
|
2077
|
+
src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
|
|
2078
|
+
msg = f"CLONE {eng or (engine or self.cloning_engine)} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
|
|
2079
|
+
print(f"{Colors.YELLOW}{msg}{Colors.END}")
|
|
2080
|
+
except Exception as e:
|
|
2081
|
+
print(f"❌ Clone failed: {e}")
|
|
2082
|
+
|
|
2083
|
+
def do_clone_use(self, arg):
|
|
2084
|
+
"""Clone a voice (or reuse an existing one) and immediately select it.
|
|
2085
|
+
|
|
2086
|
+
Usage:
|
|
2087
|
+
/clone_use <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
|
|
2088
|
+
|
|
2089
|
+
Shortcut:
|
|
2090
|
+
- Paste a WAV/FLAC/OGG path directly (optionally: `path.wav | transcript`).
|
|
2091
|
+
"""
|
|
2092
|
+
if not self.voice_manager:
|
|
2093
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2094
|
+
return
|
|
2095
|
+
|
|
2096
|
+
try:
|
|
2097
|
+
parts = shlex.split(arg.strip())
|
|
2098
|
+
except ValueError as e:
|
|
2099
|
+
print(f"Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
|
|
2100
|
+
return
|
|
2101
|
+
|
|
2102
|
+
if not parts:
|
|
2103
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2104
|
+
return
|
|
2105
|
+
|
|
2106
|
+
engine = None
|
|
2107
|
+
reference_text = None
|
|
2108
|
+
pos = []
|
|
2109
|
+
i = 0
|
|
2110
|
+
while i < len(parts):
|
|
2111
|
+
tok = parts[i]
|
|
2112
|
+
if tok in ("--engine",):
|
|
2113
|
+
if i + 1 >= len(parts):
|
|
2114
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2115
|
+
return
|
|
2116
|
+
engine = parts[i + 1]
|
|
2117
|
+
i += 2
|
|
2118
|
+
continue
|
|
2119
|
+
if tok in ("--text", "--reference-text", "--reference_text"):
|
|
2120
|
+
if i + 1 >= len(parts):
|
|
2121
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2122
|
+
return
|
|
2123
|
+
reference_text = parts[i + 1]
|
|
2124
|
+
i += 2
|
|
2125
|
+
continue
|
|
2126
|
+
pos.append(tok)
|
|
2127
|
+
i += 1
|
|
2128
|
+
|
|
2129
|
+
if not pos:
|
|
2130
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2131
|
+
return
|
|
2132
|
+
|
|
2133
|
+
path = pos[0]
|
|
2134
|
+
name = pos[1] if len(pos) > 1 else None
|
|
2135
|
+
|
|
2136
|
+
engine_name = str(engine or self.cloning_engine or "f5_tts").strip().lower()
|
|
2137
|
+
|
|
2138
|
+
# If name isn't provided, use something stable for UX.
|
|
2139
|
+
if not name:
|
|
2140
|
+
try:
|
|
2141
|
+
from pathlib import Path
|
|
2142
|
+
|
|
2143
|
+
p = Path(path)
|
|
2144
|
+
name = p.stem if p.is_file() else p.name
|
|
2145
|
+
except Exception:
|
|
2146
|
+
name = None
|
|
2147
|
+
|
|
2148
|
+
# Reuse a prior clone created from the same source path + engine.
|
|
2149
|
+
voice_id = self._resolve_clone_id_by_source(path, engine=engine_name)
|
|
2150
|
+
if voice_id:
|
|
2151
|
+
if reference_text:
|
|
2152
|
+
try:
|
|
2153
|
+
self.voice_manager.set_cloned_voice_reference_text(voice_id, reference_text)
|
|
2154
|
+
print("✅ Reusing cloned voice and updating reference text.")
|
|
2155
|
+
except Exception:
|
|
2156
|
+
print("✅ Reusing cloned voice.")
|
|
2157
|
+
else:
|
|
2158
|
+
print("✅ Reusing cloned voice.")
|
|
2159
|
+
else:
|
|
2160
|
+
try:
|
|
2161
|
+
t0 = time.monotonic()
|
|
2162
|
+
voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine_name)
|
|
2163
|
+
t1 = time.monotonic()
|
|
2164
|
+
|
|
2165
|
+
eng = ""
|
|
2166
|
+
ref_src = ""
|
|
2167
|
+
try:
|
|
2168
|
+
info = self.voice_manager.get_cloned_voice(voice_id) or {}
|
|
2169
|
+
eng = str(info.get("engine") or "").strip()
|
|
2170
|
+
ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
|
|
2171
|
+
except Exception:
|
|
2172
|
+
eng = ""
|
|
2173
|
+
ref_src = ""
|
|
2174
|
+
|
|
2175
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2176
|
+
print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
|
|
2177
|
+
if reference_text:
|
|
2178
|
+
print(" (Reference text provided)")
|
|
2179
|
+
else:
|
|
2180
|
+
print(" Tip: set reference text for best quality:")
|
|
2181
|
+
print(" /clone_set_ref_text <id-or-name> \"...\"")
|
|
2182
|
+
if str(eng or engine_name or "").strip().lower() == "chroma":
|
|
2183
|
+
print(" ℹ️ No transcript provided; STT auto-fallback runs on first speak (requires cached STT model).")
|
|
2184
|
+
|
|
2185
|
+
if self.verbose_mode:
|
|
2186
|
+
n_files, ref_audio_s = self._summarize_audio_source(path)
|
|
2187
|
+
n_txt = str(n_files) if isinstance(n_files, int) else "--"
|
|
2188
|
+
src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
|
|
2189
|
+
msg = f"CLONE {eng or engine_name} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
|
|
2190
|
+
print(f"{Colors.YELLOW}{msg}{Colors.END}")
|
|
2191
|
+
except Exception as e:
|
|
2192
|
+
print(f"❌ Clone failed: {e}")
|
|
2193
|
+
return
|
|
2194
|
+
|
|
2195
|
+
# Select if runtime is ready (no surprise downloads).
|
|
2196
|
+
if not self._is_cloning_runtime_ready(voice_id=voice_id):
|
|
2197
|
+
print("ℹ️ Cloning runtime is not ready (would trigger large downloads).")
|
|
2198
|
+
print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
|
|
2199
|
+
return
|
|
2200
|
+
|
|
2201
|
+
self.current_tts_voice = voice_id
|
|
2202
|
+
eng = ""
|
|
2203
|
+
try:
|
|
2204
|
+
info = self.voice_manager.get_cloned_voice(voice_id) or {}
|
|
2205
|
+
eng = str(info.get("engine") or "").strip()
|
|
2206
|
+
except Exception:
|
|
2207
|
+
eng = ""
|
|
2208
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2209
|
+
print(f"✅ Using cloned voice: {voice_id}{eng_txt}")
|
|
2210
|
+
if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
|
|
2211
|
+
print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
|
|
2212
|
+
# Free memory from other cloning engines (important for large backends like Chroma).
|
|
2213
|
+
try:
|
|
2214
|
+
if hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2215
|
+
self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
|
|
2216
|
+
except Exception:
|
|
2217
|
+
pass
|
|
2218
|
+
# Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
|
|
2219
|
+
try:
|
|
2220
|
+
if hasattr(self.voice_manager, "unload_piper_voice"):
|
|
2221
|
+
self.voice_manager.unload_piper_voice()
|
|
2222
|
+
except Exception:
|
|
2223
|
+
pass
|
|
2224
|
+
|
|
2225
|
+
def do_clone_set_ref_text(self, arg):
|
|
2226
|
+
"""Set the reference transcript for a cloned voice (quality fix).
|
|
2227
|
+
|
|
644
2228
|
Usage:
|
|
645
|
-
/
|
|
646
|
-
/tts_model fast_pitch
|
|
2229
|
+
/clone_set_ref_text <id-or-name> <text...>
|
|
647
2230
|
"""
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
'fast_pitch': 'tts_models/en/ljspeech/fast_pitch',
|
|
651
|
-
'glow-tts': 'tts_models/en/ljspeech/glow-tts',
|
|
652
|
-
'tacotron2-DDC': 'tts_models/en/ljspeech/tacotron2-DDC',
|
|
653
|
-
}
|
|
654
|
-
|
|
655
|
-
arg = arg.strip()
|
|
656
|
-
if not arg:
|
|
657
|
-
print("Usage: /tts_model <model_name>")
|
|
658
|
-
print("Available models: vits (best), fast_pitch, glow-tts, tacotron2-DDC")
|
|
2231
|
+
if not self.voice_manager:
|
|
2232
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
659
2233
|
return
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
2234
|
+
|
|
2235
|
+
parts = arg.strip().split(maxsplit=1)
|
|
2236
|
+
if len(parts) < 2:
|
|
2237
|
+
print("Usage: /clone_set_ref_text <id-or-name> <text...>")
|
|
2238
|
+
return
|
|
2239
|
+
|
|
2240
|
+
wanted, text = parts[0], parts[1]
|
|
2241
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
2242
|
+
match = None
|
|
2243
|
+
for v in voices:
|
|
2244
|
+
vid = v.get("voice_id") or ""
|
|
2245
|
+
name = v.get("name") or ""
|
|
2246
|
+
if wanted == vid or vid.startswith(wanted) or wanted == name:
|
|
2247
|
+
match = vid
|
|
2248
|
+
break
|
|
2249
|
+
if not match:
|
|
2250
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
2251
|
+
return
|
|
2252
|
+
|
|
665
2253
|
try:
|
|
666
|
-
self.voice_manager.
|
|
667
|
-
print("
|
|
2254
|
+
self.voice_manager.set_cloned_voice_reference_text(match, text)
|
|
2255
|
+
print("✅ Updated reference text.")
|
|
668
2256
|
except Exception as e:
|
|
669
|
-
print(f"
|
|
670
|
-
|
|
671
|
-
def
|
|
672
|
-
"""
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
2257
|
+
print(f"❌ Failed to update reference text: {e}")
|
|
2258
|
+
|
|
2259
|
+
def do_tts_voice(self, arg):
|
|
2260
|
+
"""Select which voice is used for speaking.
|
|
2261
|
+
|
|
2262
|
+
Usage:
|
|
2263
|
+
/tts_voice piper
|
|
2264
|
+
/tts_voice clone <voice_id_or_name>
|
|
2265
|
+
"""
|
|
2266
|
+
if not self.voice_manager:
|
|
2267
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
676
2268
|
return
|
|
677
|
-
|
|
678
|
-
|
|
2269
|
+
|
|
2270
|
+
parts = arg.strip().split()
|
|
2271
|
+
if not parts:
|
|
2272
|
+
if self.current_tts_voice:
|
|
2273
|
+
vid = self.current_tts_voice
|
|
2274
|
+
try:
|
|
2275
|
+
info = self.voice_manager.get_cloned_voice(vid) or {}
|
|
2276
|
+
name = (info.get("name") or "").strip()
|
|
2277
|
+
eng = (info.get("engine") or "").strip()
|
|
2278
|
+
label = name or vid
|
|
2279
|
+
suffix = f" (engine: {eng})" if eng else ""
|
|
2280
|
+
print(f"Current TTS voice: {label}{suffix}")
|
|
2281
|
+
except Exception:
|
|
2282
|
+
print(f"Current TTS voice: {vid}")
|
|
2283
|
+
else:
|
|
2284
|
+
print("Current TTS voice: piper")
|
|
2285
|
+
print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
|
|
2286
|
+
return
|
|
2287
|
+
|
|
2288
|
+
if parts[0] == "piper":
|
|
2289
|
+
self.current_tts_voice = None
|
|
2290
|
+
# Free any heavy cloning engines when switching back to Piper.
|
|
2291
|
+
try:
|
|
2292
|
+
if hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2293
|
+
self.voice_manager.unload_cloning_engines()
|
|
2294
|
+
except Exception:
|
|
2295
|
+
pass
|
|
2296
|
+
# If Piper was previously unloaded to save memory, reload it now (offline-first).
|
|
2297
|
+
try:
|
|
2298
|
+
if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
|
|
2299
|
+
a = getattr(self.voice_manager, "tts_adapter", None)
|
|
2300
|
+
if hasattr(a, "is_available") and not bool(a.is_available()):
|
|
2301
|
+
self.voice_manager.set_language(self.current_language)
|
|
2302
|
+
except Exception:
|
|
2303
|
+
pass
|
|
2304
|
+
print("✅ Using Piper (default) voice")
|
|
2305
|
+
return
|
|
2306
|
+
|
|
2307
|
+
if parts[0] != "clone" or len(parts) < 2:
|
|
2308
|
+
print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
|
|
2309
|
+
return
|
|
2310
|
+
|
|
2311
|
+
wanted = parts[1]
|
|
2312
|
+
match = self._resolve_clone_id(wanted)
|
|
2313
|
+
if not match:
|
|
2314
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
2315
|
+
return
|
|
2316
|
+
|
|
2317
|
+
# Do not allow selecting a cloned voice unless the runtime is ready.
|
|
2318
|
+
if not self._is_cloning_runtime_ready(voice_id=match):
|
|
2319
|
+
print("❌ Cloning runtime is not ready (would trigger large downloads).")
|
|
2320
|
+
print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
|
|
2321
|
+
return
|
|
2322
|
+
|
|
2323
|
+
# Allow selecting voices without reference_text; we will auto-fallback at speak-time
|
|
2324
|
+
# if the STT model is already cached locally (no downloads in REPL).
|
|
2325
|
+
|
|
2326
|
+
self.current_tts_voice = match
|
|
2327
|
+
eng = ""
|
|
2328
|
+
try:
|
|
2329
|
+
info = self.voice_manager.get_cloned_voice(match) or {}
|
|
2330
|
+
eng = (info.get("engine") or "").strip()
|
|
2331
|
+
except Exception:
|
|
2332
|
+
eng = ""
|
|
2333
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2334
|
+
print(f"✅ Using cloned voice: {match}{eng_txt}")
|
|
2335
|
+
if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
|
|
2336
|
+
print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
|
|
2337
|
+
# Free memory from other cloning engines (e.g. unloading Chroma when switching to F5, or vice-versa).
|
|
2338
|
+
try:
|
|
2339
|
+
if hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2340
|
+
self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
|
|
2341
|
+
except Exception:
|
|
2342
|
+
pass
|
|
2343
|
+
# Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
|
|
2344
|
+
try:
|
|
2345
|
+
if hasattr(self.voice_manager, "unload_piper_voice"):
|
|
2346
|
+
self.voice_manager.unload_piper_voice()
|
|
2347
|
+
except Exception:
|
|
2348
|
+
pass
|
|
2349
|
+
|
|
2350
|
+
def do_clone_my_voice(self, arg):
|
|
2351
|
+
"""Interactive voice cloning from microphone.
|
|
2352
|
+
|
|
2353
|
+
This records a short prompt to WAV and adds it to the voice store.
|
|
2354
|
+
"""
|
|
2355
|
+
if not self.voice_manager:
|
|
2356
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2357
|
+
return
|
|
2358
|
+
|
|
2359
|
+
prompt = "Good evening, Dave."
|
|
2360
|
+
seconds = 6.0
|
|
2361
|
+
print("You will record a short reference sample for voice cloning.")
|
|
2362
|
+
print(f"Please read this aloud (once): {prompt}")
|
|
2363
|
+
input("Press Enter to start recording...")
|
|
2364
|
+
try:
|
|
2365
|
+
import appdirs
|
|
2366
|
+
from pathlib import Path
|
|
2367
|
+
from abstractvoice.audio import record_wav
|
|
2368
|
+
|
|
2369
|
+
out_dir = Path(appdirs.user_data_dir("abstractvoice")) / "recordings"
|
|
2370
|
+
out_path = out_dir / "my_voice.wav"
|
|
2371
|
+
record_wav(out_path, seconds=seconds, sample_rate=24000, channels=1)
|
|
2372
|
+
voice_id = self.voice_manager.clone_voice(str(out_path), name="my_voice", reference_text=prompt)
|
|
2373
|
+
print(f"✅ Recorded and cloned: {voice_id}")
|
|
2374
|
+
print(" Use /tts_voice clone <id-or-name> to select it.")
|
|
2375
|
+
except Exception as e:
|
|
2376
|
+
print(f"❌ /clone-my-voice failed: {e}")
|
|
2377
|
+
|
|
2378
|
+
def do_cloning_status(self, arg):
|
|
2379
|
+
"""Show whether cloning runtime is ready locally (no downloads)."""
|
|
2380
|
+
try:
|
|
2381
|
+
import torch
|
|
2382
|
+
|
|
2383
|
+
mps = False
|
|
2384
|
+
try:
|
|
2385
|
+
mps = bool(torch.backends.mps.is_available())
|
|
2386
|
+
except Exception:
|
|
2387
|
+
mps = False
|
|
2388
|
+
print(f"torch: {getattr(torch, '__version__', '?')}")
|
|
2389
|
+
print(f"cuda_available: {bool(torch.cuda.is_available())}")
|
|
2390
|
+
print(f"mps_available: {mps}")
|
|
2391
|
+
except Exception:
|
|
2392
|
+
pass
|
|
2393
|
+
|
|
2394
|
+
print(f"default_cloning_engine: {self.cloning_engine}")
|
|
2395
|
+
|
|
2396
|
+
if importlib.util.find_spec("f5_tts") is None:
|
|
2397
|
+
print("ℹ️ OpenF5 runtime: not installed (missing: f5_tts)")
|
|
2398
|
+
print(" Install: pip install \"abstractvoice[cloning]\"")
|
|
2399
|
+
else:
|
|
2400
|
+
if self._is_openf5_cached():
|
|
2401
|
+
print("✅ OpenF5 artifacts: present (cached)")
|
|
2402
|
+
else:
|
|
2403
|
+
print("ℹ️ OpenF5 artifacts: not present (will require ~5.4GB download)")
|
|
2404
|
+
print(" Run: /cloning_download f5_tts")
|
|
2405
|
+
|
|
2406
|
+
if importlib.util.find_spec("transformers") is None or importlib.util.find_spec("torch") is None:
|
|
2407
|
+
print("ℹ️ Chroma runtime: not installed (missing: transformers/torch)")
|
|
2408
|
+
print(" Install: pip install \"abstractvoice[chroma]\"")
|
|
2409
|
+
else:
|
|
2410
|
+
if self._is_chroma_cached():
|
|
2411
|
+
print("✅ Chroma artifacts: present (cached)")
|
|
2412
|
+
else:
|
|
2413
|
+
print("ℹ️ Chroma artifacts: not present (will require a large download + HF access)")
|
|
2414
|
+
print(" Run: /cloning_download chroma")
|
|
2415
|
+
try:
|
|
2416
|
+
if self.voice_manager:
|
|
2417
|
+
info = self.voice_manager.get_cloning_runtime_info()
|
|
2418
|
+
if info:
|
|
2419
|
+
print(f"cloning_resolved_device: {info.get('resolved_device')}")
|
|
2420
|
+
print(f"cloning_model_param_device: {info.get('model_param_device','?')}")
|
|
2421
|
+
print(f"cloning_quality_preset: {info.get('quality_preset')}")
|
|
2422
|
+
except Exception:
|
|
2423
|
+
pass
|
|
2424
|
+
|
|
2425
|
+
def do_clone_quality(self, arg):
|
|
2426
|
+
"""Set cloned TTS quality preset (speed/quality tradeoff).
|
|
2427
|
+
|
|
2428
|
+
Usage:
|
|
2429
|
+
/clone_quality fast|balanced|high
|
|
2430
|
+
"""
|
|
2431
|
+
if not self.voice_manager:
|
|
2432
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2433
|
+
return
|
|
2434
|
+
preset = (arg or "").strip().lower()
|
|
2435
|
+
if preset not in ("fast", "balanced", "high"):
|
|
2436
|
+
print("Usage: /clone_quality fast|balanced|high")
|
|
2437
|
+
return
|
|
2438
|
+
try:
|
|
2439
|
+
self.voice_manager.set_cloned_tts_quality(preset)
|
|
2440
|
+
print(f"✅ Cloned TTS quality preset: {preset}")
|
|
2441
|
+
except Exception as e:
|
|
2442
|
+
print(f"❌ Failed to set preset: {e}")
|
|
2443
|
+
|
|
2444
|
+
def do_cloning_download(self, arg):
|
|
2445
|
+
"""Explicitly download cloning artifacts (this may take a long time)."""
|
|
2446
|
+
if not self.voice_manager:
|
|
2447
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2448
|
+
return
|
|
2449
|
+
|
|
2450
|
+
target = (arg or "").strip().lower() or self.cloning_engine
|
|
2451
|
+
engine_name = "f5_tts" if target in ("openf5", "f5", "f5_tts") else target
|
|
2452
|
+
if engine_name == "f5_tts":
|
|
2453
|
+
if importlib.util.find_spec("f5_tts") is None:
|
|
2454
|
+
print("❌ OpenF5 runtime not installed in this environment (missing: f5_tts).")
|
|
2455
|
+
print(" Install: pip install \"abstractvoice[cloning]\"")
|
|
2456
|
+
return
|
|
2457
|
+
elif engine_name == "chroma":
|
|
2458
|
+
# Artifacts download uses huggingface_hub and does not require loading the model.
|
|
2459
|
+
if importlib.util.find_spec("huggingface_hub") is None:
|
|
2460
|
+
print("❌ huggingface_hub is required to download Chroma artifacts.")
|
|
2461
|
+
print(" Install: pip install huggingface_hub")
|
|
2462
|
+
return
|
|
2463
|
+
else:
|
|
2464
|
+
print("Usage: /cloning_download [f5_tts|chroma]")
|
|
2465
|
+
return
|
|
2466
|
+
|
|
2467
|
+
try:
|
|
2468
|
+
cloner = self.voice_manager._get_voice_cloner() # REPL convenience
|
|
2469
|
+
engine = cloner._get_engine(engine_name) # explicit download is an engine concern
|
|
2470
|
+
if engine_name == "f5_tts":
|
|
2471
|
+
print("Downloading OpenF5 artifacts (~5.4GB). This is a one-time cache per machine.")
|
|
2472
|
+
engine.ensure_openf5_artifacts_downloaded()
|
|
2473
|
+
else:
|
|
2474
|
+
print("Downloading Chroma artifacts (very large; requires HF access). This is a one-time cache per machine.")
|
|
2475
|
+
engine.ensure_chroma_artifacts_downloaded()
|
|
2476
|
+
print("✅ Download complete.")
|
|
2477
|
+
except Exception as e:
|
|
2478
|
+
print(f"❌ Download failed: {e}")
|
|
2479
|
+
|
|
2480
|
+
def _is_openf5_cached(self) -> bool:
|
|
2481
|
+
"""Heuristic local check that avoids importing huggingface_hub."""
|
|
2482
|
+
from pathlib import Path
|
|
2483
|
+
import os
|
|
2484
|
+
|
|
2485
|
+
root = Path(os.path.expanduser("~/.cache/abstractvoice/openf5"))
|
|
2486
|
+
if not root.exists():
|
|
2487
|
+
return False
|
|
2488
|
+
cfg = next(iter(root.rglob("*.yaml")), None) or next(iter(root.rglob("*.yml")), None)
|
|
2489
|
+
ckpt = next(iter(root.rglob("*.pt")), None)
|
|
2490
|
+
vocab = next(iter(root.rglob("vocab*.txt")), None) or next(iter(root.rglob("*.txt")), None)
|
|
2491
|
+
return bool(cfg and ckpt and vocab)
|
|
2492
|
+
|
|
2493
|
+
def _is_chroma_cached(self) -> bool:
|
|
2494
|
+
"""Heuristic local check that avoids importing huggingface_hub."""
|
|
2495
|
+
from pathlib import Path
|
|
2496
|
+
import os
|
|
2497
|
+
|
|
2498
|
+
root = Path(os.path.expanduser("~/.cache/abstractvoice/chroma"))
|
|
2499
|
+
if not root.exists():
|
|
2500
|
+
return False
|
|
2501
|
+
required = [
|
|
2502
|
+
"config.json",
|
|
2503
|
+
"processor_config.json",
|
|
2504
|
+
"model.safetensors.index.json",
|
|
2505
|
+
"modeling_chroma.py",
|
|
2506
|
+
"processing_chroma.py",
|
|
2507
|
+
"configuration_chroma.py",
|
|
2508
|
+
]
|
|
2509
|
+
return all((root / name).exists() for name in required)
|
|
2510
|
+
|
|
2511
|
+
def _is_cloning_runtime_ready(self, *, voice_id: str | None = None, engine: str | None = None) -> bool:
|
|
2512
|
+
"""Return whether the selected cloning engine is ready locally (no downloads)."""
|
|
2513
|
+
eng = str(engine or "").strip().lower()
|
|
2514
|
+
if not eng and voice_id and self.voice_manager:
|
|
2515
|
+
try:
|
|
2516
|
+
info = self.voice_manager.get_cloned_voice(voice_id)
|
|
2517
|
+
eng = str((info or {}).get("engine") or "").strip().lower()
|
|
2518
|
+
except Exception:
|
|
2519
|
+
eng = ""
|
|
2520
|
+
if not eng:
|
|
2521
|
+
eng = str(getattr(self, "cloning_engine", "f5_tts") or "f5_tts").strip().lower()
|
|
2522
|
+
|
|
2523
|
+
if eng == "chroma":
|
|
2524
|
+
return (
|
|
2525
|
+
importlib.util.find_spec("torch") is not None
|
|
2526
|
+
and importlib.util.find_spec("transformers") is not None
|
|
2527
|
+
and self._is_chroma_cached()
|
|
2528
|
+
)
|
|
2529
|
+
return importlib.util.find_spec("f5_tts") is not None and self._is_openf5_cached()
|
|
2530
|
+
|
|
2531
|
+
def _seed_hal9000_voice(self):
|
|
2532
|
+
"""Seed a default 'hal9000' cloned voice if sample WAVs are present."""
|
|
2533
|
+
if not self.voice_manager:
|
|
2534
|
+
return
|
|
2535
|
+
try:
|
|
2536
|
+
from pathlib import Path
|
|
2537
|
+
|
|
2538
|
+
sample_dir = Path("audio_samples") / "hal9000"
|
|
2539
|
+
if not sample_dir.exists():
|
|
2540
|
+
return
|
|
2541
|
+
|
|
2542
|
+
# If already present, do nothing.
|
|
2543
|
+
existing_hal = None
|
|
2544
|
+
for v in self.voice_manager.list_cloned_voices():
|
|
2545
|
+
if (v.get("name") or "").lower() == "hal9000":
|
|
2546
|
+
existing_hal = v.get("voice_id")
|
|
2547
|
+
break
|
|
2548
|
+
|
|
2549
|
+
# Seed from the clean short WAV sample to avoid noisy auto-transcriptions.
|
|
2550
|
+
# This avoids repeated artifacts like "how are you hal" bleeding into outputs.
|
|
2551
|
+
if existing_hal is None:
|
|
2552
|
+
ref = sample_dir / "hal9000_hello.wav"
|
|
2553
|
+
if ref.exists():
|
|
2554
|
+
existing_hal = self.voice_manager.clone_voice(
|
|
2555
|
+
str(ref),
|
|
2556
|
+
name="hal9000",
|
|
2557
|
+
reference_text="Hello, Dave.",
|
|
2558
|
+
)
|
|
2559
|
+
else:
|
|
2560
|
+
existing_hal = self.voice_manager.clone_voice(str(sample_dir), name="hal9000")
|
|
2561
|
+
if self.debug_mode:
|
|
2562
|
+
print(f"Seeded cloned voice 'hal9000': {existing_hal}")
|
|
2563
|
+
|
|
2564
|
+
# Do NOT auto-select here; selecting a clone without explicit user action
|
|
2565
|
+
# can cause surprise multi-GB downloads. Users can opt in via /tts_voice.
|
|
2566
|
+
except Exception:
|
|
2567
|
+
# Best-effort only; never block REPL start.
|
|
2568
|
+
return
|
|
2569
|
+
|
|
2570
|
+
def do_tts_engine(self, arg):
|
|
2571
|
+
"""Select TTS engine: auto|piper.
|
|
2572
|
+
|
|
2573
|
+
This recreates the internal VoiceManager instance.
|
|
2574
|
+
"""
|
|
2575
|
+
engine = arg.strip().lower()
|
|
2576
|
+
if engine not in ("auto", "piper"):
|
|
2577
|
+
print("Usage: /tts_engine auto|piper")
|
|
2578
|
+
return
|
|
2579
|
+
|
|
2580
|
+
if self.voice_manager:
|
|
2581
|
+
try:
|
|
2582
|
+
self.voice_manager.cleanup()
|
|
2583
|
+
except Exception:
|
|
2584
|
+
pass
|
|
2585
|
+
|
|
2586
|
+
self.voice_manager = VoiceManager(
|
|
2587
|
+
language=self.current_language,
|
|
2588
|
+
tts_model=self._initial_tts_model,
|
|
2589
|
+
debug_mode=self.debug_mode,
|
|
2590
|
+
tts_engine=engine,
|
|
2591
|
+
allow_downloads=False,
|
|
2592
|
+
cloned_tts_streaming=False,
|
|
2593
|
+
cloning_engine=self.cloning_engine,
|
|
2594
|
+
)
|
|
2595
|
+
print(f"✅ TTS engine set to: {engine}")
|
|
2596
|
+
|
|
2597
|
+
def do_aec(self, arg):
|
|
2598
|
+
"""Enable/disable optional AEC (echo cancellation) for true barge-in.
|
|
2599
|
+
|
|
2600
|
+
Usage:
|
|
2601
|
+
/aec on [delay_ms]
|
|
2602
|
+
/aec off
|
|
2603
|
+
"""
|
|
2604
|
+
if not self.voice_manager:
|
|
2605
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2606
|
+
return
|
|
2607
|
+
|
|
2608
|
+
parts = arg.strip().split()
|
|
2609
|
+
if not parts:
|
|
2610
|
+
enabled = bool(getattr(self.voice_manager, "_aec_enabled", False))
|
|
2611
|
+
delay = int(getattr(self.voice_manager, "_aec_stream_delay_ms", 0))
|
|
2612
|
+
print(f"AEC: {'on' if enabled else 'off'} (delay_ms={delay})")
|
|
2613
|
+
print("Usage: /aec on [delay_ms] | /aec off")
|
|
2614
|
+
return
|
|
2615
|
+
|
|
2616
|
+
if parts[0] == "off":
|
|
2617
|
+
try:
|
|
2618
|
+
self.voice_manager.enable_aec(False)
|
|
2619
|
+
print("✅ AEC disabled")
|
|
2620
|
+
except Exception as e:
|
|
2621
|
+
print(f"❌ AEC disable failed: {e}")
|
|
2622
|
+
return
|
|
2623
|
+
|
|
2624
|
+
if parts[0] != "on":
|
|
2625
|
+
print("Usage: /aec on [delay_ms] | /aec off")
|
|
2626
|
+
return
|
|
2627
|
+
|
|
2628
|
+
delay_ms = 0
|
|
2629
|
+
if len(parts) > 1:
|
|
2630
|
+
try:
|
|
2631
|
+
delay_ms = int(parts[1])
|
|
2632
|
+
except Exception:
|
|
2633
|
+
print("Usage: /aec on [delay_ms] | /aec off")
|
|
2634
|
+
return
|
|
2635
|
+
|
|
2636
|
+
try:
|
|
2637
|
+
self.voice_manager.enable_aec(True, stream_delay_ms=delay_ms)
|
|
2638
|
+
print(f"✅ AEC enabled (delay_ms={delay_ms}).")
|
|
2639
|
+
print("Tip: use /voice full for barge-in behavior when AEC is enabled.")
|
|
2640
|
+
except Exception as e:
|
|
2641
|
+
print(f"❌ AEC enable failed: {e}")
|
|
2642
|
+
|
|
2643
|
+
def do_stt_engine(self, arg):
|
|
2644
|
+
"""Select STT engine: auto|faster_whisper|whisper.
|
|
2645
|
+
|
|
2646
|
+
This recreates the internal VoiceManager instance.
|
|
2647
|
+
"""
|
|
2648
|
+
engine = arg.strip().lower()
|
|
2649
|
+
if engine not in ("auto", "faster_whisper", "whisper"):
|
|
2650
|
+
print("Usage: /stt_engine auto|faster_whisper|whisper")
|
|
2651
|
+
return
|
|
2652
|
+
|
|
2653
|
+
if not self.voice_manager:
|
|
2654
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2655
|
+
return
|
|
2656
|
+
|
|
2657
|
+
# Recreate VoiceManager preserving current TTS engine preference.
|
|
2658
|
+
# If the current engine is unknown, let it auto-select.
|
|
2659
|
+
tts_engine = getattr(self.voice_manager, "_tts_engine_preference", "auto")
|
|
2660
|
+
|
|
2661
|
+
try:
|
|
2662
|
+
self.voice_manager.cleanup()
|
|
2663
|
+
except Exception:
|
|
2664
|
+
pass
|
|
2665
|
+
|
|
2666
|
+
self.voice_manager = VoiceManager(
|
|
2667
|
+
language=self.current_language,
|
|
2668
|
+
tts_model=self._initial_tts_model,
|
|
2669
|
+
debug_mode=self.debug_mode,
|
|
2670
|
+
tts_engine=tts_engine,
|
|
2671
|
+
stt_engine=engine,
|
|
2672
|
+
allow_downloads=False,
|
|
2673
|
+
cloned_tts_streaming=False,
|
|
2674
|
+
cloning_engine=self.cloning_engine,
|
|
2675
|
+
)
|
|
2676
|
+
print(f"✅ STT engine set to: {engine}")
|
|
2677
|
+
|
|
2678
|
+
def do_transcribe(self, arg):
|
|
2679
|
+
"""Transcribe an audio file via the library STT path (faster-whisper by default).
|
|
2680
|
+
|
|
2681
|
+
Usage:
|
|
2682
|
+
/transcribe path/to/audio.wav
|
|
2683
|
+
|
|
2684
|
+
Notes:
|
|
2685
|
+
- This is the simplest way to validate STT without requiring microphone capture.
|
|
2686
|
+
- The default engine is faster-whisper; legacy openai-whisper remains optional.
|
|
2687
|
+
"""
|
|
2688
|
+
if not self.voice_manager:
|
|
2689
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2690
|
+
return
|
|
2691
|
+
|
|
2692
|
+
path = arg.strip()
|
|
2693
|
+
if not path:
|
|
2694
|
+
print("Usage: /transcribe <path/to/audio.wav>")
|
|
2695
|
+
return
|
|
2696
|
+
|
|
2697
|
+
try:
|
|
2698
|
+
text = self.voice_manager.transcribe_file(path)
|
|
2699
|
+
print(f"{Colors.CYAN}{text}{Colors.END}")
|
|
2700
|
+
except Exception as e:
|
|
2701
|
+
print(f"❌ Transcription failed: {e}")
|
|
2702
|
+
if self.debug_mode:
|
|
2703
|
+
import traceback
|
|
2704
|
+
traceback.print_exc()
|
|
679
2705
|
|
|
680
2706
|
def do_clear(self, arg):
|
|
681
2707
|
"""Clear chat history."""
|
|
2708
|
+
self._clear_history()
|
|
2709
|
+
print("History cleared")
|
|
2710
|
+
|
|
2711
|
+
def do_reset(self, arg):
|
|
2712
|
+
"""Reset the session (history + current voice selection)."""
|
|
2713
|
+
try:
|
|
2714
|
+
if self.voice_manager:
|
|
2715
|
+
self.voice_manager.stop_speaking()
|
|
2716
|
+
except Exception:
|
|
2717
|
+
pass
|
|
2718
|
+
|
|
2719
|
+
# Reset voice selection back to Piper (default).
|
|
2720
|
+
self.current_tts_voice = None
|
|
2721
|
+
# Free any heavy cloning engines as part of reset.
|
|
2722
|
+
try:
|
|
2723
|
+
if self.voice_manager and hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2724
|
+
self.voice_manager.unload_cloning_engines()
|
|
2725
|
+
except Exception:
|
|
2726
|
+
pass
|
|
2727
|
+
# Ensure Piper is ready (in case it was unloaded to save memory).
|
|
2728
|
+
try:
|
|
2729
|
+
if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
|
|
2730
|
+
a = getattr(self.voice_manager, "tts_adapter", None)
|
|
2731
|
+
if hasattr(a, "is_available") and not bool(a.is_available()):
|
|
2732
|
+
self.voice_manager.set_language(self.current_language)
|
|
2733
|
+
except Exception:
|
|
2734
|
+
pass
|
|
2735
|
+
|
|
2736
|
+
# Clear chat history.
|
|
2737
|
+
self._clear_history()
|
|
2738
|
+
print("✅ Reset.")
|
|
2739
|
+
|
|
2740
|
+
def _clear_history(self) -> None:
|
|
682
2741
|
self.messages = [{"role": "system", "content": self.system_prompt}]
|
|
683
2742
|
# Reset token counters
|
|
684
2743
|
self.system_tokens = 0
|
|
685
2744
|
self.user_tokens = 0
|
|
686
2745
|
self.assistant_tokens = 0
|
|
2746
|
+
# Reset word counters
|
|
2747
|
+
self.system_words = 0
|
|
2748
|
+
self.user_words = 0
|
|
2749
|
+
self.assistant_words = 0
|
|
687
2750
|
# Recalculate system tokens
|
|
688
2751
|
self._count_system_tokens()
|
|
689
|
-
|
|
2752
|
+
self._count_system_words()
|
|
690
2753
|
|
|
691
2754
|
def do_system(self, arg):
|
|
692
2755
|
"""Set the system prompt."""
|
|
693
2756
|
if arg.strip():
|
|
694
2757
|
self.system_prompt = arg.strip()
|
|
695
|
-
self.
|
|
2758
|
+
self._clear_history()
|
|
696
2759
|
print(f"System prompt set to: {self.system_prompt}")
|
|
697
2760
|
else:
|
|
698
2761
|
print(f"Current system prompt: {self.system_prompt}")
|
|
699
2762
|
|
|
700
2763
|
def do_exit(self, arg):
|
|
701
2764
|
"""Exit the REPL."""
|
|
702
|
-
|
|
2765
|
+
# Stop any PTT session cleanly.
|
|
2766
|
+
self._ptt_session_active = False
|
|
2767
|
+
self._ptt_recording = False
|
|
2768
|
+
self._ptt_busy = False
|
|
2769
|
+
|
|
2770
|
+
# Stop voice mode / audio best-effort.
|
|
2771
|
+
try:
|
|
2772
|
+
if self.voice_manager:
|
|
2773
|
+
try:
|
|
2774
|
+
self.voice_manager.stop_listening()
|
|
2775
|
+
except Exception:
|
|
2776
|
+
pass
|
|
2777
|
+
try:
|
|
2778
|
+
self.voice_manager.stop_speaking()
|
|
2779
|
+
except Exception:
|
|
2780
|
+
pass
|
|
2781
|
+
except Exception:
|
|
2782
|
+
pass
|
|
2783
|
+
|
|
2784
|
+
try:
|
|
2785
|
+
if self.voice_manager:
|
|
2786
|
+
self.voice_manager.cleanup()
|
|
2787
|
+
except Exception:
|
|
2788
|
+
pass
|
|
703
2789
|
if self.debug_mode:
|
|
704
2790
|
print("Goodbye!")
|
|
705
2791
|
return True
|
|
@@ -781,37 +2867,81 @@ class VoiceREPL(cmd.Cmd):
|
|
|
781
2867
|
|
|
782
2868
|
# If neither voice mode nor TTS is active - don't show any message
|
|
783
2869
|
pass
|
|
2870
|
+
|
|
2871
|
+
def do_verbose(self, arg):
|
|
2872
|
+
"""Toggle verbose per-turn performance stats.
|
|
2873
|
+
|
|
2874
|
+
Usage:
|
|
2875
|
+
/verbose (toggle)
|
|
2876
|
+
/verbose on|off
|
|
2877
|
+
"""
|
|
2878
|
+
s = (arg or "").strip().lower()
|
|
2879
|
+
if s in ("", "toggle"):
|
|
2880
|
+
self.verbose_mode = not bool(getattr(self, "verbose_mode", False))
|
|
2881
|
+
elif s in ("on", "1", "true", "yes", "y"):
|
|
2882
|
+
self.verbose_mode = True
|
|
2883
|
+
elif s in ("off", "0", "false", "no", "n"):
|
|
2884
|
+
self.verbose_mode = False
|
|
2885
|
+
else:
|
|
2886
|
+
print("Usage: /verbose [on|off]")
|
|
2887
|
+
return
|
|
2888
|
+
print(f"Verbose mode: {'on' if self.verbose_mode else 'off'}")
|
|
784
2889
|
|
|
785
2890
|
def do_help(self, arg):
|
|
786
2891
|
"""Show help information."""
|
|
787
2892
|
print("Commands:")
|
|
788
2893
|
print(" /exit, /q, /quit Exit REPL")
|
|
789
2894
|
print(" /clear Clear history")
|
|
2895
|
+
print(" /reset Reset (history + voice)")
|
|
790
2896
|
print(" /tts on|off Toggle TTS")
|
|
791
2897
|
print(" /voice <mode> Voice input: off|full|wait|stop|ptt")
|
|
792
|
-
print(" /
|
|
793
|
-
print(" /
|
|
2898
|
+
print(" /voice ptt Push-to-talk session (SPACE captures, ESC exits)")
|
|
2899
|
+
print(" /language <lang> Switch voice language (en, fr, es, de, ru, zh)")
|
|
2900
|
+
print(" /setvoice [id] List Piper voices or set one (lang.voice_id)")
|
|
794
2901
|
print(" /lang_info Show current language information")
|
|
795
2902
|
print(" /list_languages List all supported languages")
|
|
796
2903
|
print(" /speed <number> Set TTS speed (0.5-2.0, default: 1.0, pitch preserved)")
|
|
797
|
-
print(" /
|
|
2904
|
+
print(" /tts_voice ... Select Piper vs cloned voice (see below)")
|
|
2905
|
+
print(" /tts_engine <e> Switch TTS engine: auto|piper")
|
|
798
2906
|
print(" /whisper <model> Switch Whisper model: tiny|base|small|medium|large")
|
|
2907
|
+
print(" /stt_engine <e> Switch STT engine: auto|faster_whisper|whisper (whisper is optional extra)")
|
|
2908
|
+
print(" /speak <text> Speak text (no LLM call)")
|
|
2909
|
+
print(" /transcribe <path> Transcribe an audio file (faster-whisper by default)")
|
|
799
2910
|
print(" /system <prompt> Set system prompt")
|
|
800
2911
|
print(" /stop Stop voice mode or TTS playback")
|
|
801
2912
|
print(" /pause Pause current TTS playback")
|
|
802
2913
|
print(" /resume Resume paused TTS playback")
|
|
2914
|
+
print(" /aec on|off Optional echo cancellation for true barge-in (requires [aec])")
|
|
803
2915
|
print(" /tokens Display token usage stats")
|
|
2916
|
+
print(" /verbose [on|off] Toggle verbose per-turn stats")
|
|
804
2917
|
print(" /help Show this help")
|
|
2918
|
+
print(" /clones List cloned voices")
|
|
2919
|
+
print(" /clone_info <id> Show cloned voice details")
|
|
2920
|
+
print(" /clone_ref <id> Show cloned voice reference text")
|
|
2921
|
+
print(" /clone_rename ... Rename a cloned voice")
|
|
2922
|
+
print(" /clone_rm <id> Delete a cloned voice")
|
|
2923
|
+
print(" /clone_rm_all --yes Delete ALL cloned voices")
|
|
2924
|
+
print(" /clone_export ... Export a cloned voice (.zip)")
|
|
2925
|
+
print(" /clone_import ... Import a cloned voice (.zip)")
|
|
2926
|
+
print(" /clone <path> [nm] Add a cloned voice from WAV/FLAC/OGG")
|
|
2927
|
+
print(" /clone_use <path> Clone+select voice (or reuse)")
|
|
2928
|
+
print(" /clone-my-voice Record a short prompt and clone it")
|
|
2929
|
+
print(" /tts_voice piper Speak with Piper (default)")
|
|
2930
|
+
print(" /tts_voice clone X Speak with a cloned voice (requires cloning runtime + cache)")
|
|
2931
|
+
print(" /cloning_status Show cloning readiness (no downloads)")
|
|
2932
|
+
print(" /cloning_download Explicitly download OpenF5 artifacts (~5.4GB)")
|
|
2933
|
+
print(" /clone_quality Set cloned TTS speed/quality: fast|balanced|high")
|
|
805
2934
|
print(" /save <filename> Save chat history to file")
|
|
806
2935
|
print(" /load <filename> Load chat history from file")
|
|
807
2936
|
print(" /model <name> Change the LLM model")
|
|
808
2937
|
print(" /temperature <val> Set temperature (0.0-2.0, default: 0.7)")
|
|
809
2938
|
print(" /max_tokens <num> Set max tokens (default: 4096)")
|
|
810
|
-
print(" stop
|
|
2939
|
+
print(" stop (deprecated) use /voice off or say 'stop' during STOP mode")
|
|
811
2940
|
print(" <message> Send to LLM (text mode)")
|
|
812
2941
|
print()
|
|
813
2942
|
print("Note: ALL commands must start with / except 'stop'")
|
|
814
|
-
print("In
|
|
2943
|
+
print("In STOP mode, say 'stop' / 'ok stop' to stop speaking (does not exit voice mode).")
|
|
2944
|
+
print("Shortcut: paste a WAV/FLAC/OGG path to clone+select (optionally: `path | transcript`).")
|
|
815
2945
|
|
|
816
2946
|
def emptyline(self):
|
|
817
2947
|
"""Handle empty line input."""
|
|
@@ -821,6 +2951,10 @@ class VoiceREPL(cmd.Cmd):
|
|
|
821
2951
|
def do_tokens(self, arg):
|
|
822
2952
|
"""Display token usage information."""
|
|
823
2953
|
try:
|
|
2954
|
+
if self._get_tiktoken_encoding() is None:
|
|
2955
|
+
print("Token counting is not available (install: pip install tiktoken).")
|
|
2956
|
+
return
|
|
2957
|
+
|
|
824
2958
|
# Always recalculate tokens to ensure accuracy
|
|
825
2959
|
self._reset_and_recalculate_tokens()
|
|
826
2960
|
|
|
@@ -998,15 +3132,26 @@ class VoiceREPL(cmd.Cmd):
|
|
|
998
3132
|
print(f"Failed to load chat history from {filename}")
|
|
999
3133
|
|
|
1000
3134
|
def _reset_and_recalculate_tokens(self):
|
|
1001
|
-
"""Reset token counts and recalculate for all messages."""
|
|
3135
|
+
"""Reset token/word counts and recalculate for all messages."""
|
|
1002
3136
|
self.system_tokens = 0
|
|
1003
3137
|
self.user_tokens = 0
|
|
1004
3138
|
self.assistant_tokens = 0
|
|
3139
|
+
self.system_words = 0
|
|
3140
|
+
self.user_words = 0
|
|
3141
|
+
self.assistant_words = 0
|
|
1005
3142
|
|
|
1006
3143
|
# Count tokens for all messages
|
|
1007
3144
|
for msg in self.messages:
|
|
1008
3145
|
if isinstance(msg, dict) and "content" in msg and "role" in msg:
|
|
1009
3146
|
self._count_tokens(msg["content"], msg["role"])
|
|
3147
|
+
w = self._count_words(msg["content"])
|
|
3148
|
+
r = msg.get("role")
|
|
3149
|
+
if r == "system":
|
|
3150
|
+
self.system_words = int(w)
|
|
3151
|
+
elif r == "user":
|
|
3152
|
+
self.user_words += int(w)
|
|
3153
|
+
elif r == "assistant":
|
|
3154
|
+
self.assistant_words += int(w)
|
|
1010
3155
|
|
|
1011
3156
|
def _ensure_system_message(self):
|
|
1012
3157
|
"""Ensure there's a system message at the start of messages."""
|
|
@@ -1070,13 +3215,30 @@ def parse_args():
|
|
|
1070
3215
|
"""Parse command line arguments."""
|
|
1071
3216
|
parser = argparse.ArgumentParser(description="AbstractVoice CLI Example")
|
|
1072
3217
|
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
3218
|
+
parser.add_argument("--verbose", action="store_true", help="Show per-turn performance stats")
|
|
1073
3219
|
parser.add_argument("--api", default="http://localhost:11434/api/chat",
|
|
1074
3220
|
help="LLM API URL")
|
|
1075
|
-
parser.add_argument("--model", default="
|
|
3221
|
+
parser.add_argument("--model", default="cogito:3b",
|
|
1076
3222
|
help="LLM model name")
|
|
1077
|
-
parser.add_argument(
|
|
1078
|
-
|
|
1079
|
-
|
|
3223
|
+
parser.add_argument(
|
|
3224
|
+
"--cloning-engine",
|
|
3225
|
+
default="f5_tts",
|
|
3226
|
+
choices=["f5_tts", "chroma"],
|
|
3227
|
+
help="Default cloning backend for new voices (f5_tts|chroma)",
|
|
3228
|
+
)
|
|
3229
|
+
parser.add_argument(
|
|
3230
|
+
"--voice-mode",
|
|
3231
|
+
default="off",
|
|
3232
|
+
choices=["off", "wait", "stop", "full", "ptt"],
|
|
3233
|
+
help="Auto-start microphone voice mode (off|wait|stop|full|ptt). Default: off.",
|
|
3234
|
+
)
|
|
3235
|
+
parser.add_argument(
|
|
3236
|
+
"--language",
|
|
3237
|
+
"--lang",
|
|
3238
|
+
default="en",
|
|
3239
|
+
choices=["en", "fr", "de", "es", "ru", "zh"],
|
|
3240
|
+
help="Voice language for default Piper TTS (en|fr|de|es|ru|zh).",
|
|
3241
|
+
)
|
|
1080
3242
|
parser.add_argument("--tts-model",
|
|
1081
3243
|
help="Specific TTS model to use (overrides language default)")
|
|
1082
3244
|
return parser.parse_args()
|
|
@@ -1093,8 +3255,11 @@ def main():
|
|
|
1093
3255
|
api_url=args.api,
|
|
1094
3256
|
model=args.model,
|
|
1095
3257
|
debug_mode=args.debug,
|
|
3258
|
+
verbose_mode=args.verbose,
|
|
1096
3259
|
language=args.language,
|
|
1097
|
-
tts_model=args.tts_model
|
|
3260
|
+
tts_model=args.tts_model,
|
|
3261
|
+
voice_mode=args.voice_mode,
|
|
3262
|
+
cloning_engine=args.cloning_engine,
|
|
1098
3263
|
)
|
|
1099
3264
|
repl.cmdloop()
|
|
1100
3265
|
except KeyboardInterrupt:
|
|
@@ -1104,4 +3269,4 @@ def main():
|
|
|
1104
3269
|
|
|
1105
3270
|
|
|
1106
3271
|
if __name__ == "__main__":
|
|
1107
|
-
main()
|
|
3272
|
+
main()
|