abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2403 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.1.dist-info/METADATA +213 -0
- abstractvoice-0.6.1.dist-info/RECORD +52 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.1.dist-info/METADATA +0 -1458
- abstractvoice-0.5.1.dist-info/RECORD +0 -23
- abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -8,9 +8,15 @@ that interacts with an LLM API for text generation.
|
|
|
8
8
|
|
|
9
9
|
import argparse
|
|
10
10
|
import cmd
|
|
11
|
+
import atexit
|
|
11
12
|
import json
|
|
12
13
|
import re
|
|
14
|
+
import shlex
|
|
15
|
+
import shutil
|
|
13
16
|
import sys
|
|
17
|
+
import importlib.util
|
|
18
|
+
import threading
|
|
19
|
+
import time
|
|
14
20
|
import requests
|
|
15
21
|
from abstractvoice import VoiceManager
|
|
16
22
|
|
|
@@ -31,18 +37,34 @@ class VoiceREPL(cmd.Cmd):
|
|
|
31
37
|
"""Voice-enabled REPL for LLM interaction."""
|
|
32
38
|
|
|
33
39
|
intro = "" # Will be set in __init__ to include help
|
|
34
|
-
prompt =
|
|
40
|
+
prompt = "> "
|
|
35
41
|
|
|
36
42
|
# Override cmd module settings
|
|
37
43
|
ruler = "" # No horizontal rule line
|
|
38
44
|
use_rawinput = True
|
|
39
45
|
|
|
40
|
-
def __init__(
|
|
41
|
-
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
api_url="http://localhost:11434/api/chat",
|
|
49
|
+
model="cogito:3b",
|
|
50
|
+
debug_mode=False,
|
|
51
|
+
verbose_mode: bool = False,
|
|
52
|
+
language="en",
|
|
53
|
+
tts_model=None,
|
|
54
|
+
voice_mode: str = "off",
|
|
55
|
+
disable_tts=False,
|
|
56
|
+
cloning_engine: str = "f5_tts",
|
|
57
|
+
):
|
|
42
58
|
super().__init__()
|
|
43
59
|
|
|
60
|
+
# Best-effort: enable proper line editing + history (Up/Down arrows).
|
|
61
|
+
# Some Python builds (notably when built without readline/libedit) will
|
|
62
|
+
# otherwise treat arrow keys as escape sequences and corrupt the prompt.
|
|
63
|
+
self._init_readline()
|
|
64
|
+
|
|
44
65
|
# Debug mode
|
|
45
66
|
self.debug_mode = debug_mode
|
|
67
|
+
self.verbose_mode = bool(verbose_mode)
|
|
46
68
|
|
|
47
69
|
# API settings
|
|
48
70
|
self.api_url = api_url
|
|
@@ -52,6 +74,8 @@ class VoiceREPL(cmd.Cmd):
|
|
|
52
74
|
|
|
53
75
|
# Language settings
|
|
54
76
|
self.current_language = language
|
|
77
|
+
self._initial_tts_model = tts_model
|
|
78
|
+
self.cloning_engine = str(cloning_engine or "f5_tts").strip().lower()
|
|
55
79
|
|
|
56
80
|
# Initialize voice manager with language support
|
|
57
81
|
if disable_tts:
|
|
@@ -61,19 +85,36 @@ class VoiceREPL(cmd.Cmd):
|
|
|
61
85
|
self.voice_manager = VoiceManager(
|
|
62
86
|
language=language,
|
|
63
87
|
tts_model=tts_model,
|
|
64
|
-
debug_mode=debug_mode
|
|
88
|
+
debug_mode=debug_mode,
|
|
89
|
+
allow_downloads=False,
|
|
90
|
+
cloned_tts_streaming=False,
|
|
91
|
+
cloning_engine=self.cloning_engine,
|
|
65
92
|
)
|
|
93
|
+
|
|
94
|
+
# Current speaking voice:
|
|
95
|
+
# - None => Piper (default, language-driven)
|
|
96
|
+
# - str => cloned voice_id
|
|
97
|
+
self.current_tts_voice: str | None = None
|
|
98
|
+
|
|
99
|
+
# When reference_text is auto-generated via ASR ("asr" source), print a
|
|
100
|
+
# ready-to-copy `/clone_set_ref_text ...` hint once per voice for easy correction.
|
|
101
|
+
self._printed_asr_ref_text_hint: set[str] = set()
|
|
102
|
+
|
|
103
|
+
# Seed a default cloned voice (HAL9000) if samples are present.
|
|
104
|
+
self._seed_hal9000_voice()
|
|
66
105
|
|
|
67
106
|
# Settings
|
|
68
107
|
self.use_tts = True
|
|
69
|
-
|
|
108
|
+
# Voice input mode (mic). Default: OFF for fast startup + offline-first.
|
|
109
|
+
# Use `--voice-mode stop` (or `/voice stop`) to enable hands-free.
|
|
110
|
+
self.voice_mode = (voice_mode or "off").strip().lower() # off, full, wait, stop, ptt
|
|
70
111
|
self.voice_mode_active = False # Is voice recognition running?
|
|
112
|
+
self._ptt_session_active = False
|
|
113
|
+
self._ptt_recording = False
|
|
114
|
+
self._ptt_busy = False
|
|
71
115
|
|
|
72
116
|
# System prompt
|
|
73
|
-
self.system_prompt = ""
|
|
74
|
-
You are a Helpful Voice Assistant. By design, your answers are short and more conversational, unless specifically asked to detail something.
|
|
75
|
-
You only speak, so never use any text formatting or markdown. Write for a speaker.
|
|
76
|
-
"""
|
|
117
|
+
self.system_prompt = "You are a Helpful Voice Assistant. By design, your answers are short and conversational, unless specifically asked to detail something. You only speak, so never use any text formatting, hinting, *emotions*, emojis or markdown. Incarnate the speaker, never comment your instructions."
|
|
77
118
|
|
|
78
119
|
# Message history
|
|
79
120
|
self.messages = [{"role": "system", "content": self.system_prompt}]
|
|
@@ -82,27 +123,136 @@ class VoiceREPL(cmd.Cmd):
|
|
|
82
123
|
self.system_tokens = 0
|
|
83
124
|
self.user_tokens = 0
|
|
84
125
|
self.assistant_tokens = 0
|
|
126
|
+
# LLM token totals (best-effort, Ollama API `eval_count`).
|
|
127
|
+
self.total_llm_out_tokens = 0
|
|
128
|
+
# Word counting
|
|
129
|
+
self.system_words = 0
|
|
130
|
+
self.user_words = 0
|
|
131
|
+
self.assistant_words = 0
|
|
132
|
+
# Best-effort tokenizer cache (tiktoken optional).
|
|
133
|
+
self._tiktoken_encoding = None
|
|
134
|
+
self._tiktoken_unavailable = False
|
|
85
135
|
self._count_system_tokens()
|
|
86
|
-
|
|
136
|
+
self._count_system_words()
|
|
137
|
+
|
|
138
|
+
# Best-effort metrics captured from voice input paths.
|
|
139
|
+
self._pending_stt_metrics: dict | None = None
|
|
140
|
+
|
|
87
141
|
if self.debug_mode:
|
|
88
142
|
print(f"Initialized with API URL: {api_url}")
|
|
89
143
|
print(f"Using model: {model}")
|
|
90
|
-
|
|
144
|
+
|
|
145
|
+
# Optionally auto-start voice input (mic). Keep OFF by default to avoid
|
|
146
|
+
# loading STT models (slow) unless the user explicitly opts in.
|
|
147
|
+
if self.voice_manager and self.voice_mode and self.voice_mode != "off":
|
|
148
|
+
try:
|
|
149
|
+
self.do_voice(self.voice_mode)
|
|
150
|
+
except Exception:
|
|
151
|
+
# Never block REPL start.
|
|
152
|
+
self.voice_mode = "off"
|
|
153
|
+
self.voice_mode_active = False
|
|
154
|
+
|
|
91
155
|
# Set intro with help information
|
|
92
156
|
self.intro = self._get_intro()
|
|
157
|
+
|
|
158
|
+
def _init_readline(self) -> None:
|
|
159
|
+
"""Initialize readline history + make ANSI prompts safe (best-effort)."""
|
|
160
|
+
rl = None
|
|
161
|
+
try:
|
|
162
|
+
import readline as _readline # type: ignore
|
|
163
|
+
|
|
164
|
+
rl = _readline
|
|
165
|
+
except Exception:
|
|
166
|
+
# Windows users may have pyreadline3 installed.
|
|
167
|
+
try:
|
|
168
|
+
import pyreadline3 as _readline # type: ignore
|
|
169
|
+
|
|
170
|
+
rl = _readline
|
|
171
|
+
except Exception:
|
|
172
|
+
rl = None
|
|
173
|
+
|
|
174
|
+
if rl is None:
|
|
175
|
+
# Keep prompt simple and avoid ANSI; prevents strange cursor behavior
|
|
176
|
+
# when arrow keys emit escape codes in cooked terminals.
|
|
177
|
+
self.prompt = "> "
|
|
178
|
+
return
|
|
179
|
+
|
|
180
|
+
# Keep prompt plain when readline is enabled. ANSI prompts are fragile
|
|
181
|
+
# across readline/libedit builds and can corrupt redraw/history behavior.
|
|
182
|
+
self.prompt = "> "
|
|
183
|
+
|
|
184
|
+
# Persist history across sessions (best-effort).
|
|
185
|
+
try:
|
|
186
|
+
from pathlib import Path
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
import appdirs
|
|
190
|
+
|
|
191
|
+
hist_dir = Path(appdirs.user_data_dir("abstractvoice"))
|
|
192
|
+
except Exception:
|
|
193
|
+
hist_dir = Path.home() / ".abstractvoice"
|
|
194
|
+
|
|
195
|
+
hist_dir.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
hist_path = hist_dir / "repl_history"
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
rl.read_history_file(str(hist_path))
|
|
200
|
+
except FileNotFoundError:
|
|
201
|
+
pass
|
|
202
|
+
except Exception:
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
rl.set_history_length(2000)
|
|
207
|
+
except Exception:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
def _save_history():
|
|
211
|
+
try:
|
|
212
|
+
rl.write_history_file(str(hist_path))
|
|
213
|
+
except Exception:
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
atexit.register(_save_history)
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
# Ensure Up/Down arrows traverse history reliably across GNU readline and
|
|
221
|
+
# macOS libedit-backed readline. Some libedit defaults perform prefix
|
|
222
|
+
# search/completion, which can look like text is being appended.
|
|
223
|
+
try:
|
|
224
|
+
doc = getattr(rl, "__doc__", "") or ""
|
|
225
|
+
is_libedit = "libedit" in doc.lower()
|
|
226
|
+
if is_libedit:
|
|
227
|
+
# libedit syntax
|
|
228
|
+
rl.parse_and_bind("bind ^[[A ed-prev-history")
|
|
229
|
+
rl.parse_and_bind("bind ^[[B ed-next-history")
|
|
230
|
+
rl.parse_and_bind("bind ^[[OA ed-prev-history")
|
|
231
|
+
rl.parse_and_bind("bind ^[[OB ed-next-history")
|
|
232
|
+
else:
|
|
233
|
+
# GNU readline syntax
|
|
234
|
+
rl.parse_and_bind('"\\e[A": previous-history')
|
|
235
|
+
rl.parse_and_bind('"\\e[B": next-history')
|
|
236
|
+
rl.parse_and_bind('"\\eOA": previous-history')
|
|
237
|
+
rl.parse_and_bind('"\\eOB": next-history')
|
|
238
|
+
except Exception:
|
|
239
|
+
pass
|
|
93
240
|
|
|
94
241
|
def _get_intro(self):
|
|
95
242
|
"""Generate intro message with help."""
|
|
96
243
|
intro = f"\n{Colors.BOLD}Welcome to AbstractVoice CLI REPL{Colors.END}\n"
|
|
97
244
|
if self.voice_manager:
|
|
98
245
|
lang_name = self.voice_manager.get_language_name()
|
|
99
|
-
|
|
246
|
+
mic = (self.voice_mode or "off").upper()
|
|
247
|
+
intro += f"API: {self.api_url} | Model: {self.model} | Voice: {lang_name} | Mic: {mic} | Cloning: {self.cloning_engine}\n"
|
|
100
248
|
else:
|
|
101
249
|
intro += f"API: {self.api_url} | Model: {self.model} | Voice: Disabled\n"
|
|
102
250
|
intro += f"\n{Colors.CYAN}Quick Start:{Colors.END}\n"
|
|
103
251
|
intro += " • Type messages to chat with the LLM\n"
|
|
104
|
-
intro += " •
|
|
252
|
+
intro += " • Voice input (mic): off by default. Enable: /voice stop (or start with --voice-mode stop)\n"
|
|
253
|
+
intro += " • PTT: /voice ptt then SPACE to capture (ESC exits)\n"
|
|
105
254
|
intro += " • Use /language <lang> to switch voice language\n"
|
|
255
|
+
intro += " • Use /clones and /tts_voice to use cloned voices\n"
|
|
106
256
|
intro += " • Type /help for full command list\n"
|
|
107
257
|
intro += " • Type /exit or /q to quit\n"
|
|
108
258
|
return intro
|
|
@@ -110,6 +260,236 @@ class VoiceREPL(cmd.Cmd):
|
|
|
110
260
|
def _count_system_tokens(self):
|
|
111
261
|
"""Count tokens in the system prompt."""
|
|
112
262
|
self._count_tokens(self.system_prompt, "system")
|
|
263
|
+
|
|
264
|
+
def _count_system_words(self):
|
|
265
|
+
self.system_words = self._count_words(self.system_prompt)
|
|
266
|
+
|
|
267
|
+
def _count_words(self, text: str) -> int:
|
|
268
|
+
s = str(text or "").strip()
|
|
269
|
+
if not s:
|
|
270
|
+
return 0
|
|
271
|
+
# A "word" here is whitespace-delimited for simplicity across languages.
|
|
272
|
+
return len([w for w in re.split(r"\s+", s) if w])
|
|
273
|
+
|
|
274
|
+
def _get_tiktoken_encoding(self):
|
|
275
|
+
if getattr(self, "_tiktoken_unavailable", False):
|
|
276
|
+
return None
|
|
277
|
+
enc = getattr(self, "_tiktoken_encoding", None)
|
|
278
|
+
if enc is not None:
|
|
279
|
+
return enc
|
|
280
|
+
try:
|
|
281
|
+
import tiktoken
|
|
282
|
+
except ImportError:
|
|
283
|
+
self._tiktoken_unavailable = True
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
|
288
|
+
except Exception:
|
|
289
|
+
try:
|
|
290
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
291
|
+
except Exception:
|
|
292
|
+
self._tiktoken_unavailable = True
|
|
293
|
+
return None
|
|
294
|
+
|
|
295
|
+
self._tiktoken_encoding = enc
|
|
296
|
+
return enc
|
|
297
|
+
|
|
298
|
+
def _fmt_s(self, seconds: float | None) -> str:
|
|
299
|
+
try:
|
|
300
|
+
if seconds is None:
|
|
301
|
+
return "--"
|
|
302
|
+
s = float(seconds)
|
|
303
|
+
if s < 0:
|
|
304
|
+
return "--"
|
|
305
|
+
# Keep it compact but readable.
|
|
306
|
+
if s < 10:
|
|
307
|
+
return f"{s:.2f}s"
|
|
308
|
+
return f"{s:.1f}s"
|
|
309
|
+
except Exception:
|
|
310
|
+
return "--"
|
|
311
|
+
|
|
312
|
+
def _fmt_num(self, x: float | None, *, digits: int = 2) -> str:
|
|
313
|
+
try:
|
|
314
|
+
if x is None:
|
|
315
|
+
return "--"
|
|
316
|
+
return f"{float(x):.{int(digits)}f}"
|
|
317
|
+
except Exception:
|
|
318
|
+
return "--"
|
|
319
|
+
|
|
320
|
+
def _fmt_wtok(self, words: int | None, tokens: int | None) -> str:
|
|
321
|
+
w = int(words) if isinstance(words, int) else (int(words) if words is not None else 0)
|
|
322
|
+
if isinstance(tokens, int):
|
|
323
|
+
return f"{w}w/{int(tokens)}tok"
|
|
324
|
+
return f"{w}w/--tok"
|
|
325
|
+
|
|
326
|
+
def _summarize_audio_source(self, source: str) -> tuple[int | None, float | None]:
|
|
327
|
+
"""Best-effort: return (file_count, total_seconds) for an audio source path."""
|
|
328
|
+
try:
|
|
329
|
+
from pathlib import Path
|
|
330
|
+
|
|
331
|
+
p = Path(str(source)).expanduser()
|
|
332
|
+
except Exception:
|
|
333
|
+
return None, None
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
import soundfile as sf
|
|
337
|
+
except Exception:
|
|
338
|
+
return None, None
|
|
339
|
+
|
|
340
|
+
supported = {".wav", ".flac", ".ogg"}
|
|
341
|
+
files = []
|
|
342
|
+
try:
|
|
343
|
+
if p.is_file():
|
|
344
|
+
files = [p]
|
|
345
|
+
elif p.is_dir():
|
|
346
|
+
files = sorted([x for x in p.iterdir() if x.is_file() and x.suffix.lower() in supported])
|
|
347
|
+
else:
|
|
348
|
+
return None, None
|
|
349
|
+
except Exception:
|
|
350
|
+
return None, None
|
|
351
|
+
|
|
352
|
+
total_s = 0.0
|
|
353
|
+
max_files = 25
|
|
354
|
+
for fp in files[:max_files]:
|
|
355
|
+
try:
|
|
356
|
+
info = sf.info(str(fp))
|
|
357
|
+
d = float(getattr(info, "duration", 0.0) or 0.0)
|
|
358
|
+
if d > 0:
|
|
359
|
+
total_s += d
|
|
360
|
+
except Exception:
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# If there are too many files, the displayed duration is a lower bound.
|
|
364
|
+
return (int(len(files)) if files else 0), (float(total_s) if total_s > 0 else None)
|
|
365
|
+
|
|
366
|
+
def _print_verbose_turn_stats(self, turn: dict) -> None:
|
|
367
|
+
if not bool(getattr(self, "verbose_mode", False)):
|
|
368
|
+
return
|
|
369
|
+
if not isinstance(turn, dict):
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
stt = turn.get("stt") if isinstance(turn.get("stt"), dict) else None
|
|
373
|
+
llm = turn.get("llm") if isinstance(turn.get("llm"), dict) else {}
|
|
374
|
+
counts = turn.get("counts") if isinstance(turn.get("counts"), dict) else {}
|
|
375
|
+
tts = turn.get("tts") if isinstance(turn.get("tts"), dict) else None
|
|
376
|
+
|
|
377
|
+
in_w = counts.get("in_words")
|
|
378
|
+
out_w = counts.get("out_words")
|
|
379
|
+
in_t = counts.get("in_tokens")
|
|
380
|
+
out_t = counts.get("out_tokens")
|
|
381
|
+
|
|
382
|
+
llm_s = llm.get("s")
|
|
383
|
+
api = llm.get("api") if isinstance(llm.get("api"), dict) else {}
|
|
384
|
+
api_prompt_tok = api.get("prompt_eval_count") if isinstance(api.get("prompt_eval_count"), int) else None
|
|
385
|
+
api_out_tok = api.get("eval_count") if isinstance(api.get("eval_count"), int) else None
|
|
386
|
+
|
|
387
|
+
# Line 1: STT (if any) + LLM + in/out counts and written speed.
|
|
388
|
+
parts1 = []
|
|
389
|
+
if stt:
|
|
390
|
+
stt_s = stt.get("stt_s")
|
|
391
|
+
stt_a = stt.get("audio_s")
|
|
392
|
+
stt_rtf = stt.get("rtf")
|
|
393
|
+
stt_txt = f"STT {self._fmt_s(stt_s)}"
|
|
394
|
+
if stt_a:
|
|
395
|
+
stt_txt += f"(a{self._fmt_s(stt_a)})"
|
|
396
|
+
if stt_rtf is not None:
|
|
397
|
+
stt_txt += f" rtf{self._fmt_num(stt_rtf, digits=2)}"
|
|
398
|
+
parts1.append(stt_txt)
|
|
399
|
+
|
|
400
|
+
if llm_s is not None or api_prompt_tok is not None or api_out_tok is not None:
|
|
401
|
+
llm_txt = f"LLM {self._fmt_s(llm_s)}"
|
|
402
|
+
if api_prompt_tok is not None or api_out_tok is not None:
|
|
403
|
+
p = str(api_prompt_tok) if api_prompt_tok is not None else "--"
|
|
404
|
+
o = str(api_out_tok) if api_out_tok is not None else "--"
|
|
405
|
+
llm_txt += f" (api p{p} o{o})"
|
|
406
|
+
parts1.append(llm_txt)
|
|
407
|
+
|
|
408
|
+
in_txt = f"in {self._fmt_wtok(in_w, in_t)}"
|
|
409
|
+
out_txt = f"out {self._fmt_wtok(out_w, out_t)}"
|
|
410
|
+
|
|
411
|
+
wps_written = None
|
|
412
|
+
try:
|
|
413
|
+
if isinstance(out_w, int) and out_w > 0 and llm_s and float(llm_s) > 0:
|
|
414
|
+
wps_written = float(out_w) / float(llm_s)
|
|
415
|
+
except Exception:
|
|
416
|
+
wps_written = None
|
|
417
|
+
|
|
418
|
+
if wps_written is not None:
|
|
419
|
+
out_txt += f" ({self._fmt_num(wps_written, digits=1)}w/s)"
|
|
420
|
+
|
|
421
|
+
parts1.append(in_txt)
|
|
422
|
+
parts1.append(out_txt)
|
|
423
|
+
|
|
424
|
+
line1 = " | ".join(parts1)
|
|
425
|
+
|
|
426
|
+
# Line 2: TTS (if any) + spoken speed + totals.
|
|
427
|
+
parts2 = []
|
|
428
|
+
if self.voice_manager and self.use_tts:
|
|
429
|
+
if not tts:
|
|
430
|
+
parts2.append("TTS --")
|
|
431
|
+
else:
|
|
432
|
+
eng = str(tts.get("engine") or "").strip().lower()
|
|
433
|
+
if eng == "clone":
|
|
434
|
+
ce = tts.get("clone_engine")
|
|
435
|
+
label = f"clone[{ce}]" if ce else "clone"
|
|
436
|
+
elif eng:
|
|
437
|
+
label = eng
|
|
438
|
+
else:
|
|
439
|
+
label = "tts"
|
|
440
|
+
|
|
441
|
+
err = (tts.get("error") or "").strip()
|
|
442
|
+
if err:
|
|
443
|
+
# Keep single-line and short.
|
|
444
|
+
msg = " ".join(err.split())
|
|
445
|
+
if len(msg) > 120:
|
|
446
|
+
msg = msg[:120].rstrip() + "…"
|
|
447
|
+
parts2.append(f"TTS {label} ERR {msg}")
|
|
448
|
+
else:
|
|
449
|
+
synth_s = tts.get("synth_s")
|
|
450
|
+
audio_s = tts.get("audio_s")
|
|
451
|
+
rtf = tts.get("rtf")
|
|
452
|
+
tts_txt = f"TTS {label} {self._fmt_s(synth_s)}→{self._fmt_s(audio_s)}"
|
|
453
|
+
if rtf is not None:
|
|
454
|
+
tts_txt += f" rtf{self._fmt_num(rtf, digits=2)}"
|
|
455
|
+
|
|
456
|
+
# Extra clone streaming details when available.
|
|
457
|
+
if eng == "clone" and bool(tts.get("streaming")):
|
|
458
|
+
ttfb_s = tts.get("ttfb_s")
|
|
459
|
+
if ttfb_s is not None:
|
|
460
|
+
tts_txt += f" ttfb{self._fmt_s(ttfb_s)}"
|
|
461
|
+
ch = tts.get("chunks")
|
|
462
|
+
if isinstance(ch, int):
|
|
463
|
+
tts_txt += f" ch{ch}"
|
|
464
|
+
|
|
465
|
+
wps_spoken = None
|
|
466
|
+
try:
|
|
467
|
+
if isinstance(out_w, int) and out_w > 0 and audio_s and float(audio_s) > 0:
|
|
468
|
+
wps_spoken = float(out_w) / float(audio_s)
|
|
469
|
+
except Exception:
|
|
470
|
+
wps_spoken = None
|
|
471
|
+
if wps_spoken is not None:
|
|
472
|
+
tts_txt += f" ({self._fmt_num(wps_spoken, digits=1)}w/s)"
|
|
473
|
+
|
|
474
|
+
parts2.append(tts_txt)
|
|
475
|
+
else:
|
|
476
|
+
parts2.append("TTS off")
|
|
477
|
+
|
|
478
|
+
total_words = int(getattr(self, "system_words", 0) + getattr(self, "user_words", 0) + getattr(self, "assistant_words", 0))
|
|
479
|
+
total_tokens = None
|
|
480
|
+
if self._get_tiktoken_encoding() is not None:
|
|
481
|
+
total_tokens = int(getattr(self, "system_tokens", 0) + getattr(self, "user_tokens", 0) + getattr(self, "assistant_tokens", 0))
|
|
482
|
+
|
|
483
|
+
tot_txt = f"tot {self._fmt_wtok(total_words, total_tokens)}"
|
|
484
|
+
if isinstance(getattr(self, "total_llm_out_tokens", None), int) and getattr(self, "total_llm_out_tokens") > 0:
|
|
485
|
+
tot_txt += f" (api out {int(getattr(self, 'total_llm_out_tokens'))}tok)"
|
|
486
|
+
parts2.append(tot_txt)
|
|
487
|
+
|
|
488
|
+
line2 = " | ".join(parts2)
|
|
489
|
+
|
|
490
|
+
# Keep it readable; two lines max.
|
|
491
|
+
print(f"{Colors.YELLOW}{line1}{Colors.END}")
|
|
492
|
+
print(f"{Colors.YELLOW}{line2}{Colors.END}")
|
|
113
493
|
|
|
114
494
|
def parseline(self, line):
|
|
115
495
|
"""Parse the line to extract command and arguments.
|
|
@@ -117,14 +497,11 @@ class VoiceREPL(cmd.Cmd):
|
|
|
117
497
|
Override to handle / prefix for commands. This ensures /voice, /help, etc.
|
|
118
498
|
are recognized as commands by stripping the leading / before parsing.
|
|
119
499
|
"""
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
# Call parent parseline to do the actual parsing
|
|
127
|
-
return super().parseline(line)
|
|
500
|
+
# Commands still use leading "/". In PTT mode we don't accept typed input.
|
|
501
|
+
s = line.strip()
|
|
502
|
+
if s.startswith("/"):
|
|
503
|
+
return super().parseline(s[1:].strip())
|
|
504
|
+
return super().parseline(line.strip())
|
|
128
505
|
|
|
129
506
|
def default(self, line):
|
|
130
507
|
"""Handle regular text input.
|
|
@@ -133,29 +510,123 @@ class VoiceREPL(cmd.Cmd):
|
|
|
133
510
|
All other commands MUST use / prefix.
|
|
134
511
|
"""
|
|
135
512
|
# Skip empty lines
|
|
136
|
-
|
|
513
|
+
text = line.strip()
|
|
514
|
+
if not text:
|
|
137
515
|
return
|
|
138
516
|
|
|
139
|
-
#
|
|
140
|
-
if
|
|
141
|
-
|
|
142
|
-
|
|
517
|
+
# In PTT mode we do not accept typed input.
|
|
518
|
+
if self.voice_mode == "ptt":
|
|
519
|
+
print("PTT mode: press SPACE to speak, ESC to exit.")
|
|
520
|
+
return
|
|
521
|
+
|
|
143
522
|
# Check if in voice mode - don't send to LLM
|
|
144
523
|
if self.voice_mode_active:
|
|
145
524
|
if self.debug_mode:
|
|
146
|
-
print(f"Voice mode active ({self.voice_mode}). Use /voice off
|
|
525
|
+
print(f"Voice mode active ({self.voice_mode}). Use /voice off to disable.")
|
|
526
|
+
return
|
|
527
|
+
|
|
528
|
+
# Interrupt any ongoing TTS playback immediately when the user types.
|
|
529
|
+
# This is the expected “barge-in by typing” UX for a REPL.
|
|
530
|
+
try:
|
|
531
|
+
if self.voice_manager:
|
|
532
|
+
self.voice_manager.stop_speaking()
|
|
533
|
+
except Exception:
|
|
534
|
+
pass
|
|
535
|
+
|
|
536
|
+
# Shortcut: paste a reference audio path to clone+use a voice.
|
|
537
|
+
# Examples:
|
|
538
|
+
# audio_samples/hal9000/hal9000_hello.wav
|
|
539
|
+
# audio_samples/hal9000/hal9000_hello.wav | Hello, Dave.
|
|
540
|
+
if self._maybe_handle_clone_shortcut(text):
|
|
147
541
|
return
|
|
148
542
|
|
|
149
543
|
# Everything else goes to LLM
|
|
150
|
-
self.
|
|
544
|
+
self._pending_stt_metrics = None
|
|
545
|
+
self.process_query(text)
|
|
546
|
+
|
|
547
|
+
# NOTE: PTT is implemented as a dedicated key-loop session (no typing).
|
|
548
|
+
|
|
549
|
+
def _maybe_handle_clone_shortcut(self, text: str) -> bool:
|
|
550
|
+
"""Best-effort: treat a pasted WAV/FLAC/OGG path as `/clone_use`."""
|
|
551
|
+
if not self.voice_manager:
|
|
552
|
+
return False
|
|
553
|
+
|
|
554
|
+
raw = (text or "").strip()
|
|
555
|
+
if not raw:
|
|
556
|
+
return False
|
|
557
|
+
if raw.startswith("/"):
|
|
558
|
+
return False
|
|
559
|
+
|
|
560
|
+
# Optional transcript with a simple pipe syntax:
|
|
561
|
+
# path.wav | Hello.
|
|
562
|
+
left, sep, right = raw.partition("|")
|
|
563
|
+
path_str = left.strip()
|
|
564
|
+
ref_text = right.strip() if sep else ""
|
|
565
|
+
reference_text = ref_text or None
|
|
566
|
+
|
|
567
|
+
# Strip naive wrapping quotes.
|
|
568
|
+
if (path_str.startswith('"') and path_str.endswith('"')) or (path_str.startswith("'") and path_str.endswith("'")):
|
|
569
|
+
path_str = path_str[1:-1].strip()
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
from pathlib import Path
|
|
573
|
+
|
|
574
|
+
p = Path(path_str).expanduser()
|
|
575
|
+
except Exception:
|
|
576
|
+
return False
|
|
577
|
+
|
|
578
|
+
if not p.exists():
|
|
579
|
+
return False
|
|
580
|
+
|
|
581
|
+
exts = {".wav", ".flac", ".ogg"}
|
|
582
|
+
if p.is_file() and p.suffix.lower() not in exts:
|
|
583
|
+
return False
|
|
584
|
+
if p.is_dir():
|
|
585
|
+
try:
|
|
586
|
+
has_audio = any(x.is_file() and x.suffix.lower() in exts for x in p.iterdir())
|
|
587
|
+
except Exception:
|
|
588
|
+
has_audio = False
|
|
589
|
+
if not has_audio:
|
|
590
|
+
return False
|
|
591
|
+
|
|
592
|
+
# Build a `/clone_use` call with a stable name.
|
|
593
|
+
import shlex as _shlex
|
|
594
|
+
|
|
595
|
+
default_name = p.stem if p.is_file() else p.name
|
|
596
|
+
args = f"{_shlex.quote(str(p))} {_shlex.quote(default_name)}"
|
|
597
|
+
if reference_text:
|
|
598
|
+
args += f" --text {_shlex.quote(reference_text)}"
|
|
599
|
+
try:
|
|
600
|
+
self.do_clone_use(args)
|
|
601
|
+
except Exception as e:
|
|
602
|
+
print(f"❌ Clone shortcut failed: {e}")
|
|
603
|
+
if self.debug_mode:
|
|
604
|
+
import traceback
|
|
605
|
+
|
|
606
|
+
traceback.print_exc()
|
|
607
|
+
return True
|
|
151
608
|
|
|
152
609
|
def process_query(self, query):
|
|
153
610
|
"""Process a query and get a response from the LLM."""
|
|
154
611
|
if not query:
|
|
155
612
|
return
|
|
613
|
+
|
|
614
|
+
# Consume any pending STT metrics for this turn (voice/PTT input).
|
|
615
|
+
stt_metrics = getattr(self, "_pending_stt_metrics", None)
|
|
616
|
+
self._pending_stt_metrics = None
|
|
617
|
+
|
|
618
|
+
# If audio is currently playing, stop it so the new request can be handled
|
|
619
|
+
# without overlapping speech.
|
|
620
|
+
try:
|
|
621
|
+
if self.voice_manager:
|
|
622
|
+
self.voice_manager.stop_speaking()
|
|
623
|
+
except Exception:
|
|
624
|
+
pass
|
|
156
625
|
|
|
157
|
-
#
|
|
158
|
-
self.
|
|
626
|
+
# Per-turn counts
|
|
627
|
+
user_words = self._count_words(query)
|
|
628
|
+
self.user_words += int(user_words)
|
|
629
|
+
user_tokens = self._count_tokens(query, "user")
|
|
159
630
|
|
|
160
631
|
# Create the message
|
|
161
632
|
user_message = {"role": "user", "content": query}
|
|
@@ -175,6 +646,7 @@ class VoiceREPL(cmd.Cmd):
|
|
|
175
646
|
}
|
|
176
647
|
|
|
177
648
|
# Make API request
|
|
649
|
+
llm_t0 = time.monotonic()
|
|
178
650
|
response = requests.post(self.api_url, json=payload)
|
|
179
651
|
response.raise_for_status()
|
|
180
652
|
|
|
@@ -182,6 +654,22 @@ class VoiceREPL(cmd.Cmd):
|
|
|
182
654
|
try:
|
|
183
655
|
# First, try to parse as JSON
|
|
184
656
|
response_data = response.json()
|
|
657
|
+
api_llm_metrics = {}
|
|
658
|
+
try:
|
|
659
|
+
# Ollama exposes timing + token counts (nanoseconds).
|
|
660
|
+
# Keep best-effort: if fields are missing, we just omit them.
|
|
661
|
+
for k in (
|
|
662
|
+
"total_duration",
|
|
663
|
+
"load_duration",
|
|
664
|
+
"prompt_eval_count",
|
|
665
|
+
"prompt_eval_duration",
|
|
666
|
+
"eval_count",
|
|
667
|
+
"eval_duration",
|
|
668
|
+
):
|
|
669
|
+
if k in response_data:
|
|
670
|
+
api_llm_metrics[k] = response_data.get(k)
|
|
671
|
+
except Exception:
|
|
672
|
+
api_llm_metrics = {}
|
|
185
673
|
|
|
186
674
|
# Check for different API formats
|
|
187
675
|
if "message" in response_data and "content" in response_data["message"]:
|
|
@@ -200,6 +688,7 @@ class VoiceREPL(cmd.Cmd):
|
|
|
200
688
|
|
|
201
689
|
# Handle streaming or non-JSON response
|
|
202
690
|
response_text = response.text.strip()
|
|
691
|
+
api_llm_metrics = {}
|
|
203
692
|
|
|
204
693
|
# Try to extract content from streaming format if possible
|
|
205
694
|
if response_text.startswith("{") and "content" in response_text:
|
|
@@ -228,9 +717,13 @@ class VoiceREPL(cmd.Cmd):
|
|
|
228
717
|
except Exception as e:
|
|
229
718
|
if self.debug_mode:
|
|
230
719
|
print(f"Error extracting content from streaming response: {e}")
|
|
720
|
+
llm_t1 = time.monotonic()
|
|
721
|
+
llm_s = float(llm_t1 - llm_t0)
|
|
231
722
|
|
|
232
|
-
#
|
|
233
|
-
self.
|
|
723
|
+
# Per-turn counts
|
|
724
|
+
assistant_words = self._count_words(response_text)
|
|
725
|
+
self.assistant_words += int(assistant_words)
|
|
726
|
+
assistant_tokens = self._count_tokens(response_text, "assistant")
|
|
234
727
|
|
|
235
728
|
# Add to message history
|
|
236
729
|
self.messages.append({"role": "assistant", "content": response_text})
|
|
@@ -238,9 +731,61 @@ class VoiceREPL(cmd.Cmd):
|
|
|
238
731
|
# Display the response with color
|
|
239
732
|
print(f"{Colors.CYAN}{response_text}{Colors.END}")
|
|
240
733
|
|
|
734
|
+
# Record last-turn stats (best-effort; printed only in verbose mode).
|
|
735
|
+
self._last_turn_metrics = {
|
|
736
|
+
"stt": stt_metrics,
|
|
737
|
+
"llm": {
|
|
738
|
+
"s": llm_s,
|
|
739
|
+
"api": api_llm_metrics,
|
|
740
|
+
},
|
|
741
|
+
"counts": {
|
|
742
|
+
"in_words": int(user_words),
|
|
743
|
+
"out_words": int(assistant_words),
|
|
744
|
+
"in_tokens": int(user_tokens) if isinstance(user_tokens, int) else None,
|
|
745
|
+
"out_tokens": int(assistant_tokens) if isinstance(assistant_tokens, int) else None,
|
|
746
|
+
},
|
|
747
|
+
}
|
|
748
|
+
try:
|
|
749
|
+
out_tok = api_llm_metrics.get("eval_count") if isinstance(api_llm_metrics, dict) else None
|
|
750
|
+
if isinstance(out_tok, int) and out_tok >= 0:
|
|
751
|
+
self.total_llm_out_tokens += int(out_tok)
|
|
752
|
+
except Exception:
|
|
753
|
+
pass
|
|
754
|
+
|
|
241
755
|
# Speak the response if voice manager is available
|
|
242
756
|
if self.voice_manager and self.use_tts:
|
|
243
|
-
|
|
757
|
+
try:
|
|
758
|
+
# UX guard: never trigger big cloning downloads during normal chat.
|
|
759
|
+
if self.current_tts_voice and not self._is_cloning_runtime_ready(voice_id=self.current_tts_voice):
|
|
760
|
+
print(
|
|
761
|
+
"ℹ️ Cloned voice selected but cloning runtime is not ready.\n"
|
|
762
|
+
" Run /cloning_status then /cloning_download, or switch back with /tts_voice piper."
|
|
763
|
+
)
|
|
764
|
+
else:
|
|
765
|
+
self._speak_with_spinner_until_audio_starts(response_text)
|
|
766
|
+
except Exception as e:
|
|
767
|
+
print(f"❌ TTS failed: {e}")
|
|
768
|
+
|
|
769
|
+
# Capture best-effort TTS metrics (Piper or cloned).
|
|
770
|
+
tts_metrics = None
|
|
771
|
+
try:
|
|
772
|
+
if self.voice_manager and hasattr(self.voice_manager, "pop_last_tts_metrics"):
|
|
773
|
+
tts_metrics = self.voice_manager.pop_last_tts_metrics()
|
|
774
|
+
except Exception:
|
|
775
|
+
tts_metrics = None
|
|
776
|
+
|
|
777
|
+
try:
|
|
778
|
+
if isinstance(getattr(self, "_last_turn_metrics", None), dict):
|
|
779
|
+
self._last_turn_metrics["tts"] = tts_metrics
|
|
780
|
+
except Exception:
|
|
781
|
+
pass
|
|
782
|
+
|
|
783
|
+
# Verbose stats (max 2 lines).
|
|
784
|
+
try:
|
|
785
|
+
if self.verbose_mode and isinstance(getattr(self, "_last_turn_metrics", None), dict):
|
|
786
|
+
self._print_verbose_turn_stats(self._last_turn_metrics)
|
|
787
|
+
except Exception:
|
|
788
|
+
pass
|
|
244
789
|
|
|
245
790
|
except requests.exceptions.ConnectionError as e:
|
|
246
791
|
print(f"❌ Cannot connect to Ollama API at {self.api_url}")
|
|
@@ -274,37 +819,29 @@ class VoiceREPL(cmd.Cmd):
|
|
|
274
819
|
|
|
275
820
|
def _count_tokens(self, text, role):
|
|
276
821
|
"""Count tokens in text."""
|
|
822
|
+
encoding = self._get_tiktoken_encoding()
|
|
823
|
+
if encoding is None:
|
|
824
|
+
return None
|
|
277
825
|
try:
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# Initialize the tokenizer
|
|
281
|
-
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
|
282
|
-
|
|
283
|
-
# Count tokens
|
|
284
|
-
token_count = len(encoding.encode(text))
|
|
285
|
-
|
|
286
|
-
# Update the token counts based on role
|
|
287
|
-
if role == "system":
|
|
288
|
-
self.system_tokens = token_count
|
|
289
|
-
elif role == "user":
|
|
290
|
-
self.user_tokens += token_count
|
|
291
|
-
elif role == "assistant":
|
|
292
|
-
self.assistant_tokens += token_count
|
|
293
|
-
|
|
294
|
-
# Calculate total tokens
|
|
295
|
-
total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
|
|
296
|
-
|
|
297
|
-
if self.debug_mode:
|
|
298
|
-
print(f"{role.capitalize()} tokens: {token_count}")
|
|
299
|
-
print(f"Total tokens: {total_tokens}")
|
|
300
|
-
|
|
301
|
-
except ImportError:
|
|
302
|
-
# If tiktoken is not available, just don't count tokens
|
|
303
|
-
pass
|
|
826
|
+
token_count = len(encoding.encode(str(text or "")))
|
|
304
827
|
except Exception as e:
|
|
305
828
|
if self.debug_mode:
|
|
306
829
|
print(f"Error counting tokens: {e}")
|
|
307
|
-
|
|
830
|
+
return None
|
|
831
|
+
|
|
832
|
+
# Update the token counts based on role
|
|
833
|
+
if role == "system":
|
|
834
|
+
self.system_tokens = int(token_count)
|
|
835
|
+
elif role == "user":
|
|
836
|
+
self.user_tokens += int(token_count)
|
|
837
|
+
elif role == "assistant":
|
|
838
|
+
self.assistant_tokens += int(token_count)
|
|
839
|
+
|
|
840
|
+
if self.debug_mode:
|
|
841
|
+
total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
|
|
842
|
+
print(f"{role.capitalize()} tokens: {token_count}")
|
|
843
|
+
print(f"Total tokens: {total_tokens}")
|
|
844
|
+
return int(token_count)
|
|
308
845
|
|
|
309
846
|
def _clean_response(self, text):
|
|
310
847
|
"""Clean LLM response text."""
|
|
@@ -323,8 +860,12 @@ class VoiceREPL(cmd.Cmd):
|
|
|
323
860
|
"""Switch voice language.
|
|
324
861
|
|
|
325
862
|
Usage: /language <lang>
|
|
326
|
-
Available languages: en, fr, es, de,
|
|
863
|
+
Available languages: en, fr, es, de, ru, zh
|
|
327
864
|
"""
|
|
865
|
+
if not self.voice_manager:
|
|
866
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
867
|
+
return
|
|
868
|
+
|
|
328
869
|
if not args:
|
|
329
870
|
current_name = self.voice_manager.get_language_name()
|
|
330
871
|
current_code = self.voice_manager.get_language()
|
|
@@ -359,10 +900,13 @@ class VoiceREPL(cmd.Cmd):
|
|
|
359
900
|
'fr': "Langue changée en français.",
|
|
360
901
|
'es': "Idioma cambiado a español.",
|
|
361
902
|
'de': "Sprache auf Deutsch umgestellt.",
|
|
362
|
-
'
|
|
903
|
+
'ru': "Язык переключен на русский.",
|
|
904
|
+
'zh': "语言已切换到中文。"
|
|
363
905
|
}
|
|
364
906
|
test_msg = test_messages.get(language, "Language switched.")
|
|
365
|
-
|
|
907
|
+
# Respect TTS toggle: if the user disabled TTS, don't speak test messages.
|
|
908
|
+
if getattr(self, "use_tts", True):
|
|
909
|
+
self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
|
|
366
910
|
|
|
367
911
|
# Restart voice mode if it was active
|
|
368
912
|
if was_active:
|
|
@@ -383,10 +927,13 @@ class VoiceREPL(cmd.Cmd):
|
|
|
383
927
|
/setvoice <voice_id> # Set voice (format: language.voice_id)
|
|
384
928
|
|
|
385
929
|
Examples:
|
|
386
|
-
/setvoice # List all voices
|
|
387
|
-
/setvoice fr.
|
|
388
|
-
/setvoice it.mai_male_vits # Set Italian male VITS voice
|
|
930
|
+
/setvoice # List all Piper voices
|
|
931
|
+
/setvoice fr.siwis # Switch to French (voice id is best-effort)
|
|
389
932
|
"""
|
|
933
|
+
if not self.voice_manager:
|
|
934
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
935
|
+
return
|
|
936
|
+
|
|
390
937
|
if not args:
|
|
391
938
|
# Show all available voices with metadata
|
|
392
939
|
print(f"\n{Colors.CYAN}Available Voice Models:{Colors.END}")
|
|
@@ -398,7 +945,7 @@ class VoiceREPL(cmd.Cmd):
|
|
|
398
945
|
# Get language name
|
|
399
946
|
lang_names = {
|
|
400
947
|
'en': 'English', 'fr': 'French', 'es': 'Spanish',
|
|
401
|
-
'de': 'German', '
|
|
948
|
+
'de': 'German', 'ru': 'Russian', 'zh': 'Chinese'
|
|
402
949
|
}
|
|
403
950
|
lang_name = lang_names.get(language, language.upper())
|
|
404
951
|
|
|
@@ -406,24 +953,22 @@ class VoiceREPL(cmd.Cmd):
|
|
|
406
953
|
|
|
407
954
|
for voice_id, voice_info in voices.items():
|
|
408
955
|
cached_icon = "✅" if voice_info.get('cached', False) else "📥"
|
|
409
|
-
quality_icon = "
|
|
410
|
-
size_text = f"{voice_info
|
|
956
|
+
quality_icon = "🔧"
|
|
957
|
+
size_text = f"{voice_info.get('size_mb', 0)}MB"
|
|
411
958
|
|
|
412
959
|
print(f" {cached_icon} {quality_icon} {language}.{voice_id}")
|
|
413
960
|
print(f" {voice_info['name']} ({size_text})")
|
|
414
961
|
print(f" {voice_info['description']}")
|
|
415
|
-
|
|
416
|
-
print(f" ⚠️ Requires espeak-ng")
|
|
962
|
+
# Piper has no system deps.
|
|
417
963
|
|
|
418
964
|
print(f"\n{Colors.YELLOW}Usage:{Colors.END}")
|
|
419
965
|
print(" /setvoice <language>.<voice_id>")
|
|
420
|
-
print(" Example: /setvoice fr.
|
|
421
|
-
print("\n📥 = Download needed ✅ = Ready
|
|
966
|
+
print(" Example: /setvoice fr.siwis")
|
|
967
|
+
print("\n📥 = Download needed ✅ = Ready")
|
|
422
968
|
|
|
423
969
|
except Exception as e:
|
|
424
970
|
print(f"❌ Error listing models: {e}")
|
|
425
|
-
|
|
426
|
-
self.voice_manager.list_voices()
|
|
971
|
+
print(" (No fallback available)")
|
|
427
972
|
return
|
|
428
973
|
|
|
429
974
|
voice_spec = args.strip()
|
|
@@ -451,39 +996,28 @@ class VoiceREPL(cmd.Cmd):
|
|
|
451
996
|
# Download and set the specific voice using programmatic API
|
|
452
997
|
try:
|
|
453
998
|
print(f"🔄 Setting voice {voice_spec}...")
|
|
454
|
-
|
|
455
|
-
# Use the programmatic download API
|
|
456
|
-
success = self.voice_manager.download_model(voice_spec)
|
|
999
|
+
success = self.voice_manager.set_voice(language, voice_id)
|
|
457
1000
|
|
|
458
1001
|
if success:
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
if success:
|
|
463
|
-
# Update current language
|
|
464
|
-
self.current_language = language
|
|
465
|
-
|
|
466
|
-
print(f"✅ Voice set to {voice_spec}")
|
|
467
|
-
|
|
468
|
-
# Test the voice
|
|
469
|
-
test_messages = {
|
|
470
|
-
'en': 'Voice changed to English.',
|
|
471
|
-
'fr': 'Voix changée en français.',
|
|
472
|
-
'es': 'Voz cambiada al español.',
|
|
473
|
-
'de': 'Stimme auf Deutsch geändert.',
|
|
474
|
-
'it': 'Voce cambiata in italiano.'
|
|
475
|
-
}
|
|
476
|
-
test_msg = test_messages.get(language, f'Voice changed to {language}.')
|
|
477
|
-
self.voice_manager.speak(test_msg)
|
|
1002
|
+
self.current_language = language
|
|
1003
|
+
print(f"✅ Voice set to {voice_spec}")
|
|
478
1004
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
1005
|
+
test_messages = {
|
|
1006
|
+
'en': 'Voice changed to English.',
|
|
1007
|
+
'fr': 'Voix changée en français.',
|
|
1008
|
+
'es': 'Voz cambiada al español.',
|
|
1009
|
+
'de': 'Stimme auf Deutsch geändert.',
|
|
1010
|
+
'ru': 'Голос изменён на русский.',
|
|
1011
|
+
'zh': '语音已切换到中文。'
|
|
1012
|
+
}
|
|
1013
|
+
test_msg = test_messages.get(language, f'Voice changed to {language}.')
|
|
1014
|
+
if getattr(self, "use_tts", True):
|
|
1015
|
+
self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
|
|
1016
|
+
|
|
1017
|
+
if was_active:
|
|
1018
|
+
self.do_voice(self.voice_mode)
|
|
484
1019
|
else:
|
|
485
|
-
print(f"❌ Failed to
|
|
486
|
-
print(" Check your internet connection or try a different voice")
|
|
1020
|
+
print(f"❌ Failed to set voice: {voice_spec}")
|
|
487
1021
|
|
|
488
1022
|
except Exception as e:
|
|
489
1023
|
print(f"❌ Error setting voice: {e}")
|
|
@@ -521,185 +1055,1732 @@ class VoiceREPL(cmd.Cmd):
|
|
|
521
1055
|
off - Disable voice input
|
|
522
1056
|
full - Continuous listening, interrupts TTS on speech detection
|
|
523
1057
|
wait - Pause listening while TTS is speaking (recommended)
|
|
524
|
-
stop -
|
|
525
|
-
ptt - Push-to-talk
|
|
1058
|
+
stop - Keep listening while speaking, but only stop TTS on stop phrase
|
|
1059
|
+
ptt - Push-to-talk (use /ptt to record one utterance)
|
|
526
1060
|
"""
|
|
527
|
-
arg = arg.lower().strip()
|
|
1061
|
+
arg = (arg or "").lower().strip()
|
|
528
1062
|
|
|
529
1063
|
# Handle legacy "on" argument
|
|
530
1064
|
if arg == "on":
|
|
531
1065
|
arg = "wait"
|
|
532
1066
|
|
|
533
1067
|
if arg in ["off", "full", "wait", "stop", "ptt"]:
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
1068
|
+
if not self.voice_manager:
|
|
1069
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
1070
|
+
return
|
|
1071
|
+
|
|
1072
|
+
# Exit PTT session if running.
|
|
1073
|
+
if self._ptt_session_active:
|
|
1074
|
+
self._ptt_session_active = False
|
|
1075
|
+
self._ptt_recording = False
|
|
1076
|
+
self._ptt_busy = False
|
|
1077
|
+
|
|
1078
|
+
# Stop any ongoing mic session.
|
|
1079
|
+
try:
|
|
1080
|
+
self.voice_manager.stop_listening()
|
|
1081
|
+
except Exception:
|
|
1082
|
+
pass
|
|
1083
|
+
self.voice_mode_active = False
|
|
1084
|
+
|
|
538
1085
|
self.voice_mode = arg
|
|
539
1086
|
self.voice_manager.set_voice_mode(arg)
|
|
540
|
-
|
|
1087
|
+
|
|
541
1088
|
if arg == "off":
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
1089
|
+
print("Voice mode disabled.")
|
|
1090
|
+
return
|
|
1091
|
+
|
|
1092
|
+
if arg == "ptt":
|
|
1093
|
+
# PTT is a dedicated session: no text entry.
|
|
1094
|
+
print("Voice mode: PTT - Push-to-talk (no typing).")
|
|
1095
|
+
print("SPACE: start/stop recording (transcribe on stop)")
|
|
1096
|
+
print("ESC: exit PTT mode")
|
|
1097
|
+
self._run_ptt_session()
|
|
1098
|
+
return
|
|
1099
|
+
|
|
1100
|
+
# Continuous listening modes.
|
|
1101
|
+
try:
|
|
549
1102
|
self.voice_manager.listen(
|
|
550
1103
|
on_transcription=self._voice_callback,
|
|
551
|
-
|
|
1104
|
+
# Stop phrase interrupts TTS; keep listening.
|
|
1105
|
+
on_stop=lambda: (
|
|
1106
|
+
print("\n⏹️ Stopped speaking.\n") if (self.voice_manager and self.voice_manager.is_speaking()) else None
|
|
1107
|
+
),
|
|
552
1108
|
)
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
1109
|
+
self.voice_mode_active = True
|
|
1110
|
+
except Exception as e:
|
|
1111
|
+
self.voice_mode_active = False
|
|
1112
|
+
self.voice_mode = "off"
|
|
1113
|
+
print(f"❌ Failed to start microphone listening: {e}")
|
|
1114
|
+
print(" Tip: check microphone permissions/device availability.")
|
|
1115
|
+
return
|
|
1116
|
+
|
|
1117
|
+
if arg == "wait":
|
|
1118
|
+
print("Voice mode: WAIT - Listens continuously except while speaking.")
|
|
1119
|
+
print("Use /voice off to disable.")
|
|
1120
|
+
elif arg == "stop":
|
|
1121
|
+
print("Voice mode: STOP - Always listens; stop phrase stops TTS.")
|
|
1122
|
+
print("Use /voice off to disable.")
|
|
1123
|
+
elif arg == "full":
|
|
1124
|
+
print("Voice mode: FULL - Interrupts TTS on any speech (best with AEC/headset).")
|
|
1125
|
+
print("Use /voice off to disable.")
|
|
567
1126
|
else:
|
|
568
1127
|
print("Usage: /voice off | full | wait | stop | ptt")
|
|
569
1128
|
print(" off - Disable voice input")
|
|
570
1129
|
print(" full - Continuous listening, interrupts TTS on speech")
|
|
571
|
-
print(" wait -
|
|
572
|
-
print(" stop -
|
|
573
|
-
print(" ptt - Push-to-talk
|
|
574
|
-
|
|
575
|
-
def
|
|
576
|
-
"""
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
if
|
|
582
|
-
|
|
583
|
-
# Don't process "stop" as a query
|
|
1130
|
+
print(" wait - Listen except while speaking")
|
|
1131
|
+
print(" stop - Always listen; stop phrase stops TTS")
|
|
1132
|
+
print(" ptt - Push-to-talk (no typing; SPACE triggers capture)")
|
|
1133
|
+
|
|
1134
|
+
def do_ptt(self, arg):
|
|
1135
|
+
"""Push-to-talk: record a single utterance, then process it.
|
|
1136
|
+
|
|
1137
|
+
Usage:
|
|
1138
|
+
/ptt
|
|
1139
|
+
"""
|
|
1140
|
+
if not self.voice_manager:
|
|
1141
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
584
1142
|
return
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
# In PTT mode, process immediately
|
|
1143
|
+
print("❌ /ptt is deprecated. Use: /voice ptt (then SPACE)")
|
|
1144
|
+
return
|
|
1145
|
+
|
|
1146
|
+
# Ensure we are not already listening.
|
|
1147
|
+
try:
|
|
1148
|
+
self.voice_manager.stop_listening()
|
|
1149
|
+
except Exception:
|
|
593
1150
|
pass
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
"""
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
if arg == "on":
|
|
611
|
-
self.use_tts = True
|
|
612
|
-
print("TTS enabled" if self.debug_mode else "")
|
|
613
|
-
elif arg == "off":
|
|
614
|
-
self.use_tts = False
|
|
615
|
-
print("TTS disabled" if self.debug_mode else "")
|
|
616
|
-
else:
|
|
617
|
-
print("Usage: /tts on | off")
|
|
618
|
-
|
|
619
|
-
def do_speed(self, arg):
|
|
620
|
-
"""Set the TTS speed multiplier."""
|
|
621
|
-
if not arg.strip():
|
|
622
|
-
print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
|
|
1151
|
+
|
|
1152
|
+
return
|
|
1153
|
+
|
|
1154
|
+
def _run_ptt_session(self) -> None:
|
|
1155
|
+
"""PTT mode key loop (no typing).
|
|
1156
|
+
|
|
1157
|
+
Clean semantics:
|
|
1158
|
+
- SPACE toggles recording (start/stop)
|
|
1159
|
+
- on stop: transcribe immediately and send to the LLM
|
|
1160
|
+
- ESC exits PTT mode (returns to STOP mode)
|
|
1161
|
+
|
|
1162
|
+
This avoids relying on VAD end-of-utterance, which is fragile when speaker
|
|
1163
|
+
echo is present (common on laptop speakers).
|
|
1164
|
+
"""
|
|
1165
|
+
if not self.voice_manager:
|
|
623
1166
|
return
|
|
624
|
-
|
|
1167
|
+
self._ptt_session_active = True
|
|
1168
|
+
self._ptt_recording = False
|
|
1169
|
+
self._ptt_busy = False
|
|
1170
|
+
|
|
1171
|
+
# Lazy imports: keep REPL startup snappy.
|
|
1172
|
+
import io
|
|
1173
|
+
import wave
|
|
1174
|
+
|
|
625
1175
|
try:
|
|
626
|
-
|
|
1176
|
+
import sounddevice as sd
|
|
1177
|
+
except Exception as e:
|
|
1178
|
+
print(f"❌ PTT requires sounddevice: {e}")
|
|
1179
|
+
self._ptt_session_active = False
|
|
1180
|
+
return
|
|
1181
|
+
|
|
1182
|
+
sr = 16000
|
|
1183
|
+
frames: list[bytes] = []
|
|
1184
|
+
stream = {"obj": None}
|
|
1185
|
+
cols = 80
|
|
1186
|
+
try:
|
|
1187
|
+
cols = int(shutil.get_terminal_size((80, 20)).columns)
|
|
1188
|
+
except Exception:
|
|
1189
|
+
cols = 80
|
|
1190
|
+
|
|
1191
|
+
def _clear_status() -> None:
|
|
1192
|
+
try:
|
|
1193
|
+
sys.stdout.write("\r" + (" " * max(10, cols - 1)) + "\r")
|
|
1194
|
+
sys.stdout.flush()
|
|
1195
|
+
except Exception:
|
|
1196
|
+
pass
|
|
1197
|
+
|
|
1198
|
+
def _status_line(msg: str) -> None:
|
|
1199
|
+
# Render on a single line (no newline) so SPACE can be pressed repeatedly.
|
|
1200
|
+
try:
|
|
1201
|
+
_clear_status()
|
|
1202
|
+
sys.stdout.write(str(msg)[: max(0, cols - 1)])
|
|
1203
|
+
sys.stdout.flush()
|
|
1204
|
+
except Exception:
|
|
1205
|
+
pass
|
|
1206
|
+
|
|
1207
|
+
def _println(msg: str = "") -> None:
|
|
1208
|
+
# When in raw terminal mode, '\n' does NOT reliably return to column 0.
|
|
1209
|
+
# Use CRLF explicitly to prevent "diagonal drifting" rendering.
|
|
1210
|
+
try:
|
|
1211
|
+
_clear_status()
|
|
1212
|
+
sys.stdout.write("\r\n" + str(msg) + "\r\n")
|
|
1213
|
+
sys.stdout.flush()
|
|
1214
|
+
except Exception:
|
|
1215
|
+
pass
|
|
1216
|
+
|
|
1217
|
+
def _start_recording() -> None:
|
|
1218
|
+
nonlocal frames
|
|
1219
|
+
if self._ptt_recording:
|
|
1220
|
+
return
|
|
1221
|
+
if self._ptt_busy:
|
|
1222
|
+
return
|
|
1223
|
+
frames = []
|
|
1224
|
+
|
|
1225
|
+
# Interrupt any speech immediately.
|
|
1226
|
+
try:
|
|
1227
|
+
self.voice_manager.stop_speaking()
|
|
1228
|
+
except Exception:
|
|
1229
|
+
pass
|
|
1230
|
+
|
|
1231
|
+
def _cb(indata, _frames, _time, status):
|
|
1232
|
+
if status and self.debug_mode:
|
|
1233
|
+
pass
|
|
1234
|
+
try:
|
|
1235
|
+
frames.append(indata.copy().tobytes())
|
|
1236
|
+
except Exception:
|
|
1237
|
+
pass
|
|
1238
|
+
|
|
1239
|
+
try:
|
|
1240
|
+
stream["obj"] = sd.InputStream(
|
|
1241
|
+
samplerate=sr,
|
|
1242
|
+
channels=1,
|
|
1243
|
+
dtype="int16",
|
|
1244
|
+
callback=_cb,
|
|
1245
|
+
blocksize=int(sr * 0.03),
|
|
1246
|
+
)
|
|
1247
|
+
stream["obj"].start()
|
|
1248
|
+
self._ptt_recording = True
|
|
1249
|
+
_status_line("🎙️ Recording… (SPACE to send, ESC to exit)")
|
|
1250
|
+
except Exception as e:
|
|
1251
|
+
self._ptt_recording = False
|
|
1252
|
+
stream["obj"] = None
|
|
1253
|
+
_clear_status()
|
|
1254
|
+
_println(f"❌ Failed to start microphone stream: {e}")
|
|
1255
|
+
|
|
1256
|
+
def _stop_recording_and_send() -> None:
|
|
1257
|
+
if not self._ptt_recording:
|
|
1258
|
+
return
|
|
1259
|
+
self._ptt_recording = False
|
|
1260
|
+
_clear_status()
|
|
1261
|
+
|
|
1262
|
+
try:
|
|
1263
|
+
if stream["obj"] is not None:
|
|
1264
|
+
try:
|
|
1265
|
+
stream["obj"].stop()
|
|
1266
|
+
except Exception:
|
|
1267
|
+
pass
|
|
1268
|
+
try:
|
|
1269
|
+
stream["obj"].close()
|
|
1270
|
+
except Exception:
|
|
1271
|
+
pass
|
|
1272
|
+
finally:
|
|
1273
|
+
stream["obj"] = None
|
|
1274
|
+
|
|
1275
|
+
pcm = b"".join(frames)
|
|
1276
|
+
if len(pcm) < int(sr * 0.25) * 2:
|
|
1277
|
+
_println("…(too short, try again)")
|
|
1278
|
+
return
|
|
1279
|
+
|
|
1280
|
+
buf = io.BytesIO()
|
|
1281
|
+
with wave.open(buf, "wb") as w:
|
|
1282
|
+
w.setnchannels(1)
|
|
1283
|
+
w.setsampwidth(2)
|
|
1284
|
+
w.setframerate(sr)
|
|
1285
|
+
w.writeframes(pcm)
|
|
1286
|
+
wav_bytes = buf.getvalue()
|
|
1287
|
+
|
|
1288
|
+
self._ptt_busy = True
|
|
1289
|
+
try:
|
|
1290
|
+
audio_s = 0.0
|
|
1291
|
+
try:
|
|
1292
|
+
if sr and sr > 0:
|
|
1293
|
+
audio_s = float(len(pcm)) / float(int(sr) * 2)
|
|
1294
|
+
except Exception:
|
|
1295
|
+
audio_s = 0.0
|
|
1296
|
+
|
|
1297
|
+
t0 = time.monotonic()
|
|
1298
|
+
text = (self.voice_manager.transcribe_from_bytes(wav_bytes, language=self.current_language) or "").strip()
|
|
1299
|
+
t1 = time.monotonic()
|
|
1300
|
+
stt_s = float(t1 - t0)
|
|
1301
|
+
self._pending_stt_metrics = {
|
|
1302
|
+
"stt_s": stt_s,
|
|
1303
|
+
"audio_s": float(audio_s),
|
|
1304
|
+
"rtf": (stt_s / float(audio_s)) if audio_s else None,
|
|
1305
|
+
"sample_rate": int(sr),
|
|
1306
|
+
"chunks": None,
|
|
1307
|
+
"chunk_ms": None,
|
|
1308
|
+
"profile": "ptt",
|
|
1309
|
+
"ts": time.time(),
|
|
1310
|
+
}
|
|
1311
|
+
except Exception as e:
|
|
1312
|
+
self._ptt_busy = False
|
|
1313
|
+
_println(f"❌ Transcription failed: {e}")
|
|
1314
|
+
return
|
|
1315
|
+
self._ptt_busy = False
|
|
1316
|
+
|
|
1317
|
+
if not text:
|
|
1318
|
+
_println("…(no transcription)")
|
|
1319
|
+
return
|
|
1320
|
+
|
|
1321
|
+
_println(f"> {text}")
|
|
1322
|
+
self.process_query(text)
|
|
1323
|
+
|
|
1324
|
+
# Platform key read.
|
|
1325
|
+
import sys
|
|
1326
|
+
if sys.platform == "win32":
|
|
1327
|
+
import msvcrt
|
|
1328
|
+
|
|
1329
|
+
while self._ptt_session_active:
|
|
1330
|
+
ch = msvcrt.getwch()
|
|
1331
|
+
if ch == "\x1b": # ESC
|
|
1332
|
+
break
|
|
1333
|
+
if self._ptt_busy:
|
|
1334
|
+
continue
|
|
1335
|
+
if ch == " ":
|
|
1336
|
+
if not self._ptt_recording:
|
|
1337
|
+
_start_recording()
|
|
1338
|
+
else:
|
|
1339
|
+
_stop_recording_and_send()
|
|
1340
|
+
else:
|
|
1341
|
+
import termios
|
|
1342
|
+
import tty
|
|
1343
|
+
|
|
1344
|
+
fd = sys.stdin.fileno()
|
|
1345
|
+
old = termios.tcgetattr(fd)
|
|
1346
|
+
try:
|
|
1347
|
+
tty.setraw(fd)
|
|
1348
|
+
|
|
1349
|
+
def _run_in_cooked(block):
|
|
1350
|
+
"""Run a block with normal tty settings.
|
|
1351
|
+
|
|
1352
|
+
In raw mode, many terminals treat '\n' as LF without CR, so prints from
|
|
1353
|
+
deeper code paths (LLM responses) can drift/indent. We temporarily
|
|
1354
|
+
restore the terminal mode to keep output rendering stable.
|
|
1355
|
+
"""
|
|
1356
|
+
try:
|
|
1357
|
+
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
|
1358
|
+
except Exception:
|
|
1359
|
+
pass
|
|
1360
|
+
try:
|
|
1361
|
+
block()
|
|
1362
|
+
finally:
|
|
1363
|
+
try:
|
|
1364
|
+
tty.setraw(fd)
|
|
1365
|
+
except Exception:
|
|
1366
|
+
pass
|
|
1367
|
+
|
|
1368
|
+
while self._ptt_session_active:
|
|
1369
|
+
ch = sys.stdin.read(1)
|
|
1370
|
+
if ch == "\x1b": # ESC
|
|
1371
|
+
break
|
|
1372
|
+
if self._ptt_busy:
|
|
1373
|
+
continue
|
|
1374
|
+
if ch == " ":
|
|
1375
|
+
if not self._ptt_recording:
|
|
1376
|
+
_start_recording()
|
|
1377
|
+
else:
|
|
1378
|
+
_run_in_cooked(_stop_recording_and_send)
|
|
1379
|
+
finally:
|
|
1380
|
+
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
|
1381
|
+
|
|
1382
|
+
self._ptt_session_active = False
|
|
1383
|
+
self._ptt_recording = False
|
|
1384
|
+
self._ptt_busy = False
|
|
1385
|
+
try:
|
|
1386
|
+
if stream["obj"] is not None:
|
|
1387
|
+
stream["obj"].stop()
|
|
1388
|
+
stream["obj"].close()
|
|
1389
|
+
except Exception:
|
|
1390
|
+
pass
|
|
1391
|
+
_clear_status()
|
|
1392
|
+
# Ensure we end on a clean line before restoring other modes.
|
|
1393
|
+
try:
|
|
1394
|
+
sys.stdout.write("\r\n")
|
|
1395
|
+
sys.stdout.flush()
|
|
1396
|
+
except Exception:
|
|
1397
|
+
pass
|
|
1398
|
+
# Restore to STOP after exiting PTT.
|
|
1399
|
+
try:
|
|
1400
|
+
self.do_voice("stop")
|
|
1401
|
+
except Exception:
|
|
1402
|
+
pass
|
|
1403
|
+
|
|
1404
|
+
def _voice_callback(self, text):
|
|
1405
|
+
"""Callback for voice recognition."""
|
|
1406
|
+
# Capture best-effort STT metrics from the recognizer (for verbose stats).
|
|
1407
|
+
stt_metrics = None
|
|
1408
|
+
try:
|
|
1409
|
+
vm = self.voice_manager
|
|
1410
|
+
rec = getattr(vm, "voice_recognizer", None) if vm else None
|
|
1411
|
+
if rec is not None and hasattr(rec, "pop_last_stt_metrics"):
|
|
1412
|
+
stt_metrics = rec.pop_last_stt_metrics()
|
|
1413
|
+
except Exception:
|
|
1414
|
+
stt_metrics = None
|
|
1415
|
+
self._pending_stt_metrics = stt_metrics
|
|
1416
|
+
|
|
1417
|
+
# Print what the user said
|
|
1418
|
+
print(f"\n> {text}")
|
|
1419
|
+
# NOTE: stop phrases are handled by the stop_callback path (interrupt TTS).
|
|
1420
|
+
# We do not use "stop" to exit voice mode; use /voice off explicitly.
|
|
1421
|
+
|
|
1422
|
+
# Mode-specific handling
|
|
1423
|
+
if self.voice_mode == "stop":
|
|
1424
|
+
# In 'stop' mode, don't interrupt TTS - just queue the message
|
|
1425
|
+
# But since we're in callback, TTS interrupt is already paused
|
|
1426
|
+
pass
|
|
1427
|
+
elif self.voice_mode == "ptt":
|
|
1428
|
+
# In PTT mode, process immediately
|
|
1429
|
+
pass
|
|
1430
|
+
# 'full' mode has default behavior
|
|
1431
|
+
|
|
1432
|
+
# Process the user's query
|
|
1433
|
+
self.process_query(text)
|
|
1434
|
+
|
|
1435
|
+
def _voice_stop_callback(self):
|
|
1436
|
+
"""Callback when voice mode is stopped."""
|
|
1437
|
+
self.voice_mode = "off"
|
|
1438
|
+
self.voice_mode_active = False
|
|
1439
|
+
self.voice_manager.stop_listening()
|
|
1440
|
+
print("Voice mode disabled.")
|
|
1441
|
+
|
|
1442
|
+
def do_tts(self, arg):
|
|
1443
|
+
"""Toggle text-to-speech."""
|
|
1444
|
+
arg = arg.lower().strip()
|
|
1445
|
+
|
|
1446
|
+
if arg == "on":
|
|
1447
|
+
self.use_tts = True
|
|
1448
|
+
if self.voice_manager is None:
|
|
1449
|
+
# Re-enable voice features (TTS/STT) by creating a VoiceManager.
|
|
1450
|
+
self.voice_manager = VoiceManager(
|
|
1451
|
+
language=self.current_language,
|
|
1452
|
+
tts_model=self._initial_tts_model,
|
|
1453
|
+
debug_mode=self.debug_mode,
|
|
1454
|
+
allow_downloads=False,
|
|
1455
|
+
cloned_tts_streaming=False,
|
|
1456
|
+
cloning_engine=self.cloning_engine,
|
|
1457
|
+
)
|
|
1458
|
+
print("TTS enabled" if self.debug_mode else "")
|
|
1459
|
+
elif arg == "off":
|
|
1460
|
+
self.use_tts = False
|
|
1461
|
+
print("TTS disabled" if self.debug_mode else "")
|
|
1462
|
+
else:
|
|
1463
|
+
print("Usage: /tts on | off")
|
|
1464
|
+
|
|
1465
|
+
def do_speed(self, arg):
|
|
1466
|
+
"""Set the TTS speed multiplier."""
|
|
1467
|
+
if not self.voice_manager:
|
|
1468
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1469
|
+
return
|
|
1470
|
+
if not arg.strip():
|
|
1471
|
+
print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
|
|
1472
|
+
return
|
|
1473
|
+
|
|
1474
|
+
try:
|
|
1475
|
+
speed = float(arg.strip())
|
|
627
1476
|
if 0.5 <= speed <= 2.0:
|
|
628
1477
|
self.voice_manager.set_speed(speed)
|
|
629
1478
|
print(f"TTS speed set to {speed}x")
|
|
630
1479
|
else:
|
|
631
|
-
print("Speed should be between 0.5 and 2.0")
|
|
632
|
-
except ValueError:
|
|
633
|
-
print("Usage: /speed <number> (e.g., /speed 1.5)")
|
|
634
|
-
|
|
635
|
-
def do_tts_model(self, arg):
|
|
636
|
-
"""
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
1480
|
+
print("Speed should be between 0.5 and 2.0")
|
|
1481
|
+
except ValueError:
|
|
1482
|
+
print("Usage: /speed <number> (e.g., /speed 1.5)")
|
|
1483
|
+
|
|
1484
|
+
def do_tts_model(self, arg):
|
|
1485
|
+
"""Deprecated: legacy TTS model switching.
|
|
1486
|
+
|
|
1487
|
+
AbstractVoice core is Piper-first; use `/setvoice` (Piper voices) or cloned voices.
|
|
1488
|
+
"""
|
|
1489
|
+
print("❌ /tts_model is not supported (Piper-first core).")
|
|
1490
|
+
print(" Use /setvoice for Piper voices, or /tts_voice clone <id> for cloned voices.")
|
|
1491
|
+
|
|
1492
|
+
def do_whisper(self, arg):
|
|
1493
|
+
"""Change Whisper model."""
|
|
1494
|
+
if not self.voice_manager:
|
|
1495
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
1496
|
+
return
|
|
1497
|
+
model = arg.strip()
|
|
1498
|
+
if not model:
|
|
1499
|
+
print(f"Current Whisper model: {self.voice_manager.get_whisper()}")
|
|
1500
|
+
return
|
|
1501
|
+
|
|
1502
|
+
self.voice_manager.set_whisper(model)
|
|
1503
|
+
|
|
1504
|
+
def do_speak(self, arg):
|
|
1505
|
+
"""Speak a text immediately (without calling the LLM).
|
|
1506
|
+
|
|
1507
|
+
Usage:
|
|
1508
|
+
/speak Hello world
|
|
1509
|
+
"""
|
|
1510
|
+
if not self.voice_manager:
|
|
1511
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1512
|
+
return
|
|
1513
|
+
|
|
1514
|
+
text = arg.strip()
|
|
1515
|
+
if not text:
|
|
1516
|
+
print("Usage: /speak <text>")
|
|
1517
|
+
return
|
|
1518
|
+
|
|
1519
|
+
try:
|
|
1520
|
+
self._speak_with_spinner_until_audio_starts(text)
|
|
1521
|
+
if self.verbose_mode:
|
|
1522
|
+
out_words = self._count_words(text)
|
|
1523
|
+
out_tokens = None
|
|
1524
|
+
try:
|
|
1525
|
+
enc = self._get_tiktoken_encoding()
|
|
1526
|
+
if enc is not None:
|
|
1527
|
+
out_tokens = int(len(enc.encode(str(text or ""))))
|
|
1528
|
+
except Exception:
|
|
1529
|
+
out_tokens = None
|
|
1530
|
+
|
|
1531
|
+
tts_metrics = None
|
|
1532
|
+
try:
|
|
1533
|
+
if hasattr(self.voice_manager, "pop_last_tts_metrics"):
|
|
1534
|
+
tts_metrics = self.voice_manager.pop_last_tts_metrics()
|
|
1535
|
+
except Exception:
|
|
1536
|
+
tts_metrics = None
|
|
1537
|
+
|
|
1538
|
+
turn = {
|
|
1539
|
+
"stt": None,
|
|
1540
|
+
"llm": {},
|
|
1541
|
+
"counts": {
|
|
1542
|
+
"in_words": 0,
|
|
1543
|
+
"out_words": int(out_words),
|
|
1544
|
+
"in_tokens": None,
|
|
1545
|
+
"out_tokens": out_tokens,
|
|
1546
|
+
},
|
|
1547
|
+
"tts": tts_metrics,
|
|
1548
|
+
}
|
|
1549
|
+
self._print_verbose_turn_stats(turn)
|
|
1550
|
+
except Exception as e:
|
|
1551
|
+
print(f"❌ Speak failed: {e}")
|
|
1552
|
+
if self.debug_mode:
|
|
1553
|
+
import traceback
|
|
1554
|
+
traceback.print_exc()
|
|
1555
|
+
|
|
1556
|
+
def _speak_with_spinner_until_audio_starts(self, text: str) -> None:
|
|
1557
|
+
"""REPL UX: show spinner while waiting for first audio, then stop.
|
|
1558
|
+
|
|
1559
|
+
This avoids corrupting the `cmd` prompt while still giving feedback during
|
|
1560
|
+
long cloned-TTS synthesis. Once playback starts, the prompt is displayed
|
|
1561
|
+
normally so the user can interrupt anytime by typing.
|
|
1562
|
+
"""
|
|
1563
|
+
if not self.voice_manager:
|
|
1564
|
+
return
|
|
1565
|
+
|
|
1566
|
+
is_clone = bool(self.current_tts_voice)
|
|
1567
|
+
if not is_clone:
|
|
1568
|
+
# Offline-first: Piper voices must be explicitly cached. Provide a clear
|
|
1569
|
+
# message instead of hanging on implicit downloads.
|
|
1570
|
+
try:
|
|
1571
|
+
a = getattr(self.voice_manager, "tts_adapter", None)
|
|
1572
|
+
if a is not None and hasattr(a, "is_available") and not bool(a.is_available()):
|
|
1573
|
+
lang = str(getattr(self, "current_language", "en") or "en").strip().lower()
|
|
1574
|
+
raise RuntimeError(
|
|
1575
|
+
f"Piper voice model for '{lang}' is not available locally.\n"
|
|
1576
|
+
f"Run: python -m abstractvoice download --piper {lang}"
|
|
1577
|
+
)
|
|
1578
|
+
except RuntimeError:
|
|
1579
|
+
raise
|
|
1580
|
+
except Exception:
|
|
1581
|
+
pass
|
|
1582
|
+
ind = self._busy_indicator(enabled=is_clone)
|
|
1583
|
+
try:
|
|
1584
|
+
if is_clone:
|
|
1585
|
+
ind.start()
|
|
1586
|
+
self.voice_manager.speak(text, voice=self.current_tts_voice)
|
|
1587
|
+
|
|
1588
|
+
if not is_clone:
|
|
1589
|
+
return
|
|
1590
|
+
|
|
1591
|
+
# Wait until audio playback actually starts (or synthesis ends without audio).
|
|
1592
|
+
vm = self.voice_manager
|
|
1593
|
+
while True:
|
|
1594
|
+
try:
|
|
1595
|
+
playing = bool(vm.is_speaking())
|
|
1596
|
+
synth_active = bool(
|
|
1597
|
+
getattr(vm, "_cloned_synthesis_active", None) and vm._cloned_synthesis_active.is_set()
|
|
1598
|
+
)
|
|
1599
|
+
except Exception:
|
|
1600
|
+
playing, synth_active = False, False
|
|
1601
|
+
|
|
1602
|
+
if playing:
|
|
1603
|
+
break
|
|
1604
|
+
|
|
1605
|
+
# If synthesis is no longer active and we aren't playing, stop the spinner
|
|
1606
|
+
# (either done very quickly or failed).
|
|
1607
|
+
if not synth_active:
|
|
1608
|
+
break
|
|
1609
|
+
|
|
1610
|
+
time.sleep(0.05)
|
|
1611
|
+
finally:
|
|
1612
|
+
try:
|
|
1613
|
+
ind.stop()
|
|
1614
|
+
except Exception:
|
|
1615
|
+
pass
|
|
1616
|
+
# If ASR auto-generated the clone's reference_text, print an easy override command
|
|
1617
|
+
# (once). We do this after stopping the spinner to avoid corrupting the prompt line.
|
|
1618
|
+
try:
|
|
1619
|
+
if is_clone and self.current_tts_voice:
|
|
1620
|
+
self._maybe_print_asr_ref_text_override(self.current_tts_voice)
|
|
1621
|
+
except Exception:
|
|
1622
|
+
pass
|
|
1623
|
+
# Do not print the prompt manually: `cmd` will render it on return,
|
|
1624
|
+
# and printing here can result in duplicate prompts (`> >`).
|
|
1625
|
+
|
|
1626
|
+
def _maybe_print_asr_ref_text_override(self, voice_id: str) -> None:
|
|
1627
|
+
"""If `reference_text` was auto-generated via ASR, print a paste-ready override hint.
|
|
1628
|
+
|
|
1629
|
+
Important: `/clone_set_ref_text` uses a simple `split(maxsplit=1)`, so quoting is not
|
|
1630
|
+
interpreted. We therefore print the command *without* quotes to avoid storing them.
|
|
1631
|
+
"""
|
|
1632
|
+
if not self.voice_manager:
|
|
1633
|
+
return
|
|
1634
|
+
vid = str(voice_id or "").strip()
|
|
1635
|
+
if not vid:
|
|
1636
|
+
return
|
|
1637
|
+
if vid in self._printed_asr_ref_text_hint:
|
|
1638
|
+
return
|
|
1639
|
+
try:
|
|
1640
|
+
info = self.voice_manager.get_cloned_voice(vid) or {}
|
|
1641
|
+
except Exception:
|
|
1642
|
+
return
|
|
1643
|
+
meta = info.get("meta") or {}
|
|
1644
|
+
src = str(meta.get("reference_text_source") or "").strip().lower()
|
|
1645
|
+
ref_text = str(info.get("reference_text") or "").strip()
|
|
1646
|
+
if not ref_text:
|
|
1647
|
+
return
|
|
1648
|
+
if src != "asr":
|
|
1649
|
+
return
|
|
1650
|
+
|
|
1651
|
+
# Mark first so any printing errors won't cause repeated spam.
|
|
1652
|
+
self._printed_asr_ref_text_hint.add(vid)
|
|
1653
|
+
|
|
1654
|
+
prefix = vid[:8] if len(vid) >= 8 else vid
|
|
1655
|
+
name = str(info.get("name") or "").strip()
|
|
1656
|
+
label = f"{name} ({prefix})" if name else prefix
|
|
1657
|
+
print("ℹ️ Auto-generated reference transcript (ASR).")
|
|
1658
|
+
print(f" Voice: {label}")
|
|
1659
|
+
print(" If you want to correct it, copy/paste and edit the text after the id:")
|
|
1660
|
+
print(f" /clone_set_ref_text {prefix} {ref_text}")
|
|
1661
|
+
|
|
1662
|
+
class _busy_indicator:
|
|
1663
|
+
"""A minimal, discreet spinner (no extra lines)."""
|
|
1664
|
+
|
|
1665
|
+
def __init__(self, enabled: bool = False):
|
|
1666
|
+
self.enabled = bool(enabled)
|
|
1667
|
+
self._stop = threading.Event()
|
|
1668
|
+
self._thread = None
|
|
1669
|
+
|
|
1670
|
+
def start(self):
|
|
1671
|
+
if not self.enabled:
|
|
1672
|
+
return
|
|
1673
|
+
if self._thread and self._thread.is_alive():
|
|
1674
|
+
return
|
|
1675
|
+
|
|
1676
|
+
def _run():
|
|
1677
|
+
frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
1678
|
+
i = 0
|
|
1679
|
+
t0 = time.time()
|
|
1680
|
+
# Small delay so fast operations don't flash.
|
|
1681
|
+
time.sleep(0.25)
|
|
1682
|
+
if self._stop.is_set():
|
|
1683
|
+
return
|
|
1684
|
+
# Hide cursor for a cleaner look.
|
|
1685
|
+
try:
|
|
1686
|
+
sys.stdout.write("\033[?25l")
|
|
1687
|
+
sys.stdout.flush()
|
|
1688
|
+
except Exception:
|
|
1689
|
+
pass
|
|
1690
|
+
while not self._stop.is_set():
|
|
1691
|
+
elapsed = time.time() - t0
|
|
1692
|
+
sys.stdout.write(f"\r(synthesizing {elapsed:0.1f}s) {frames[i % len(frames)]}")
|
|
1693
|
+
sys.stdout.flush()
|
|
1694
|
+
i += 1
|
|
1695
|
+
time.sleep(0.1)
|
|
1696
|
+
|
|
1697
|
+
self._thread = threading.Thread(target=_run, daemon=True)
|
|
1698
|
+
self._thread.start()
|
|
1699
|
+
|
|
1700
|
+
def stop(self):
|
|
1701
|
+
if not self.enabled:
|
|
1702
|
+
return
|
|
1703
|
+
self._stop.set()
|
|
1704
|
+
try:
|
|
1705
|
+
if self._thread:
|
|
1706
|
+
self._thread.join(timeout=0.5)
|
|
1707
|
+
except Exception:
|
|
1708
|
+
pass
|
|
1709
|
+
# Clear spinner line.
|
|
1710
|
+
try:
|
|
1711
|
+
# `\033[2K` clears the entire line (more robust than fixed spaces).
|
|
1712
|
+
sys.stdout.write("\r\033[2K\r")
|
|
1713
|
+
# Restore cursor.
|
|
1714
|
+
sys.stdout.write("\033[?25h")
|
|
1715
|
+
sys.stdout.flush()
|
|
1716
|
+
except Exception:
|
|
1717
|
+
pass
|
|
1718
|
+
|
|
1719
|
+
def __enter__(self):
|
|
1720
|
+
self.start()
|
|
1721
|
+
return self
|
|
1722
|
+
|
|
1723
|
+
def __exit__(self, exc_type, exc, tb):
|
|
1724
|
+
self.stop()
|
|
1725
|
+
return False
|
|
1726
|
+
|
|
1727
|
+
# NOTE: We intentionally do not keep a background spinner running while the REPL
|
|
1728
|
+
# is waiting for user input (it corrupts the prompt line). Instead, we show a
|
|
1729
|
+
# spinner only until the first audio actually starts, then stop it so the prompt
|
|
1730
|
+
# stays usable for interruption-by-typing.
|
|
1731
|
+
|
|
1732
|
+
def do_clones(self, arg):
|
|
1733
|
+
"""List cloned voices in the local store."""
|
|
1734
|
+
if not self.voice_manager:
|
|
1735
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1736
|
+
return
|
|
1737
|
+
try:
|
|
1738
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
1739
|
+
if not voices:
|
|
1740
|
+
print("No cloned voices yet. Use /clone <path> or /clone-my-voice.")
|
|
1741
|
+
return
|
|
1742
|
+
print(f"\n{Colors.CYAN}Cloned voices:{Colors.END}")
|
|
1743
|
+
for v in voices:
|
|
1744
|
+
vid = v.get("voice_id") or v.get("voice", "")
|
|
1745
|
+
name = v.get("name", "")
|
|
1746
|
+
eng = (v.get("engine") or "").strip()
|
|
1747
|
+
eng_txt = f" [{eng}]" if eng else ""
|
|
1748
|
+
src = (v.get("meta") or {}).get("reference_text_source", "")
|
|
1749
|
+
src_txt = f" [{src}]" if src else ""
|
|
1750
|
+
current = " (current)" if self.current_tts_voice == vid else ""
|
|
1751
|
+
print(f" - {name}: {vid}{eng_txt}{src_txt}{current}")
|
|
1752
|
+
print("Tip: /clone_rm <id-or-name> deletes one; /clone_rm_all --yes deletes all.")
|
|
1753
|
+
except Exception as e:
|
|
1754
|
+
print(f"❌ Error listing cloned voices: {e}")
|
|
1755
|
+
|
|
1756
|
+
def _resolve_clone_id(self, wanted: str) -> str | None:
|
|
1757
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
1758
|
+
for v in voices:
|
|
1759
|
+
vid = v.get("voice_id") or ""
|
|
1760
|
+
name = v.get("name") or ""
|
|
1761
|
+
if wanted == vid or vid.startswith(wanted) or wanted == name:
|
|
1762
|
+
return vid
|
|
1763
|
+
return None
|
|
1764
|
+
|
|
1765
|
+
def _resolve_clone_id_by_source(self, source: str, *, engine: str | None = None) -> str | None:
|
|
1766
|
+
"""Find a cloned voice by its stored meta.source (best-effort)."""
|
|
1767
|
+
if not self.voice_manager:
|
|
1768
|
+
return None
|
|
1769
|
+
|
|
1770
|
+
try:
|
|
1771
|
+
from pathlib import Path
|
|
1772
|
+
|
|
1773
|
+
target = Path(str(source)).expanduser()
|
|
1774
|
+
try:
|
|
1775
|
+
target_norm = str(target.resolve())
|
|
1776
|
+
except Exception:
|
|
1777
|
+
target_norm = str(target)
|
|
1778
|
+
except Exception:
|
|
1779
|
+
target_norm = str(source)
|
|
1780
|
+
|
|
1781
|
+
try:
|
|
1782
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
1783
|
+
except Exception:
|
|
1784
|
+
return None
|
|
1785
|
+
|
|
1786
|
+
wanted_engine = (str(engine).strip().lower() if engine else None) or None
|
|
1787
|
+
for v in voices:
|
|
1788
|
+
meta = v.get("meta") or {}
|
|
1789
|
+
src = meta.get("source")
|
|
1790
|
+
if not src:
|
|
1791
|
+
continue
|
|
1792
|
+
try:
|
|
1793
|
+
from pathlib import Path
|
|
1794
|
+
|
|
1795
|
+
p = Path(str(src)).expanduser()
|
|
1796
|
+
try:
|
|
1797
|
+
src_norm = str(p.resolve())
|
|
1798
|
+
except Exception:
|
|
1799
|
+
src_norm = str(p)
|
|
1800
|
+
except Exception:
|
|
1801
|
+
src_norm = str(src)
|
|
1802
|
+
|
|
1803
|
+
if src_norm != target_norm:
|
|
1804
|
+
continue
|
|
1805
|
+
if wanted_engine and (str(v.get("engine") or "").strip().lower() != wanted_engine):
|
|
1806
|
+
continue
|
|
1807
|
+
return str(v.get("voice_id") or "").strip() or None
|
|
1808
|
+
return None
|
|
1809
|
+
|
|
1810
|
+
def do_clone_info(self, arg):
|
|
1811
|
+
"""Show details for a cloned voice.
|
|
1812
|
+
|
|
1813
|
+
Usage:
|
|
1814
|
+
/clone_info <id-or-name>
|
|
1815
|
+
"""
|
|
1816
|
+
if not self.voice_manager:
|
|
1817
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1818
|
+
return
|
|
1819
|
+
wanted = arg.strip()
|
|
1820
|
+
if not wanted:
|
|
1821
|
+
print("Usage: /clone_info <id-or-name>")
|
|
1822
|
+
return
|
|
1823
|
+
vid = self._resolve_clone_id(wanted)
|
|
1824
|
+
if not vid:
|
|
1825
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
1826
|
+
return
|
|
1827
|
+
try:
|
|
1828
|
+
info = self.voice_manager.get_cloned_voice(vid)
|
|
1829
|
+
meta = info.get("meta") or {}
|
|
1830
|
+
print(f"\n{Colors.CYAN}Cloned voice info:{Colors.END}")
|
|
1831
|
+
print(f" id: {info.get('voice_id')}")
|
|
1832
|
+
print(f" name: {info.get('name')}")
|
|
1833
|
+
print(f" engine: {info.get('engine')}")
|
|
1834
|
+
print(f" refs: {len(info.get('reference_files') or [])}")
|
|
1835
|
+
print(f" ref_text_source: {meta.get('reference_text_source','')}")
|
|
1836
|
+
rt = (info.get('reference_text') or '').strip()
|
|
1837
|
+
if rt:
|
|
1838
|
+
short = (rt[:200] + "…") if len(rt) > 200 else rt
|
|
1839
|
+
print(f" reference_text: {short}")
|
|
1840
|
+
else:
|
|
1841
|
+
print(" reference_text: (missing)")
|
|
1842
|
+
except Exception as e:
|
|
1843
|
+
print(f"❌ Error: {e}")
|
|
1844
|
+
|
|
1845
|
+
def do_clone_ref(self, arg):
|
|
1846
|
+
"""Print the full reference_text for a cloned voice.
|
|
1847
|
+
|
|
1848
|
+
Usage:
|
|
1849
|
+
/clone_ref <id-or-name>
|
|
1850
|
+
"""
|
|
1851
|
+
if not self.voice_manager:
|
|
1852
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1853
|
+
return
|
|
1854
|
+
wanted = arg.strip()
|
|
1855
|
+
if not wanted:
|
|
1856
|
+
print("Usage: /clone_ref <id-or-name>")
|
|
1857
|
+
return
|
|
1858
|
+
vid = self._resolve_clone_id(wanted)
|
|
1859
|
+
if not vid:
|
|
1860
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
1861
|
+
return
|
|
1862
|
+
info = self.voice_manager.get_cloned_voice(vid)
|
|
1863
|
+
print((info.get("reference_text") or "").strip())
|
|
1864
|
+
|
|
1865
|
+
def do_clone_rename(self, arg):
|
|
1866
|
+
"""Rename a cloned voice.
|
|
1867
|
+
|
|
1868
|
+
Usage:
|
|
1869
|
+
/clone_rename <id-or-name> <new_name>
|
|
1870
|
+
"""
|
|
1871
|
+
if not self.voice_manager:
|
|
1872
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1873
|
+
return
|
|
1874
|
+
parts = arg.strip().split(maxsplit=1)
|
|
1875
|
+
if len(parts) < 2:
|
|
1876
|
+
print("Usage: /clone_rename <id-or-name> <new_name>")
|
|
1877
|
+
return
|
|
1878
|
+
vid = self._resolve_clone_id(parts[0])
|
|
1879
|
+
if not vid:
|
|
1880
|
+
print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
|
|
1881
|
+
return
|
|
1882
|
+
self.voice_manager.rename_cloned_voice(vid, parts[1])
|
|
1883
|
+
print("✅ Renamed.")
|
|
1884
|
+
|
|
1885
|
+
def do_clone_rm(self, arg):
|
|
1886
|
+
"""Remove a cloned voice from the store.
|
|
1887
|
+
|
|
1888
|
+
Usage:
|
|
1889
|
+
/clone_rm <id-or-name>
|
|
1890
|
+
"""
|
|
1891
|
+
if not self.voice_manager:
|
|
1892
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1893
|
+
return
|
|
1894
|
+
wanted = arg.strip()
|
|
1895
|
+
if not wanted:
|
|
1896
|
+
print("Usage: /clone_rm <id-or-name>")
|
|
1897
|
+
return
|
|
1898
|
+
vid = self._resolve_clone_id(wanted)
|
|
1899
|
+
if not vid:
|
|
1900
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
1901
|
+
return
|
|
1902
|
+
# If currently selected, switch back to Piper.
|
|
1903
|
+
if self.current_tts_voice == vid:
|
|
1904
|
+
self.current_tts_voice = None
|
|
1905
|
+
self.voice_manager.delete_cloned_voice(vid)
|
|
1906
|
+
print("✅ Deleted.")
|
|
1907
|
+
|
|
1908
|
+
def do_clone_rm_all(self, arg):
|
|
1909
|
+
"""Remove ALL cloned voices from the local store.
|
|
1910
|
+
|
|
1911
|
+
Usage:
|
|
1912
|
+
/clone_rm_all --yes
|
|
1913
|
+
"""
|
|
1914
|
+
if not self.voice_manager:
|
|
1915
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1916
|
+
return
|
|
1917
|
+
|
|
1918
|
+
confirm = (arg or "").strip().lower()
|
|
1919
|
+
if confirm not in ("--yes", "-y", "yes"):
|
|
1920
|
+
try:
|
|
1921
|
+
n = len(self.voice_manager.list_cloned_voices() or [])
|
|
1922
|
+
except Exception:
|
|
1923
|
+
n = 0
|
|
1924
|
+
if n <= 0:
|
|
1925
|
+
print("No cloned voices to delete.")
|
|
1926
|
+
return
|
|
1927
|
+
print(f"⚠️ This will permanently delete {n} cloned voice(s).")
|
|
1928
|
+
print("Re-run with: /clone_rm_all --yes")
|
|
1929
|
+
return
|
|
1930
|
+
|
|
1931
|
+
# If currently selected, switch back to Piper.
|
|
1932
|
+
self.current_tts_voice = None
|
|
1933
|
+
|
|
1934
|
+
deleted = 0
|
|
1935
|
+
failed = 0
|
|
1936
|
+
try:
|
|
1937
|
+
voices = list(self.voice_manager.list_cloned_voices() or [])
|
|
1938
|
+
except Exception as e:
|
|
1939
|
+
print(f"❌ Error listing cloned voices: {e}")
|
|
1940
|
+
return
|
|
1941
|
+
|
|
1942
|
+
for v in voices:
|
|
1943
|
+
vid = str(v.get("voice_id") or v.get("voice") or "").strip()
|
|
1944
|
+
if not vid:
|
|
1945
|
+
continue
|
|
1946
|
+
try:
|
|
1947
|
+
self.voice_manager.delete_cloned_voice(vid)
|
|
1948
|
+
deleted += 1
|
|
1949
|
+
except Exception:
|
|
1950
|
+
failed += 1
|
|
1951
|
+
|
|
1952
|
+
if failed:
|
|
1953
|
+
print(f"✅ Deleted {deleted} cloned voice(s). ⚠️ Failed: {failed}")
|
|
1954
|
+
else:
|
|
1955
|
+
print(f"✅ Deleted {deleted} cloned voice(s).")
|
|
1956
|
+
|
|
1957
|
+
def do_clone_export(self, arg):
|
|
1958
|
+
"""Export a cloned voice bundle (.zip).
|
|
1959
|
+
|
|
1960
|
+
Usage:
|
|
1961
|
+
/clone_export <id-or-name> <path.zip>
|
|
1962
|
+
"""
|
|
1963
|
+
if not self.voice_manager:
|
|
1964
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1965
|
+
return
|
|
1966
|
+
parts = arg.strip().split(maxsplit=1)
|
|
1967
|
+
if len(parts) < 2:
|
|
1968
|
+
print("Usage: /clone_export <id-or-name> <path.zip>")
|
|
1969
|
+
return
|
|
1970
|
+
vid = self._resolve_clone_id(parts[0])
|
|
1971
|
+
if not vid:
|
|
1972
|
+
print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
|
|
1973
|
+
return
|
|
1974
|
+
out = self.voice_manager.export_voice(vid, parts[1])
|
|
1975
|
+
print(f"✅ Exported: {out}")
|
|
1976
|
+
|
|
1977
|
+
def do_clone_import(self, arg):
|
|
1978
|
+
"""Import a cloned voice bundle (.zip).
|
|
1979
|
+
|
|
1980
|
+
Usage:
|
|
1981
|
+
/clone_import <path.zip>
|
|
1982
|
+
"""
|
|
1983
|
+
if not self.voice_manager:
|
|
1984
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
1985
|
+
return
|
|
1986
|
+
path = arg.strip()
|
|
1987
|
+
if not path:
|
|
1988
|
+
print("Usage: /clone_import <path.zip>")
|
|
1989
|
+
return
|
|
1990
|
+
vid = self.voice_manager.import_voice(path)
|
|
1991
|
+
print(f"✅ Imported as: {vid}")
|
|
1992
|
+
|
|
1993
|
+
def do_clone(self, arg):
|
|
1994
|
+
"""Clone a voice from a reference file or folder.
|
|
1995
|
+
|
|
1996
|
+
Usage:
|
|
1997
|
+
/clone <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
|
|
1998
|
+
"""
|
|
1999
|
+
if not self.voice_manager:
|
|
2000
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2001
|
+
return
|
|
2002
|
+
|
|
2003
|
+
try:
|
|
2004
|
+
parts = shlex.split(arg.strip())
|
|
2005
|
+
except ValueError as e:
|
|
2006
|
+
print(f"Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
|
|
2007
|
+
return
|
|
2008
|
+
|
|
2009
|
+
if not parts:
|
|
2010
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2011
|
+
return
|
|
2012
|
+
|
|
2013
|
+
engine = None
|
|
2014
|
+
reference_text = None
|
|
2015
|
+
pos = []
|
|
2016
|
+
i = 0
|
|
2017
|
+
while i < len(parts):
|
|
2018
|
+
tok = parts[i]
|
|
2019
|
+
if tok in ("--engine",):
|
|
2020
|
+
if i + 1 >= len(parts):
|
|
2021
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2022
|
+
return
|
|
2023
|
+
engine = parts[i + 1]
|
|
2024
|
+
i += 2
|
|
2025
|
+
continue
|
|
2026
|
+
if tok in ("--text", "--reference-text", "--reference_text"):
|
|
2027
|
+
if i + 1 >= len(parts):
|
|
2028
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2029
|
+
return
|
|
2030
|
+
reference_text = parts[i + 1]
|
|
2031
|
+
i += 2
|
|
2032
|
+
continue
|
|
2033
|
+
pos.append(tok)
|
|
2034
|
+
i += 1
|
|
2035
|
+
|
|
2036
|
+
if not pos:
|
|
2037
|
+
print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2038
|
+
return
|
|
2039
|
+
|
|
2040
|
+
path = pos[0]
|
|
2041
|
+
name = pos[1] if len(pos) > 1 else None
|
|
2042
|
+
try:
|
|
2043
|
+
t0 = time.monotonic()
|
|
2044
|
+
voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine)
|
|
2045
|
+
t1 = time.monotonic()
|
|
2046
|
+
|
|
2047
|
+
eng = ""
|
|
2048
|
+
ref_src = ""
|
|
2049
|
+
try:
|
|
2050
|
+
info = self.voice_manager.get_cloned_voice(voice_id) or {}
|
|
2051
|
+
eng = str(info.get("engine") or "").strip()
|
|
2052
|
+
ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
|
|
2053
|
+
except Exception:
|
|
2054
|
+
eng = ""
|
|
2055
|
+
ref_src = ""
|
|
2056
|
+
|
|
2057
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2058
|
+
print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
|
|
2059
|
+
print(" Use /tts_voice clone <id-or-name> to select it.")
|
|
2060
|
+
print(" Tip: set reference text for best quality:")
|
|
2061
|
+
print(" /clone_set_ref_text <id-or-name> \"...\"")
|
|
2062
|
+
if not self._is_cloning_runtime_ready(voice_id=voice_id):
|
|
2063
|
+
print(" (Cloning runtime not ready yet; run /cloning_status and /cloning_download first.)")
|
|
2064
|
+
if str(eng or (engine or self.cloning_engine) or "").strip().lower() == "chroma" and not (reference_text or "").strip():
|
|
2065
|
+
print("ℹ️ No reference transcript provided.")
|
|
2066
|
+
print(" We will auto-generate it via STT on first speak (offline-first: requires cached STT model).")
|
|
2067
|
+
print(" Optional (often best quality): /clone_set_ref_text <id-or-name> \"...\" (or re-run /clone ... --text \"...\")")
|
|
2068
|
+
|
|
2069
|
+
if self.verbose_mode:
|
|
2070
|
+
n_files, ref_audio_s = self._summarize_audio_source(path)
|
|
2071
|
+
n_txt = str(n_files) if isinstance(n_files, int) else "--"
|
|
2072
|
+
src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
|
|
2073
|
+
msg = f"CLONE {eng or (engine or self.cloning_engine)} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
|
|
2074
|
+
print(f"{Colors.YELLOW}{msg}{Colors.END}")
|
|
2075
|
+
except Exception as e:
|
|
2076
|
+
print(f"❌ Clone failed: {e}")
|
|
2077
|
+
|
|
2078
|
+
def do_clone_use(self, arg):
|
|
2079
|
+
"""Clone a voice (or reuse an existing one) and immediately select it.
|
|
2080
|
+
|
|
2081
|
+
Usage:
|
|
2082
|
+
/clone_use <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
|
|
2083
|
+
|
|
2084
|
+
Shortcut:
|
|
2085
|
+
- Paste a WAV/FLAC/OGG path directly (optionally: `path.wav | transcript`).
|
|
2086
|
+
"""
|
|
2087
|
+
if not self.voice_manager:
|
|
2088
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2089
|
+
return
|
|
2090
|
+
|
|
2091
|
+
try:
|
|
2092
|
+
parts = shlex.split(arg.strip())
|
|
2093
|
+
except ValueError as e:
|
|
2094
|
+
print(f"Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
|
|
2095
|
+
return
|
|
2096
|
+
|
|
2097
|
+
if not parts:
|
|
2098
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2099
|
+
return
|
|
2100
|
+
|
|
2101
|
+
engine = None
|
|
2102
|
+
reference_text = None
|
|
2103
|
+
pos = []
|
|
2104
|
+
i = 0
|
|
2105
|
+
while i < len(parts):
|
|
2106
|
+
tok = parts[i]
|
|
2107
|
+
if tok in ("--engine",):
|
|
2108
|
+
if i + 1 >= len(parts):
|
|
2109
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2110
|
+
return
|
|
2111
|
+
engine = parts[i + 1]
|
|
2112
|
+
i += 2
|
|
2113
|
+
continue
|
|
2114
|
+
if tok in ("--text", "--reference-text", "--reference_text"):
|
|
2115
|
+
if i + 1 >= len(parts):
|
|
2116
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2117
|
+
return
|
|
2118
|
+
reference_text = parts[i + 1]
|
|
2119
|
+
i += 2
|
|
2120
|
+
continue
|
|
2121
|
+
pos.append(tok)
|
|
2122
|
+
i += 1
|
|
2123
|
+
|
|
2124
|
+
if not pos:
|
|
2125
|
+
print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
|
|
2126
|
+
return
|
|
2127
|
+
|
|
2128
|
+
path = pos[0]
|
|
2129
|
+
name = pos[1] if len(pos) > 1 else None
|
|
2130
|
+
|
|
2131
|
+
engine_name = str(engine or self.cloning_engine or "f5_tts").strip().lower()
|
|
2132
|
+
|
|
2133
|
+
# If name isn't provided, use something stable for UX.
|
|
2134
|
+
if not name:
|
|
2135
|
+
try:
|
|
2136
|
+
from pathlib import Path
|
|
2137
|
+
|
|
2138
|
+
p = Path(path)
|
|
2139
|
+
name = p.stem if p.is_file() else p.name
|
|
2140
|
+
except Exception:
|
|
2141
|
+
name = None
|
|
2142
|
+
|
|
2143
|
+
# Reuse a prior clone created from the same source path + engine.
|
|
2144
|
+
voice_id = self._resolve_clone_id_by_source(path, engine=engine_name)
|
|
2145
|
+
if voice_id:
|
|
2146
|
+
if reference_text:
|
|
2147
|
+
try:
|
|
2148
|
+
self.voice_manager.set_cloned_voice_reference_text(voice_id, reference_text)
|
|
2149
|
+
print("✅ Reusing cloned voice and updating reference text.")
|
|
2150
|
+
except Exception:
|
|
2151
|
+
print("✅ Reusing cloned voice.")
|
|
2152
|
+
else:
|
|
2153
|
+
print("✅ Reusing cloned voice.")
|
|
2154
|
+
else:
|
|
2155
|
+
try:
|
|
2156
|
+
t0 = time.monotonic()
|
|
2157
|
+
voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine_name)
|
|
2158
|
+
t1 = time.monotonic()
|
|
2159
|
+
|
|
2160
|
+
eng = ""
|
|
2161
|
+
ref_src = ""
|
|
2162
|
+
try:
|
|
2163
|
+
info = self.voice_manager.get_cloned_voice(voice_id) or {}
|
|
2164
|
+
eng = str(info.get("engine") or "").strip()
|
|
2165
|
+
ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
|
|
2166
|
+
except Exception:
|
|
2167
|
+
eng = ""
|
|
2168
|
+
ref_src = ""
|
|
2169
|
+
|
|
2170
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2171
|
+
print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
|
|
2172
|
+
if reference_text:
|
|
2173
|
+
print(" (Reference text provided)")
|
|
2174
|
+
else:
|
|
2175
|
+
print(" Tip: set reference text for best quality:")
|
|
2176
|
+
print(" /clone_set_ref_text <id-or-name> \"...\"")
|
|
2177
|
+
if str(eng or engine_name or "").strip().lower() == "chroma":
|
|
2178
|
+
print(" ℹ️ No transcript provided; STT auto-fallback runs on first speak (requires cached STT model).")
|
|
2179
|
+
|
|
2180
|
+
if self.verbose_mode:
|
|
2181
|
+
n_files, ref_audio_s = self._summarize_audio_source(path)
|
|
2182
|
+
n_txt = str(n_files) if isinstance(n_files, int) else "--"
|
|
2183
|
+
src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
|
|
2184
|
+
msg = f"CLONE {eng or engine_name} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
|
|
2185
|
+
print(f"{Colors.YELLOW}{msg}{Colors.END}")
|
|
2186
|
+
except Exception as e:
|
|
2187
|
+
print(f"❌ Clone failed: {e}")
|
|
2188
|
+
return
|
|
2189
|
+
|
|
2190
|
+
# Select if runtime is ready (no surprise downloads).
|
|
2191
|
+
if not self._is_cloning_runtime_ready(voice_id=voice_id):
|
|
2192
|
+
print("ℹ️ Cloning runtime is not ready (would trigger large downloads).")
|
|
2193
|
+
print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
|
|
2194
|
+
return
|
|
2195
|
+
|
|
2196
|
+
self.current_tts_voice = voice_id
|
|
2197
|
+
eng = ""
|
|
2198
|
+
try:
|
|
2199
|
+
info = self.voice_manager.get_cloned_voice(voice_id) or {}
|
|
2200
|
+
eng = str(info.get("engine") or "").strip()
|
|
2201
|
+
except Exception:
|
|
2202
|
+
eng = ""
|
|
2203
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2204
|
+
print(f"✅ Using cloned voice: {voice_id}{eng_txt}")
|
|
2205
|
+
if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
|
|
2206
|
+
print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
|
|
2207
|
+
# Free memory from other cloning engines (important for large backends like Chroma).
|
|
2208
|
+
try:
|
|
2209
|
+
if hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2210
|
+
self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
|
|
2211
|
+
except Exception:
|
|
2212
|
+
pass
|
|
2213
|
+
# Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
|
|
2214
|
+
try:
|
|
2215
|
+
if hasattr(self.voice_manager, "unload_piper_voice"):
|
|
2216
|
+
self.voice_manager.unload_piper_voice()
|
|
2217
|
+
except Exception:
|
|
2218
|
+
pass
|
|
2219
|
+
|
|
2220
|
+
def do_clone_set_ref_text(self, arg):
|
|
2221
|
+
"""Set the reference transcript for a cloned voice (quality fix).
|
|
2222
|
+
|
|
644
2223
|
Usage:
|
|
645
|
-
/
|
|
646
|
-
/tts_model fast_pitch
|
|
2224
|
+
/clone_set_ref_text <id-or-name> <text...>
|
|
647
2225
|
"""
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
'fast_pitch': 'tts_models/en/ljspeech/fast_pitch',
|
|
651
|
-
'glow-tts': 'tts_models/en/ljspeech/glow-tts',
|
|
652
|
-
'tacotron2-DDC': 'tts_models/en/ljspeech/tacotron2-DDC',
|
|
653
|
-
}
|
|
654
|
-
|
|
655
|
-
arg = arg.strip()
|
|
656
|
-
if not arg:
|
|
657
|
-
print("Usage: /tts_model <model_name>")
|
|
658
|
-
print("Available models: vits (best), fast_pitch, glow-tts, tacotron2-DDC")
|
|
2226
|
+
if not self.voice_manager:
|
|
2227
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
659
2228
|
return
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
2229
|
+
|
|
2230
|
+
parts = arg.strip().split(maxsplit=1)
|
|
2231
|
+
if len(parts) < 2:
|
|
2232
|
+
print("Usage: /clone_set_ref_text <id-or-name> <text...>")
|
|
2233
|
+
return
|
|
2234
|
+
|
|
2235
|
+
wanted, text = parts[0], parts[1]
|
|
2236
|
+
voices = self.voice_manager.list_cloned_voices()
|
|
2237
|
+
match = None
|
|
2238
|
+
for v in voices:
|
|
2239
|
+
vid = v.get("voice_id") or ""
|
|
2240
|
+
name = v.get("name") or ""
|
|
2241
|
+
if wanted == vid or vid.startswith(wanted) or wanted == name:
|
|
2242
|
+
match = vid
|
|
2243
|
+
break
|
|
2244
|
+
if not match:
|
|
2245
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
2246
|
+
return
|
|
2247
|
+
|
|
665
2248
|
try:
|
|
666
|
-
self.voice_manager.
|
|
667
|
-
print("
|
|
2249
|
+
self.voice_manager.set_cloned_voice_reference_text(match, text)
|
|
2250
|
+
print("✅ Updated reference text.")
|
|
668
2251
|
except Exception as e:
|
|
669
|
-
print(f"
|
|
670
|
-
|
|
671
|
-
def
|
|
672
|
-
"""
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
2252
|
+
print(f"❌ Failed to update reference text: {e}")
|
|
2253
|
+
|
|
2254
|
+
def do_tts_voice(self, arg):
|
|
2255
|
+
"""Select which voice is used for speaking.
|
|
2256
|
+
|
|
2257
|
+
Usage:
|
|
2258
|
+
/tts_voice piper
|
|
2259
|
+
/tts_voice clone <voice_id_or_name>
|
|
2260
|
+
"""
|
|
2261
|
+
if not self.voice_manager:
|
|
2262
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
676
2263
|
return
|
|
677
|
-
|
|
678
|
-
|
|
2264
|
+
|
|
2265
|
+
parts = arg.strip().split()
|
|
2266
|
+
if not parts:
|
|
2267
|
+
if self.current_tts_voice:
|
|
2268
|
+
vid = self.current_tts_voice
|
|
2269
|
+
try:
|
|
2270
|
+
info = self.voice_manager.get_cloned_voice(vid) or {}
|
|
2271
|
+
name = (info.get("name") or "").strip()
|
|
2272
|
+
eng = (info.get("engine") or "").strip()
|
|
2273
|
+
label = name or vid
|
|
2274
|
+
suffix = f" (engine: {eng})" if eng else ""
|
|
2275
|
+
print(f"Current TTS voice: {label}{suffix}")
|
|
2276
|
+
except Exception:
|
|
2277
|
+
print(f"Current TTS voice: {vid}")
|
|
2278
|
+
else:
|
|
2279
|
+
print("Current TTS voice: piper")
|
|
2280
|
+
print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
|
|
2281
|
+
return
|
|
2282
|
+
|
|
2283
|
+
if parts[0] == "piper":
|
|
2284
|
+
self.current_tts_voice = None
|
|
2285
|
+
# Free any heavy cloning engines when switching back to Piper.
|
|
2286
|
+
try:
|
|
2287
|
+
if hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2288
|
+
self.voice_manager.unload_cloning_engines()
|
|
2289
|
+
except Exception:
|
|
2290
|
+
pass
|
|
2291
|
+
# If Piper was previously unloaded to save memory, reload it now (offline-first).
|
|
2292
|
+
try:
|
|
2293
|
+
if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
|
|
2294
|
+
a = getattr(self.voice_manager, "tts_adapter", None)
|
|
2295
|
+
if hasattr(a, "is_available") and not bool(a.is_available()):
|
|
2296
|
+
self.voice_manager.set_language(self.current_language)
|
|
2297
|
+
except Exception:
|
|
2298
|
+
pass
|
|
2299
|
+
print("✅ Using Piper (default) voice")
|
|
2300
|
+
return
|
|
2301
|
+
|
|
2302
|
+
if parts[0] != "clone" or len(parts) < 2:
|
|
2303
|
+
print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
|
|
2304
|
+
return
|
|
2305
|
+
|
|
2306
|
+
wanted = parts[1]
|
|
2307
|
+
match = self._resolve_clone_id(wanted)
|
|
2308
|
+
if not match:
|
|
2309
|
+
print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
|
|
2310
|
+
return
|
|
2311
|
+
|
|
2312
|
+
# Do not allow selecting a cloned voice unless the runtime is ready.
|
|
2313
|
+
if not self._is_cloning_runtime_ready(voice_id=match):
|
|
2314
|
+
print("❌ Cloning runtime is not ready (would trigger large downloads).")
|
|
2315
|
+
print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
|
|
2316
|
+
return
|
|
2317
|
+
|
|
2318
|
+
# Allow selecting voices without reference_text; we will auto-fallback at speak-time
|
|
2319
|
+
# if the STT model is already cached locally (no downloads in REPL).
|
|
2320
|
+
|
|
2321
|
+
self.current_tts_voice = match
|
|
2322
|
+
eng = ""
|
|
2323
|
+
try:
|
|
2324
|
+
info = self.voice_manager.get_cloned_voice(match) or {}
|
|
2325
|
+
eng = (info.get("engine") or "").strip()
|
|
2326
|
+
except Exception:
|
|
2327
|
+
eng = ""
|
|
2328
|
+
eng_txt = f" (engine: {eng})" if eng else ""
|
|
2329
|
+
print(f"✅ Using cloned voice: {match}{eng_txt}")
|
|
2330
|
+
if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
|
|
2331
|
+
print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
|
|
2332
|
+
# Free memory from other cloning engines (e.g. unloading Chroma when switching to F5, or vice-versa).
|
|
2333
|
+
try:
|
|
2334
|
+
if hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2335
|
+
self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
|
|
2336
|
+
except Exception:
|
|
2337
|
+
pass
|
|
2338
|
+
# Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
|
|
2339
|
+
try:
|
|
2340
|
+
if hasattr(self.voice_manager, "unload_piper_voice"):
|
|
2341
|
+
self.voice_manager.unload_piper_voice()
|
|
2342
|
+
except Exception:
|
|
2343
|
+
pass
|
|
2344
|
+
|
|
2345
|
+
def do_clone_my_voice(self, arg):
|
|
2346
|
+
"""Interactive voice cloning from microphone.
|
|
2347
|
+
|
|
2348
|
+
This records a short prompt to WAV and adds it to the voice store.
|
|
2349
|
+
"""
|
|
2350
|
+
if not self.voice_manager:
|
|
2351
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2352
|
+
return
|
|
2353
|
+
|
|
2354
|
+
prompt = "Good evening, Dave."
|
|
2355
|
+
seconds = 6.0
|
|
2356
|
+
print("You will record a short reference sample for voice cloning.")
|
|
2357
|
+
print(f"Please read this aloud (once): {prompt}")
|
|
2358
|
+
input("Press Enter to start recording...")
|
|
2359
|
+
try:
|
|
2360
|
+
import appdirs
|
|
2361
|
+
from pathlib import Path
|
|
2362
|
+
from abstractvoice.audio import record_wav
|
|
2363
|
+
|
|
2364
|
+
out_dir = Path(appdirs.user_data_dir("abstractvoice")) / "recordings"
|
|
2365
|
+
out_path = out_dir / "my_voice.wav"
|
|
2366
|
+
record_wav(out_path, seconds=seconds, sample_rate=24000, channels=1)
|
|
2367
|
+
voice_id = self.voice_manager.clone_voice(str(out_path), name="my_voice", reference_text=prompt)
|
|
2368
|
+
print(f"✅ Recorded and cloned: {voice_id}")
|
|
2369
|
+
print(" Use /tts_voice clone <id-or-name> to select it.")
|
|
2370
|
+
except Exception as e:
|
|
2371
|
+
print(f"❌ /clone-my-voice failed: {e}")
|
|
2372
|
+
|
|
2373
|
+
def do_cloning_status(self, arg):
|
|
2374
|
+
"""Show whether cloning runtime is ready locally (no downloads)."""
|
|
2375
|
+
try:
|
|
2376
|
+
import torch
|
|
2377
|
+
|
|
2378
|
+
mps = False
|
|
2379
|
+
try:
|
|
2380
|
+
mps = bool(torch.backends.mps.is_available())
|
|
2381
|
+
except Exception:
|
|
2382
|
+
mps = False
|
|
2383
|
+
print(f"torch: {getattr(torch, '__version__', '?')}")
|
|
2384
|
+
print(f"cuda_available: {bool(torch.cuda.is_available())}")
|
|
2385
|
+
print(f"mps_available: {mps}")
|
|
2386
|
+
except Exception:
|
|
2387
|
+
pass
|
|
2388
|
+
|
|
2389
|
+
print(f"default_cloning_engine: {self.cloning_engine}")
|
|
2390
|
+
|
|
2391
|
+
if importlib.util.find_spec("f5_tts") is None:
|
|
2392
|
+
print("ℹ️ OpenF5 runtime: not installed (missing: f5_tts)")
|
|
2393
|
+
print(" Install: pip install \"abstractvoice[cloning]\"")
|
|
2394
|
+
else:
|
|
2395
|
+
if self._is_openf5_cached():
|
|
2396
|
+
print("✅ OpenF5 artifacts: present (cached)")
|
|
2397
|
+
else:
|
|
2398
|
+
print("ℹ️ OpenF5 artifacts: not present (will require ~5.4GB download)")
|
|
2399
|
+
print(" Run: /cloning_download f5_tts")
|
|
2400
|
+
|
|
2401
|
+
if importlib.util.find_spec("transformers") is None or importlib.util.find_spec("torch") is None:
|
|
2402
|
+
print("ℹ️ Chroma runtime: not installed (missing: transformers/torch)")
|
|
2403
|
+
print(" Install: pip install \"abstractvoice[chroma]\"")
|
|
2404
|
+
else:
|
|
2405
|
+
if self._is_chroma_cached():
|
|
2406
|
+
print("✅ Chroma artifacts: present (cached)")
|
|
2407
|
+
else:
|
|
2408
|
+
print("ℹ️ Chroma artifacts: not present (will require a large download + HF access)")
|
|
2409
|
+
print(" Run: /cloning_download chroma")
|
|
2410
|
+
try:
|
|
2411
|
+
if self.voice_manager:
|
|
2412
|
+
info = self.voice_manager.get_cloning_runtime_info()
|
|
2413
|
+
if info:
|
|
2414
|
+
print(f"cloning_resolved_device: {info.get('resolved_device')}")
|
|
2415
|
+
print(f"cloning_model_param_device: {info.get('model_param_device','?')}")
|
|
2416
|
+
print(f"cloning_quality_preset: {info.get('quality_preset')}")
|
|
2417
|
+
except Exception:
|
|
2418
|
+
pass
|
|
2419
|
+
|
|
2420
|
+
def do_clone_quality(self, arg):
|
|
2421
|
+
"""Set cloned TTS quality preset (speed/quality tradeoff).
|
|
2422
|
+
|
|
2423
|
+
Usage:
|
|
2424
|
+
/clone_quality fast|balanced|high
|
|
2425
|
+
"""
|
|
2426
|
+
if not self.voice_manager:
|
|
2427
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2428
|
+
return
|
|
2429
|
+
preset = (arg or "").strip().lower()
|
|
2430
|
+
if preset not in ("fast", "balanced", "high"):
|
|
2431
|
+
print("Usage: /clone_quality fast|balanced|high")
|
|
2432
|
+
return
|
|
2433
|
+
try:
|
|
2434
|
+
self.voice_manager.set_cloned_tts_quality(preset)
|
|
2435
|
+
print(f"✅ Cloned TTS quality preset: {preset}")
|
|
2436
|
+
except Exception as e:
|
|
2437
|
+
print(f"❌ Failed to set preset: {e}")
|
|
2438
|
+
|
|
2439
|
+
def do_cloning_download(self, arg):
|
|
2440
|
+
"""Explicitly download cloning artifacts (this may take a long time)."""
|
|
2441
|
+
if not self.voice_manager:
|
|
2442
|
+
print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
|
|
2443
|
+
return
|
|
2444
|
+
|
|
2445
|
+
target = (arg or "").strip().lower() or self.cloning_engine
|
|
2446
|
+
engine_name = "f5_tts" if target in ("openf5", "f5", "f5_tts") else target
|
|
2447
|
+
if engine_name == "f5_tts":
|
|
2448
|
+
if importlib.util.find_spec("f5_tts") is None:
|
|
2449
|
+
print("❌ OpenF5 runtime not installed in this environment (missing: f5_tts).")
|
|
2450
|
+
print(" Install: pip install \"abstractvoice[cloning]\"")
|
|
2451
|
+
return
|
|
2452
|
+
elif engine_name == "chroma":
|
|
2453
|
+
# Artifacts download uses huggingface_hub and does not require loading the model.
|
|
2454
|
+
if importlib.util.find_spec("huggingface_hub") is None:
|
|
2455
|
+
print("❌ huggingface_hub is required to download Chroma artifacts.")
|
|
2456
|
+
print(" Install: pip install huggingface_hub")
|
|
2457
|
+
return
|
|
2458
|
+
else:
|
|
2459
|
+
print("Usage: /cloning_download [f5_tts|chroma]")
|
|
2460
|
+
return
|
|
2461
|
+
|
|
2462
|
+
try:
|
|
2463
|
+
cloner = self.voice_manager._get_voice_cloner() # REPL convenience
|
|
2464
|
+
engine = cloner._get_engine(engine_name) # explicit download is an engine concern
|
|
2465
|
+
if engine_name == "f5_tts":
|
|
2466
|
+
print("Downloading OpenF5 artifacts (~5.4GB). This is a one-time cache per machine.")
|
|
2467
|
+
engine.ensure_openf5_artifacts_downloaded()
|
|
2468
|
+
else:
|
|
2469
|
+
print("Downloading Chroma artifacts (very large; requires HF access). This is a one-time cache per machine.")
|
|
2470
|
+
engine.ensure_chroma_artifacts_downloaded()
|
|
2471
|
+
print("✅ Download complete.")
|
|
2472
|
+
except Exception as e:
|
|
2473
|
+
print(f"❌ Download failed: {e}")
|
|
2474
|
+
|
|
2475
|
+
def _is_openf5_cached(self) -> bool:
|
|
2476
|
+
"""Heuristic local check that avoids importing huggingface_hub."""
|
|
2477
|
+
from pathlib import Path
|
|
2478
|
+
import os
|
|
2479
|
+
|
|
2480
|
+
root = Path(os.path.expanduser("~/.cache/abstractvoice/openf5"))
|
|
2481
|
+
if not root.exists():
|
|
2482
|
+
return False
|
|
2483
|
+
cfg = next(iter(root.rglob("*.yaml")), None) or next(iter(root.rglob("*.yml")), None)
|
|
2484
|
+
ckpt = next(iter(root.rglob("*.pt")), None)
|
|
2485
|
+
vocab = next(iter(root.rglob("vocab*.txt")), None) or next(iter(root.rglob("*.txt")), None)
|
|
2486
|
+
return bool(cfg and ckpt and vocab)
|
|
2487
|
+
|
|
2488
|
+
def _is_chroma_cached(self) -> bool:
|
|
2489
|
+
"""Heuristic local check that avoids importing huggingface_hub."""
|
|
2490
|
+
from pathlib import Path
|
|
2491
|
+
import os
|
|
2492
|
+
|
|
2493
|
+
root = Path(os.path.expanduser("~/.cache/abstractvoice/chroma"))
|
|
2494
|
+
if not root.exists():
|
|
2495
|
+
return False
|
|
2496
|
+
required = [
|
|
2497
|
+
"config.json",
|
|
2498
|
+
"processor_config.json",
|
|
2499
|
+
"model.safetensors.index.json",
|
|
2500
|
+
"modeling_chroma.py",
|
|
2501
|
+
"processing_chroma.py",
|
|
2502
|
+
"configuration_chroma.py",
|
|
2503
|
+
]
|
|
2504
|
+
return all((root / name).exists() for name in required)
|
|
2505
|
+
|
|
2506
|
+
def _is_cloning_runtime_ready(self, *, voice_id: str | None = None, engine: str | None = None) -> bool:
|
|
2507
|
+
"""Return whether the selected cloning engine is ready locally (no downloads)."""
|
|
2508
|
+
eng = str(engine or "").strip().lower()
|
|
2509
|
+
if not eng and voice_id and self.voice_manager:
|
|
2510
|
+
try:
|
|
2511
|
+
info = self.voice_manager.get_cloned_voice(voice_id)
|
|
2512
|
+
eng = str((info or {}).get("engine") or "").strip().lower()
|
|
2513
|
+
except Exception:
|
|
2514
|
+
eng = ""
|
|
2515
|
+
if not eng:
|
|
2516
|
+
eng = str(getattr(self, "cloning_engine", "f5_tts") or "f5_tts").strip().lower()
|
|
2517
|
+
|
|
2518
|
+
if eng == "chroma":
|
|
2519
|
+
return (
|
|
2520
|
+
importlib.util.find_spec("torch") is not None
|
|
2521
|
+
and importlib.util.find_spec("transformers") is not None
|
|
2522
|
+
and self._is_chroma_cached()
|
|
2523
|
+
)
|
|
2524
|
+
return importlib.util.find_spec("f5_tts") is not None and self._is_openf5_cached()
|
|
2525
|
+
|
|
2526
|
+
def _seed_hal9000_voice(self):
|
|
2527
|
+
"""Seed a default 'hal9000' cloned voice if sample WAVs are present."""
|
|
2528
|
+
if not self.voice_manager:
|
|
2529
|
+
return
|
|
2530
|
+
try:
|
|
2531
|
+
from pathlib import Path
|
|
2532
|
+
|
|
2533
|
+
sample_dir = Path("audio_samples") / "hal9000"
|
|
2534
|
+
if not sample_dir.exists():
|
|
2535
|
+
return
|
|
2536
|
+
|
|
2537
|
+
# If already present, do nothing.
|
|
2538
|
+
existing_hal = None
|
|
2539
|
+
for v in self.voice_manager.list_cloned_voices():
|
|
2540
|
+
if (v.get("name") or "").lower() == "hal9000":
|
|
2541
|
+
existing_hal = v.get("voice_id")
|
|
2542
|
+
break
|
|
2543
|
+
|
|
2544
|
+
# Seed from the clean short WAV sample to avoid noisy auto-transcriptions.
|
|
2545
|
+
# This avoids repeated artifacts like "how are you hal" bleeding into outputs.
|
|
2546
|
+
if existing_hal is None:
|
|
2547
|
+
ref = sample_dir / "hal9000_hello.wav"
|
|
2548
|
+
if ref.exists():
|
|
2549
|
+
existing_hal = self.voice_manager.clone_voice(
|
|
2550
|
+
str(ref),
|
|
2551
|
+
name="hal9000",
|
|
2552
|
+
reference_text="Hello, Dave.",
|
|
2553
|
+
)
|
|
2554
|
+
else:
|
|
2555
|
+
existing_hal = self.voice_manager.clone_voice(str(sample_dir), name="hal9000")
|
|
2556
|
+
if self.debug_mode:
|
|
2557
|
+
print(f"Seeded cloned voice 'hal9000': {existing_hal}")
|
|
2558
|
+
|
|
2559
|
+
# Do NOT auto-select here; selecting a clone without explicit user action
|
|
2560
|
+
# can cause surprise multi-GB downloads. Users can opt in via /tts_voice.
|
|
2561
|
+
except Exception:
|
|
2562
|
+
# Best-effort only; never block REPL start.
|
|
2563
|
+
return
|
|
2564
|
+
|
|
2565
|
+
def do_tts_engine(self, arg):
|
|
2566
|
+
"""Select TTS engine: auto|piper.
|
|
2567
|
+
|
|
2568
|
+
This recreates the internal VoiceManager instance.
|
|
2569
|
+
"""
|
|
2570
|
+
engine = arg.strip().lower()
|
|
2571
|
+
if engine not in ("auto", "piper"):
|
|
2572
|
+
print("Usage: /tts_engine auto|piper")
|
|
2573
|
+
return
|
|
2574
|
+
|
|
2575
|
+
if self.voice_manager:
|
|
2576
|
+
try:
|
|
2577
|
+
self.voice_manager.cleanup()
|
|
2578
|
+
except Exception:
|
|
2579
|
+
pass
|
|
2580
|
+
|
|
2581
|
+
self.voice_manager = VoiceManager(
|
|
2582
|
+
language=self.current_language,
|
|
2583
|
+
tts_model=self._initial_tts_model,
|
|
2584
|
+
debug_mode=self.debug_mode,
|
|
2585
|
+
tts_engine=engine,
|
|
2586
|
+
allow_downloads=False,
|
|
2587
|
+
cloned_tts_streaming=False,
|
|
2588
|
+
cloning_engine=self.cloning_engine,
|
|
2589
|
+
)
|
|
2590
|
+
print(f"✅ TTS engine set to: {engine}")
|
|
2591
|
+
|
|
2592
|
+
def do_aec(self, arg):
|
|
2593
|
+
"""Enable/disable optional AEC (echo cancellation) for true barge-in.
|
|
2594
|
+
|
|
2595
|
+
Usage:
|
|
2596
|
+
/aec on [delay_ms]
|
|
2597
|
+
/aec off
|
|
2598
|
+
"""
|
|
2599
|
+
if not self.voice_manager:
|
|
2600
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2601
|
+
return
|
|
2602
|
+
|
|
2603
|
+
parts = arg.strip().split()
|
|
2604
|
+
if not parts:
|
|
2605
|
+
enabled = bool(getattr(self.voice_manager, "_aec_enabled", False))
|
|
2606
|
+
delay = int(getattr(self.voice_manager, "_aec_stream_delay_ms", 0))
|
|
2607
|
+
print(f"AEC: {'on' if enabled else 'off'} (delay_ms={delay})")
|
|
2608
|
+
print("Usage: /aec on [delay_ms] | /aec off")
|
|
2609
|
+
return
|
|
2610
|
+
|
|
2611
|
+
if parts[0] == "off":
|
|
2612
|
+
try:
|
|
2613
|
+
self.voice_manager.enable_aec(False)
|
|
2614
|
+
print("✅ AEC disabled")
|
|
2615
|
+
except Exception as e:
|
|
2616
|
+
print(f"❌ AEC disable failed: {e}")
|
|
2617
|
+
return
|
|
2618
|
+
|
|
2619
|
+
if parts[0] != "on":
|
|
2620
|
+
print("Usage: /aec on [delay_ms] | /aec off")
|
|
2621
|
+
return
|
|
2622
|
+
|
|
2623
|
+
delay_ms = 0
|
|
2624
|
+
if len(parts) > 1:
|
|
2625
|
+
try:
|
|
2626
|
+
delay_ms = int(parts[1])
|
|
2627
|
+
except Exception:
|
|
2628
|
+
print("Usage: /aec on [delay_ms] | /aec off")
|
|
2629
|
+
return
|
|
2630
|
+
|
|
2631
|
+
try:
|
|
2632
|
+
self.voice_manager.enable_aec(True, stream_delay_ms=delay_ms)
|
|
2633
|
+
print(f"✅ AEC enabled (delay_ms={delay_ms}).")
|
|
2634
|
+
print("Tip: use /voice full for barge-in behavior when AEC is enabled.")
|
|
2635
|
+
except Exception as e:
|
|
2636
|
+
print(f"❌ AEC enable failed: {e}")
|
|
2637
|
+
|
|
2638
|
+
def do_stt_engine(self, arg):
|
|
2639
|
+
"""Select STT engine: auto|faster_whisper|whisper.
|
|
2640
|
+
|
|
2641
|
+
This recreates the internal VoiceManager instance.
|
|
2642
|
+
"""
|
|
2643
|
+
engine = arg.strip().lower()
|
|
2644
|
+
if engine not in ("auto", "faster_whisper", "whisper"):
|
|
2645
|
+
print("Usage: /stt_engine auto|faster_whisper|whisper")
|
|
2646
|
+
return
|
|
2647
|
+
|
|
2648
|
+
if not self.voice_manager:
|
|
2649
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2650
|
+
return
|
|
2651
|
+
|
|
2652
|
+
# Recreate VoiceManager preserving current TTS engine preference.
|
|
2653
|
+
# If the current engine is unknown, let it auto-select.
|
|
2654
|
+
tts_engine = getattr(self.voice_manager, "_tts_engine_preference", "auto")
|
|
2655
|
+
|
|
2656
|
+
try:
|
|
2657
|
+
self.voice_manager.cleanup()
|
|
2658
|
+
except Exception:
|
|
2659
|
+
pass
|
|
2660
|
+
|
|
2661
|
+
self.voice_manager = VoiceManager(
|
|
2662
|
+
language=self.current_language,
|
|
2663
|
+
tts_model=self._initial_tts_model,
|
|
2664
|
+
debug_mode=self.debug_mode,
|
|
2665
|
+
tts_engine=tts_engine,
|
|
2666
|
+
stt_engine=engine,
|
|
2667
|
+
allow_downloads=False,
|
|
2668
|
+
cloned_tts_streaming=False,
|
|
2669
|
+
cloning_engine=self.cloning_engine,
|
|
2670
|
+
)
|
|
2671
|
+
print(f"✅ STT engine set to: {engine}")
|
|
2672
|
+
|
|
2673
|
+
def do_transcribe(self, arg):
|
|
2674
|
+
"""Transcribe an audio file via the library STT path (faster-whisper by default).
|
|
2675
|
+
|
|
2676
|
+
Usage:
|
|
2677
|
+
/transcribe path/to/audio.wav
|
|
2678
|
+
|
|
2679
|
+
Notes:
|
|
2680
|
+
- This is the simplest way to validate STT without requiring microphone capture.
|
|
2681
|
+
- The default engine is faster-whisper; legacy openai-whisper remains optional.
|
|
2682
|
+
"""
|
|
2683
|
+
if not self.voice_manager:
|
|
2684
|
+
print("🔇 Voice features are disabled. Use '/tts on' to enable.")
|
|
2685
|
+
return
|
|
2686
|
+
|
|
2687
|
+
path = arg.strip()
|
|
2688
|
+
if not path:
|
|
2689
|
+
print("Usage: /transcribe <path/to/audio.wav>")
|
|
2690
|
+
return
|
|
2691
|
+
|
|
2692
|
+
try:
|
|
2693
|
+
text = self.voice_manager.transcribe_file(path)
|
|
2694
|
+
print(f"{Colors.CYAN}{text}{Colors.END}")
|
|
2695
|
+
except Exception as e:
|
|
2696
|
+
print(f"❌ Transcription failed: {e}")
|
|
2697
|
+
if self.debug_mode:
|
|
2698
|
+
import traceback
|
|
2699
|
+
traceback.print_exc()
|
|
679
2700
|
|
|
680
2701
|
def do_clear(self, arg):
|
|
681
2702
|
"""Clear chat history."""
|
|
2703
|
+
self._clear_history()
|
|
2704
|
+
print("History cleared")
|
|
2705
|
+
|
|
2706
|
+
def do_reset(self, arg):
|
|
2707
|
+
"""Reset the session (history + current voice selection)."""
|
|
2708
|
+
try:
|
|
2709
|
+
if self.voice_manager:
|
|
2710
|
+
self.voice_manager.stop_speaking()
|
|
2711
|
+
except Exception:
|
|
2712
|
+
pass
|
|
2713
|
+
|
|
2714
|
+
# Reset voice selection back to Piper (default).
|
|
2715
|
+
self.current_tts_voice = None
|
|
2716
|
+
# Free any heavy cloning engines as part of reset.
|
|
2717
|
+
try:
|
|
2718
|
+
if self.voice_manager and hasattr(self.voice_manager, "unload_cloning_engines"):
|
|
2719
|
+
self.voice_manager.unload_cloning_engines()
|
|
2720
|
+
except Exception:
|
|
2721
|
+
pass
|
|
2722
|
+
# Ensure Piper is ready (in case it was unloaded to save memory).
|
|
2723
|
+
try:
|
|
2724
|
+
if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
|
|
2725
|
+
a = getattr(self.voice_manager, "tts_adapter", None)
|
|
2726
|
+
if hasattr(a, "is_available") and not bool(a.is_available()):
|
|
2727
|
+
self.voice_manager.set_language(self.current_language)
|
|
2728
|
+
except Exception:
|
|
2729
|
+
pass
|
|
2730
|
+
|
|
2731
|
+
# Clear chat history.
|
|
2732
|
+
self._clear_history()
|
|
2733
|
+
print("✅ Reset.")
|
|
2734
|
+
|
|
2735
|
+
def _clear_history(self) -> None:
|
|
682
2736
|
self.messages = [{"role": "system", "content": self.system_prompt}]
|
|
683
2737
|
# Reset token counters
|
|
684
2738
|
self.system_tokens = 0
|
|
685
2739
|
self.user_tokens = 0
|
|
686
2740
|
self.assistant_tokens = 0
|
|
2741
|
+
# Reset word counters
|
|
2742
|
+
self.system_words = 0
|
|
2743
|
+
self.user_words = 0
|
|
2744
|
+
self.assistant_words = 0
|
|
687
2745
|
# Recalculate system tokens
|
|
688
2746
|
self._count_system_tokens()
|
|
689
|
-
|
|
2747
|
+
self._count_system_words()
|
|
690
2748
|
|
|
691
2749
|
def do_system(self, arg):
|
|
692
2750
|
"""Set the system prompt."""
|
|
693
2751
|
if arg.strip():
|
|
694
2752
|
self.system_prompt = arg.strip()
|
|
695
|
-
self.
|
|
2753
|
+
self._clear_history()
|
|
696
2754
|
print(f"System prompt set to: {self.system_prompt}")
|
|
697
2755
|
else:
|
|
698
2756
|
print(f"Current system prompt: {self.system_prompt}")
|
|
699
2757
|
|
|
700
2758
|
def do_exit(self, arg):
|
|
701
2759
|
"""Exit the REPL."""
|
|
702
|
-
|
|
2760
|
+
# Stop any PTT session cleanly.
|
|
2761
|
+
self._ptt_session_active = False
|
|
2762
|
+
self._ptt_recording = False
|
|
2763
|
+
self._ptt_busy = False
|
|
2764
|
+
|
|
2765
|
+
# Stop voice mode / audio best-effort.
|
|
2766
|
+
try:
|
|
2767
|
+
if self.voice_manager:
|
|
2768
|
+
try:
|
|
2769
|
+
self.voice_manager.stop_listening()
|
|
2770
|
+
except Exception:
|
|
2771
|
+
pass
|
|
2772
|
+
try:
|
|
2773
|
+
self.voice_manager.stop_speaking()
|
|
2774
|
+
except Exception:
|
|
2775
|
+
pass
|
|
2776
|
+
except Exception:
|
|
2777
|
+
pass
|
|
2778
|
+
|
|
2779
|
+
try:
|
|
2780
|
+
if self.voice_manager:
|
|
2781
|
+
self.voice_manager.cleanup()
|
|
2782
|
+
except Exception:
|
|
2783
|
+
pass
|
|
703
2784
|
if self.debug_mode:
|
|
704
2785
|
print("Goodbye!")
|
|
705
2786
|
return True
|
|
@@ -781,37 +2862,81 @@ class VoiceREPL(cmd.Cmd):
|
|
|
781
2862
|
|
|
782
2863
|
# If neither voice mode nor TTS is active - don't show any message
|
|
783
2864
|
pass
|
|
2865
|
+
|
|
2866
|
+
def do_verbose(self, arg):
|
|
2867
|
+
"""Toggle verbose per-turn performance stats.
|
|
2868
|
+
|
|
2869
|
+
Usage:
|
|
2870
|
+
/verbose (toggle)
|
|
2871
|
+
/verbose on|off
|
|
2872
|
+
"""
|
|
2873
|
+
s = (arg or "").strip().lower()
|
|
2874
|
+
if s in ("", "toggle"):
|
|
2875
|
+
self.verbose_mode = not bool(getattr(self, "verbose_mode", False))
|
|
2876
|
+
elif s in ("on", "1", "true", "yes", "y"):
|
|
2877
|
+
self.verbose_mode = True
|
|
2878
|
+
elif s in ("off", "0", "false", "no", "n"):
|
|
2879
|
+
self.verbose_mode = False
|
|
2880
|
+
else:
|
|
2881
|
+
print("Usage: /verbose [on|off]")
|
|
2882
|
+
return
|
|
2883
|
+
print(f"Verbose mode: {'on' if self.verbose_mode else 'off'}")
|
|
784
2884
|
|
|
785
2885
|
def do_help(self, arg):
|
|
786
2886
|
"""Show help information."""
|
|
787
2887
|
print("Commands:")
|
|
788
2888
|
print(" /exit, /q, /quit Exit REPL")
|
|
789
2889
|
print(" /clear Clear history")
|
|
2890
|
+
print(" /reset Reset (history + voice)")
|
|
790
2891
|
print(" /tts on|off Toggle TTS")
|
|
791
2892
|
print(" /voice <mode> Voice input: off|full|wait|stop|ptt")
|
|
792
|
-
print(" /
|
|
793
|
-
print(" /
|
|
2893
|
+
print(" /voice ptt Push-to-talk session (SPACE captures, ESC exits)")
|
|
2894
|
+
print(" /language <lang> Switch voice language (en, fr, es, de, ru, zh)")
|
|
2895
|
+
print(" /setvoice [id] List Piper voices or set one (lang.voice_id)")
|
|
794
2896
|
print(" /lang_info Show current language information")
|
|
795
2897
|
print(" /list_languages List all supported languages")
|
|
796
2898
|
print(" /speed <number> Set TTS speed (0.5-2.0, default: 1.0, pitch preserved)")
|
|
797
|
-
print(" /
|
|
2899
|
+
print(" /tts_voice ... Select Piper vs cloned voice (see below)")
|
|
2900
|
+
print(" /tts_engine <e> Switch TTS engine: auto|piper")
|
|
798
2901
|
print(" /whisper <model> Switch Whisper model: tiny|base|small|medium|large")
|
|
2902
|
+
print(" /stt_engine <e> Switch STT engine: auto|faster_whisper|whisper (whisper is optional extra)")
|
|
2903
|
+
print(" /speak <text> Speak text (no LLM call)")
|
|
2904
|
+
print(" /transcribe <path> Transcribe an audio file (faster-whisper by default)")
|
|
799
2905
|
print(" /system <prompt> Set system prompt")
|
|
800
2906
|
print(" /stop Stop voice mode or TTS playback")
|
|
801
2907
|
print(" /pause Pause current TTS playback")
|
|
802
2908
|
print(" /resume Resume paused TTS playback")
|
|
2909
|
+
print(" /aec on|off Optional echo cancellation for true barge-in (requires [aec])")
|
|
803
2910
|
print(" /tokens Display token usage stats")
|
|
2911
|
+
print(" /verbose [on|off] Toggle verbose per-turn stats")
|
|
804
2912
|
print(" /help Show this help")
|
|
2913
|
+
print(" /clones List cloned voices")
|
|
2914
|
+
print(" /clone_info <id> Show cloned voice details")
|
|
2915
|
+
print(" /clone_ref <id> Show cloned voice reference text")
|
|
2916
|
+
print(" /clone_rename ... Rename a cloned voice")
|
|
2917
|
+
print(" /clone_rm <id> Delete a cloned voice")
|
|
2918
|
+
print(" /clone_rm_all --yes Delete ALL cloned voices")
|
|
2919
|
+
print(" /clone_export ... Export a cloned voice (.zip)")
|
|
2920
|
+
print(" /clone_import ... Import a cloned voice (.zip)")
|
|
2921
|
+
print(" /clone <path> [nm] Add a cloned voice from WAV/FLAC/OGG")
|
|
2922
|
+
print(" /clone_use <path> Clone+select voice (or reuse)")
|
|
2923
|
+
print(" /clone-my-voice Record a short prompt and clone it")
|
|
2924
|
+
print(" /tts_voice piper Speak with Piper (default)")
|
|
2925
|
+
print(" /tts_voice clone X Speak with a cloned voice (requires cloning runtime + cache)")
|
|
2926
|
+
print(" /cloning_status Show cloning readiness (no downloads)")
|
|
2927
|
+
print(" /cloning_download Explicitly download OpenF5 artifacts (~5.4GB)")
|
|
2928
|
+
print(" /clone_quality Set cloned TTS speed/quality: fast|balanced|high")
|
|
805
2929
|
print(" /save <filename> Save chat history to file")
|
|
806
2930
|
print(" /load <filename> Load chat history from file")
|
|
807
2931
|
print(" /model <name> Change the LLM model")
|
|
808
2932
|
print(" /temperature <val> Set temperature (0.0-2.0, default: 0.7)")
|
|
809
2933
|
print(" /max_tokens <num> Set max tokens (default: 4096)")
|
|
810
|
-
print(" stop
|
|
2934
|
+
print(" stop (deprecated) use /voice off or say 'stop' during STOP mode")
|
|
811
2935
|
print(" <message> Send to LLM (text mode)")
|
|
812
2936
|
print()
|
|
813
2937
|
print("Note: ALL commands must start with / except 'stop'")
|
|
814
|
-
print("In
|
|
2938
|
+
print("In STOP mode, say 'stop' / 'ok stop' to stop speaking (does not exit voice mode).")
|
|
2939
|
+
print("Shortcut: paste a WAV/FLAC/OGG path to clone+select (optionally: `path | transcript`).")
|
|
815
2940
|
|
|
816
2941
|
def emptyline(self):
|
|
817
2942
|
"""Handle empty line input."""
|
|
@@ -821,6 +2946,10 @@ class VoiceREPL(cmd.Cmd):
|
|
|
821
2946
|
def do_tokens(self, arg):
|
|
822
2947
|
"""Display token usage information."""
|
|
823
2948
|
try:
|
|
2949
|
+
if self._get_tiktoken_encoding() is None:
|
|
2950
|
+
print("Token counting is not available (install: pip install tiktoken).")
|
|
2951
|
+
return
|
|
2952
|
+
|
|
824
2953
|
# Always recalculate tokens to ensure accuracy
|
|
825
2954
|
self._reset_and_recalculate_tokens()
|
|
826
2955
|
|
|
@@ -998,15 +3127,26 @@ class VoiceREPL(cmd.Cmd):
|
|
|
998
3127
|
print(f"Failed to load chat history from {filename}")
|
|
999
3128
|
|
|
1000
3129
|
def _reset_and_recalculate_tokens(self):
|
|
1001
|
-
"""Reset token counts and recalculate for all messages."""
|
|
3130
|
+
"""Reset token/word counts and recalculate for all messages."""
|
|
1002
3131
|
self.system_tokens = 0
|
|
1003
3132
|
self.user_tokens = 0
|
|
1004
3133
|
self.assistant_tokens = 0
|
|
3134
|
+
self.system_words = 0
|
|
3135
|
+
self.user_words = 0
|
|
3136
|
+
self.assistant_words = 0
|
|
1005
3137
|
|
|
1006
3138
|
# Count tokens for all messages
|
|
1007
3139
|
for msg in self.messages:
|
|
1008
3140
|
if isinstance(msg, dict) and "content" in msg and "role" in msg:
|
|
1009
3141
|
self._count_tokens(msg["content"], msg["role"])
|
|
3142
|
+
w = self._count_words(msg["content"])
|
|
3143
|
+
r = msg.get("role")
|
|
3144
|
+
if r == "system":
|
|
3145
|
+
self.system_words = int(w)
|
|
3146
|
+
elif r == "user":
|
|
3147
|
+
self.user_words += int(w)
|
|
3148
|
+
elif r == "assistant":
|
|
3149
|
+
self.assistant_words += int(w)
|
|
1010
3150
|
|
|
1011
3151
|
def _ensure_system_message(self):
|
|
1012
3152
|
"""Ensure there's a system message at the start of messages."""
|
|
@@ -1070,13 +3210,30 @@ def parse_args():
|
|
|
1070
3210
|
"""Parse command line arguments."""
|
|
1071
3211
|
parser = argparse.ArgumentParser(description="AbstractVoice CLI Example")
|
|
1072
3212
|
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
3213
|
+
parser.add_argument("--verbose", action="store_true", help="Show per-turn performance stats")
|
|
1073
3214
|
parser.add_argument("--api", default="http://localhost:11434/api/chat",
|
|
1074
3215
|
help="LLM API URL")
|
|
1075
|
-
parser.add_argument("--model", default="
|
|
3216
|
+
parser.add_argument("--model", default="cogito:3b",
|
|
1076
3217
|
help="LLM model name")
|
|
1077
|
-
parser.add_argument(
|
|
1078
|
-
|
|
1079
|
-
|
|
3218
|
+
parser.add_argument(
|
|
3219
|
+
"--cloning-engine",
|
|
3220
|
+
default="f5_tts",
|
|
3221
|
+
choices=["f5_tts", "chroma"],
|
|
3222
|
+
help="Default cloning backend for new voices (f5_tts|chroma)",
|
|
3223
|
+
)
|
|
3224
|
+
parser.add_argument(
|
|
3225
|
+
"--voice-mode",
|
|
3226
|
+
default="off",
|
|
3227
|
+
choices=["off", "wait", "stop", "full", "ptt"],
|
|
3228
|
+
help="Auto-start microphone voice mode (off|wait|stop|full|ptt). Default: off.",
|
|
3229
|
+
)
|
|
3230
|
+
parser.add_argument(
|
|
3231
|
+
"--language",
|
|
3232
|
+
"--lang",
|
|
3233
|
+
default="en",
|
|
3234
|
+
choices=["en", "fr", "de", "es", "ru", "zh"],
|
|
3235
|
+
help="Voice language for default Piper TTS (en|fr|de|es|ru|zh).",
|
|
3236
|
+
)
|
|
1080
3237
|
parser.add_argument("--tts-model",
|
|
1081
3238
|
help="Specific TTS model to use (overrides language default)")
|
|
1082
3239
|
return parser.parse_args()
|
|
@@ -1093,8 +3250,11 @@ def main():
|
|
|
1093
3250
|
api_url=args.api,
|
|
1094
3251
|
model=args.model,
|
|
1095
3252
|
debug_mode=args.debug,
|
|
3253
|
+
verbose_mode=args.verbose,
|
|
1096
3254
|
language=args.language,
|
|
1097
|
-
tts_model=args.tts_model
|
|
3255
|
+
tts_model=args.tts_model,
|
|
3256
|
+
voice_mode=args.voice_mode,
|
|
3257
|
+
cloning_engine=args.cloning_engine,
|
|
1098
3258
|
)
|
|
1099
3259
|
repl.cmdloop()
|
|
1100
3260
|
except KeyboardInterrupt:
|
|
@@ -1104,4 +3264,4 @@ def main():
|
|
|
1104
3264
|
|
|
1105
3265
|
|
|
1106
3266
|
if __name__ == "__main__":
|
|
1107
|
-
main()
|
|
3267
|
+
main()
|