abstractvoice 0.5.2__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2408 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/text_sanitize.py +33 -0
  32. abstractvoice/tts/__init__.py +3 -3
  33. abstractvoice/tts/adapter_tts_engine.py +210 -0
  34. abstractvoice/tts/tts_engine.py +257 -1208
  35. abstractvoice/vm/__init__.py +2 -0
  36. abstractvoice/vm/common.py +21 -0
  37. abstractvoice/vm/core.py +139 -0
  38. abstractvoice/vm/manager.py +108 -0
  39. abstractvoice/vm/stt_mixin.py +158 -0
  40. abstractvoice/vm/tts_mixin.py +550 -0
  41. abstractvoice/voice_manager.py +6 -1061
  42. abstractvoice-0.6.2.dist-info/METADATA +213 -0
  43. abstractvoice-0.6.2.dist-info/RECORD +53 -0
  44. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/WHEEL +1 -1
  45. abstractvoice-0.6.2.dist-info/entry_points.txt +6 -0
  46. abstractvoice/instant_setup.py +0 -83
  47. abstractvoice/simple_model_manager.py +0 -539
  48. abstractvoice-0.5.2.dist-info/METADATA +0 -1458
  49. abstractvoice-0.5.2.dist-info/RECORD +0 -23
  50. abstractvoice-0.5.2.dist-info/entry_points.txt +0 -2
  51. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/licenses/LICENSE +0 -0
  52. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/top_level.txt +0 -0
@@ -8,11 +8,18 @@ that interacts with an LLM API for text generation.
8
8
 
9
9
  import argparse
10
10
  import cmd
11
+ import atexit
11
12
  import json
12
13
  import re
14
+ import shlex
15
+ import shutil
13
16
  import sys
17
+ import importlib.util
18
+ import threading
19
+ import time
14
20
  import requests
15
21
  from abstractvoice import VoiceManager
22
+ from abstractvoice.text_sanitize import sanitize_markdown_for_speech
16
23
 
17
24
 
18
25
  # ANSI color codes
@@ -31,18 +38,34 @@ class VoiceREPL(cmd.Cmd):
31
38
  """Voice-enabled REPL for LLM interaction."""
32
39
 
33
40
  intro = "" # Will be set in __init__ to include help
34
- prompt = f"{Colors.GREEN}> {Colors.END}"
41
+ prompt = "> "
35
42
 
36
43
  # Override cmd module settings
37
44
  ruler = "" # No horizontal rule line
38
45
  use_rawinput = True
39
46
 
40
- def __init__(self, api_url="http://localhost:11434/api/chat",
41
- model="granite3.3:2b", debug_mode=False, language="en", tts_model=None, disable_tts=False):
47
+ def __init__(
48
+ self,
49
+ api_url="http://localhost:11434/api/chat",
50
+ model="cogito:3b",
51
+ debug_mode=False,
52
+ verbose_mode: bool = False,
53
+ language="en",
54
+ tts_model=None,
55
+ voice_mode: str = "off",
56
+ disable_tts=False,
57
+ cloning_engine: str = "f5_tts",
58
+ ):
42
59
  super().__init__()
43
60
 
61
+ # Best-effort: enable proper line editing + history (Up/Down arrows).
62
+ # Some Python builds (notably when built without readline/libedit) will
63
+ # otherwise treat arrow keys as escape sequences and corrupt the prompt.
64
+ self._init_readline()
65
+
44
66
  # Debug mode
45
67
  self.debug_mode = debug_mode
68
+ self.verbose_mode = bool(verbose_mode)
46
69
 
47
70
  # API settings
48
71
  self.api_url = api_url
@@ -52,6 +75,8 @@ class VoiceREPL(cmd.Cmd):
52
75
 
53
76
  # Language settings
54
77
  self.current_language = language
78
+ self._initial_tts_model = tts_model
79
+ self.cloning_engine = str(cloning_engine or "f5_tts").strip().lower()
55
80
 
56
81
  # Initialize voice manager with language support
57
82
  if disable_tts:
@@ -61,19 +86,36 @@ class VoiceREPL(cmd.Cmd):
61
86
  self.voice_manager = VoiceManager(
62
87
  language=language,
63
88
  tts_model=tts_model,
64
- debug_mode=debug_mode
89
+ debug_mode=debug_mode,
90
+ allow_downloads=False,
91
+ cloned_tts_streaming=False,
92
+ cloning_engine=self.cloning_engine,
65
93
  )
94
+
95
+ # Current speaking voice:
96
+ # - None => Piper (default, language-driven)
97
+ # - str => cloned voice_id
98
+ self.current_tts_voice: str | None = None
99
+
100
+ # When reference_text is auto-generated via ASR ("asr" source), print a
101
+ # ready-to-copy `/clone_set_ref_text ...` hint once per voice for easy correction.
102
+ self._printed_asr_ref_text_hint: set[str] = set()
103
+
104
+ # Seed a default cloned voice (HAL9000) if samples are present.
105
+ self._seed_hal9000_voice()
66
106
 
67
107
  # Settings
68
108
  self.use_tts = True
69
- self.voice_mode = "off" # off, full, wait, stop, ptt
109
+ # Voice input mode (mic). Default: OFF for fast startup + offline-first.
110
+ # Use `--voice-mode stop` (or `/voice stop`) to enable hands-free.
111
+ self.voice_mode = (voice_mode or "off").strip().lower() # off, full, wait, stop, ptt
70
112
  self.voice_mode_active = False # Is voice recognition running?
113
+ self._ptt_session_active = False
114
+ self._ptt_recording = False
115
+ self._ptt_busy = False
71
116
 
72
117
  # System prompt
73
- self.system_prompt = """
74
- You are a Helpful Voice Assistant. By design, your answers are short and more conversational, unless specifically asked to detail something.
75
- You only speak, so never use any text formatting or markdown. Write for a speaker.
76
- """
118
+ self.system_prompt = "You are a Helpful Voice Assistant. By design, your answers are short and conversational, unless specifically asked to detail something. You only speak, so never use any text formatting, hinting, *emotions*, emojis or markdown. Incarnate the speaker, never comment your instructions."
77
119
 
78
120
  # Message history
79
121
  self.messages = [{"role": "system", "content": self.system_prompt}]
@@ -82,27 +124,136 @@ class VoiceREPL(cmd.Cmd):
82
124
  self.system_tokens = 0
83
125
  self.user_tokens = 0
84
126
  self.assistant_tokens = 0
127
+ # LLM token totals (best-effort, Ollama API `eval_count`).
128
+ self.total_llm_out_tokens = 0
129
+ # Word counting
130
+ self.system_words = 0
131
+ self.user_words = 0
132
+ self.assistant_words = 0
133
+ # Best-effort tokenizer cache (tiktoken optional).
134
+ self._tiktoken_encoding = None
135
+ self._tiktoken_unavailable = False
85
136
  self._count_system_tokens()
86
-
137
+ self._count_system_words()
138
+
139
+ # Best-effort metrics captured from voice input paths.
140
+ self._pending_stt_metrics: dict | None = None
141
+
87
142
  if self.debug_mode:
88
143
  print(f"Initialized with API URL: {api_url}")
89
144
  print(f"Using model: {model}")
90
-
145
+
146
+ # Optionally auto-start voice input (mic). Keep OFF by default to avoid
147
+ # loading STT models (slow) unless the user explicitly opts in.
148
+ if self.voice_manager and self.voice_mode and self.voice_mode != "off":
149
+ try:
150
+ self.do_voice(self.voice_mode)
151
+ except Exception:
152
+ # Never block REPL start.
153
+ self.voice_mode = "off"
154
+ self.voice_mode_active = False
155
+
91
156
  # Set intro with help information
92
157
  self.intro = self._get_intro()
158
+
159
+ def _init_readline(self) -> None:
160
+ """Initialize readline history + make ANSI prompts safe (best-effort)."""
161
+ rl = None
162
+ try:
163
+ import readline as _readline # type: ignore
164
+
165
+ rl = _readline
166
+ except Exception:
167
+ # Windows users may have pyreadline3 installed.
168
+ try:
169
+ import pyreadline3 as _readline # type: ignore
170
+
171
+ rl = _readline
172
+ except Exception:
173
+ rl = None
174
+
175
+ if rl is None:
176
+ # Keep prompt simple and avoid ANSI; prevents strange cursor behavior
177
+ # when arrow keys emit escape codes in cooked terminals.
178
+ self.prompt = "> "
179
+ return
180
+
181
+ # Keep prompt plain when readline is enabled. ANSI prompts are fragile
182
+ # across readline/libedit builds and can corrupt redraw/history behavior.
183
+ self.prompt = "> "
184
+
185
+ # Persist history across sessions (best-effort).
186
+ try:
187
+ from pathlib import Path
188
+
189
+ try:
190
+ import appdirs
191
+
192
+ hist_dir = Path(appdirs.user_data_dir("abstractvoice"))
193
+ except Exception:
194
+ hist_dir = Path.home() / ".abstractvoice"
195
+
196
+ hist_dir.mkdir(parents=True, exist_ok=True)
197
+ hist_path = hist_dir / "repl_history"
198
+
199
+ try:
200
+ rl.read_history_file(str(hist_path))
201
+ except FileNotFoundError:
202
+ pass
203
+ except Exception:
204
+ pass
205
+
206
+ try:
207
+ rl.set_history_length(2000)
208
+ except Exception:
209
+ pass
210
+
211
+ def _save_history():
212
+ try:
213
+ rl.write_history_file(str(hist_path))
214
+ except Exception:
215
+ pass
216
+
217
+ atexit.register(_save_history)
218
+ except Exception:
219
+ pass
220
+
221
+ # Ensure Up/Down arrows traverse history reliably across GNU readline and
222
+ # macOS libedit-backed readline. Some libedit defaults perform prefix
223
+ # search/completion, which can look like text is being appended.
224
+ try:
225
+ doc = getattr(rl, "__doc__", "") or ""
226
+ is_libedit = "libedit" in doc.lower()
227
+ if is_libedit:
228
+ # libedit syntax
229
+ rl.parse_and_bind("bind ^[[A ed-prev-history")
230
+ rl.parse_and_bind("bind ^[[B ed-next-history")
231
+ rl.parse_and_bind("bind ^[[OA ed-prev-history")
232
+ rl.parse_and_bind("bind ^[[OB ed-next-history")
233
+ else:
234
+ # GNU readline syntax
235
+ rl.parse_and_bind('"\\e[A": previous-history')
236
+ rl.parse_and_bind('"\\e[B": next-history')
237
+ rl.parse_and_bind('"\\eOA": previous-history')
238
+ rl.parse_and_bind('"\\eOB": next-history')
239
+ except Exception:
240
+ pass
93
241
 
94
242
  def _get_intro(self):
95
243
  """Generate intro message with help."""
96
244
  intro = f"\n{Colors.BOLD}Welcome to AbstractVoice CLI REPL{Colors.END}\n"
97
245
  if self.voice_manager:
98
246
  lang_name = self.voice_manager.get_language_name()
99
- intro += f"API: {self.api_url} | Model: {self.model} | Voice: {lang_name}\n"
247
+ mic = (self.voice_mode or "off").upper()
248
+ intro += f"API: {self.api_url} | Model: {self.model} | Voice: {lang_name} | Mic: {mic} | Cloning: {self.cloning_engine}\n"
100
249
  else:
101
250
  intro += f"API: {self.api_url} | Model: {self.model} | Voice: Disabled\n"
102
251
  intro += f"\n{Colors.CYAN}Quick Start:{Colors.END}\n"
103
252
  intro += " • Type messages to chat with the LLM\n"
104
- intro += " • Use /voice <mode> to enable voice input\n"
253
+ intro += " • Voice input (mic): off by default. Enable: /voice stop (or start with --voice-mode stop)\n"
254
+ intro += " • PTT: /voice ptt then SPACE to capture (ESC exits)\n"
105
255
  intro += " • Use /language <lang> to switch voice language\n"
256
+ intro += " • Use /clones and /tts_voice to use cloned voices\n"
106
257
  intro += " • Type /help for full command list\n"
107
258
  intro += " • Type /exit or /q to quit\n"
108
259
  return intro
@@ -110,6 +261,236 @@ class VoiceREPL(cmd.Cmd):
110
261
  def _count_system_tokens(self):
111
262
  """Count tokens in the system prompt."""
112
263
  self._count_tokens(self.system_prompt, "system")
264
+
265
+ def _count_system_words(self):
266
+ self.system_words = self._count_words(self.system_prompt)
267
+
268
+ def _count_words(self, text: str) -> int:
269
+ s = str(text or "").strip()
270
+ if not s:
271
+ return 0
272
+ # A "word" here is whitespace-delimited for simplicity across languages.
273
+ return len([w for w in re.split(r"\s+", s) if w])
274
+
275
+ def _get_tiktoken_encoding(self):
276
+ if getattr(self, "_tiktoken_unavailable", False):
277
+ return None
278
+ enc = getattr(self, "_tiktoken_encoding", None)
279
+ if enc is not None:
280
+ return enc
281
+ try:
282
+ import tiktoken
283
+ except ImportError:
284
+ self._tiktoken_unavailable = True
285
+ return None
286
+
287
+ try:
288
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
289
+ except Exception:
290
+ try:
291
+ enc = tiktoken.get_encoding("cl100k_base")
292
+ except Exception:
293
+ self._tiktoken_unavailable = True
294
+ return None
295
+
296
+ self._tiktoken_encoding = enc
297
+ return enc
298
+
299
+ def _fmt_s(self, seconds: float | None) -> str:
300
+ try:
301
+ if seconds is None:
302
+ return "--"
303
+ s = float(seconds)
304
+ if s < 0:
305
+ return "--"
306
+ # Keep it compact but readable.
307
+ if s < 10:
308
+ return f"{s:.2f}s"
309
+ return f"{s:.1f}s"
310
+ except Exception:
311
+ return "--"
312
+
313
+ def _fmt_num(self, x: float | None, *, digits: int = 2) -> str:
314
+ try:
315
+ if x is None:
316
+ return "--"
317
+ return f"{float(x):.{int(digits)}f}"
318
+ except Exception:
319
+ return "--"
320
+
321
+ def _fmt_wtok(self, words: int | None, tokens: int | None) -> str:
322
+ w = int(words) if isinstance(words, int) else (int(words) if words is not None else 0)
323
+ if isinstance(tokens, int):
324
+ return f"{w}w/{int(tokens)}tok"
325
+ return f"{w}w/--tok"
326
+
327
+ def _summarize_audio_source(self, source: str) -> tuple[int | None, float | None]:
328
+ """Best-effort: return (file_count, total_seconds) for an audio source path."""
329
+ try:
330
+ from pathlib import Path
331
+
332
+ p = Path(str(source)).expanduser()
333
+ except Exception:
334
+ return None, None
335
+
336
+ try:
337
+ import soundfile as sf
338
+ except Exception:
339
+ return None, None
340
+
341
+ supported = {".wav", ".flac", ".ogg"}
342
+ files = []
343
+ try:
344
+ if p.is_file():
345
+ files = [p]
346
+ elif p.is_dir():
347
+ files = sorted([x for x in p.iterdir() if x.is_file() and x.suffix.lower() in supported])
348
+ else:
349
+ return None, None
350
+ except Exception:
351
+ return None, None
352
+
353
+ total_s = 0.0
354
+ max_files = 25
355
+ for fp in files[:max_files]:
356
+ try:
357
+ info = sf.info(str(fp))
358
+ d = float(getattr(info, "duration", 0.0) or 0.0)
359
+ if d > 0:
360
+ total_s += d
361
+ except Exception:
362
+ continue
363
+
364
+ # If there are too many files, the displayed duration is a lower bound.
365
+ return (int(len(files)) if files else 0), (float(total_s) if total_s > 0 else None)
366
+
367
+ def _print_verbose_turn_stats(self, turn: dict) -> None:
368
+ if not bool(getattr(self, "verbose_mode", False)):
369
+ return
370
+ if not isinstance(turn, dict):
371
+ return
372
+
373
+ stt = turn.get("stt") if isinstance(turn.get("stt"), dict) else None
374
+ llm = turn.get("llm") if isinstance(turn.get("llm"), dict) else {}
375
+ counts = turn.get("counts") if isinstance(turn.get("counts"), dict) else {}
376
+ tts = turn.get("tts") if isinstance(turn.get("tts"), dict) else None
377
+
378
+ in_w = counts.get("in_words")
379
+ out_w = counts.get("out_words")
380
+ in_t = counts.get("in_tokens")
381
+ out_t = counts.get("out_tokens")
382
+
383
+ llm_s = llm.get("s")
384
+ api = llm.get("api") if isinstance(llm.get("api"), dict) else {}
385
+ api_prompt_tok = api.get("prompt_eval_count") if isinstance(api.get("prompt_eval_count"), int) else None
386
+ api_out_tok = api.get("eval_count") if isinstance(api.get("eval_count"), int) else None
387
+
388
+ # Line 1: STT (if any) + LLM + in/out counts and written speed.
389
+ parts1 = []
390
+ if stt:
391
+ stt_s = stt.get("stt_s")
392
+ stt_a = stt.get("audio_s")
393
+ stt_rtf = stt.get("rtf")
394
+ stt_txt = f"STT {self._fmt_s(stt_s)}"
395
+ if stt_a:
396
+ stt_txt += f"(a{self._fmt_s(stt_a)})"
397
+ if stt_rtf is not None:
398
+ stt_txt += f" rtf{self._fmt_num(stt_rtf, digits=2)}"
399
+ parts1.append(stt_txt)
400
+
401
+ if llm_s is not None or api_prompt_tok is not None or api_out_tok is not None:
402
+ llm_txt = f"LLM {self._fmt_s(llm_s)}"
403
+ if api_prompt_tok is not None or api_out_tok is not None:
404
+ p = str(api_prompt_tok) if api_prompt_tok is not None else "--"
405
+ o = str(api_out_tok) if api_out_tok is not None else "--"
406
+ llm_txt += f" (api p{p} o{o})"
407
+ parts1.append(llm_txt)
408
+
409
+ in_txt = f"in {self._fmt_wtok(in_w, in_t)}"
410
+ out_txt = f"out {self._fmt_wtok(out_w, out_t)}"
411
+
412
+ wps_written = None
413
+ try:
414
+ if isinstance(out_w, int) and out_w > 0 and llm_s and float(llm_s) > 0:
415
+ wps_written = float(out_w) / float(llm_s)
416
+ except Exception:
417
+ wps_written = None
418
+
419
+ if wps_written is not None:
420
+ out_txt += f" ({self._fmt_num(wps_written, digits=1)}w/s)"
421
+
422
+ parts1.append(in_txt)
423
+ parts1.append(out_txt)
424
+
425
+ line1 = " | ".join(parts1)
426
+
427
+ # Line 2: TTS (if any) + spoken speed + totals.
428
+ parts2 = []
429
+ if self.voice_manager and self.use_tts:
430
+ if not tts:
431
+ parts2.append("TTS --")
432
+ else:
433
+ eng = str(tts.get("engine") or "").strip().lower()
434
+ if eng == "clone":
435
+ ce = tts.get("clone_engine")
436
+ label = f"clone[{ce}]" if ce else "clone"
437
+ elif eng:
438
+ label = eng
439
+ else:
440
+ label = "tts"
441
+
442
+ err = (tts.get("error") or "").strip()
443
+ if err:
444
+ # Keep single-line and short.
445
+ msg = " ".join(err.split())
446
+ if len(msg) > 120:
447
+ msg = msg[:120].rstrip() + "…"
448
+ parts2.append(f"TTS {label} ERR {msg}")
449
+ else:
450
+ synth_s = tts.get("synth_s")
451
+ audio_s = tts.get("audio_s")
452
+ rtf = tts.get("rtf")
453
+ tts_txt = f"TTS {label} {self._fmt_s(synth_s)}→{self._fmt_s(audio_s)}"
454
+ if rtf is not None:
455
+ tts_txt += f" rtf{self._fmt_num(rtf, digits=2)}"
456
+
457
+ # Extra clone streaming details when available.
458
+ if eng == "clone" and bool(tts.get("streaming")):
459
+ ttfb_s = tts.get("ttfb_s")
460
+ if ttfb_s is not None:
461
+ tts_txt += f" ttfb{self._fmt_s(ttfb_s)}"
462
+ ch = tts.get("chunks")
463
+ if isinstance(ch, int):
464
+ tts_txt += f" ch{ch}"
465
+
466
+ wps_spoken = None
467
+ try:
468
+ if isinstance(out_w, int) and out_w > 0 and audio_s and float(audio_s) > 0:
469
+ wps_spoken = float(out_w) / float(audio_s)
470
+ except Exception:
471
+ wps_spoken = None
472
+ if wps_spoken is not None:
473
+ tts_txt += f" ({self._fmt_num(wps_spoken, digits=1)}w/s)"
474
+
475
+ parts2.append(tts_txt)
476
+ else:
477
+ parts2.append("TTS off")
478
+
479
+ total_words = int(getattr(self, "system_words", 0) + getattr(self, "user_words", 0) + getattr(self, "assistant_words", 0))
480
+ total_tokens = None
481
+ if self._get_tiktoken_encoding() is not None:
482
+ total_tokens = int(getattr(self, "system_tokens", 0) + getattr(self, "user_tokens", 0) + getattr(self, "assistant_tokens", 0))
483
+
484
+ tot_txt = f"tot {self._fmt_wtok(total_words, total_tokens)}"
485
+ if isinstance(getattr(self, "total_llm_out_tokens", None), int) and getattr(self, "total_llm_out_tokens") > 0:
486
+ tot_txt += f" (api out {int(getattr(self, 'total_llm_out_tokens'))}tok)"
487
+ parts2.append(tot_txt)
488
+
489
+ line2 = " | ".join(parts2)
490
+
491
+ # Keep it readable; two lines max.
492
+ print(f"{Colors.YELLOW}{line1}{Colors.END}")
493
+ print(f"{Colors.YELLOW}{line2}{Colors.END}")
113
494
 
114
495
  def parseline(self, line):
115
496
  """Parse the line to extract command and arguments.
@@ -117,14 +498,11 @@ class VoiceREPL(cmd.Cmd):
117
498
  Override to handle / prefix for commands. This ensures /voice, /help, etc.
118
499
  are recognized as commands by stripping the leading / before parsing.
119
500
  """
120
- line = line.strip()
121
-
122
- # If line starts with /, remove it for command processing
123
- if line.startswith('/'):
124
- line = line[1:].strip()
125
-
126
- # Call parent parseline to do the actual parsing
127
- return super().parseline(line)
501
+ # Commands still use leading "/". In PTT mode we don't accept typed input.
502
+ s = line.strip()
503
+ if s.startswith("/"):
504
+ return super().parseline(s[1:].strip())
505
+ return super().parseline(line.strip())
128
506
 
129
507
  def default(self, line):
130
508
  """Handle regular text input.
@@ -133,29 +511,123 @@ class VoiceREPL(cmd.Cmd):
133
511
  All other commands MUST use / prefix.
134
512
  """
135
513
  # Skip empty lines
136
- if not line.strip():
514
+ text = line.strip()
515
+ if not text:
137
516
  return
138
517
 
139
- # ONLY 'stop' is recognized without / (for voice mode convenience)
140
- if line.strip().lower() == "stop":
141
- return self.do_stop("")
142
-
518
+ # In PTT mode we do not accept typed input.
519
+ if self.voice_mode == "ptt":
520
+ print("PTT mode: press SPACE to speak, ESC to exit.")
521
+ return
522
+
143
523
  # Check if in voice mode - don't send to LLM
144
524
  if self.voice_mode_active:
145
525
  if self.debug_mode:
146
- print(f"Voice mode active ({self.voice_mode}). Use /voice off or say 'stop' to exit.")
526
+ print(f"Voice mode active ({self.voice_mode}). Use /voice off to disable.")
527
+ return
528
+
529
+ # Interrupt any ongoing TTS playback immediately when the user types.
530
+ # This is the expected “barge-in by typing” UX for a REPL.
531
+ try:
532
+ if self.voice_manager:
533
+ self.voice_manager.stop_speaking()
534
+ except Exception:
535
+ pass
536
+
537
+ # Shortcut: paste a reference audio path to clone+use a voice.
538
+ # Examples:
539
+ # audio_samples/hal9000/hal9000_hello.wav
540
+ # audio_samples/hal9000/hal9000_hello.wav | Hello, Dave.
541
+ if self._maybe_handle_clone_shortcut(text):
147
542
  return
148
543
 
149
544
  # Everything else goes to LLM
150
- self.process_query(line.strip())
545
+ self._pending_stt_metrics = None
546
+ self.process_query(text)
547
+
548
+ # NOTE: PTT is implemented as a dedicated key-loop session (no typing).
549
+
550
+ def _maybe_handle_clone_shortcut(self, text: str) -> bool:
551
+ """Best-effort: treat a pasted WAV/FLAC/OGG path as `/clone_use`."""
552
+ if not self.voice_manager:
553
+ return False
554
+
555
+ raw = (text or "").strip()
556
+ if not raw:
557
+ return False
558
+ if raw.startswith("/"):
559
+ return False
560
+
561
+ # Optional transcript with a simple pipe syntax:
562
+ # path.wav | Hello.
563
+ left, sep, right = raw.partition("|")
564
+ path_str = left.strip()
565
+ ref_text = right.strip() if sep else ""
566
+ reference_text = ref_text or None
567
+
568
+ # Strip naive wrapping quotes.
569
+ if (path_str.startswith('"') and path_str.endswith('"')) or (path_str.startswith("'") and path_str.endswith("'")):
570
+ path_str = path_str[1:-1].strip()
571
+
572
+ try:
573
+ from pathlib import Path
574
+
575
+ p = Path(path_str).expanduser()
576
+ except Exception:
577
+ return False
578
+
579
+ if not p.exists():
580
+ return False
581
+
582
+ exts = {".wav", ".flac", ".ogg"}
583
+ if p.is_file() and p.suffix.lower() not in exts:
584
+ return False
585
+ if p.is_dir():
586
+ try:
587
+ has_audio = any(x.is_file() and x.suffix.lower() in exts for x in p.iterdir())
588
+ except Exception:
589
+ has_audio = False
590
+ if not has_audio:
591
+ return False
592
+
593
+ # Build a `/clone_use` call with a stable name.
594
+ import shlex as _shlex
595
+
596
+ default_name = p.stem if p.is_file() else p.name
597
+ args = f"{_shlex.quote(str(p))} {_shlex.quote(default_name)}"
598
+ if reference_text:
599
+ args += f" --text {_shlex.quote(reference_text)}"
600
+ try:
601
+ self.do_clone_use(args)
602
+ except Exception as e:
603
+ print(f"❌ Clone shortcut failed: {e}")
604
+ if self.debug_mode:
605
+ import traceback
606
+
607
+ traceback.print_exc()
608
+ return True
151
609
 
152
610
  def process_query(self, query):
153
611
  """Process a query and get a response from the LLM."""
154
612
  if not query:
155
613
  return
614
+
615
+ # Consume any pending STT metrics for this turn (voice/PTT input).
616
+ stt_metrics = getattr(self, "_pending_stt_metrics", None)
617
+ self._pending_stt_metrics = None
618
+
619
+ # If audio is currently playing, stop it so the new request can be handled
620
+ # without overlapping speech.
621
+ try:
622
+ if self.voice_manager:
623
+ self.voice_manager.stop_speaking()
624
+ except Exception:
625
+ pass
156
626
 
157
- # Count user message tokens
158
- self._count_tokens(query, "user")
627
+ # Per-turn counts
628
+ user_words = self._count_words(query)
629
+ self.user_words += int(user_words)
630
+ user_tokens = self._count_tokens(query, "user")
159
631
 
160
632
  # Create the message
161
633
  user_message = {"role": "user", "content": query}
@@ -175,6 +647,7 @@ class VoiceREPL(cmd.Cmd):
175
647
  }
176
648
 
177
649
  # Make API request
650
+ llm_t0 = time.monotonic()
178
651
  response = requests.post(self.api_url, json=payload)
179
652
  response.raise_for_status()
180
653
 
@@ -182,6 +655,22 @@ class VoiceREPL(cmd.Cmd):
182
655
  try:
183
656
  # First, try to parse as JSON
184
657
  response_data = response.json()
658
+ api_llm_metrics = {}
659
+ try:
660
+ # Ollama exposes timing + token counts (nanoseconds).
661
+ # Keep best-effort: if fields are missing, we just omit them.
662
+ for k in (
663
+ "total_duration",
664
+ "load_duration",
665
+ "prompt_eval_count",
666
+ "prompt_eval_duration",
667
+ "eval_count",
668
+ "eval_duration",
669
+ ):
670
+ if k in response_data:
671
+ api_llm_metrics[k] = response_data.get(k)
672
+ except Exception:
673
+ api_llm_metrics = {}
185
674
 
186
675
  # Check for different API formats
187
676
  if "message" in response_data and "content" in response_data["message"]:
@@ -200,6 +689,7 @@ class VoiceREPL(cmd.Cmd):
200
689
 
201
690
  # Handle streaming or non-JSON response
202
691
  response_text = response.text.strip()
692
+ api_llm_metrics = {}
203
693
 
204
694
  # Try to extract content from streaming format if possible
205
695
  if response_text.startswith("{") and "content" in response_text:
@@ -228,9 +718,13 @@ class VoiceREPL(cmd.Cmd):
228
718
  except Exception as e:
229
719
  if self.debug_mode:
230
720
  print(f"Error extracting content from streaming response: {e}")
721
+ llm_t1 = time.monotonic()
722
+ llm_s = float(llm_t1 - llm_t0)
231
723
 
232
- # Count assistant message tokens
233
- self._count_tokens(response_text, "assistant")
724
+ # Per-turn counts
725
+ assistant_words = self._count_words(response_text)
726
+ self.assistant_words += int(assistant_words)
727
+ assistant_tokens = self._count_tokens(response_text, "assistant")
234
728
 
235
729
  # Add to message history
236
730
  self.messages.append({"role": "assistant", "content": response_text})
@@ -238,9 +732,61 @@ class VoiceREPL(cmd.Cmd):
238
732
  # Display the response with color
239
733
  print(f"{Colors.CYAN}{response_text}{Colors.END}")
240
734
 
735
+ # Record last-turn stats (best-effort; printed only in verbose mode).
736
+ self._last_turn_metrics = {
737
+ "stt": stt_metrics,
738
+ "llm": {
739
+ "s": llm_s,
740
+ "api": api_llm_metrics,
741
+ },
742
+ "counts": {
743
+ "in_words": int(user_words),
744
+ "out_words": int(assistant_words),
745
+ "in_tokens": int(user_tokens) if isinstance(user_tokens, int) else None,
746
+ "out_tokens": int(assistant_tokens) if isinstance(assistant_tokens, int) else None,
747
+ },
748
+ }
749
+ try:
750
+ out_tok = api_llm_metrics.get("eval_count") if isinstance(api_llm_metrics, dict) else None
751
+ if isinstance(out_tok, int) and out_tok >= 0:
752
+ self.total_llm_out_tokens += int(out_tok)
753
+ except Exception:
754
+ pass
755
+
241
756
  # Speak the response if voice manager is available
242
757
  if self.voice_manager and self.use_tts:
243
- self.voice_manager.speak(response_text)
758
+ try:
759
+ # UX guard: never trigger big cloning downloads during normal chat.
760
+ if self.current_tts_voice and not self._is_cloning_runtime_ready(voice_id=self.current_tts_voice):
761
+ print(
762
+ "ℹ️ Cloned voice selected but cloning runtime is not ready.\n"
763
+ " Run /cloning_status then /cloning_download, or switch back with /tts_voice piper."
764
+ )
765
+ else:
766
+ self._speak_with_spinner_until_audio_starts(response_text)
767
+ except Exception as e:
768
+ print(f"❌ TTS failed: {e}")
769
+
770
+ # Capture best-effort TTS metrics (Piper or cloned).
771
+ tts_metrics = None
772
+ try:
773
+ if self.voice_manager and hasattr(self.voice_manager, "pop_last_tts_metrics"):
774
+ tts_metrics = self.voice_manager.pop_last_tts_metrics()
775
+ except Exception:
776
+ tts_metrics = None
777
+
778
+ try:
779
+ if isinstance(getattr(self, "_last_turn_metrics", None), dict):
780
+ self._last_turn_metrics["tts"] = tts_metrics
781
+ except Exception:
782
+ pass
783
+
784
+ # Verbose stats (max 2 lines).
785
+ try:
786
+ if self.verbose_mode and isinstance(getattr(self, "_last_turn_metrics", None), dict):
787
+ self._print_verbose_turn_stats(self._last_turn_metrics)
788
+ except Exception:
789
+ pass
244
790
 
245
791
  except requests.exceptions.ConnectionError as e:
246
792
  print(f"❌ Cannot connect to Ollama API at {self.api_url}")
@@ -274,37 +820,29 @@ class VoiceREPL(cmd.Cmd):
274
820
 
275
821
  def _count_tokens(self, text, role):
276
822
  """Count tokens in text."""
823
+ encoding = self._get_tiktoken_encoding()
824
+ if encoding is None:
825
+ return None
277
826
  try:
278
- import tiktoken
279
-
280
- # Initialize the tokenizer
281
- encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
282
-
283
- # Count tokens
284
- token_count = len(encoding.encode(text))
285
-
286
- # Update the token counts based on role
287
- if role == "system":
288
- self.system_tokens = token_count
289
- elif role == "user":
290
- self.user_tokens += token_count
291
- elif role == "assistant":
292
- self.assistant_tokens += token_count
293
-
294
- # Calculate total tokens
295
- total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
296
-
297
- if self.debug_mode:
298
- print(f"{role.capitalize()} tokens: {token_count}")
299
- print(f"Total tokens: {total_tokens}")
300
-
301
- except ImportError:
302
- # If tiktoken is not available, just don't count tokens
303
- pass
827
+ token_count = len(encoding.encode(str(text or "")))
304
828
  except Exception as e:
305
829
  if self.debug_mode:
306
830
  print(f"Error counting tokens: {e}")
307
- pass
831
+ return None
832
+
833
+ # Update the token counts based on role
834
+ if role == "system":
835
+ self.system_tokens = int(token_count)
836
+ elif role == "user":
837
+ self.user_tokens += int(token_count)
838
+ elif role == "assistant":
839
+ self.assistant_tokens += int(token_count)
840
+
841
+ if self.debug_mode:
842
+ total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
843
+ print(f"{role.capitalize()} tokens: {token_count}")
844
+ print(f"Total tokens: {total_tokens}")
845
+ return int(token_count)
308
846
 
309
847
  def _clean_response(self, text):
310
848
  """Clean LLM response text."""
@@ -323,8 +861,12 @@ class VoiceREPL(cmd.Cmd):
323
861
  """Switch voice language.
324
862
 
325
863
  Usage: /language <lang>
326
- Available languages: en, fr, es, de, it
864
+ Available languages: en, fr, es, de, ru, zh
327
865
  """
866
+ if not self.voice_manager:
867
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
868
+ return
869
+
328
870
  if not args:
329
871
  current_name = self.voice_manager.get_language_name()
330
872
  current_code = self.voice_manager.get_language()
@@ -359,10 +901,13 @@ class VoiceREPL(cmd.Cmd):
359
901
  'fr': "Langue changée en français.",
360
902
  'es': "Idioma cambiado a español.",
361
903
  'de': "Sprache auf Deutsch umgestellt.",
362
- 'it': "Lingua cambiata in italiano."
904
+ 'ru': "Язык переключен на русский.",
905
+ 'zh': "语言已切换到中文。"
363
906
  }
364
907
  test_msg = test_messages.get(language, "Language switched.")
365
- self.voice_manager.speak(test_msg)
908
+ # Respect TTS toggle: if the user disabled TTS, don't speak test messages.
909
+ if getattr(self, "use_tts", True):
910
+ self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
366
911
 
367
912
  # Restart voice mode if it was active
368
913
  if was_active:
@@ -383,10 +928,13 @@ class VoiceREPL(cmd.Cmd):
383
928
  /setvoice <voice_id> # Set voice (format: language.voice_id)
384
929
 
385
930
  Examples:
386
- /setvoice # List all voices with JSON-like info
387
- /setvoice fr.css10_vits # Set French CSS10 VITS voice
388
- /setvoice it.mai_male_vits # Set Italian male VITS voice
931
+ /setvoice # List all Piper voices
932
+ /setvoice fr.siwis # Switch to French (voice id is best-effort)
389
933
  """
934
+ if not self.voice_manager:
935
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
936
+ return
937
+
390
938
  if not args:
391
939
  # Show all available voices with metadata
392
940
  print(f"\n{Colors.CYAN}Available Voice Models:{Colors.END}")
@@ -398,7 +946,7 @@ class VoiceREPL(cmd.Cmd):
398
946
  # Get language name
399
947
  lang_names = {
400
948
  'en': 'English', 'fr': 'French', 'es': 'Spanish',
401
- 'de': 'German', 'it': 'Italian'
949
+ 'de': 'German', 'ru': 'Russian', 'zh': 'Chinese'
402
950
  }
403
951
  lang_name = lang_names.get(language, language.upper())
404
952
 
@@ -406,24 +954,22 @@ class VoiceREPL(cmd.Cmd):
406
954
 
407
955
  for voice_id, voice_info in voices.items():
408
956
  cached_icon = "✅" if voice_info.get('cached', False) else "📥"
409
- quality_icon = "✨" if voice_info['quality'] == 'excellent' else "🔧"
410
- size_text = f"{voice_info['size_mb']}MB"
957
+ quality_icon = "🔧"
958
+ size_text = f"{voice_info.get('size_mb', 0)}MB"
411
959
 
412
960
  print(f" {cached_icon} {quality_icon} {language}.{voice_id}")
413
961
  print(f" {voice_info['name']} ({size_text})")
414
962
  print(f" {voice_info['description']}")
415
- if voice_info.get('requires_espeak', False):
416
- print(f" ⚠️ Requires espeak-ng")
963
+ # Piper has no system deps.
417
964
 
418
965
  print(f"\n{Colors.YELLOW}Usage:{Colors.END}")
419
966
  print(" /setvoice <language>.<voice_id>")
420
- print(" Example: /setvoice fr.css10_vits")
421
- print("\n📥 = Download needed ✅ = Ready ✨ = High quality 🔧 = Good quality")
967
+ print(" Example: /setvoice fr.siwis")
968
+ print("\n📥 = Download needed ✅ = Ready")
422
969
 
423
970
  except Exception as e:
424
971
  print(f"❌ Error listing models: {e}")
425
- # Fallback to old method
426
- self.voice_manager.list_voices()
972
+ print(" (No fallback available)")
427
973
  return
428
974
 
429
975
  voice_spec = args.strip()
@@ -451,39 +997,28 @@ class VoiceREPL(cmd.Cmd):
451
997
  # Download and set the specific voice using programmatic API
452
998
  try:
453
999
  print(f"🔄 Setting voice {voice_spec}...")
454
-
455
- # Use the programmatic download API
456
- success = self.voice_manager.download_model(voice_spec)
1000
+ success = self.voice_manager.set_voice(language, voice_id)
457
1001
 
458
1002
  if success:
459
- # Now set the language to match
460
- success = self.voice_manager.set_language(language)
461
-
462
- if success:
463
- # Update current language
464
- self.current_language = language
465
-
466
- print(f"✅ Voice set to {voice_spec}")
467
-
468
- # Test the voice
469
- test_messages = {
470
- 'en': 'Voice changed to English.',
471
- 'fr': 'Voix changée en français.',
472
- 'es': 'Voz cambiada al español.',
473
- 'de': 'Stimme auf Deutsch geändert.',
474
- 'it': 'Voce cambiata in italiano.'
475
- }
476
- test_msg = test_messages.get(language, f'Voice changed to {language}.')
477
- self.voice_manager.speak(test_msg)
1003
+ self.current_language = language
1004
+ print(f"✅ Voice set to {voice_spec}")
478
1005
 
479
- # Restart voice mode if it was active
480
- if was_active:
481
- self.do_voice(self.voice_mode)
482
- else:
483
- print(f"❌ Failed to set language: {language}")
1006
+ test_messages = {
1007
+ 'en': 'Voice changed to English.',
1008
+ 'fr': 'Voix changée en français.',
1009
+ 'es': 'Voz cambiada al español.',
1010
+ 'de': 'Stimme auf Deutsch geändert.',
1011
+ 'ru': 'Голос изменён на русский.',
1012
+ 'zh': '语音已切换到中文。'
1013
+ }
1014
+ test_msg = test_messages.get(language, f'Voice changed to {language}.')
1015
+ if getattr(self, "use_tts", True):
1016
+ self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
1017
+
1018
+ if was_active:
1019
+ self.do_voice(self.voice_mode)
484
1020
  else:
485
- print(f"❌ Failed to download voice: {voice_spec}")
486
- print(" Check your internet connection or try a different voice")
1021
+ print(f"❌ Failed to set voice: {voice_spec}")
487
1022
 
488
1023
  except Exception as e:
489
1024
  print(f"❌ Error setting voice: {e}")
@@ -521,185 +1056,1736 @@ class VoiceREPL(cmd.Cmd):
521
1056
  off - Disable voice input
522
1057
  full - Continuous listening, interrupts TTS on speech detection
523
1058
  wait - Pause listening while TTS is speaking (recommended)
524
- stop - Only stops TTS on 'stop' keyword (planned)
525
- ptt - Push-to-talk mode (planned)
1059
+ stop - Keep listening while speaking, but only stop TTS on stop phrase
1060
+ ptt - Push-to-talk (use /ptt to record one utterance)
526
1061
  """
527
- arg = arg.lower().strip()
1062
+ arg = (arg or "").lower().strip()
528
1063
 
529
1064
  # Handle legacy "on" argument
530
1065
  if arg == "on":
531
1066
  arg = "wait"
532
1067
 
533
1068
  if arg in ["off", "full", "wait", "stop", "ptt"]:
534
- # If switching from one mode to another, stop current mode first
535
- if self.voice_mode_active and arg != "off":
536
- self._voice_stop_callback()
537
-
1069
+ if not self.voice_manager:
1070
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
1071
+ return
1072
+
1073
+ # Exit PTT session if running.
1074
+ if self._ptt_session_active:
1075
+ self._ptt_session_active = False
1076
+ self._ptt_recording = False
1077
+ self._ptt_busy = False
1078
+
1079
+ # Stop any ongoing mic session.
1080
+ try:
1081
+ self.voice_manager.stop_listening()
1082
+ except Exception:
1083
+ pass
1084
+ self.voice_mode_active = False
1085
+
538
1086
  self.voice_mode = arg
539
1087
  self.voice_manager.set_voice_mode(arg)
540
-
1088
+
541
1089
  if arg == "off":
542
- if self.voice_mode_active:
543
- self._voice_stop_callback()
544
- else:
545
- # Start voice recognition for non-off modes
546
- self.voice_mode_active = True
547
-
548
- # Start listening with callbacks
1090
+ print("Voice mode disabled.")
1091
+ return
1092
+
1093
+ if arg == "ptt":
1094
+ # PTT is a dedicated session: no text entry.
1095
+ print("Voice mode: PTT - Push-to-talk (no typing).")
1096
+ print("SPACE: start/stop recording (transcribe on stop)")
1097
+ print("ESC: exit PTT mode")
1098
+ self._run_ptt_session()
1099
+ return
1100
+
1101
+ # Continuous listening modes.
1102
+ try:
549
1103
  self.voice_manager.listen(
550
1104
  on_transcription=self._voice_callback,
551
- on_stop=lambda: self._voice_stop_callback()
1105
+ # Stop phrase interrupts TTS; keep listening.
1106
+ on_stop=lambda: (
1107
+ print("\n⏹️ Stopped speaking.\n") if (self.voice_manager and self.voice_manager.is_speaking()) else None
1108
+ ),
552
1109
  )
553
-
554
- # Print mode-specific instructions
555
- if arg == "full":
556
- print("Voice mode: FULL - Continuous listening, interrupts TTS on speech.")
557
- print("Say 'stop' to exit.")
558
- elif arg == "wait":
559
- print("Voice mode: WAIT - Pauses listening while speaking (recommended).")
560
- print("Say 'stop' to exit.")
561
- elif arg == "stop":
562
- print("Voice mode: STOP (Planned) - Only stops TTS on 'stop' keyword.")
563
- print("Currently same as WAIT mode.")
564
- elif arg == "ptt":
565
- print("Voice mode: PTT (Planned) - Push-to-talk functionality.")
566
- print("Currently same as WAIT mode.")
1110
+ self.voice_mode_active = True
1111
+ except Exception as e:
1112
+ self.voice_mode_active = False
1113
+ self.voice_mode = "off"
1114
+ print(f" Failed to start microphone listening: {e}")
1115
+ print(" Tip: check microphone permissions/device availability.")
1116
+ return
1117
+
1118
+ if arg == "wait":
1119
+ print("Voice mode: WAIT - Listens continuously except while speaking.")
1120
+ print("Use /voice off to disable.")
1121
+ elif arg == "stop":
1122
+ print("Voice mode: STOP - Always listens; stop phrase stops TTS.")
1123
+ print("Use /voice off to disable.")
1124
+ elif arg == "full":
1125
+ print("Voice mode: FULL - Interrupts TTS on any speech (best with AEC/headset).")
1126
+ print("Use /voice off to disable.")
567
1127
  else:
568
1128
  print("Usage: /voice off | full | wait | stop | ptt")
569
1129
  print(" off - Disable voice input")
570
1130
  print(" full - Continuous listening, interrupts TTS on speech")
571
- print(" wait - Pause listening while speaking (recommended)")
572
- print(" stop - Only stop TTS on 'stop' keyword (planned)")
573
- print(" ptt - Push-to-talk mode (planned)")
574
-
575
- def _voice_callback(self, text):
576
- """Callback for voice recognition."""
577
- # Print what the user said
578
- print(f"\n> {text}")
579
-
580
- # Check if the user said 'stop' to exit voice mode
581
- if text.lower() == "stop":
582
- self._voice_stop_callback()
583
- # Don't process "stop" as a query
1131
+ print(" wait - Listen except while speaking")
1132
+ print(" stop - Always listen; stop phrase stops TTS")
1133
+ print(" ptt - Push-to-talk (no typing; SPACE triggers capture)")
1134
+
1135
+ def do_ptt(self, arg):
1136
+ """Push-to-talk: record a single utterance, then process it.
1137
+
1138
+ Usage:
1139
+ /ptt
1140
+ """
1141
+ if not self.voice_manager:
1142
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
584
1143
  return
585
-
586
- # Mode-specific handling
587
- if self.voice_mode == "stop":
588
- # In 'stop' mode, don't interrupt TTS - just queue the message
589
- # But since we're in callback, TTS interrupt is already paused
590
- pass
591
- elif self.voice_mode == "ptt":
592
- # In PTT mode, process immediately
1144
+ print("❌ /ptt is deprecated. Use: /voice ptt (then SPACE)")
1145
+ return
1146
+
1147
+ # Ensure we are not already listening.
1148
+ try:
1149
+ self.voice_manager.stop_listening()
1150
+ except Exception:
593
1151
  pass
594
- # 'full' mode has default behavior
595
-
596
- # Process the user's query
597
- self.process_query(text)
598
-
599
- def _voice_stop_callback(self):
600
- """Callback when voice mode is stopped."""
601
- self.voice_mode = "off"
602
- self.voice_mode_active = False
603
- self.voice_manager.stop_listening()
604
- print("Voice mode disabled.")
605
-
606
- def do_tts(self, arg):
607
- """Toggle text-to-speech."""
608
- arg = arg.lower().strip()
609
-
610
- if arg == "on":
611
- self.use_tts = True
612
- print("TTS enabled" if self.debug_mode else "")
613
- elif arg == "off":
614
- self.use_tts = False
615
- print("TTS disabled" if self.debug_mode else "")
616
- else:
617
- print("Usage: /tts on | off")
618
-
619
- def do_speed(self, arg):
620
- """Set the TTS speed multiplier."""
621
- if not arg.strip():
622
- print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
1152
+
1153
+ return
1154
+
1155
+ def _run_ptt_session(self) -> None:
1156
+ """PTT mode key loop (no typing).
1157
+
1158
+ Clean semantics:
1159
+ - SPACE toggles recording (start/stop)
1160
+ - on stop: transcribe immediately and send to the LLM
1161
+ - ESC exits PTT mode (returns to STOP mode)
1162
+
1163
+ This avoids relying on VAD end-of-utterance, which is fragile when speaker
1164
+ echo is present (common on laptop speakers).
1165
+ """
1166
+ if not self.voice_manager:
623
1167
  return
624
-
1168
+ self._ptt_session_active = True
1169
+ self._ptt_recording = False
1170
+ self._ptt_busy = False
1171
+
1172
+ # Lazy imports: keep REPL startup snappy.
1173
+ import io
1174
+ import wave
1175
+
625
1176
  try:
626
- speed = float(arg.strip())
1177
+ import sounddevice as sd
1178
+ except Exception as e:
1179
+ print(f"❌ PTT requires sounddevice: {e}")
1180
+ self._ptt_session_active = False
1181
+ return
1182
+
1183
+ sr = 16000
1184
+ frames: list[bytes] = []
1185
+ stream = {"obj": None}
1186
+ cols = 80
1187
+ try:
1188
+ cols = int(shutil.get_terminal_size((80, 20)).columns)
1189
+ except Exception:
1190
+ cols = 80
1191
+
1192
+ def _clear_status() -> None:
1193
+ try:
1194
+ sys.stdout.write("\r" + (" " * max(10, cols - 1)) + "\r")
1195
+ sys.stdout.flush()
1196
+ except Exception:
1197
+ pass
1198
+
1199
+ def _status_line(msg: str) -> None:
1200
+ # Render on a single line (no newline) so SPACE can be pressed repeatedly.
1201
+ try:
1202
+ _clear_status()
1203
+ sys.stdout.write(str(msg)[: max(0, cols - 1)])
1204
+ sys.stdout.flush()
1205
+ except Exception:
1206
+ pass
1207
+
1208
+ def _println(msg: str = "") -> None:
1209
+ # When in raw terminal mode, '\n' does NOT reliably return to column 0.
1210
+ # Use CRLF explicitly to prevent "diagonal drifting" rendering.
1211
+ try:
1212
+ _clear_status()
1213
+ sys.stdout.write("\r\n" + str(msg) + "\r\n")
1214
+ sys.stdout.flush()
1215
+ except Exception:
1216
+ pass
1217
+
1218
+ def _start_recording() -> None:
1219
+ nonlocal frames
1220
+ if self._ptt_recording:
1221
+ return
1222
+ if self._ptt_busy:
1223
+ return
1224
+ frames = []
1225
+
1226
+ # Interrupt any speech immediately.
1227
+ try:
1228
+ self.voice_manager.stop_speaking()
1229
+ except Exception:
1230
+ pass
1231
+
1232
+ def _cb(indata, _frames, _time, status):
1233
+ if status and self.debug_mode:
1234
+ pass
1235
+ try:
1236
+ frames.append(indata.copy().tobytes())
1237
+ except Exception:
1238
+ pass
1239
+
1240
+ try:
1241
+ stream["obj"] = sd.InputStream(
1242
+ samplerate=sr,
1243
+ channels=1,
1244
+ dtype="int16",
1245
+ callback=_cb,
1246
+ blocksize=int(sr * 0.03),
1247
+ )
1248
+ stream["obj"].start()
1249
+ self._ptt_recording = True
1250
+ _status_line("🎙️ Recording… (SPACE to send, ESC to exit)")
1251
+ except Exception as e:
1252
+ self._ptt_recording = False
1253
+ stream["obj"] = None
1254
+ _clear_status()
1255
+ _println(f"❌ Failed to start microphone stream: {e}")
1256
+
1257
+ def _stop_recording_and_send() -> None:
1258
+ if not self._ptt_recording:
1259
+ return
1260
+ self._ptt_recording = False
1261
+ _clear_status()
1262
+
1263
+ try:
1264
+ if stream["obj"] is not None:
1265
+ try:
1266
+ stream["obj"].stop()
1267
+ except Exception:
1268
+ pass
1269
+ try:
1270
+ stream["obj"].close()
1271
+ except Exception:
1272
+ pass
1273
+ finally:
1274
+ stream["obj"] = None
1275
+
1276
+ pcm = b"".join(frames)
1277
+ if len(pcm) < int(sr * 0.25) * 2:
1278
+ _println("…(too short, try again)")
1279
+ return
1280
+
1281
+ buf = io.BytesIO()
1282
+ with wave.open(buf, "wb") as w:
1283
+ w.setnchannels(1)
1284
+ w.setsampwidth(2)
1285
+ w.setframerate(sr)
1286
+ w.writeframes(pcm)
1287
+ wav_bytes = buf.getvalue()
1288
+
1289
+ self._ptt_busy = True
1290
+ try:
1291
+ audio_s = 0.0
1292
+ try:
1293
+ if sr and sr > 0:
1294
+ audio_s = float(len(pcm)) / float(int(sr) * 2)
1295
+ except Exception:
1296
+ audio_s = 0.0
1297
+
1298
+ t0 = time.monotonic()
1299
+ text = (self.voice_manager.transcribe_from_bytes(wav_bytes, language=self.current_language) or "").strip()
1300
+ t1 = time.monotonic()
1301
+ stt_s = float(t1 - t0)
1302
+ self._pending_stt_metrics = {
1303
+ "stt_s": stt_s,
1304
+ "audio_s": float(audio_s),
1305
+ "rtf": (stt_s / float(audio_s)) if audio_s else None,
1306
+ "sample_rate": int(sr),
1307
+ "chunks": None,
1308
+ "chunk_ms": None,
1309
+ "profile": "ptt",
1310
+ "ts": time.time(),
1311
+ }
1312
+ except Exception as e:
1313
+ self._ptt_busy = False
1314
+ _println(f"❌ Transcription failed: {e}")
1315
+ return
1316
+ self._ptt_busy = False
1317
+
1318
+ if not text:
1319
+ _println("…(no transcription)")
1320
+ return
1321
+
1322
+ _println(f"> {text}")
1323
+ self.process_query(text)
1324
+
1325
+ # Platform key read.
1326
+ import sys
1327
+ if sys.platform == "win32":
1328
+ import msvcrt
1329
+
1330
+ while self._ptt_session_active:
1331
+ ch = msvcrt.getwch()
1332
+ if ch == "\x1b": # ESC
1333
+ break
1334
+ if self._ptt_busy:
1335
+ continue
1336
+ if ch == " ":
1337
+ if not self._ptt_recording:
1338
+ _start_recording()
1339
+ else:
1340
+ _stop_recording_and_send()
1341
+ else:
1342
+ import termios
1343
+ import tty
1344
+
1345
+ fd = sys.stdin.fileno()
1346
+ old = termios.tcgetattr(fd)
1347
+ try:
1348
+ tty.setraw(fd)
1349
+
1350
+ def _run_in_cooked(block):
1351
+ """Run a block with normal tty settings.
1352
+
1353
+ In raw mode, many terminals treat '\n' as LF without CR, so prints from
1354
+ deeper code paths (LLM responses) can drift/indent. We temporarily
1355
+ restore the terminal mode to keep output rendering stable.
1356
+ """
1357
+ try:
1358
+ termios.tcsetattr(fd, termios.TCSADRAIN, old)
1359
+ except Exception:
1360
+ pass
1361
+ try:
1362
+ block()
1363
+ finally:
1364
+ try:
1365
+ tty.setraw(fd)
1366
+ except Exception:
1367
+ pass
1368
+
1369
+ while self._ptt_session_active:
1370
+ ch = sys.stdin.read(1)
1371
+ if ch == "\x1b": # ESC
1372
+ break
1373
+ if self._ptt_busy:
1374
+ continue
1375
+ if ch == " ":
1376
+ if not self._ptt_recording:
1377
+ _start_recording()
1378
+ else:
1379
+ _run_in_cooked(_stop_recording_and_send)
1380
+ finally:
1381
+ termios.tcsetattr(fd, termios.TCSADRAIN, old)
1382
+
1383
+ self._ptt_session_active = False
1384
+ self._ptt_recording = False
1385
+ self._ptt_busy = False
1386
+ try:
1387
+ if stream["obj"] is not None:
1388
+ stream["obj"].stop()
1389
+ stream["obj"].close()
1390
+ except Exception:
1391
+ pass
1392
+ _clear_status()
1393
+ # Ensure we end on a clean line before restoring other modes.
1394
+ try:
1395
+ sys.stdout.write("\r\n")
1396
+ sys.stdout.flush()
1397
+ except Exception:
1398
+ pass
1399
+ # Restore to STOP after exiting PTT.
1400
+ try:
1401
+ self.do_voice("stop")
1402
+ except Exception:
1403
+ pass
1404
+
1405
+ def _voice_callback(self, text):
1406
+ """Callback for voice recognition."""
1407
+ # Capture best-effort STT metrics from the recognizer (for verbose stats).
1408
+ stt_metrics = None
1409
+ try:
1410
+ vm = self.voice_manager
1411
+ rec = getattr(vm, "voice_recognizer", None) if vm else None
1412
+ if rec is not None and hasattr(rec, "pop_last_stt_metrics"):
1413
+ stt_metrics = rec.pop_last_stt_metrics()
1414
+ except Exception:
1415
+ stt_metrics = None
1416
+ self._pending_stt_metrics = stt_metrics
1417
+
1418
+ # Print what the user said
1419
+ print(f"\n> {text}")
1420
+ # NOTE: stop phrases are handled by the stop_callback path (interrupt TTS).
1421
+ # We do not use "stop" to exit voice mode; use /voice off explicitly.
1422
+
1423
+ # Mode-specific handling
1424
+ if self.voice_mode == "stop":
1425
+ # In 'stop' mode, don't interrupt TTS - just queue the message
1426
+ # But since we're in callback, TTS interrupt is already paused
1427
+ pass
1428
+ elif self.voice_mode == "ptt":
1429
+ # In PTT mode, process immediately
1430
+ pass
1431
+ # 'full' mode has default behavior
1432
+
1433
+ # Process the user's query
1434
+ self.process_query(text)
1435
+
1436
+ def _voice_stop_callback(self):
1437
+ """Callback when voice mode is stopped."""
1438
+ self.voice_mode = "off"
1439
+ self.voice_mode_active = False
1440
+ self.voice_manager.stop_listening()
1441
+ print("Voice mode disabled.")
1442
+
1443
+ def do_tts(self, arg):
1444
+ """Toggle text-to-speech."""
1445
+ arg = arg.lower().strip()
1446
+
1447
+ if arg == "on":
1448
+ self.use_tts = True
1449
+ if self.voice_manager is None:
1450
+ # Re-enable voice features (TTS/STT) by creating a VoiceManager.
1451
+ self.voice_manager = VoiceManager(
1452
+ language=self.current_language,
1453
+ tts_model=self._initial_tts_model,
1454
+ debug_mode=self.debug_mode,
1455
+ allow_downloads=False,
1456
+ cloned_tts_streaming=False,
1457
+ cloning_engine=self.cloning_engine,
1458
+ )
1459
+ print("TTS enabled" if self.debug_mode else "")
1460
+ elif arg == "off":
1461
+ self.use_tts = False
1462
+ print("TTS disabled" if self.debug_mode else "")
1463
+ else:
1464
+ print("Usage: /tts on | off")
1465
+
1466
+ def do_speed(self, arg):
1467
+ """Set the TTS speed multiplier."""
1468
+ if not self.voice_manager:
1469
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1470
+ return
1471
+ if not arg.strip():
1472
+ print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
1473
+ return
1474
+
1475
+ try:
1476
+ speed = float(arg.strip())
627
1477
  if 0.5 <= speed <= 2.0:
628
1478
  self.voice_manager.set_speed(speed)
629
1479
  print(f"TTS speed set to {speed}x")
630
1480
  else:
631
- print("Speed should be between 0.5 and 2.0")
632
- except ValueError:
633
- print("Usage: /speed <number> (e.g., /speed 1.5)")
634
-
635
- def do_tts_model(self, arg):
636
- """Change TTS model.
637
-
638
- Available models (quality ranking):
639
- vits - BEST quality (requires espeak-ng)
640
- fast_pitch - Good quality (works everywhere)
641
- glow-tts - Alternative fallback
642
- tacotron2-DDC - Legacy
643
-
1481
+ print("Speed should be between 0.5 and 2.0")
1482
+ except ValueError:
1483
+ print("Usage: /speed <number> (e.g., /speed 1.5)")
1484
+
1485
+ def do_tts_model(self, arg):
1486
+ """Deprecated: legacy TTS model switching.
1487
+
1488
+ AbstractVoice core is Piper-first; use `/setvoice` (Piper voices) or cloned voices.
1489
+ """
1490
+ print("❌ /tts_model is not supported (Piper-first core).")
1491
+ print(" Use /setvoice for Piper voices, or /tts_voice clone <id> for cloned voices.")
1492
+
1493
+ def do_whisper(self, arg):
1494
+ """Change Whisper model."""
1495
+ if not self.voice_manager:
1496
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
1497
+ return
1498
+ model = arg.strip()
1499
+ if not model:
1500
+ print(f"Current Whisper model: {self.voice_manager.get_whisper()}")
1501
+ return
1502
+
1503
+ self.voice_manager.set_whisper(model)
1504
+
1505
+ def do_speak(self, arg):
1506
+ """Speak a text immediately (without calling the LLM).
1507
+
1508
+ Usage:
1509
+ /speak Hello world
1510
+ """
1511
+ if not self.voice_manager:
1512
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1513
+ return
1514
+
1515
+ text = arg.strip()
1516
+ if not text:
1517
+ print("Usage: /speak <text>")
1518
+ return
1519
+
1520
+ try:
1521
+ self._speak_with_spinner_until_audio_starts(text)
1522
+ if self.verbose_mode:
1523
+ out_words = self._count_words(text)
1524
+ out_tokens = None
1525
+ try:
1526
+ enc = self._get_tiktoken_encoding()
1527
+ if enc is not None:
1528
+ out_tokens = int(len(enc.encode(str(text or ""))))
1529
+ except Exception:
1530
+ out_tokens = None
1531
+
1532
+ tts_metrics = None
1533
+ try:
1534
+ if hasattr(self.voice_manager, "pop_last_tts_metrics"):
1535
+ tts_metrics = self.voice_manager.pop_last_tts_metrics()
1536
+ except Exception:
1537
+ tts_metrics = None
1538
+
1539
+ turn = {
1540
+ "stt": None,
1541
+ "llm": {},
1542
+ "counts": {
1543
+ "in_words": 0,
1544
+ "out_words": int(out_words),
1545
+ "in_tokens": None,
1546
+ "out_tokens": out_tokens,
1547
+ },
1548
+ "tts": tts_metrics,
1549
+ }
1550
+ self._print_verbose_turn_stats(turn)
1551
+ except Exception as e:
1552
+ print(f"❌ Speak failed: {e}")
1553
+ if self.debug_mode:
1554
+ import traceback
1555
+ traceback.print_exc()
1556
+
1557
+ def _speak_with_spinner_until_audio_starts(self, text: str) -> None:
1558
+ """REPL UX: show spinner while waiting for first audio, then stop.
1559
+
1560
+ This avoids corrupting the `cmd` prompt while still giving feedback during
1561
+ long cloned-TTS synthesis. Once playback starts, the prompt is displayed
1562
+ normally so the user can interrupt anytime by typing.
1563
+ """
1564
+ if not self.voice_manager:
1565
+ return
1566
+
1567
+ # LLM output often contains Markdown. Strip the most common formatting
1568
+ # tokens so TTS stays natural (do not change what is printed).
1569
+ speak_text = sanitize_markdown_for_speech(text)
1570
+
1571
+ is_clone = bool(self.current_tts_voice)
1572
+ if not is_clone:
1573
+ # Offline-first: Piper voices must be explicitly cached. Provide a clear
1574
+ # message instead of hanging on implicit downloads.
1575
+ try:
1576
+ a = getattr(self.voice_manager, "tts_adapter", None)
1577
+ if a is not None and hasattr(a, "is_available") and not bool(a.is_available()):
1578
+ lang = str(getattr(self, "current_language", "en") or "en").strip().lower()
1579
+ raise RuntimeError(
1580
+ f"Piper voice model for '{lang}' is not available locally.\n"
1581
+ f"Run: python -m abstractvoice download --piper {lang}"
1582
+ )
1583
+ except RuntimeError:
1584
+ raise
1585
+ except Exception:
1586
+ pass
1587
+ ind = self._busy_indicator(enabled=is_clone)
1588
+ try:
1589
+ if is_clone:
1590
+ ind.start()
1591
+ self.voice_manager.speak(speak_text, voice=self.current_tts_voice)
1592
+
1593
+ if not is_clone:
1594
+ return
1595
+
1596
+ # Wait until audio playback actually starts (or synthesis ends without audio).
1597
+ vm = self.voice_manager
1598
+ while True:
1599
+ try:
1600
+ playing = bool(vm.is_speaking())
1601
+ synth_active = bool(
1602
+ getattr(vm, "_cloned_synthesis_active", None) and vm._cloned_synthesis_active.is_set()
1603
+ )
1604
+ except Exception:
1605
+ playing, synth_active = False, False
1606
+
1607
+ if playing:
1608
+ break
1609
+
1610
+ # If synthesis is no longer active and we aren't playing, stop the spinner
1611
+ # (either done very quickly or failed).
1612
+ if not synth_active:
1613
+ break
1614
+
1615
+ time.sleep(0.05)
1616
+ finally:
1617
+ try:
1618
+ ind.stop()
1619
+ except Exception:
1620
+ pass
1621
+ # If ASR auto-generated the clone's reference_text, print an easy override command
1622
+ # (once). We do this after stopping the spinner to avoid corrupting the prompt line.
1623
+ try:
1624
+ if is_clone and self.current_tts_voice:
1625
+ self._maybe_print_asr_ref_text_override(self.current_tts_voice)
1626
+ except Exception:
1627
+ pass
1628
+ # Do not print the prompt manually: `cmd` will render it on return,
1629
+ # and printing here can result in duplicate prompts (`> >`).
1630
+
1631
+ def _maybe_print_asr_ref_text_override(self, voice_id: str) -> None:
1632
+ """If `reference_text` was auto-generated via ASR, print a paste-ready override hint.
1633
+
1634
+ Important: `/clone_set_ref_text` uses a simple `split(maxsplit=1)`, so quoting is not
1635
+ interpreted. We therefore print the command *without* quotes to avoid storing them.
1636
+ """
1637
+ if not self.voice_manager:
1638
+ return
1639
+ vid = str(voice_id or "").strip()
1640
+ if not vid:
1641
+ return
1642
+ if vid in self._printed_asr_ref_text_hint:
1643
+ return
1644
+ try:
1645
+ info = self.voice_manager.get_cloned_voice(vid) or {}
1646
+ except Exception:
1647
+ return
1648
+ meta = info.get("meta") or {}
1649
+ src = str(meta.get("reference_text_source") or "").strip().lower()
1650
+ ref_text = str(info.get("reference_text") or "").strip()
1651
+ if not ref_text:
1652
+ return
1653
+ if src != "asr":
1654
+ return
1655
+
1656
+ # Mark first so any printing errors won't cause repeated spam.
1657
+ self._printed_asr_ref_text_hint.add(vid)
1658
+
1659
+ prefix = vid[:8] if len(vid) >= 8 else vid
1660
+ name = str(info.get("name") or "").strip()
1661
+ label = f"{name} ({prefix})" if name else prefix
1662
+ print("ℹ️ Auto-generated reference transcript (ASR).")
1663
+ print(f" Voice: {label}")
1664
+ print(" If you want to correct it, copy/paste and edit the text after the id:")
1665
+ print(f" /clone_set_ref_text {prefix} {ref_text}")
1666
+
1667
+ class _busy_indicator:
1668
+ """A minimal, discreet spinner (no extra lines)."""
1669
+
1670
+ def __init__(self, enabled: bool = False):
1671
+ self.enabled = bool(enabled)
1672
+ self._stop = threading.Event()
1673
+ self._thread = None
1674
+
1675
+ def start(self):
1676
+ if not self.enabled:
1677
+ return
1678
+ if self._thread and self._thread.is_alive():
1679
+ return
1680
+
1681
+ def _run():
1682
+ frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
1683
+ i = 0
1684
+ t0 = time.time()
1685
+ # Small delay so fast operations don't flash.
1686
+ time.sleep(0.25)
1687
+ if self._stop.is_set():
1688
+ return
1689
+ # Hide cursor for a cleaner look.
1690
+ try:
1691
+ sys.stdout.write("\033[?25l")
1692
+ sys.stdout.flush()
1693
+ except Exception:
1694
+ pass
1695
+ while not self._stop.is_set():
1696
+ elapsed = time.time() - t0
1697
+ sys.stdout.write(f"\r(synthesizing {elapsed:0.1f}s) {frames[i % len(frames)]}")
1698
+ sys.stdout.flush()
1699
+ i += 1
1700
+ time.sleep(0.1)
1701
+
1702
+ self._thread = threading.Thread(target=_run, daemon=True)
1703
+ self._thread.start()
1704
+
1705
+ def stop(self):
1706
+ if not self.enabled:
1707
+ return
1708
+ self._stop.set()
1709
+ try:
1710
+ if self._thread:
1711
+ self._thread.join(timeout=0.5)
1712
+ except Exception:
1713
+ pass
1714
+ # Clear spinner line.
1715
+ try:
1716
+ # `\033[2K` clears the entire line (more robust than fixed spaces).
1717
+ sys.stdout.write("\r\033[2K\r")
1718
+ # Restore cursor.
1719
+ sys.stdout.write("\033[?25h")
1720
+ sys.stdout.flush()
1721
+ except Exception:
1722
+ pass
1723
+
1724
+ def __enter__(self):
1725
+ self.start()
1726
+ return self
1727
+
1728
+ def __exit__(self, exc_type, exc, tb):
1729
+ self.stop()
1730
+ return False
1731
+
1732
+ # NOTE: We intentionally do not keep a background spinner running while the REPL
1733
+ # is waiting for user input (it corrupts the prompt line). Instead, we show a
1734
+ # spinner only until the first audio actually starts, then stop it so the prompt
1735
+ # stays usable for interruption-by-typing.
1736
+
1737
+ def do_clones(self, arg):
1738
+ """List cloned voices in the local store."""
1739
+ if not self.voice_manager:
1740
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1741
+ return
1742
+ try:
1743
+ voices = self.voice_manager.list_cloned_voices()
1744
+ if not voices:
1745
+ print("No cloned voices yet. Use /clone <path> or /clone-my-voice.")
1746
+ return
1747
+ print(f"\n{Colors.CYAN}Cloned voices:{Colors.END}")
1748
+ for v in voices:
1749
+ vid = v.get("voice_id") or v.get("voice", "")
1750
+ name = v.get("name", "")
1751
+ eng = (v.get("engine") or "").strip()
1752
+ eng_txt = f" [{eng}]" if eng else ""
1753
+ src = (v.get("meta") or {}).get("reference_text_source", "")
1754
+ src_txt = f" [{src}]" if src else ""
1755
+ current = " (current)" if self.current_tts_voice == vid else ""
1756
+ print(f" - {name}: {vid}{eng_txt}{src_txt}{current}")
1757
+ print("Tip: /clone_rm <id-or-name> deletes one; /clone_rm_all --yes deletes all.")
1758
+ except Exception as e:
1759
+ print(f"❌ Error listing cloned voices: {e}")
1760
+
1761
+ def _resolve_clone_id(self, wanted: str) -> str | None:
1762
+ voices = self.voice_manager.list_cloned_voices()
1763
+ for v in voices:
1764
+ vid = v.get("voice_id") or ""
1765
+ name = v.get("name") or ""
1766
+ if wanted == vid or vid.startswith(wanted) or wanted == name:
1767
+ return vid
1768
+ return None
1769
+
1770
+ def _resolve_clone_id_by_source(self, source: str, *, engine: str | None = None) -> str | None:
1771
+ """Find a cloned voice by its stored meta.source (best-effort)."""
1772
+ if not self.voice_manager:
1773
+ return None
1774
+
1775
+ try:
1776
+ from pathlib import Path
1777
+
1778
+ target = Path(str(source)).expanduser()
1779
+ try:
1780
+ target_norm = str(target.resolve())
1781
+ except Exception:
1782
+ target_norm = str(target)
1783
+ except Exception:
1784
+ target_norm = str(source)
1785
+
1786
+ try:
1787
+ voices = self.voice_manager.list_cloned_voices()
1788
+ except Exception:
1789
+ return None
1790
+
1791
+ wanted_engine = (str(engine).strip().lower() if engine else None) or None
1792
+ for v in voices:
1793
+ meta = v.get("meta") or {}
1794
+ src = meta.get("source")
1795
+ if not src:
1796
+ continue
1797
+ try:
1798
+ from pathlib import Path
1799
+
1800
+ p = Path(str(src)).expanduser()
1801
+ try:
1802
+ src_norm = str(p.resolve())
1803
+ except Exception:
1804
+ src_norm = str(p)
1805
+ except Exception:
1806
+ src_norm = str(src)
1807
+
1808
+ if src_norm != target_norm:
1809
+ continue
1810
+ if wanted_engine and (str(v.get("engine") or "").strip().lower() != wanted_engine):
1811
+ continue
1812
+ return str(v.get("voice_id") or "").strip() or None
1813
+ return None
1814
+
1815
+ def do_clone_info(self, arg):
1816
+ """Show details for a cloned voice.
1817
+
1818
+ Usage:
1819
+ /clone_info <id-or-name>
1820
+ """
1821
+ if not self.voice_manager:
1822
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1823
+ return
1824
+ wanted = arg.strip()
1825
+ if not wanted:
1826
+ print("Usage: /clone_info <id-or-name>")
1827
+ return
1828
+ vid = self._resolve_clone_id(wanted)
1829
+ if not vid:
1830
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
1831
+ return
1832
+ try:
1833
+ info = self.voice_manager.get_cloned_voice(vid)
1834
+ meta = info.get("meta") or {}
1835
+ print(f"\n{Colors.CYAN}Cloned voice info:{Colors.END}")
1836
+ print(f" id: {info.get('voice_id')}")
1837
+ print(f" name: {info.get('name')}")
1838
+ print(f" engine: {info.get('engine')}")
1839
+ print(f" refs: {len(info.get('reference_files') or [])}")
1840
+ print(f" ref_text_source: {meta.get('reference_text_source','')}")
1841
+ rt = (info.get('reference_text') or '').strip()
1842
+ if rt:
1843
+ short = (rt[:200] + "…") if len(rt) > 200 else rt
1844
+ print(f" reference_text: {short}")
1845
+ else:
1846
+ print(" reference_text: (missing)")
1847
+ except Exception as e:
1848
+ print(f"❌ Error: {e}")
1849
+
1850
+ def do_clone_ref(self, arg):
1851
+ """Print the full reference_text for a cloned voice.
1852
+
1853
+ Usage:
1854
+ /clone_ref <id-or-name>
1855
+ """
1856
+ if not self.voice_manager:
1857
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1858
+ return
1859
+ wanted = arg.strip()
1860
+ if not wanted:
1861
+ print("Usage: /clone_ref <id-or-name>")
1862
+ return
1863
+ vid = self._resolve_clone_id(wanted)
1864
+ if not vid:
1865
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
1866
+ return
1867
+ info = self.voice_manager.get_cloned_voice(vid)
1868
+ print((info.get("reference_text") or "").strip())
1869
+
1870
+ def do_clone_rename(self, arg):
1871
+ """Rename a cloned voice.
1872
+
1873
+ Usage:
1874
+ /clone_rename <id-or-name> <new_name>
1875
+ """
1876
+ if not self.voice_manager:
1877
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1878
+ return
1879
+ parts = arg.strip().split(maxsplit=1)
1880
+ if len(parts) < 2:
1881
+ print("Usage: /clone_rename <id-or-name> <new_name>")
1882
+ return
1883
+ vid = self._resolve_clone_id(parts[0])
1884
+ if not vid:
1885
+ print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
1886
+ return
1887
+ self.voice_manager.rename_cloned_voice(vid, parts[1])
1888
+ print("✅ Renamed.")
1889
+
1890
+ def do_clone_rm(self, arg):
1891
+ """Remove a cloned voice from the store.
1892
+
1893
+ Usage:
1894
+ /clone_rm <id-or-name>
1895
+ """
1896
+ if not self.voice_manager:
1897
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1898
+ return
1899
+ wanted = arg.strip()
1900
+ if not wanted:
1901
+ print("Usage: /clone_rm <id-or-name>")
1902
+ return
1903
+ vid = self._resolve_clone_id(wanted)
1904
+ if not vid:
1905
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
1906
+ return
1907
+ # If currently selected, switch back to Piper.
1908
+ if self.current_tts_voice == vid:
1909
+ self.current_tts_voice = None
1910
+ self.voice_manager.delete_cloned_voice(vid)
1911
+ print("✅ Deleted.")
1912
+
1913
+ def do_clone_rm_all(self, arg):
1914
+ """Remove ALL cloned voices from the local store.
1915
+
1916
+ Usage:
1917
+ /clone_rm_all --yes
1918
+ """
1919
+ if not self.voice_manager:
1920
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1921
+ return
1922
+
1923
+ confirm = (arg or "").strip().lower()
1924
+ if confirm not in ("--yes", "-y", "yes"):
1925
+ try:
1926
+ n = len(self.voice_manager.list_cloned_voices() or [])
1927
+ except Exception:
1928
+ n = 0
1929
+ if n <= 0:
1930
+ print("No cloned voices to delete.")
1931
+ return
1932
+ print(f"⚠️ This will permanently delete {n} cloned voice(s).")
1933
+ print("Re-run with: /clone_rm_all --yes")
1934
+ return
1935
+
1936
+ # If currently selected, switch back to Piper.
1937
+ self.current_tts_voice = None
1938
+
1939
+ deleted = 0
1940
+ failed = 0
1941
+ try:
1942
+ voices = list(self.voice_manager.list_cloned_voices() or [])
1943
+ except Exception as e:
1944
+ print(f"❌ Error listing cloned voices: {e}")
1945
+ return
1946
+
1947
+ for v in voices:
1948
+ vid = str(v.get("voice_id") or v.get("voice") or "").strip()
1949
+ if not vid:
1950
+ continue
1951
+ try:
1952
+ self.voice_manager.delete_cloned_voice(vid)
1953
+ deleted += 1
1954
+ except Exception:
1955
+ failed += 1
1956
+
1957
+ if failed:
1958
+ print(f"✅ Deleted {deleted} cloned voice(s). ⚠️ Failed: {failed}")
1959
+ else:
1960
+ print(f"✅ Deleted {deleted} cloned voice(s).")
1961
+
1962
+ def do_clone_export(self, arg):
1963
+ """Export a cloned voice bundle (.zip).
1964
+
1965
+ Usage:
1966
+ /clone_export <id-or-name> <path.zip>
1967
+ """
1968
+ if not self.voice_manager:
1969
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1970
+ return
1971
+ parts = arg.strip().split(maxsplit=1)
1972
+ if len(parts) < 2:
1973
+ print("Usage: /clone_export <id-or-name> <path.zip>")
1974
+ return
1975
+ vid = self._resolve_clone_id(parts[0])
1976
+ if not vid:
1977
+ print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
1978
+ return
1979
+ out = self.voice_manager.export_voice(vid, parts[1])
1980
+ print(f"✅ Exported: {out}")
1981
+
1982
+ def do_clone_import(self, arg):
1983
+ """Import a cloned voice bundle (.zip).
1984
+
1985
+ Usage:
1986
+ /clone_import <path.zip>
1987
+ """
1988
+ if not self.voice_manager:
1989
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1990
+ return
1991
+ path = arg.strip()
1992
+ if not path:
1993
+ print("Usage: /clone_import <path.zip>")
1994
+ return
1995
+ vid = self.voice_manager.import_voice(path)
1996
+ print(f"✅ Imported as: {vid}")
1997
+
1998
+ def do_clone(self, arg):
1999
+ """Clone a voice from a reference file or folder.
2000
+
2001
+ Usage:
2002
+ /clone <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
2003
+ """
2004
+ if not self.voice_manager:
2005
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2006
+ return
2007
+
2008
+ try:
2009
+ parts = shlex.split(arg.strip())
2010
+ except ValueError as e:
2011
+ print(f"Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
2012
+ return
2013
+
2014
+ if not parts:
2015
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2016
+ return
2017
+
2018
+ engine = None
2019
+ reference_text = None
2020
+ pos = []
2021
+ i = 0
2022
+ while i < len(parts):
2023
+ tok = parts[i]
2024
+ if tok in ("--engine",):
2025
+ if i + 1 >= len(parts):
2026
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2027
+ return
2028
+ engine = parts[i + 1]
2029
+ i += 2
2030
+ continue
2031
+ if tok in ("--text", "--reference-text", "--reference_text"):
2032
+ if i + 1 >= len(parts):
2033
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2034
+ return
2035
+ reference_text = parts[i + 1]
2036
+ i += 2
2037
+ continue
2038
+ pos.append(tok)
2039
+ i += 1
2040
+
2041
+ if not pos:
2042
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2043
+ return
2044
+
2045
+ path = pos[0]
2046
+ name = pos[1] if len(pos) > 1 else None
2047
+ try:
2048
+ t0 = time.monotonic()
2049
+ voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine)
2050
+ t1 = time.monotonic()
2051
+
2052
+ eng = ""
2053
+ ref_src = ""
2054
+ try:
2055
+ info = self.voice_manager.get_cloned_voice(voice_id) or {}
2056
+ eng = str(info.get("engine") or "").strip()
2057
+ ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
2058
+ except Exception:
2059
+ eng = ""
2060
+ ref_src = ""
2061
+
2062
+ eng_txt = f" (engine: {eng})" if eng else ""
2063
+ print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
2064
+ print(" Use /tts_voice clone <id-or-name> to select it.")
2065
+ print(" Tip: set reference text for best quality:")
2066
+ print(" /clone_set_ref_text <id-or-name> \"...\"")
2067
+ if not self._is_cloning_runtime_ready(voice_id=voice_id):
2068
+ print(" (Cloning runtime not ready yet; run /cloning_status and /cloning_download first.)")
2069
+ if str(eng or (engine or self.cloning_engine) or "").strip().lower() == "chroma" and not (reference_text or "").strip():
2070
+ print("ℹ️ No reference transcript provided.")
2071
+ print(" We will auto-generate it via STT on first speak (offline-first: requires cached STT model).")
2072
+ print(" Optional (often best quality): /clone_set_ref_text <id-or-name> \"...\" (or re-run /clone ... --text \"...\")")
2073
+
2074
+ if self.verbose_mode:
2075
+ n_files, ref_audio_s = self._summarize_audio_source(path)
2076
+ n_txt = str(n_files) if isinstance(n_files, int) else "--"
2077
+ src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
2078
+ msg = f"CLONE {eng or (engine or self.cloning_engine)} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
2079
+ print(f"{Colors.YELLOW}{msg}{Colors.END}")
2080
+ except Exception as e:
2081
+ print(f"❌ Clone failed: {e}")
2082
+
2083
+ def do_clone_use(self, arg):
2084
+ """Clone a voice (or reuse an existing one) and immediately select it.
2085
+
2086
+ Usage:
2087
+ /clone_use <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
2088
+
2089
+ Shortcut:
2090
+ - Paste a WAV/FLAC/OGG path directly (optionally: `path.wav | transcript`).
2091
+ """
2092
+ if not self.voice_manager:
2093
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2094
+ return
2095
+
2096
+ try:
2097
+ parts = shlex.split(arg.strip())
2098
+ except ValueError as e:
2099
+ print(f"Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
2100
+ return
2101
+
2102
+ if not parts:
2103
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2104
+ return
2105
+
2106
+ engine = None
2107
+ reference_text = None
2108
+ pos = []
2109
+ i = 0
2110
+ while i < len(parts):
2111
+ tok = parts[i]
2112
+ if tok in ("--engine",):
2113
+ if i + 1 >= len(parts):
2114
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2115
+ return
2116
+ engine = parts[i + 1]
2117
+ i += 2
2118
+ continue
2119
+ if tok in ("--text", "--reference-text", "--reference_text"):
2120
+ if i + 1 >= len(parts):
2121
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2122
+ return
2123
+ reference_text = parts[i + 1]
2124
+ i += 2
2125
+ continue
2126
+ pos.append(tok)
2127
+ i += 1
2128
+
2129
+ if not pos:
2130
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2131
+ return
2132
+
2133
+ path = pos[0]
2134
+ name = pos[1] if len(pos) > 1 else None
2135
+
2136
+ engine_name = str(engine or self.cloning_engine or "f5_tts").strip().lower()
2137
+
2138
+ # If name isn't provided, use something stable for UX.
2139
+ if not name:
2140
+ try:
2141
+ from pathlib import Path
2142
+
2143
+ p = Path(path)
2144
+ name = p.stem if p.is_file() else p.name
2145
+ except Exception:
2146
+ name = None
2147
+
2148
+ # Reuse a prior clone created from the same source path + engine.
2149
+ voice_id = self._resolve_clone_id_by_source(path, engine=engine_name)
2150
+ if voice_id:
2151
+ if reference_text:
2152
+ try:
2153
+ self.voice_manager.set_cloned_voice_reference_text(voice_id, reference_text)
2154
+ print("✅ Reusing cloned voice and updating reference text.")
2155
+ except Exception:
2156
+ print("✅ Reusing cloned voice.")
2157
+ else:
2158
+ print("✅ Reusing cloned voice.")
2159
+ else:
2160
+ try:
2161
+ t0 = time.monotonic()
2162
+ voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine_name)
2163
+ t1 = time.monotonic()
2164
+
2165
+ eng = ""
2166
+ ref_src = ""
2167
+ try:
2168
+ info = self.voice_manager.get_cloned_voice(voice_id) or {}
2169
+ eng = str(info.get("engine") or "").strip()
2170
+ ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
2171
+ except Exception:
2172
+ eng = ""
2173
+ ref_src = ""
2174
+
2175
+ eng_txt = f" (engine: {eng})" if eng else ""
2176
+ print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
2177
+ if reference_text:
2178
+ print(" (Reference text provided)")
2179
+ else:
2180
+ print(" Tip: set reference text for best quality:")
2181
+ print(" /clone_set_ref_text <id-or-name> \"...\"")
2182
+ if str(eng or engine_name or "").strip().lower() == "chroma":
2183
+ print(" ℹ️ No transcript provided; STT auto-fallback runs on first speak (requires cached STT model).")
2184
+
2185
+ if self.verbose_mode:
2186
+ n_files, ref_audio_s = self._summarize_audio_source(path)
2187
+ n_txt = str(n_files) if isinstance(n_files, int) else "--"
2188
+ src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
2189
+ msg = f"CLONE {eng or engine_name} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
2190
+ print(f"{Colors.YELLOW}{msg}{Colors.END}")
2191
+ except Exception as e:
2192
+ print(f"❌ Clone failed: {e}")
2193
+ return
2194
+
2195
+ # Select if runtime is ready (no surprise downloads).
2196
+ if not self._is_cloning_runtime_ready(voice_id=voice_id):
2197
+ print("ℹ️ Cloning runtime is not ready (would trigger large downloads).")
2198
+ print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
2199
+ return
2200
+
2201
+ self.current_tts_voice = voice_id
2202
+ eng = ""
2203
+ try:
2204
+ info = self.voice_manager.get_cloned_voice(voice_id) or {}
2205
+ eng = str(info.get("engine") or "").strip()
2206
+ except Exception:
2207
+ eng = ""
2208
+ eng_txt = f" (engine: {eng})" if eng else ""
2209
+ print(f"✅ Using cloned voice: {voice_id}{eng_txt}")
2210
+ if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
2211
+ print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
2212
+ # Free memory from other cloning engines (important for large backends like Chroma).
2213
+ try:
2214
+ if hasattr(self.voice_manager, "unload_cloning_engines"):
2215
+ self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
2216
+ except Exception:
2217
+ pass
2218
+ # Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
2219
+ try:
2220
+ if hasattr(self.voice_manager, "unload_piper_voice"):
2221
+ self.voice_manager.unload_piper_voice()
2222
+ except Exception:
2223
+ pass
2224
+
2225
+ def do_clone_set_ref_text(self, arg):
2226
+ """Set the reference transcript for a cloned voice (quality fix).
2227
+
644
2228
  Usage:
645
- /tts_model vits
646
- /tts_model fast_pitch
2229
+ /clone_set_ref_text <id-or-name> <text...>
647
2230
  """
648
- model_shortcuts = {
649
- 'vits': 'tts_models/en/ljspeech/vits',
650
- 'fast_pitch': 'tts_models/en/ljspeech/fast_pitch',
651
- 'glow-tts': 'tts_models/en/ljspeech/glow-tts',
652
- 'tacotron2-DDC': 'tts_models/en/ljspeech/tacotron2-DDC',
653
- }
654
-
655
- arg = arg.strip()
656
- if not arg:
657
- print("Usage: /tts_model <model_name>")
658
- print("Available models: vits (best), fast_pitch, glow-tts, tacotron2-DDC")
2231
+ if not self.voice_manager:
2232
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
659
2233
  return
660
-
661
- # Get full model name
662
- model_name = model_shortcuts.get(arg, arg)
663
-
664
- print(f"Changing TTS model to: {model_name}")
2234
+
2235
+ parts = arg.strip().split(maxsplit=1)
2236
+ if len(parts) < 2:
2237
+ print("Usage: /clone_set_ref_text <id-or-name> <text...>")
2238
+ return
2239
+
2240
+ wanted, text = parts[0], parts[1]
2241
+ voices = self.voice_manager.list_cloned_voices()
2242
+ match = None
2243
+ for v in voices:
2244
+ vid = v.get("voice_id") or ""
2245
+ name = v.get("name") or ""
2246
+ if wanted == vid or vid.startswith(wanted) or wanted == name:
2247
+ match = vid
2248
+ break
2249
+ if not match:
2250
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
2251
+ return
2252
+
665
2253
  try:
666
- self.voice_manager.set_tts_model(model_name)
667
- print(" TTS model changed successfully")
2254
+ self.voice_manager.set_cloned_voice_reference_text(match, text)
2255
+ print(" Updated reference text.")
668
2256
  except Exception as e:
669
- print(f" Error changing model: {e}")
670
-
671
- def do_whisper(self, arg):
672
- """Change Whisper model."""
673
- model = arg.strip()
674
- if not model:
675
- print(f"Current Whisper model: {self.voice_manager.get_whisper()}")
2257
+ print(f" Failed to update reference text: {e}")
2258
+
2259
+ def do_tts_voice(self, arg):
2260
+ """Select which voice is used for speaking.
2261
+
2262
+ Usage:
2263
+ /tts_voice piper
2264
+ /tts_voice clone <voice_id_or_name>
2265
+ """
2266
+ if not self.voice_manager:
2267
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
676
2268
  return
677
-
678
- self.voice_manager.set_whisper(model)
2269
+
2270
+ parts = arg.strip().split()
2271
+ if not parts:
2272
+ if self.current_tts_voice:
2273
+ vid = self.current_tts_voice
2274
+ try:
2275
+ info = self.voice_manager.get_cloned_voice(vid) or {}
2276
+ name = (info.get("name") or "").strip()
2277
+ eng = (info.get("engine") or "").strip()
2278
+ label = name or vid
2279
+ suffix = f" (engine: {eng})" if eng else ""
2280
+ print(f"Current TTS voice: {label}{suffix}")
2281
+ except Exception:
2282
+ print(f"Current TTS voice: {vid}")
2283
+ else:
2284
+ print("Current TTS voice: piper")
2285
+ print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
2286
+ return
2287
+
2288
+ if parts[0] == "piper":
2289
+ self.current_tts_voice = None
2290
+ # Free any heavy cloning engines when switching back to Piper.
2291
+ try:
2292
+ if hasattr(self.voice_manager, "unload_cloning_engines"):
2293
+ self.voice_manager.unload_cloning_engines()
2294
+ except Exception:
2295
+ pass
2296
+ # If Piper was previously unloaded to save memory, reload it now (offline-first).
2297
+ try:
2298
+ if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
2299
+ a = getattr(self.voice_manager, "tts_adapter", None)
2300
+ if hasattr(a, "is_available") and not bool(a.is_available()):
2301
+ self.voice_manager.set_language(self.current_language)
2302
+ except Exception:
2303
+ pass
2304
+ print("✅ Using Piper (default) voice")
2305
+ return
2306
+
2307
+ if parts[0] != "clone" or len(parts) < 2:
2308
+ print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
2309
+ return
2310
+
2311
+ wanted = parts[1]
2312
+ match = self._resolve_clone_id(wanted)
2313
+ if not match:
2314
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
2315
+ return
2316
+
2317
+ # Do not allow selecting a cloned voice unless the runtime is ready.
2318
+ if not self._is_cloning_runtime_ready(voice_id=match):
2319
+ print("❌ Cloning runtime is not ready (would trigger large downloads).")
2320
+ print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
2321
+ return
2322
+
2323
+ # Allow selecting voices without reference_text; we will auto-fallback at speak-time
2324
+ # if the STT model is already cached locally (no downloads in REPL).
2325
+
2326
+ self.current_tts_voice = match
2327
+ eng = ""
2328
+ try:
2329
+ info = self.voice_manager.get_cloned_voice(match) or {}
2330
+ eng = (info.get("engine") or "").strip()
2331
+ except Exception:
2332
+ eng = ""
2333
+ eng_txt = f" (engine: {eng})" if eng else ""
2334
+ print(f"✅ Using cloned voice: {match}{eng_txt}")
2335
+ if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
2336
+ print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
2337
+ # Free memory from other cloning engines (e.g. unloading Chroma when switching to F5, or vice-versa).
2338
+ try:
2339
+ if hasattr(self.voice_manager, "unload_cloning_engines"):
2340
+ self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
2341
+ except Exception:
2342
+ pass
2343
+ # Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
2344
+ try:
2345
+ if hasattr(self.voice_manager, "unload_piper_voice"):
2346
+ self.voice_manager.unload_piper_voice()
2347
+ except Exception:
2348
+ pass
2349
+
2350
+ def do_clone_my_voice(self, arg):
2351
+ """Interactive voice cloning from microphone.
2352
+
2353
+ This records a short prompt to WAV and adds it to the voice store.
2354
+ """
2355
+ if not self.voice_manager:
2356
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2357
+ return
2358
+
2359
+ prompt = "Good evening, Dave."
2360
+ seconds = 6.0
2361
+ print("You will record a short reference sample for voice cloning.")
2362
+ print(f"Please read this aloud (once): {prompt}")
2363
+ input("Press Enter to start recording...")
2364
+ try:
2365
+ import appdirs
2366
+ from pathlib import Path
2367
+ from abstractvoice.audio import record_wav
2368
+
2369
+ out_dir = Path(appdirs.user_data_dir("abstractvoice")) / "recordings"
2370
+ out_path = out_dir / "my_voice.wav"
2371
+ record_wav(out_path, seconds=seconds, sample_rate=24000, channels=1)
2372
+ voice_id = self.voice_manager.clone_voice(str(out_path), name="my_voice", reference_text=prompt)
2373
+ print(f"✅ Recorded and cloned: {voice_id}")
2374
+ print(" Use /tts_voice clone <id-or-name> to select it.")
2375
+ except Exception as e:
2376
+ print(f"❌ /clone-my-voice failed: {e}")
2377
+
2378
+ def do_cloning_status(self, arg):
2379
+ """Show whether cloning runtime is ready locally (no downloads)."""
2380
+ try:
2381
+ import torch
2382
+
2383
+ mps = False
2384
+ try:
2385
+ mps = bool(torch.backends.mps.is_available())
2386
+ except Exception:
2387
+ mps = False
2388
+ print(f"torch: {getattr(torch, '__version__', '?')}")
2389
+ print(f"cuda_available: {bool(torch.cuda.is_available())}")
2390
+ print(f"mps_available: {mps}")
2391
+ except Exception:
2392
+ pass
2393
+
2394
+ print(f"default_cloning_engine: {self.cloning_engine}")
2395
+
2396
+ if importlib.util.find_spec("f5_tts") is None:
2397
+ print("ℹ️ OpenF5 runtime: not installed (missing: f5_tts)")
2398
+ print(" Install: pip install \"abstractvoice[cloning]\"")
2399
+ else:
2400
+ if self._is_openf5_cached():
2401
+ print("✅ OpenF5 artifacts: present (cached)")
2402
+ else:
2403
+ print("ℹ️ OpenF5 artifacts: not present (will require ~5.4GB download)")
2404
+ print(" Run: /cloning_download f5_tts")
2405
+
2406
+ if importlib.util.find_spec("transformers") is None or importlib.util.find_spec("torch") is None:
2407
+ print("ℹ️ Chroma runtime: not installed (missing: transformers/torch)")
2408
+ print(" Install: pip install \"abstractvoice[chroma]\"")
2409
+ else:
2410
+ if self._is_chroma_cached():
2411
+ print("✅ Chroma artifacts: present (cached)")
2412
+ else:
2413
+ print("ℹ️ Chroma artifacts: not present (will require a large download + HF access)")
2414
+ print(" Run: /cloning_download chroma")
2415
+ try:
2416
+ if self.voice_manager:
2417
+ info = self.voice_manager.get_cloning_runtime_info()
2418
+ if info:
2419
+ print(f"cloning_resolved_device: {info.get('resolved_device')}")
2420
+ print(f"cloning_model_param_device: {info.get('model_param_device','?')}")
2421
+ print(f"cloning_quality_preset: {info.get('quality_preset')}")
2422
+ except Exception:
2423
+ pass
2424
+
2425
+ def do_clone_quality(self, arg):
2426
+ """Set cloned TTS quality preset (speed/quality tradeoff).
2427
+
2428
+ Usage:
2429
+ /clone_quality fast|balanced|high
2430
+ """
2431
+ if not self.voice_manager:
2432
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2433
+ return
2434
+ preset = (arg or "").strip().lower()
2435
+ if preset not in ("fast", "balanced", "high"):
2436
+ print("Usage: /clone_quality fast|balanced|high")
2437
+ return
2438
+ try:
2439
+ self.voice_manager.set_cloned_tts_quality(preset)
2440
+ print(f"✅ Cloned TTS quality preset: {preset}")
2441
+ except Exception as e:
2442
+ print(f"❌ Failed to set preset: {e}")
2443
+
2444
+ def do_cloning_download(self, arg):
2445
+ """Explicitly download cloning artifacts (this may take a long time)."""
2446
+ if not self.voice_manager:
2447
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2448
+ return
2449
+
2450
+ target = (arg or "").strip().lower() or self.cloning_engine
2451
+ engine_name = "f5_tts" if target in ("openf5", "f5", "f5_tts") else target
2452
+ if engine_name == "f5_tts":
2453
+ if importlib.util.find_spec("f5_tts") is None:
2454
+ print("❌ OpenF5 runtime not installed in this environment (missing: f5_tts).")
2455
+ print(" Install: pip install \"abstractvoice[cloning]\"")
2456
+ return
2457
+ elif engine_name == "chroma":
2458
+ # Artifacts download uses huggingface_hub and does not require loading the model.
2459
+ if importlib.util.find_spec("huggingface_hub") is None:
2460
+ print("❌ huggingface_hub is required to download Chroma artifacts.")
2461
+ print(" Install: pip install huggingface_hub")
2462
+ return
2463
+ else:
2464
+ print("Usage: /cloning_download [f5_tts|chroma]")
2465
+ return
2466
+
2467
+ try:
2468
+ cloner = self.voice_manager._get_voice_cloner() # REPL convenience
2469
+ engine = cloner._get_engine(engine_name) # explicit download is an engine concern
2470
+ if engine_name == "f5_tts":
2471
+ print("Downloading OpenF5 artifacts (~5.4GB). This is a one-time cache per machine.")
2472
+ engine.ensure_openf5_artifacts_downloaded()
2473
+ else:
2474
+ print("Downloading Chroma artifacts (very large; requires HF access). This is a one-time cache per machine.")
2475
+ engine.ensure_chroma_artifacts_downloaded()
2476
+ print("✅ Download complete.")
2477
+ except Exception as e:
2478
+ print(f"❌ Download failed: {e}")
2479
+
2480
+ def _is_openf5_cached(self) -> bool:
2481
+ """Heuristic local check that avoids importing huggingface_hub."""
2482
+ from pathlib import Path
2483
+ import os
2484
+
2485
+ root = Path(os.path.expanduser("~/.cache/abstractvoice/openf5"))
2486
+ if not root.exists():
2487
+ return False
2488
+ cfg = next(iter(root.rglob("*.yaml")), None) or next(iter(root.rglob("*.yml")), None)
2489
+ ckpt = next(iter(root.rglob("*.pt")), None)
2490
+ vocab = next(iter(root.rglob("vocab*.txt")), None) or next(iter(root.rglob("*.txt")), None)
2491
+ return bool(cfg and ckpt and vocab)
2492
+
2493
+ def _is_chroma_cached(self) -> bool:
2494
+ """Heuristic local check that avoids importing huggingface_hub."""
2495
+ from pathlib import Path
2496
+ import os
2497
+
2498
+ root = Path(os.path.expanduser("~/.cache/abstractvoice/chroma"))
2499
+ if not root.exists():
2500
+ return False
2501
+ required = [
2502
+ "config.json",
2503
+ "processor_config.json",
2504
+ "model.safetensors.index.json",
2505
+ "modeling_chroma.py",
2506
+ "processing_chroma.py",
2507
+ "configuration_chroma.py",
2508
+ ]
2509
+ return all((root / name).exists() for name in required)
2510
+
2511
+ def _is_cloning_runtime_ready(self, *, voice_id: str | None = None, engine: str | None = None) -> bool:
2512
+ """Return whether the selected cloning engine is ready locally (no downloads)."""
2513
+ eng = str(engine or "").strip().lower()
2514
+ if not eng and voice_id and self.voice_manager:
2515
+ try:
2516
+ info = self.voice_manager.get_cloned_voice(voice_id)
2517
+ eng = str((info or {}).get("engine") or "").strip().lower()
2518
+ except Exception:
2519
+ eng = ""
2520
+ if not eng:
2521
+ eng = str(getattr(self, "cloning_engine", "f5_tts") or "f5_tts").strip().lower()
2522
+
2523
+ if eng == "chroma":
2524
+ return (
2525
+ importlib.util.find_spec("torch") is not None
2526
+ and importlib.util.find_spec("transformers") is not None
2527
+ and self._is_chroma_cached()
2528
+ )
2529
+ return importlib.util.find_spec("f5_tts") is not None and self._is_openf5_cached()
2530
+
2531
+ def _seed_hal9000_voice(self):
2532
+ """Seed a default 'hal9000' cloned voice if sample WAVs are present."""
2533
+ if not self.voice_manager:
2534
+ return
2535
+ try:
2536
+ from pathlib import Path
2537
+
2538
+ sample_dir = Path("audio_samples") / "hal9000"
2539
+ if not sample_dir.exists():
2540
+ return
2541
+
2542
+ # If already present, do nothing.
2543
+ existing_hal = None
2544
+ for v in self.voice_manager.list_cloned_voices():
2545
+ if (v.get("name") or "").lower() == "hal9000":
2546
+ existing_hal = v.get("voice_id")
2547
+ break
2548
+
2549
+ # Seed from the clean short WAV sample to avoid noisy auto-transcriptions.
2550
+ # This avoids repeated artifacts like "how are you hal" bleeding into outputs.
2551
+ if existing_hal is None:
2552
+ ref = sample_dir / "hal9000_hello.wav"
2553
+ if ref.exists():
2554
+ existing_hal = self.voice_manager.clone_voice(
2555
+ str(ref),
2556
+ name="hal9000",
2557
+ reference_text="Hello, Dave.",
2558
+ )
2559
+ else:
2560
+ existing_hal = self.voice_manager.clone_voice(str(sample_dir), name="hal9000")
2561
+ if self.debug_mode:
2562
+ print(f"Seeded cloned voice 'hal9000': {existing_hal}")
2563
+
2564
+ # Do NOT auto-select here; selecting a clone without explicit user action
2565
+ # can cause surprise multi-GB downloads. Users can opt in via /tts_voice.
2566
+ except Exception:
2567
+ # Best-effort only; never block REPL start.
2568
+ return
2569
+
2570
+ def do_tts_engine(self, arg):
2571
+ """Select TTS engine: auto|piper.
2572
+
2573
+ This recreates the internal VoiceManager instance.
2574
+ """
2575
+ engine = arg.strip().lower()
2576
+ if engine not in ("auto", "piper"):
2577
+ print("Usage: /tts_engine auto|piper")
2578
+ return
2579
+
2580
+ if self.voice_manager:
2581
+ try:
2582
+ self.voice_manager.cleanup()
2583
+ except Exception:
2584
+ pass
2585
+
2586
+ self.voice_manager = VoiceManager(
2587
+ language=self.current_language,
2588
+ tts_model=self._initial_tts_model,
2589
+ debug_mode=self.debug_mode,
2590
+ tts_engine=engine,
2591
+ allow_downloads=False,
2592
+ cloned_tts_streaming=False,
2593
+ cloning_engine=self.cloning_engine,
2594
+ )
2595
+ print(f"✅ TTS engine set to: {engine}")
2596
+
2597
+ def do_aec(self, arg):
2598
+ """Enable/disable optional AEC (echo cancellation) for true barge-in.
2599
+
2600
+ Usage:
2601
+ /aec on [delay_ms]
2602
+ /aec off
2603
+ """
2604
+ if not self.voice_manager:
2605
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2606
+ return
2607
+
2608
+ parts = arg.strip().split()
2609
+ if not parts:
2610
+ enabled = bool(getattr(self.voice_manager, "_aec_enabled", False))
2611
+ delay = int(getattr(self.voice_manager, "_aec_stream_delay_ms", 0))
2612
+ print(f"AEC: {'on' if enabled else 'off'} (delay_ms={delay})")
2613
+ print("Usage: /aec on [delay_ms] | /aec off")
2614
+ return
2615
+
2616
+ if parts[0] == "off":
2617
+ try:
2618
+ self.voice_manager.enable_aec(False)
2619
+ print("✅ AEC disabled")
2620
+ except Exception as e:
2621
+ print(f"❌ AEC disable failed: {e}")
2622
+ return
2623
+
2624
+ if parts[0] != "on":
2625
+ print("Usage: /aec on [delay_ms] | /aec off")
2626
+ return
2627
+
2628
+ delay_ms = 0
2629
+ if len(parts) > 1:
2630
+ try:
2631
+ delay_ms = int(parts[1])
2632
+ except Exception:
2633
+ print("Usage: /aec on [delay_ms] | /aec off")
2634
+ return
2635
+
2636
+ try:
2637
+ self.voice_manager.enable_aec(True, stream_delay_ms=delay_ms)
2638
+ print(f"✅ AEC enabled (delay_ms={delay_ms}).")
2639
+ print("Tip: use /voice full for barge-in behavior when AEC is enabled.")
2640
+ except Exception as e:
2641
+ print(f"❌ AEC enable failed: {e}")
2642
+
2643
+ def do_stt_engine(self, arg):
2644
+ """Select STT engine: auto|faster_whisper|whisper.
2645
+
2646
+ This recreates the internal VoiceManager instance.
2647
+ """
2648
+ engine = arg.strip().lower()
2649
+ if engine not in ("auto", "faster_whisper", "whisper"):
2650
+ print("Usage: /stt_engine auto|faster_whisper|whisper")
2651
+ return
2652
+
2653
+ if not self.voice_manager:
2654
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2655
+ return
2656
+
2657
+ # Recreate VoiceManager preserving current TTS engine preference.
2658
+ # If the current engine is unknown, let it auto-select.
2659
+ tts_engine = getattr(self.voice_manager, "_tts_engine_preference", "auto")
2660
+
2661
+ try:
2662
+ self.voice_manager.cleanup()
2663
+ except Exception:
2664
+ pass
2665
+
2666
+ self.voice_manager = VoiceManager(
2667
+ language=self.current_language,
2668
+ tts_model=self._initial_tts_model,
2669
+ debug_mode=self.debug_mode,
2670
+ tts_engine=tts_engine,
2671
+ stt_engine=engine,
2672
+ allow_downloads=False,
2673
+ cloned_tts_streaming=False,
2674
+ cloning_engine=self.cloning_engine,
2675
+ )
2676
+ print(f"✅ STT engine set to: {engine}")
2677
+
2678
+ def do_transcribe(self, arg):
2679
+ """Transcribe an audio file via the library STT path (faster-whisper by default).
2680
+
2681
+ Usage:
2682
+ /transcribe path/to/audio.wav
2683
+
2684
+ Notes:
2685
+ - This is the simplest way to validate STT without requiring microphone capture.
2686
+ - The default engine is faster-whisper; legacy openai-whisper remains optional.
2687
+ """
2688
+ if not self.voice_manager:
2689
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2690
+ return
2691
+
2692
+ path = arg.strip()
2693
+ if not path:
2694
+ print("Usage: /transcribe <path/to/audio.wav>")
2695
+ return
2696
+
2697
+ try:
2698
+ text = self.voice_manager.transcribe_file(path)
2699
+ print(f"{Colors.CYAN}{text}{Colors.END}")
2700
+ except Exception as e:
2701
+ print(f"❌ Transcription failed: {e}")
2702
+ if self.debug_mode:
2703
+ import traceback
2704
+ traceback.print_exc()
679
2705
 
680
2706
  def do_clear(self, arg):
681
2707
  """Clear chat history."""
2708
+ self._clear_history()
2709
+ print("History cleared")
2710
+
2711
+ def do_reset(self, arg):
2712
+ """Reset the session (history + current voice selection)."""
2713
+ try:
2714
+ if self.voice_manager:
2715
+ self.voice_manager.stop_speaking()
2716
+ except Exception:
2717
+ pass
2718
+
2719
+ # Reset voice selection back to Piper (default).
2720
+ self.current_tts_voice = None
2721
+ # Free any heavy cloning engines as part of reset.
2722
+ try:
2723
+ if self.voice_manager and hasattr(self.voice_manager, "unload_cloning_engines"):
2724
+ self.voice_manager.unload_cloning_engines()
2725
+ except Exception:
2726
+ pass
2727
+ # Ensure Piper is ready (in case it was unloaded to save memory).
2728
+ try:
2729
+ if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
2730
+ a = getattr(self.voice_manager, "tts_adapter", None)
2731
+ if hasattr(a, "is_available") and not bool(a.is_available()):
2732
+ self.voice_manager.set_language(self.current_language)
2733
+ except Exception:
2734
+ pass
2735
+
2736
+ # Clear chat history.
2737
+ self._clear_history()
2738
+ print("✅ Reset.")
2739
+
2740
+ def _clear_history(self) -> None:
682
2741
  self.messages = [{"role": "system", "content": self.system_prompt}]
683
2742
  # Reset token counters
684
2743
  self.system_tokens = 0
685
2744
  self.user_tokens = 0
686
2745
  self.assistant_tokens = 0
2746
+ # Reset word counters
2747
+ self.system_words = 0
2748
+ self.user_words = 0
2749
+ self.assistant_words = 0
687
2750
  # Recalculate system tokens
688
2751
  self._count_system_tokens()
689
- print("History cleared")
2752
+ self._count_system_words()
690
2753
 
691
2754
  def do_system(self, arg):
692
2755
  """Set the system prompt."""
693
2756
  if arg.strip():
694
2757
  self.system_prompt = arg.strip()
695
- self.messages = [{"role": "system", "content": self.system_prompt}]
2758
+ self._clear_history()
696
2759
  print(f"System prompt set to: {self.system_prompt}")
697
2760
  else:
698
2761
  print(f"Current system prompt: {self.system_prompt}")
699
2762
 
700
2763
  def do_exit(self, arg):
701
2764
  """Exit the REPL."""
702
- self.voice_manager.cleanup()
2765
+ # Stop any PTT session cleanly.
2766
+ self._ptt_session_active = False
2767
+ self._ptt_recording = False
2768
+ self._ptt_busy = False
2769
+
2770
+ # Stop voice mode / audio best-effort.
2771
+ try:
2772
+ if self.voice_manager:
2773
+ try:
2774
+ self.voice_manager.stop_listening()
2775
+ except Exception:
2776
+ pass
2777
+ try:
2778
+ self.voice_manager.stop_speaking()
2779
+ except Exception:
2780
+ pass
2781
+ except Exception:
2782
+ pass
2783
+
2784
+ try:
2785
+ if self.voice_manager:
2786
+ self.voice_manager.cleanup()
2787
+ except Exception:
2788
+ pass
703
2789
  if self.debug_mode:
704
2790
  print("Goodbye!")
705
2791
  return True
@@ -781,37 +2867,81 @@ class VoiceREPL(cmd.Cmd):
781
2867
 
782
2868
  # If neither voice mode nor TTS is active - don't show any message
783
2869
  pass
2870
+
2871
+ def do_verbose(self, arg):
2872
+ """Toggle verbose per-turn performance stats.
2873
+
2874
+ Usage:
2875
+ /verbose (toggle)
2876
+ /verbose on|off
2877
+ """
2878
+ s = (arg or "").strip().lower()
2879
+ if s in ("", "toggle"):
2880
+ self.verbose_mode = not bool(getattr(self, "verbose_mode", False))
2881
+ elif s in ("on", "1", "true", "yes", "y"):
2882
+ self.verbose_mode = True
2883
+ elif s in ("off", "0", "false", "no", "n"):
2884
+ self.verbose_mode = False
2885
+ else:
2886
+ print("Usage: /verbose [on|off]")
2887
+ return
2888
+ print(f"Verbose mode: {'on' if self.verbose_mode else 'off'}")
784
2889
 
785
2890
  def do_help(self, arg):
786
2891
  """Show help information."""
787
2892
  print("Commands:")
788
2893
  print(" /exit, /q, /quit Exit REPL")
789
2894
  print(" /clear Clear history")
2895
+ print(" /reset Reset (history + voice)")
790
2896
  print(" /tts on|off Toggle TTS")
791
2897
  print(" /voice <mode> Voice input: off|full|wait|stop|ptt")
792
- print(" /language <lang> Switch voice language (en, fr, es, de, it)")
793
- print(" /setvoice [id] List voices or set specific voice (lang.voice_id)")
2898
+ print(" /voice ptt Push-to-talk session (SPACE captures, ESC exits)")
2899
+ print(" /language <lang> Switch voice language (en, fr, es, de, ru, zh)")
2900
+ print(" /setvoice [id] List Piper voices or set one (lang.voice_id)")
794
2901
  print(" /lang_info Show current language information")
795
2902
  print(" /list_languages List all supported languages")
796
2903
  print(" /speed <number> Set TTS speed (0.5-2.0, default: 1.0, pitch preserved)")
797
- print(" /tts_model <model> Switch TTS model: vits(best)|fast_pitch|glow-tts|tacotron2-DDC")
2904
+ print(" /tts_voice ... Select Piper vs cloned voice (see below)")
2905
+ print(" /tts_engine <e> Switch TTS engine: auto|piper")
798
2906
  print(" /whisper <model> Switch Whisper model: tiny|base|small|medium|large")
2907
+ print(" /stt_engine <e> Switch STT engine: auto|faster_whisper|whisper (whisper is optional extra)")
2908
+ print(" /speak <text> Speak text (no LLM call)")
2909
+ print(" /transcribe <path> Transcribe an audio file (faster-whisper by default)")
799
2910
  print(" /system <prompt> Set system prompt")
800
2911
  print(" /stop Stop voice mode or TTS playback")
801
2912
  print(" /pause Pause current TTS playback")
802
2913
  print(" /resume Resume paused TTS playback")
2914
+ print(" /aec on|off Optional echo cancellation for true barge-in (requires [aec])")
803
2915
  print(" /tokens Display token usage stats")
2916
+ print(" /verbose [on|off] Toggle verbose per-turn stats")
804
2917
  print(" /help Show this help")
2918
+ print(" /clones List cloned voices")
2919
+ print(" /clone_info <id> Show cloned voice details")
2920
+ print(" /clone_ref <id> Show cloned voice reference text")
2921
+ print(" /clone_rename ... Rename a cloned voice")
2922
+ print(" /clone_rm <id> Delete a cloned voice")
2923
+ print(" /clone_rm_all --yes Delete ALL cloned voices")
2924
+ print(" /clone_export ... Export a cloned voice (.zip)")
2925
+ print(" /clone_import ... Import a cloned voice (.zip)")
2926
+ print(" /clone <path> [nm] Add a cloned voice from WAV/FLAC/OGG")
2927
+ print(" /clone_use <path> Clone+select voice (or reuse)")
2928
+ print(" /clone-my-voice Record a short prompt and clone it")
2929
+ print(" /tts_voice piper Speak with Piper (default)")
2930
+ print(" /tts_voice clone X Speak with a cloned voice (requires cloning runtime + cache)")
2931
+ print(" /cloning_status Show cloning readiness (no downloads)")
2932
+ print(" /cloning_download Explicitly download OpenF5 artifacts (~5.4GB)")
2933
+ print(" /clone_quality Set cloned TTS speed/quality: fast|balanced|high")
805
2934
  print(" /save <filename> Save chat history to file")
806
2935
  print(" /load <filename> Load chat history from file")
807
2936
  print(" /model <name> Change the LLM model")
808
2937
  print(" /temperature <val> Set temperature (0.0-2.0, default: 0.7)")
809
2938
  print(" /max_tokens <num> Set max tokens (default: 4096)")
810
- print(" stop Stop voice mode or TTS (voice command)")
2939
+ print(" stop (deprecated) use /voice off or say 'stop' during STOP mode")
811
2940
  print(" <message> Send to LLM (text mode)")
812
2941
  print()
813
2942
  print("Note: ALL commands must start with / except 'stop'")
814
- print("In voice mode, say 'stop' to exit voice mode.")
2943
+ print("In STOP mode, say 'stop' / 'ok stop' to stop speaking (does not exit voice mode).")
2944
+ print("Shortcut: paste a WAV/FLAC/OGG path to clone+select (optionally: `path | transcript`).")
815
2945
 
816
2946
  def emptyline(self):
817
2947
  """Handle empty line input."""
@@ -821,6 +2951,10 @@ class VoiceREPL(cmd.Cmd):
821
2951
  def do_tokens(self, arg):
822
2952
  """Display token usage information."""
823
2953
  try:
2954
+ if self._get_tiktoken_encoding() is None:
2955
+ print("Token counting is not available (install: pip install tiktoken).")
2956
+ return
2957
+
824
2958
  # Always recalculate tokens to ensure accuracy
825
2959
  self._reset_and_recalculate_tokens()
826
2960
 
@@ -998,15 +3132,26 @@ class VoiceREPL(cmd.Cmd):
998
3132
  print(f"Failed to load chat history from {filename}")
999
3133
 
1000
3134
  def _reset_and_recalculate_tokens(self):
1001
- """Reset token counts and recalculate for all messages."""
3135
+ """Reset token/word counts and recalculate for all messages."""
1002
3136
  self.system_tokens = 0
1003
3137
  self.user_tokens = 0
1004
3138
  self.assistant_tokens = 0
3139
+ self.system_words = 0
3140
+ self.user_words = 0
3141
+ self.assistant_words = 0
1005
3142
 
1006
3143
  # Count tokens for all messages
1007
3144
  for msg in self.messages:
1008
3145
  if isinstance(msg, dict) and "content" in msg and "role" in msg:
1009
3146
  self._count_tokens(msg["content"], msg["role"])
3147
+ w = self._count_words(msg["content"])
3148
+ r = msg.get("role")
3149
+ if r == "system":
3150
+ self.system_words = int(w)
3151
+ elif r == "user":
3152
+ self.user_words += int(w)
3153
+ elif r == "assistant":
3154
+ self.assistant_words += int(w)
1010
3155
 
1011
3156
  def _ensure_system_message(self):
1012
3157
  """Ensure there's a system message at the start of messages."""
@@ -1070,13 +3215,30 @@ def parse_args():
1070
3215
  """Parse command line arguments."""
1071
3216
  parser = argparse.ArgumentParser(description="AbstractVoice CLI Example")
1072
3217
  parser.add_argument("--debug", action="store_true", help="Enable debug mode")
3218
+ parser.add_argument("--verbose", action="store_true", help="Show per-turn performance stats")
1073
3219
  parser.add_argument("--api", default="http://localhost:11434/api/chat",
1074
3220
  help="LLM API URL")
1075
- parser.add_argument("--model", default="granite3.3:2b",
3221
+ parser.add_argument("--model", default="cogito:3b",
1076
3222
  help="LLM model name")
1077
- parser.add_argument("--language", "--lang", default="en",
1078
- choices=["en", "fr", "es", "de", "it", "ru", "multilingual"],
1079
- help="Voice language (en=English, fr=French, es=Spanish, de=German, it=Italian, ru=Russian, multilingual=All)")
3223
+ parser.add_argument(
3224
+ "--cloning-engine",
3225
+ default="f5_tts",
3226
+ choices=["f5_tts", "chroma"],
3227
+ help="Default cloning backend for new voices (f5_tts|chroma)",
3228
+ )
3229
+ parser.add_argument(
3230
+ "--voice-mode",
3231
+ default="off",
3232
+ choices=["off", "wait", "stop", "full", "ptt"],
3233
+ help="Auto-start microphone voice mode (off|wait|stop|full|ptt). Default: off.",
3234
+ )
3235
+ parser.add_argument(
3236
+ "--language",
3237
+ "--lang",
3238
+ default="en",
3239
+ choices=["en", "fr", "de", "es", "ru", "zh"],
3240
+ help="Voice language for default Piper TTS (en|fr|de|es|ru|zh).",
3241
+ )
1080
3242
  parser.add_argument("--tts-model",
1081
3243
  help="Specific TTS model to use (overrides language default)")
1082
3244
  return parser.parse_args()
@@ -1093,8 +3255,11 @@ def main():
1093
3255
  api_url=args.api,
1094
3256
  model=args.model,
1095
3257
  debug_mode=args.debug,
3258
+ verbose_mode=args.verbose,
1096
3259
  language=args.language,
1097
- tts_model=args.tts_model
3260
+ tts_model=args.tts_model,
3261
+ voice_mode=args.voice_mode,
3262
+ cloning_engine=args.cloning_engine,
1098
3263
  )
1099
3264
  repl.cmdloop()
1100
3265
  except KeyboardInterrupt:
@@ -1104,4 +3269,4 @@ def main():
1104
3269
 
1105
3270
 
1106
3271
  if __name__ == "__main__":
1107
- main()
3272
+ main()