abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.1.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.1.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -8,9 +8,15 @@ that interacts with an LLM API for text generation.
8
8
 
9
9
  import argparse
10
10
  import cmd
11
+ import atexit
11
12
  import json
12
13
  import re
14
+ import shlex
15
+ import shutil
13
16
  import sys
17
+ import importlib.util
18
+ import threading
19
+ import time
14
20
  import requests
15
21
  from abstractvoice import VoiceManager
16
22
 
@@ -31,18 +37,34 @@ class VoiceREPL(cmd.Cmd):
31
37
  """Voice-enabled REPL for LLM interaction."""
32
38
 
33
39
  intro = "" # Will be set in __init__ to include help
34
- prompt = f"{Colors.GREEN}> {Colors.END}"
40
+ prompt = "> "
35
41
 
36
42
  # Override cmd module settings
37
43
  ruler = "" # No horizontal rule line
38
44
  use_rawinput = True
39
45
 
40
- def __init__(self, api_url="http://localhost:11434/api/chat",
41
- model="granite3.3:2b", debug_mode=False, language="en", tts_model=None, disable_tts=False):
46
+ def __init__(
47
+ self,
48
+ api_url="http://localhost:11434/api/chat",
49
+ model="cogito:3b",
50
+ debug_mode=False,
51
+ verbose_mode: bool = False,
52
+ language="en",
53
+ tts_model=None,
54
+ voice_mode: str = "off",
55
+ disable_tts=False,
56
+ cloning_engine: str = "f5_tts",
57
+ ):
42
58
  super().__init__()
43
59
 
60
+ # Best-effort: enable proper line editing + history (Up/Down arrows).
61
+ # Some Python builds (notably when built without readline/libedit) will
62
+ # otherwise treat arrow keys as escape sequences and corrupt the prompt.
63
+ self._init_readline()
64
+
44
65
  # Debug mode
45
66
  self.debug_mode = debug_mode
67
+ self.verbose_mode = bool(verbose_mode)
46
68
 
47
69
  # API settings
48
70
  self.api_url = api_url
@@ -52,6 +74,8 @@ class VoiceREPL(cmd.Cmd):
52
74
 
53
75
  # Language settings
54
76
  self.current_language = language
77
+ self._initial_tts_model = tts_model
78
+ self.cloning_engine = str(cloning_engine or "f5_tts").strip().lower()
55
79
 
56
80
  # Initialize voice manager with language support
57
81
  if disable_tts:
@@ -61,19 +85,36 @@ class VoiceREPL(cmd.Cmd):
61
85
  self.voice_manager = VoiceManager(
62
86
  language=language,
63
87
  tts_model=tts_model,
64
- debug_mode=debug_mode
88
+ debug_mode=debug_mode,
89
+ allow_downloads=False,
90
+ cloned_tts_streaming=False,
91
+ cloning_engine=self.cloning_engine,
65
92
  )
93
+
94
+ # Current speaking voice:
95
+ # - None => Piper (default, language-driven)
96
+ # - str => cloned voice_id
97
+ self.current_tts_voice: str | None = None
98
+
99
+ # When reference_text is auto-generated via ASR ("asr" source), print a
100
+ # ready-to-copy `/clone_set_ref_text ...` hint once per voice for easy correction.
101
+ self._printed_asr_ref_text_hint: set[str] = set()
102
+
103
+ # Seed a default cloned voice (HAL9000) if samples are present.
104
+ self._seed_hal9000_voice()
66
105
 
67
106
  # Settings
68
107
  self.use_tts = True
69
- self.voice_mode = "off" # off, full, wait, stop, ptt
108
+ # Voice input mode (mic). Default: OFF for fast startup + offline-first.
109
+ # Use `--voice-mode stop` (or `/voice stop`) to enable hands-free.
110
+ self.voice_mode = (voice_mode or "off").strip().lower() # off, full, wait, stop, ptt
70
111
  self.voice_mode_active = False # Is voice recognition running?
112
+ self._ptt_session_active = False
113
+ self._ptt_recording = False
114
+ self._ptt_busy = False
71
115
 
72
116
  # System prompt
73
- self.system_prompt = """
74
- You are a Helpful Voice Assistant. By design, your answers are short and more conversational, unless specifically asked to detail something.
75
- You only speak, so never use any text formatting or markdown. Write for a speaker.
76
- """
117
+ self.system_prompt = "You are a Helpful Voice Assistant. By design, your answers are short and conversational, unless specifically asked to detail something. You only speak, so never use any text formatting, hinting, *emotions*, emojis or markdown. Incarnate the speaker, never comment your instructions."
77
118
 
78
119
  # Message history
79
120
  self.messages = [{"role": "system", "content": self.system_prompt}]
@@ -82,27 +123,136 @@ class VoiceREPL(cmd.Cmd):
82
123
  self.system_tokens = 0
83
124
  self.user_tokens = 0
84
125
  self.assistant_tokens = 0
126
+ # LLM token totals (best-effort, Ollama API `eval_count`).
127
+ self.total_llm_out_tokens = 0
128
+ # Word counting
129
+ self.system_words = 0
130
+ self.user_words = 0
131
+ self.assistant_words = 0
132
+ # Best-effort tokenizer cache (tiktoken optional).
133
+ self._tiktoken_encoding = None
134
+ self._tiktoken_unavailable = False
85
135
  self._count_system_tokens()
86
-
136
+ self._count_system_words()
137
+
138
+ # Best-effort metrics captured from voice input paths.
139
+ self._pending_stt_metrics: dict | None = None
140
+
87
141
  if self.debug_mode:
88
142
  print(f"Initialized with API URL: {api_url}")
89
143
  print(f"Using model: {model}")
90
-
144
+
145
+ # Optionally auto-start voice input (mic). Keep OFF by default to avoid
146
+ # loading STT models (slow) unless the user explicitly opts in.
147
+ if self.voice_manager and self.voice_mode and self.voice_mode != "off":
148
+ try:
149
+ self.do_voice(self.voice_mode)
150
+ except Exception:
151
+ # Never block REPL start.
152
+ self.voice_mode = "off"
153
+ self.voice_mode_active = False
154
+
91
155
  # Set intro with help information
92
156
  self.intro = self._get_intro()
157
+
158
+ def _init_readline(self) -> None:
159
+ """Initialize readline history + make ANSI prompts safe (best-effort)."""
160
+ rl = None
161
+ try:
162
+ import readline as _readline # type: ignore
163
+
164
+ rl = _readline
165
+ except Exception:
166
+ # Windows users may have pyreadline3 installed.
167
+ try:
168
+ import pyreadline3 as _readline # type: ignore
169
+
170
+ rl = _readline
171
+ except Exception:
172
+ rl = None
173
+
174
+ if rl is None:
175
+ # Keep prompt simple and avoid ANSI; prevents strange cursor behavior
176
+ # when arrow keys emit escape codes in cooked terminals.
177
+ self.prompt = "> "
178
+ return
179
+
180
+ # Keep prompt plain when readline is enabled. ANSI prompts are fragile
181
+ # across readline/libedit builds and can corrupt redraw/history behavior.
182
+ self.prompt = "> "
183
+
184
+ # Persist history across sessions (best-effort).
185
+ try:
186
+ from pathlib import Path
187
+
188
+ try:
189
+ import appdirs
190
+
191
+ hist_dir = Path(appdirs.user_data_dir("abstractvoice"))
192
+ except Exception:
193
+ hist_dir = Path.home() / ".abstractvoice"
194
+
195
+ hist_dir.mkdir(parents=True, exist_ok=True)
196
+ hist_path = hist_dir / "repl_history"
197
+
198
+ try:
199
+ rl.read_history_file(str(hist_path))
200
+ except FileNotFoundError:
201
+ pass
202
+ except Exception:
203
+ pass
204
+
205
+ try:
206
+ rl.set_history_length(2000)
207
+ except Exception:
208
+ pass
209
+
210
+ def _save_history():
211
+ try:
212
+ rl.write_history_file(str(hist_path))
213
+ except Exception:
214
+ pass
215
+
216
+ atexit.register(_save_history)
217
+ except Exception:
218
+ pass
219
+
220
+ # Ensure Up/Down arrows traverse history reliably across GNU readline and
221
+ # macOS libedit-backed readline. Some libedit defaults perform prefix
222
+ # search/completion, which can look like text is being appended.
223
+ try:
224
+ doc = getattr(rl, "__doc__", "") or ""
225
+ is_libedit = "libedit" in doc.lower()
226
+ if is_libedit:
227
+ # libedit syntax
228
+ rl.parse_and_bind("bind ^[[A ed-prev-history")
229
+ rl.parse_and_bind("bind ^[[B ed-next-history")
230
+ rl.parse_and_bind("bind ^[[OA ed-prev-history")
231
+ rl.parse_and_bind("bind ^[[OB ed-next-history")
232
+ else:
233
+ # GNU readline syntax
234
+ rl.parse_and_bind('"\\e[A": previous-history')
235
+ rl.parse_and_bind('"\\e[B": next-history')
236
+ rl.parse_and_bind('"\\eOA": previous-history')
237
+ rl.parse_and_bind('"\\eOB": next-history')
238
+ except Exception:
239
+ pass
93
240
 
94
241
  def _get_intro(self):
95
242
  """Generate intro message with help."""
96
243
  intro = f"\n{Colors.BOLD}Welcome to AbstractVoice CLI REPL{Colors.END}\n"
97
244
  if self.voice_manager:
98
245
  lang_name = self.voice_manager.get_language_name()
99
- intro += f"API: {self.api_url} | Model: {self.model} | Voice: {lang_name}\n"
246
+ mic = (self.voice_mode or "off").upper()
247
+ intro += f"API: {self.api_url} | Model: {self.model} | Voice: {lang_name} | Mic: {mic} | Cloning: {self.cloning_engine}\n"
100
248
  else:
101
249
  intro += f"API: {self.api_url} | Model: {self.model} | Voice: Disabled\n"
102
250
  intro += f"\n{Colors.CYAN}Quick Start:{Colors.END}\n"
103
251
  intro += " • Type messages to chat with the LLM\n"
104
- intro += " • Use /voice <mode> to enable voice input\n"
252
+ intro += " • Voice input (mic): off by default. Enable: /voice stop (or start with --voice-mode stop)\n"
253
+ intro += " • PTT: /voice ptt then SPACE to capture (ESC exits)\n"
105
254
  intro += " • Use /language <lang> to switch voice language\n"
255
+ intro += " • Use /clones and /tts_voice to use cloned voices\n"
106
256
  intro += " • Type /help for full command list\n"
107
257
  intro += " • Type /exit or /q to quit\n"
108
258
  return intro
@@ -110,6 +260,236 @@ class VoiceREPL(cmd.Cmd):
110
260
  def _count_system_tokens(self):
111
261
  """Count tokens in the system prompt."""
112
262
  self._count_tokens(self.system_prompt, "system")
263
+
264
+ def _count_system_words(self):
265
+ self.system_words = self._count_words(self.system_prompt)
266
+
267
+ def _count_words(self, text: str) -> int:
268
+ s = str(text or "").strip()
269
+ if not s:
270
+ return 0
271
+ # A "word" here is whitespace-delimited for simplicity across languages.
272
+ return len([w for w in re.split(r"\s+", s) if w])
273
+
274
+ def _get_tiktoken_encoding(self):
275
+ if getattr(self, "_tiktoken_unavailable", False):
276
+ return None
277
+ enc = getattr(self, "_tiktoken_encoding", None)
278
+ if enc is not None:
279
+ return enc
280
+ try:
281
+ import tiktoken
282
+ except ImportError:
283
+ self._tiktoken_unavailable = True
284
+ return None
285
+
286
+ try:
287
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
288
+ except Exception:
289
+ try:
290
+ enc = tiktoken.get_encoding("cl100k_base")
291
+ except Exception:
292
+ self._tiktoken_unavailable = True
293
+ return None
294
+
295
+ self._tiktoken_encoding = enc
296
+ return enc
297
+
298
+ def _fmt_s(self, seconds: float | None) -> str:
299
+ try:
300
+ if seconds is None:
301
+ return "--"
302
+ s = float(seconds)
303
+ if s < 0:
304
+ return "--"
305
+ # Keep it compact but readable.
306
+ if s < 10:
307
+ return f"{s:.2f}s"
308
+ return f"{s:.1f}s"
309
+ except Exception:
310
+ return "--"
311
+
312
+ def _fmt_num(self, x: float | None, *, digits: int = 2) -> str:
313
+ try:
314
+ if x is None:
315
+ return "--"
316
+ return f"{float(x):.{int(digits)}f}"
317
+ except Exception:
318
+ return "--"
319
+
320
+ def _fmt_wtok(self, words: int | None, tokens: int | None) -> str:
321
+ w = int(words) if isinstance(words, int) else (int(words) if words is not None else 0)
322
+ if isinstance(tokens, int):
323
+ return f"{w}w/{int(tokens)}tok"
324
+ return f"{w}w/--tok"
325
+
326
+ def _summarize_audio_source(self, source: str) -> tuple[int | None, float | None]:
327
+ """Best-effort: return (file_count, total_seconds) for an audio source path."""
328
+ try:
329
+ from pathlib import Path
330
+
331
+ p = Path(str(source)).expanduser()
332
+ except Exception:
333
+ return None, None
334
+
335
+ try:
336
+ import soundfile as sf
337
+ except Exception:
338
+ return None, None
339
+
340
+ supported = {".wav", ".flac", ".ogg"}
341
+ files = []
342
+ try:
343
+ if p.is_file():
344
+ files = [p]
345
+ elif p.is_dir():
346
+ files = sorted([x for x in p.iterdir() if x.is_file() and x.suffix.lower() in supported])
347
+ else:
348
+ return None, None
349
+ except Exception:
350
+ return None, None
351
+
352
+ total_s = 0.0
353
+ max_files = 25
354
+ for fp in files[:max_files]:
355
+ try:
356
+ info = sf.info(str(fp))
357
+ d = float(getattr(info, "duration", 0.0) or 0.0)
358
+ if d > 0:
359
+ total_s += d
360
+ except Exception:
361
+ continue
362
+
363
+ # If there are too many files, the displayed duration is a lower bound.
364
+ return (int(len(files)) if files else 0), (float(total_s) if total_s > 0 else None)
365
+
366
+ def _print_verbose_turn_stats(self, turn: dict) -> None:
367
+ if not bool(getattr(self, "verbose_mode", False)):
368
+ return
369
+ if not isinstance(turn, dict):
370
+ return
371
+
372
+ stt = turn.get("stt") if isinstance(turn.get("stt"), dict) else None
373
+ llm = turn.get("llm") if isinstance(turn.get("llm"), dict) else {}
374
+ counts = turn.get("counts") if isinstance(turn.get("counts"), dict) else {}
375
+ tts = turn.get("tts") if isinstance(turn.get("tts"), dict) else None
376
+
377
+ in_w = counts.get("in_words")
378
+ out_w = counts.get("out_words")
379
+ in_t = counts.get("in_tokens")
380
+ out_t = counts.get("out_tokens")
381
+
382
+ llm_s = llm.get("s")
383
+ api = llm.get("api") if isinstance(llm.get("api"), dict) else {}
384
+ api_prompt_tok = api.get("prompt_eval_count") if isinstance(api.get("prompt_eval_count"), int) else None
385
+ api_out_tok = api.get("eval_count") if isinstance(api.get("eval_count"), int) else None
386
+
387
+ # Line 1: STT (if any) + LLM + in/out counts and written speed.
388
+ parts1 = []
389
+ if stt:
390
+ stt_s = stt.get("stt_s")
391
+ stt_a = stt.get("audio_s")
392
+ stt_rtf = stt.get("rtf")
393
+ stt_txt = f"STT {self._fmt_s(stt_s)}"
394
+ if stt_a:
395
+ stt_txt += f"(a{self._fmt_s(stt_a)})"
396
+ if stt_rtf is not None:
397
+ stt_txt += f" rtf{self._fmt_num(stt_rtf, digits=2)}"
398
+ parts1.append(stt_txt)
399
+
400
+ if llm_s is not None or api_prompt_tok is not None or api_out_tok is not None:
401
+ llm_txt = f"LLM {self._fmt_s(llm_s)}"
402
+ if api_prompt_tok is not None or api_out_tok is not None:
403
+ p = str(api_prompt_tok) if api_prompt_tok is not None else "--"
404
+ o = str(api_out_tok) if api_out_tok is not None else "--"
405
+ llm_txt += f" (api p{p} o{o})"
406
+ parts1.append(llm_txt)
407
+
408
+ in_txt = f"in {self._fmt_wtok(in_w, in_t)}"
409
+ out_txt = f"out {self._fmt_wtok(out_w, out_t)}"
410
+
411
+ wps_written = None
412
+ try:
413
+ if isinstance(out_w, int) and out_w > 0 and llm_s and float(llm_s) > 0:
414
+ wps_written = float(out_w) / float(llm_s)
415
+ except Exception:
416
+ wps_written = None
417
+
418
+ if wps_written is not None:
419
+ out_txt += f" ({self._fmt_num(wps_written, digits=1)}w/s)"
420
+
421
+ parts1.append(in_txt)
422
+ parts1.append(out_txt)
423
+
424
+ line1 = " | ".join(parts1)
425
+
426
+ # Line 2: TTS (if any) + spoken speed + totals.
427
+ parts2 = []
428
+ if self.voice_manager and self.use_tts:
429
+ if not tts:
430
+ parts2.append("TTS --")
431
+ else:
432
+ eng = str(tts.get("engine") or "").strip().lower()
433
+ if eng == "clone":
434
+ ce = tts.get("clone_engine")
435
+ label = f"clone[{ce}]" if ce else "clone"
436
+ elif eng:
437
+ label = eng
438
+ else:
439
+ label = "tts"
440
+
441
+ err = (tts.get("error") or "").strip()
442
+ if err:
443
+ # Keep single-line and short.
444
+ msg = " ".join(err.split())
445
+ if len(msg) > 120:
446
+ msg = msg[:120].rstrip() + "…"
447
+ parts2.append(f"TTS {label} ERR {msg}")
448
+ else:
449
+ synth_s = tts.get("synth_s")
450
+ audio_s = tts.get("audio_s")
451
+ rtf = tts.get("rtf")
452
+ tts_txt = f"TTS {label} {self._fmt_s(synth_s)}→{self._fmt_s(audio_s)}"
453
+ if rtf is not None:
454
+ tts_txt += f" rtf{self._fmt_num(rtf, digits=2)}"
455
+
456
+ # Extra clone streaming details when available.
457
+ if eng == "clone" and bool(tts.get("streaming")):
458
+ ttfb_s = tts.get("ttfb_s")
459
+ if ttfb_s is not None:
460
+ tts_txt += f" ttfb{self._fmt_s(ttfb_s)}"
461
+ ch = tts.get("chunks")
462
+ if isinstance(ch, int):
463
+ tts_txt += f" ch{ch}"
464
+
465
+ wps_spoken = None
466
+ try:
467
+ if isinstance(out_w, int) and out_w > 0 and audio_s and float(audio_s) > 0:
468
+ wps_spoken = float(out_w) / float(audio_s)
469
+ except Exception:
470
+ wps_spoken = None
471
+ if wps_spoken is not None:
472
+ tts_txt += f" ({self._fmt_num(wps_spoken, digits=1)}w/s)"
473
+
474
+ parts2.append(tts_txt)
475
+ else:
476
+ parts2.append("TTS off")
477
+
478
+ total_words = int(getattr(self, "system_words", 0) + getattr(self, "user_words", 0) + getattr(self, "assistant_words", 0))
479
+ total_tokens = None
480
+ if self._get_tiktoken_encoding() is not None:
481
+ total_tokens = int(getattr(self, "system_tokens", 0) + getattr(self, "user_tokens", 0) + getattr(self, "assistant_tokens", 0))
482
+
483
+ tot_txt = f"tot {self._fmt_wtok(total_words, total_tokens)}"
484
+ if isinstance(getattr(self, "total_llm_out_tokens", None), int) and getattr(self, "total_llm_out_tokens") > 0:
485
+ tot_txt += f" (api out {int(getattr(self, 'total_llm_out_tokens'))}tok)"
486
+ parts2.append(tot_txt)
487
+
488
+ line2 = " | ".join(parts2)
489
+
490
+ # Keep it readable; two lines max.
491
+ print(f"{Colors.YELLOW}{line1}{Colors.END}")
492
+ print(f"{Colors.YELLOW}{line2}{Colors.END}")
113
493
 
114
494
  def parseline(self, line):
115
495
  """Parse the line to extract command and arguments.
@@ -117,14 +497,11 @@ class VoiceREPL(cmd.Cmd):
117
497
  Override to handle / prefix for commands. This ensures /voice, /help, etc.
118
498
  are recognized as commands by stripping the leading / before parsing.
119
499
  """
120
- line = line.strip()
121
-
122
- # If line starts with /, remove it for command processing
123
- if line.startswith('/'):
124
- line = line[1:].strip()
125
-
126
- # Call parent parseline to do the actual parsing
127
- return super().parseline(line)
500
+ # Commands still use leading "/". In PTT mode we don't accept typed input.
501
+ s = line.strip()
502
+ if s.startswith("/"):
503
+ return super().parseline(s[1:].strip())
504
+ return super().parseline(line.strip())
128
505
 
129
506
  def default(self, line):
130
507
  """Handle regular text input.
@@ -133,29 +510,123 @@ class VoiceREPL(cmd.Cmd):
133
510
  All other commands MUST use / prefix.
134
511
  """
135
512
  # Skip empty lines
136
- if not line.strip():
513
+ text = line.strip()
514
+ if not text:
137
515
  return
138
516
 
139
- # ONLY 'stop' is recognized without / (for voice mode convenience)
140
- if line.strip().lower() == "stop":
141
- return self.do_stop("")
142
-
517
+ # In PTT mode we do not accept typed input.
518
+ if self.voice_mode == "ptt":
519
+ print("PTT mode: press SPACE to speak, ESC to exit.")
520
+ return
521
+
143
522
  # Check if in voice mode - don't send to LLM
144
523
  if self.voice_mode_active:
145
524
  if self.debug_mode:
146
- print(f"Voice mode active ({self.voice_mode}). Use /voice off or say 'stop' to exit.")
525
+ print(f"Voice mode active ({self.voice_mode}). Use /voice off to disable.")
526
+ return
527
+
528
+ # Interrupt any ongoing TTS playback immediately when the user types.
529
+ # This is the expected “barge-in by typing” UX for a REPL.
530
+ try:
531
+ if self.voice_manager:
532
+ self.voice_manager.stop_speaking()
533
+ except Exception:
534
+ pass
535
+
536
+ # Shortcut: paste a reference audio path to clone+use a voice.
537
+ # Examples:
538
+ # audio_samples/hal9000/hal9000_hello.wav
539
+ # audio_samples/hal9000/hal9000_hello.wav | Hello, Dave.
540
+ if self._maybe_handle_clone_shortcut(text):
147
541
  return
148
542
 
149
543
  # Everything else goes to LLM
150
- self.process_query(line.strip())
544
+ self._pending_stt_metrics = None
545
+ self.process_query(text)
546
+
547
+ # NOTE: PTT is implemented as a dedicated key-loop session (no typing).
548
+
549
+ def _maybe_handle_clone_shortcut(self, text: str) -> bool:
550
+ """Best-effort: treat a pasted WAV/FLAC/OGG path as `/clone_use`."""
551
+ if not self.voice_manager:
552
+ return False
553
+
554
+ raw = (text or "").strip()
555
+ if not raw:
556
+ return False
557
+ if raw.startswith("/"):
558
+ return False
559
+
560
+ # Optional transcript with a simple pipe syntax:
561
+ # path.wav | Hello.
562
+ left, sep, right = raw.partition("|")
563
+ path_str = left.strip()
564
+ ref_text = right.strip() if sep else ""
565
+ reference_text = ref_text or None
566
+
567
+ # Strip naive wrapping quotes.
568
+ if (path_str.startswith('"') and path_str.endswith('"')) or (path_str.startswith("'") and path_str.endswith("'")):
569
+ path_str = path_str[1:-1].strip()
570
+
571
+ try:
572
+ from pathlib import Path
573
+
574
+ p = Path(path_str).expanduser()
575
+ except Exception:
576
+ return False
577
+
578
+ if not p.exists():
579
+ return False
580
+
581
+ exts = {".wav", ".flac", ".ogg"}
582
+ if p.is_file() and p.suffix.lower() not in exts:
583
+ return False
584
+ if p.is_dir():
585
+ try:
586
+ has_audio = any(x.is_file() and x.suffix.lower() in exts for x in p.iterdir())
587
+ except Exception:
588
+ has_audio = False
589
+ if not has_audio:
590
+ return False
591
+
592
+ # Build a `/clone_use` call with a stable name.
593
+ import shlex as _shlex
594
+
595
+ default_name = p.stem if p.is_file() else p.name
596
+ args = f"{_shlex.quote(str(p))} {_shlex.quote(default_name)}"
597
+ if reference_text:
598
+ args += f" --text {_shlex.quote(reference_text)}"
599
+ try:
600
+ self.do_clone_use(args)
601
+ except Exception as e:
602
+ print(f"❌ Clone shortcut failed: {e}")
603
+ if self.debug_mode:
604
+ import traceback
605
+
606
+ traceback.print_exc()
607
+ return True
151
608
 
152
609
  def process_query(self, query):
153
610
  """Process a query and get a response from the LLM."""
154
611
  if not query:
155
612
  return
613
+
614
+ # Consume any pending STT metrics for this turn (voice/PTT input).
615
+ stt_metrics = getattr(self, "_pending_stt_metrics", None)
616
+ self._pending_stt_metrics = None
617
+
618
+ # If audio is currently playing, stop it so the new request can be handled
619
+ # without overlapping speech.
620
+ try:
621
+ if self.voice_manager:
622
+ self.voice_manager.stop_speaking()
623
+ except Exception:
624
+ pass
156
625
 
157
- # Count user message tokens
158
- self._count_tokens(query, "user")
626
+ # Per-turn counts
627
+ user_words = self._count_words(query)
628
+ self.user_words += int(user_words)
629
+ user_tokens = self._count_tokens(query, "user")
159
630
 
160
631
  # Create the message
161
632
  user_message = {"role": "user", "content": query}
@@ -175,6 +646,7 @@ class VoiceREPL(cmd.Cmd):
175
646
  }
176
647
 
177
648
  # Make API request
649
+ llm_t0 = time.monotonic()
178
650
  response = requests.post(self.api_url, json=payload)
179
651
  response.raise_for_status()
180
652
 
@@ -182,6 +654,22 @@ class VoiceREPL(cmd.Cmd):
182
654
  try:
183
655
  # First, try to parse as JSON
184
656
  response_data = response.json()
657
+ api_llm_metrics = {}
658
+ try:
659
+ # Ollama exposes timing + token counts (nanoseconds).
660
+ # Keep best-effort: if fields are missing, we just omit them.
661
+ for k in (
662
+ "total_duration",
663
+ "load_duration",
664
+ "prompt_eval_count",
665
+ "prompt_eval_duration",
666
+ "eval_count",
667
+ "eval_duration",
668
+ ):
669
+ if k in response_data:
670
+ api_llm_metrics[k] = response_data.get(k)
671
+ except Exception:
672
+ api_llm_metrics = {}
185
673
 
186
674
  # Check for different API formats
187
675
  if "message" in response_data and "content" in response_data["message"]:
@@ -200,6 +688,7 @@ class VoiceREPL(cmd.Cmd):
200
688
 
201
689
  # Handle streaming or non-JSON response
202
690
  response_text = response.text.strip()
691
+ api_llm_metrics = {}
203
692
 
204
693
  # Try to extract content from streaming format if possible
205
694
  if response_text.startswith("{") and "content" in response_text:
@@ -228,9 +717,13 @@ class VoiceREPL(cmd.Cmd):
228
717
  except Exception as e:
229
718
  if self.debug_mode:
230
719
  print(f"Error extracting content from streaming response: {e}")
720
+ llm_t1 = time.monotonic()
721
+ llm_s = float(llm_t1 - llm_t0)
231
722
 
232
- # Count assistant message tokens
233
- self._count_tokens(response_text, "assistant")
723
+ # Per-turn counts
724
+ assistant_words = self._count_words(response_text)
725
+ self.assistant_words += int(assistant_words)
726
+ assistant_tokens = self._count_tokens(response_text, "assistant")
234
727
 
235
728
  # Add to message history
236
729
  self.messages.append({"role": "assistant", "content": response_text})
@@ -238,9 +731,61 @@ class VoiceREPL(cmd.Cmd):
238
731
  # Display the response with color
239
732
  print(f"{Colors.CYAN}{response_text}{Colors.END}")
240
733
 
734
+ # Record last-turn stats (best-effort; printed only in verbose mode).
735
+ self._last_turn_metrics = {
736
+ "stt": stt_metrics,
737
+ "llm": {
738
+ "s": llm_s,
739
+ "api": api_llm_metrics,
740
+ },
741
+ "counts": {
742
+ "in_words": int(user_words),
743
+ "out_words": int(assistant_words),
744
+ "in_tokens": int(user_tokens) if isinstance(user_tokens, int) else None,
745
+ "out_tokens": int(assistant_tokens) if isinstance(assistant_tokens, int) else None,
746
+ },
747
+ }
748
+ try:
749
+ out_tok = api_llm_metrics.get("eval_count") if isinstance(api_llm_metrics, dict) else None
750
+ if isinstance(out_tok, int) and out_tok >= 0:
751
+ self.total_llm_out_tokens += int(out_tok)
752
+ except Exception:
753
+ pass
754
+
241
755
  # Speak the response if voice manager is available
242
756
  if self.voice_manager and self.use_tts:
243
- self.voice_manager.speak(response_text)
757
+ try:
758
+ # UX guard: never trigger big cloning downloads during normal chat.
759
+ if self.current_tts_voice and not self._is_cloning_runtime_ready(voice_id=self.current_tts_voice):
760
+ print(
761
+ "ℹ️ Cloned voice selected but cloning runtime is not ready.\n"
762
+ " Run /cloning_status then /cloning_download, or switch back with /tts_voice piper."
763
+ )
764
+ else:
765
+ self._speak_with_spinner_until_audio_starts(response_text)
766
+ except Exception as e:
767
+ print(f"❌ TTS failed: {e}")
768
+
769
+ # Capture best-effort TTS metrics (Piper or cloned).
770
+ tts_metrics = None
771
+ try:
772
+ if self.voice_manager and hasattr(self.voice_manager, "pop_last_tts_metrics"):
773
+ tts_metrics = self.voice_manager.pop_last_tts_metrics()
774
+ except Exception:
775
+ tts_metrics = None
776
+
777
+ try:
778
+ if isinstance(getattr(self, "_last_turn_metrics", None), dict):
779
+ self._last_turn_metrics["tts"] = tts_metrics
780
+ except Exception:
781
+ pass
782
+
783
+ # Verbose stats (max 2 lines).
784
+ try:
785
+ if self.verbose_mode and isinstance(getattr(self, "_last_turn_metrics", None), dict):
786
+ self._print_verbose_turn_stats(self._last_turn_metrics)
787
+ except Exception:
788
+ pass
244
789
 
245
790
  except requests.exceptions.ConnectionError as e:
246
791
  print(f"❌ Cannot connect to Ollama API at {self.api_url}")
@@ -274,37 +819,29 @@ class VoiceREPL(cmd.Cmd):
274
819
 
275
820
  def _count_tokens(self, text, role):
276
821
  """Count tokens in text."""
822
+ encoding = self._get_tiktoken_encoding()
823
+ if encoding is None:
824
+ return None
277
825
  try:
278
- import tiktoken
279
-
280
- # Initialize the tokenizer
281
- encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
282
-
283
- # Count tokens
284
- token_count = len(encoding.encode(text))
285
-
286
- # Update the token counts based on role
287
- if role == "system":
288
- self.system_tokens = token_count
289
- elif role == "user":
290
- self.user_tokens += token_count
291
- elif role == "assistant":
292
- self.assistant_tokens += token_count
293
-
294
- # Calculate total tokens
295
- total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
296
-
297
- if self.debug_mode:
298
- print(f"{role.capitalize()} tokens: {token_count}")
299
- print(f"Total tokens: {total_tokens}")
300
-
301
- except ImportError:
302
- # If tiktoken is not available, just don't count tokens
303
- pass
826
+ token_count = len(encoding.encode(str(text or "")))
304
827
  except Exception as e:
305
828
  if self.debug_mode:
306
829
  print(f"Error counting tokens: {e}")
307
- pass
830
+ return None
831
+
832
+ # Update the token counts based on role
833
+ if role == "system":
834
+ self.system_tokens = int(token_count)
835
+ elif role == "user":
836
+ self.user_tokens += int(token_count)
837
+ elif role == "assistant":
838
+ self.assistant_tokens += int(token_count)
839
+
840
+ if self.debug_mode:
841
+ total_tokens = self.system_tokens + self.user_tokens + self.assistant_tokens
842
+ print(f"{role.capitalize()} tokens: {token_count}")
843
+ print(f"Total tokens: {total_tokens}")
844
+ return int(token_count)
308
845
 
309
846
  def _clean_response(self, text):
310
847
  """Clean LLM response text."""
@@ -323,8 +860,12 @@ class VoiceREPL(cmd.Cmd):
323
860
  """Switch voice language.
324
861
 
325
862
  Usage: /language <lang>
326
- Available languages: en, fr, es, de, it
863
+ Available languages: en, fr, es, de, ru, zh
327
864
  """
865
+ if not self.voice_manager:
866
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
867
+ return
868
+
328
869
  if not args:
329
870
  current_name = self.voice_manager.get_language_name()
330
871
  current_code = self.voice_manager.get_language()
@@ -359,10 +900,13 @@ class VoiceREPL(cmd.Cmd):
359
900
  'fr': "Langue changée en français.",
360
901
  'es': "Idioma cambiado a español.",
361
902
  'de': "Sprache auf Deutsch umgestellt.",
362
- 'it': "Lingua cambiata in italiano."
903
+ 'ru': "Язык переключен на русский.",
904
+ 'zh': "语言已切换到中文。"
363
905
  }
364
906
  test_msg = test_messages.get(language, "Language switched.")
365
- self.voice_manager.speak(test_msg)
907
+ # Respect TTS toggle: if the user disabled TTS, don't speak test messages.
908
+ if getattr(self, "use_tts", True):
909
+ self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
366
910
 
367
911
  # Restart voice mode if it was active
368
912
  if was_active:
@@ -383,10 +927,13 @@ class VoiceREPL(cmd.Cmd):
383
927
  /setvoice <voice_id> # Set voice (format: language.voice_id)
384
928
 
385
929
  Examples:
386
- /setvoice # List all voices with JSON-like info
387
- /setvoice fr.css10_vits # Set French CSS10 VITS voice
388
- /setvoice it.mai_male_vits # Set Italian male VITS voice
930
+ /setvoice # List all Piper voices
931
+ /setvoice fr.siwis # Switch to French (voice id is best-effort)
389
932
  """
933
+ if not self.voice_manager:
934
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
935
+ return
936
+
390
937
  if not args:
391
938
  # Show all available voices with metadata
392
939
  print(f"\n{Colors.CYAN}Available Voice Models:{Colors.END}")
@@ -398,7 +945,7 @@ class VoiceREPL(cmd.Cmd):
398
945
  # Get language name
399
946
  lang_names = {
400
947
  'en': 'English', 'fr': 'French', 'es': 'Spanish',
401
- 'de': 'German', 'it': 'Italian'
948
+ 'de': 'German', 'ru': 'Russian', 'zh': 'Chinese'
402
949
  }
403
950
  lang_name = lang_names.get(language, language.upper())
404
951
 
@@ -406,24 +953,22 @@ class VoiceREPL(cmd.Cmd):
406
953
 
407
954
  for voice_id, voice_info in voices.items():
408
955
  cached_icon = "✅" if voice_info.get('cached', False) else "📥"
409
- quality_icon = "✨" if voice_info['quality'] == 'excellent' else "🔧"
410
- size_text = f"{voice_info['size_mb']}MB"
956
+ quality_icon = "🔧"
957
+ size_text = f"{voice_info.get('size_mb', 0)}MB"
411
958
 
412
959
  print(f" {cached_icon} {quality_icon} {language}.{voice_id}")
413
960
  print(f" {voice_info['name']} ({size_text})")
414
961
  print(f" {voice_info['description']}")
415
- if voice_info.get('requires_espeak', False):
416
- print(f" ⚠️ Requires espeak-ng")
962
+ # Piper has no system deps.
417
963
 
418
964
  print(f"\n{Colors.YELLOW}Usage:{Colors.END}")
419
965
  print(" /setvoice <language>.<voice_id>")
420
- print(" Example: /setvoice fr.css10_vits")
421
- print("\n📥 = Download needed ✅ = Ready ✨ = High quality 🔧 = Good quality")
966
+ print(" Example: /setvoice fr.siwis")
967
+ print("\n📥 = Download needed ✅ = Ready")
422
968
 
423
969
  except Exception as e:
424
970
  print(f"❌ Error listing models: {e}")
425
- # Fallback to old method
426
- self.voice_manager.list_voices()
971
+ print(" (No fallback available)")
427
972
  return
428
973
 
429
974
  voice_spec = args.strip()
@@ -451,39 +996,28 @@ class VoiceREPL(cmd.Cmd):
451
996
  # Download and set the specific voice using programmatic API
452
997
  try:
453
998
  print(f"🔄 Setting voice {voice_spec}...")
454
-
455
- # Use the programmatic download API
456
- success = self.voice_manager.download_model(voice_spec)
999
+ success = self.voice_manager.set_voice(language, voice_id)
457
1000
 
458
1001
  if success:
459
- # Now set the language to match
460
- success = self.voice_manager.set_language(language)
461
-
462
- if success:
463
- # Update current language
464
- self.current_language = language
465
-
466
- print(f"✅ Voice set to {voice_spec}")
467
-
468
- # Test the voice
469
- test_messages = {
470
- 'en': 'Voice changed to English.',
471
- 'fr': 'Voix changée en français.',
472
- 'es': 'Voz cambiada al español.',
473
- 'de': 'Stimme auf Deutsch geändert.',
474
- 'it': 'Voce cambiata in italiano.'
475
- }
476
- test_msg = test_messages.get(language, f'Voice changed to {language}.')
477
- self.voice_manager.speak(test_msg)
1002
+ self.current_language = language
1003
+ print(f"✅ Voice set to {voice_spec}")
478
1004
 
479
- # Restart voice mode if it was active
480
- if was_active:
481
- self.do_voice(self.voice_mode)
482
- else:
483
- print(f"❌ Failed to set language: {language}")
1005
+ test_messages = {
1006
+ 'en': 'Voice changed to English.',
1007
+ 'fr': 'Voix changée en français.',
1008
+ 'es': 'Voz cambiada al español.',
1009
+ 'de': 'Stimme auf Deutsch geändert.',
1010
+ 'ru': 'Голос изменён на русский.',
1011
+ 'zh': '语音已切换到中文。'
1012
+ }
1013
+ test_msg = test_messages.get(language, f'Voice changed to {language}.')
1014
+ if getattr(self, "use_tts", True):
1015
+ self.voice_manager.speak(test_msg, voice=self.current_tts_voice)
1016
+
1017
+ if was_active:
1018
+ self.do_voice(self.voice_mode)
484
1019
  else:
485
- print(f"❌ Failed to download voice: {voice_spec}")
486
- print(" Check your internet connection or try a different voice")
1020
+ print(f"❌ Failed to set voice: {voice_spec}")
487
1021
 
488
1022
  except Exception as e:
489
1023
  print(f"❌ Error setting voice: {e}")
@@ -521,185 +1055,1732 @@ class VoiceREPL(cmd.Cmd):
521
1055
  off - Disable voice input
522
1056
  full - Continuous listening, interrupts TTS on speech detection
523
1057
  wait - Pause listening while TTS is speaking (recommended)
524
- stop - Only stops TTS on 'stop' keyword (planned)
525
- ptt - Push-to-talk mode (planned)
1058
+ stop - Keep listening while speaking, but only stop TTS on stop phrase
1059
+ ptt - Push-to-talk (use /ptt to record one utterance)
526
1060
  """
527
- arg = arg.lower().strip()
1061
+ arg = (arg or "").lower().strip()
528
1062
 
529
1063
  # Handle legacy "on" argument
530
1064
  if arg == "on":
531
1065
  arg = "wait"
532
1066
 
533
1067
  if arg in ["off", "full", "wait", "stop", "ptt"]:
534
- # If switching from one mode to another, stop current mode first
535
- if self.voice_mode_active and arg != "off":
536
- self._voice_stop_callback()
537
-
1068
+ if not self.voice_manager:
1069
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
1070
+ return
1071
+
1072
+ # Exit PTT session if running.
1073
+ if self._ptt_session_active:
1074
+ self._ptt_session_active = False
1075
+ self._ptt_recording = False
1076
+ self._ptt_busy = False
1077
+
1078
+ # Stop any ongoing mic session.
1079
+ try:
1080
+ self.voice_manager.stop_listening()
1081
+ except Exception:
1082
+ pass
1083
+ self.voice_mode_active = False
1084
+
538
1085
  self.voice_mode = arg
539
1086
  self.voice_manager.set_voice_mode(arg)
540
-
1087
+
541
1088
  if arg == "off":
542
- if self.voice_mode_active:
543
- self._voice_stop_callback()
544
- else:
545
- # Start voice recognition for non-off modes
546
- self.voice_mode_active = True
547
-
548
- # Start listening with callbacks
1089
+ print("Voice mode disabled.")
1090
+ return
1091
+
1092
+ if arg == "ptt":
1093
+ # PTT is a dedicated session: no text entry.
1094
+ print("Voice mode: PTT - Push-to-talk (no typing).")
1095
+ print("SPACE: start/stop recording (transcribe on stop)")
1096
+ print("ESC: exit PTT mode")
1097
+ self._run_ptt_session()
1098
+ return
1099
+
1100
+ # Continuous listening modes.
1101
+ try:
549
1102
  self.voice_manager.listen(
550
1103
  on_transcription=self._voice_callback,
551
- on_stop=lambda: self._voice_stop_callback()
1104
+ # Stop phrase interrupts TTS; keep listening.
1105
+ on_stop=lambda: (
1106
+ print("\n⏹️ Stopped speaking.\n") if (self.voice_manager and self.voice_manager.is_speaking()) else None
1107
+ ),
552
1108
  )
553
-
554
- # Print mode-specific instructions
555
- if arg == "full":
556
- print("Voice mode: FULL - Continuous listening, interrupts TTS on speech.")
557
- print("Say 'stop' to exit.")
558
- elif arg == "wait":
559
- print("Voice mode: WAIT - Pauses listening while speaking (recommended).")
560
- print("Say 'stop' to exit.")
561
- elif arg == "stop":
562
- print("Voice mode: STOP (Planned) - Only stops TTS on 'stop' keyword.")
563
- print("Currently same as WAIT mode.")
564
- elif arg == "ptt":
565
- print("Voice mode: PTT (Planned) - Push-to-talk functionality.")
566
- print("Currently same as WAIT mode.")
1109
+ self.voice_mode_active = True
1110
+ except Exception as e:
1111
+ self.voice_mode_active = False
1112
+ self.voice_mode = "off"
1113
+ print(f" Failed to start microphone listening: {e}")
1114
+ print(" Tip: check microphone permissions/device availability.")
1115
+ return
1116
+
1117
+ if arg == "wait":
1118
+ print("Voice mode: WAIT - Listens continuously except while speaking.")
1119
+ print("Use /voice off to disable.")
1120
+ elif arg == "stop":
1121
+ print("Voice mode: STOP - Always listens; stop phrase stops TTS.")
1122
+ print("Use /voice off to disable.")
1123
+ elif arg == "full":
1124
+ print("Voice mode: FULL - Interrupts TTS on any speech (best with AEC/headset).")
1125
+ print("Use /voice off to disable.")
567
1126
  else:
568
1127
  print("Usage: /voice off | full | wait | stop | ptt")
569
1128
  print(" off - Disable voice input")
570
1129
  print(" full - Continuous listening, interrupts TTS on speech")
571
- print(" wait - Pause listening while speaking (recommended)")
572
- print(" stop - Only stop TTS on 'stop' keyword (planned)")
573
- print(" ptt - Push-to-talk mode (planned)")
574
-
575
- def _voice_callback(self, text):
576
- """Callback for voice recognition."""
577
- # Print what the user said
578
- print(f"\n> {text}")
579
-
580
- # Check if the user said 'stop' to exit voice mode
581
- if text.lower() == "stop":
582
- self._voice_stop_callback()
583
- # Don't process "stop" as a query
1130
+ print(" wait - Listen except while speaking")
1131
+ print(" stop - Always listen; stop phrase stops TTS")
1132
+ print(" ptt - Push-to-talk (no typing; SPACE triggers capture)")
1133
+
1134
+ def do_ptt(self, arg):
1135
+ """Push-to-talk: record a single utterance, then process it.
1136
+
1137
+ Usage:
1138
+ /ptt
1139
+ """
1140
+ if not self.voice_manager:
1141
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
584
1142
  return
585
-
586
- # Mode-specific handling
587
- if self.voice_mode == "stop":
588
- # In 'stop' mode, don't interrupt TTS - just queue the message
589
- # But since we're in callback, TTS interrupt is already paused
590
- pass
591
- elif self.voice_mode == "ptt":
592
- # In PTT mode, process immediately
1143
+ print("❌ /ptt is deprecated. Use: /voice ptt (then SPACE)")
1144
+ return
1145
+
1146
+ # Ensure we are not already listening.
1147
+ try:
1148
+ self.voice_manager.stop_listening()
1149
+ except Exception:
593
1150
  pass
594
- # 'full' mode has default behavior
595
-
596
- # Process the user's query
597
- self.process_query(text)
598
-
599
- def _voice_stop_callback(self):
600
- """Callback when voice mode is stopped."""
601
- self.voice_mode = "off"
602
- self.voice_mode_active = False
603
- self.voice_manager.stop_listening()
604
- print("Voice mode disabled.")
605
-
606
- def do_tts(self, arg):
607
- """Toggle text-to-speech."""
608
- arg = arg.lower().strip()
609
-
610
- if arg == "on":
611
- self.use_tts = True
612
- print("TTS enabled" if self.debug_mode else "")
613
- elif arg == "off":
614
- self.use_tts = False
615
- print("TTS disabled" if self.debug_mode else "")
616
- else:
617
- print("Usage: /tts on | off")
618
-
619
- def do_speed(self, arg):
620
- """Set the TTS speed multiplier."""
621
- if not arg.strip():
622
- print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
1151
+
1152
+ return
1153
+
1154
+ def _run_ptt_session(self) -> None:
1155
+ """PTT mode key loop (no typing).
1156
+
1157
+ Clean semantics:
1158
+ - SPACE toggles recording (start/stop)
1159
+ - on stop: transcribe immediately and send to the LLM
1160
+ - ESC exits PTT mode (returns to STOP mode)
1161
+
1162
+ This avoids relying on VAD end-of-utterance, which is fragile when speaker
1163
+ echo is present (common on laptop speakers).
1164
+ """
1165
+ if not self.voice_manager:
623
1166
  return
624
-
1167
+ self._ptt_session_active = True
1168
+ self._ptt_recording = False
1169
+ self._ptt_busy = False
1170
+
1171
+ # Lazy imports: keep REPL startup snappy.
1172
+ import io
1173
+ import wave
1174
+
625
1175
  try:
626
- speed = float(arg.strip())
1176
+ import sounddevice as sd
1177
+ except Exception as e:
1178
+ print(f"❌ PTT requires sounddevice: {e}")
1179
+ self._ptt_session_active = False
1180
+ return
1181
+
1182
+ sr = 16000
1183
+ frames: list[bytes] = []
1184
+ stream = {"obj": None}
1185
+ cols = 80
1186
+ try:
1187
+ cols = int(shutil.get_terminal_size((80, 20)).columns)
1188
+ except Exception:
1189
+ cols = 80
1190
+
1191
+ def _clear_status() -> None:
1192
+ try:
1193
+ sys.stdout.write("\r" + (" " * max(10, cols - 1)) + "\r")
1194
+ sys.stdout.flush()
1195
+ except Exception:
1196
+ pass
1197
+
1198
+ def _status_line(msg: str) -> None:
1199
+ # Render on a single line (no newline) so SPACE can be pressed repeatedly.
1200
+ try:
1201
+ _clear_status()
1202
+ sys.stdout.write(str(msg)[: max(0, cols - 1)])
1203
+ sys.stdout.flush()
1204
+ except Exception:
1205
+ pass
1206
+
1207
+ def _println(msg: str = "") -> None:
1208
+ # When in raw terminal mode, '\n' does NOT reliably return to column 0.
1209
+ # Use CRLF explicitly to prevent "diagonal drifting" rendering.
1210
+ try:
1211
+ _clear_status()
1212
+ sys.stdout.write("\r\n" + str(msg) + "\r\n")
1213
+ sys.stdout.flush()
1214
+ except Exception:
1215
+ pass
1216
+
1217
+ def _start_recording() -> None:
1218
+ nonlocal frames
1219
+ if self._ptt_recording:
1220
+ return
1221
+ if self._ptt_busy:
1222
+ return
1223
+ frames = []
1224
+
1225
+ # Interrupt any speech immediately.
1226
+ try:
1227
+ self.voice_manager.stop_speaking()
1228
+ except Exception:
1229
+ pass
1230
+
1231
+ def _cb(indata, _frames, _time, status):
1232
+ if status and self.debug_mode:
1233
+ pass
1234
+ try:
1235
+ frames.append(indata.copy().tobytes())
1236
+ except Exception:
1237
+ pass
1238
+
1239
+ try:
1240
+ stream["obj"] = sd.InputStream(
1241
+ samplerate=sr,
1242
+ channels=1,
1243
+ dtype="int16",
1244
+ callback=_cb,
1245
+ blocksize=int(sr * 0.03),
1246
+ )
1247
+ stream["obj"].start()
1248
+ self._ptt_recording = True
1249
+ _status_line("🎙️ Recording… (SPACE to send, ESC to exit)")
1250
+ except Exception as e:
1251
+ self._ptt_recording = False
1252
+ stream["obj"] = None
1253
+ _clear_status()
1254
+ _println(f"❌ Failed to start microphone stream: {e}")
1255
+
1256
+ def _stop_recording_and_send() -> None:
1257
+ if not self._ptt_recording:
1258
+ return
1259
+ self._ptt_recording = False
1260
+ _clear_status()
1261
+
1262
+ try:
1263
+ if stream["obj"] is not None:
1264
+ try:
1265
+ stream["obj"].stop()
1266
+ except Exception:
1267
+ pass
1268
+ try:
1269
+ stream["obj"].close()
1270
+ except Exception:
1271
+ pass
1272
+ finally:
1273
+ stream["obj"] = None
1274
+
1275
+ pcm = b"".join(frames)
1276
+ if len(pcm) < int(sr * 0.25) * 2:
1277
+ _println("…(too short, try again)")
1278
+ return
1279
+
1280
+ buf = io.BytesIO()
1281
+ with wave.open(buf, "wb") as w:
1282
+ w.setnchannels(1)
1283
+ w.setsampwidth(2)
1284
+ w.setframerate(sr)
1285
+ w.writeframes(pcm)
1286
+ wav_bytes = buf.getvalue()
1287
+
1288
+ self._ptt_busy = True
1289
+ try:
1290
+ audio_s = 0.0
1291
+ try:
1292
+ if sr and sr > 0:
1293
+ audio_s = float(len(pcm)) / float(int(sr) * 2)
1294
+ except Exception:
1295
+ audio_s = 0.0
1296
+
1297
+ t0 = time.monotonic()
1298
+ text = (self.voice_manager.transcribe_from_bytes(wav_bytes, language=self.current_language) or "").strip()
1299
+ t1 = time.monotonic()
1300
+ stt_s = float(t1 - t0)
1301
+ self._pending_stt_metrics = {
1302
+ "stt_s": stt_s,
1303
+ "audio_s": float(audio_s),
1304
+ "rtf": (stt_s / float(audio_s)) if audio_s else None,
1305
+ "sample_rate": int(sr),
1306
+ "chunks": None,
1307
+ "chunk_ms": None,
1308
+ "profile": "ptt",
1309
+ "ts": time.time(),
1310
+ }
1311
+ except Exception as e:
1312
+ self._ptt_busy = False
1313
+ _println(f"❌ Transcription failed: {e}")
1314
+ return
1315
+ self._ptt_busy = False
1316
+
1317
+ if not text:
1318
+ _println("…(no transcription)")
1319
+ return
1320
+
1321
+ _println(f"> {text}")
1322
+ self.process_query(text)
1323
+
1324
+ # Platform key read.
1325
+ import sys
1326
+ if sys.platform == "win32":
1327
+ import msvcrt
1328
+
1329
+ while self._ptt_session_active:
1330
+ ch = msvcrt.getwch()
1331
+ if ch == "\x1b": # ESC
1332
+ break
1333
+ if self._ptt_busy:
1334
+ continue
1335
+ if ch == " ":
1336
+ if not self._ptt_recording:
1337
+ _start_recording()
1338
+ else:
1339
+ _stop_recording_and_send()
1340
+ else:
1341
+ import termios
1342
+ import tty
1343
+
1344
+ fd = sys.stdin.fileno()
1345
+ old = termios.tcgetattr(fd)
1346
+ try:
1347
+ tty.setraw(fd)
1348
+
1349
+ def _run_in_cooked(block):
1350
+ """Run a block with normal tty settings.
1351
+
1352
+ In raw mode, many terminals treat '\n' as LF without CR, so prints from
1353
+ deeper code paths (LLM responses) can drift/indent. We temporarily
1354
+ restore the terminal mode to keep output rendering stable.
1355
+ """
1356
+ try:
1357
+ termios.tcsetattr(fd, termios.TCSADRAIN, old)
1358
+ except Exception:
1359
+ pass
1360
+ try:
1361
+ block()
1362
+ finally:
1363
+ try:
1364
+ tty.setraw(fd)
1365
+ except Exception:
1366
+ pass
1367
+
1368
+ while self._ptt_session_active:
1369
+ ch = sys.stdin.read(1)
1370
+ if ch == "\x1b": # ESC
1371
+ break
1372
+ if self._ptt_busy:
1373
+ continue
1374
+ if ch == " ":
1375
+ if not self._ptt_recording:
1376
+ _start_recording()
1377
+ else:
1378
+ _run_in_cooked(_stop_recording_and_send)
1379
+ finally:
1380
+ termios.tcsetattr(fd, termios.TCSADRAIN, old)
1381
+
1382
+ self._ptt_session_active = False
1383
+ self._ptt_recording = False
1384
+ self._ptt_busy = False
1385
+ try:
1386
+ if stream["obj"] is not None:
1387
+ stream["obj"].stop()
1388
+ stream["obj"].close()
1389
+ except Exception:
1390
+ pass
1391
+ _clear_status()
1392
+ # Ensure we end on a clean line before restoring other modes.
1393
+ try:
1394
+ sys.stdout.write("\r\n")
1395
+ sys.stdout.flush()
1396
+ except Exception:
1397
+ pass
1398
+ # Restore to STOP after exiting PTT.
1399
+ try:
1400
+ self.do_voice("stop")
1401
+ except Exception:
1402
+ pass
1403
+
1404
+ def _voice_callback(self, text):
1405
+ """Callback for voice recognition."""
1406
+ # Capture best-effort STT metrics from the recognizer (for verbose stats).
1407
+ stt_metrics = None
1408
+ try:
1409
+ vm = self.voice_manager
1410
+ rec = getattr(vm, "voice_recognizer", None) if vm else None
1411
+ if rec is not None and hasattr(rec, "pop_last_stt_metrics"):
1412
+ stt_metrics = rec.pop_last_stt_metrics()
1413
+ except Exception:
1414
+ stt_metrics = None
1415
+ self._pending_stt_metrics = stt_metrics
1416
+
1417
+ # Print what the user said
1418
+ print(f"\n> {text}")
1419
+ # NOTE: stop phrases are handled by the stop_callback path (interrupt TTS).
1420
+ # We do not use "stop" to exit voice mode; use /voice off explicitly.
1421
+
1422
+ # Mode-specific handling
1423
+ if self.voice_mode == "stop":
1424
+ # In 'stop' mode, don't interrupt TTS - just queue the message
1425
+ # But since we're in callback, TTS interrupt is already paused
1426
+ pass
1427
+ elif self.voice_mode == "ptt":
1428
+ # In PTT mode, process immediately
1429
+ pass
1430
+ # 'full' mode has default behavior
1431
+
1432
+ # Process the user's query
1433
+ self.process_query(text)
1434
+
1435
+ def _voice_stop_callback(self):
1436
+ """Callback when voice mode is stopped."""
1437
+ self.voice_mode = "off"
1438
+ self.voice_mode_active = False
1439
+ self.voice_manager.stop_listening()
1440
+ print("Voice mode disabled.")
1441
+
1442
+ def do_tts(self, arg):
1443
+ """Toggle text-to-speech."""
1444
+ arg = arg.lower().strip()
1445
+
1446
+ if arg == "on":
1447
+ self.use_tts = True
1448
+ if self.voice_manager is None:
1449
+ # Re-enable voice features (TTS/STT) by creating a VoiceManager.
1450
+ self.voice_manager = VoiceManager(
1451
+ language=self.current_language,
1452
+ tts_model=self._initial_tts_model,
1453
+ debug_mode=self.debug_mode,
1454
+ allow_downloads=False,
1455
+ cloned_tts_streaming=False,
1456
+ cloning_engine=self.cloning_engine,
1457
+ )
1458
+ print("TTS enabled" if self.debug_mode else "")
1459
+ elif arg == "off":
1460
+ self.use_tts = False
1461
+ print("TTS disabled" if self.debug_mode else "")
1462
+ else:
1463
+ print("Usage: /tts on | off")
1464
+
1465
+ def do_speed(self, arg):
1466
+ """Set the TTS speed multiplier."""
1467
+ if not self.voice_manager:
1468
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1469
+ return
1470
+ if not arg.strip():
1471
+ print(f"Current TTS speed: {self.voice_manager.get_speed()}x")
1472
+ return
1473
+
1474
+ try:
1475
+ speed = float(arg.strip())
627
1476
  if 0.5 <= speed <= 2.0:
628
1477
  self.voice_manager.set_speed(speed)
629
1478
  print(f"TTS speed set to {speed}x")
630
1479
  else:
631
- print("Speed should be between 0.5 and 2.0")
632
- except ValueError:
633
- print("Usage: /speed <number> (e.g., /speed 1.5)")
634
-
635
- def do_tts_model(self, arg):
636
- """Change TTS model.
637
-
638
- Available models (quality ranking):
639
- vits - BEST quality (requires espeak-ng)
640
- fast_pitch - Good quality (works everywhere)
641
- glow-tts - Alternative fallback
642
- tacotron2-DDC - Legacy
643
-
1480
+ print("Speed should be between 0.5 and 2.0")
1481
+ except ValueError:
1482
+ print("Usage: /speed <number> (e.g., /speed 1.5)")
1483
+
1484
+ def do_tts_model(self, arg):
1485
+ """Deprecated: legacy TTS model switching.
1486
+
1487
+ AbstractVoice core is Piper-first; use `/setvoice` (Piper voices) or cloned voices.
1488
+ """
1489
+ print("❌ /tts_model is not supported (Piper-first core).")
1490
+ print(" Use /setvoice for Piper voices, or /tts_voice clone <id> for cloned voices.")
1491
+
1492
+ def do_whisper(self, arg):
1493
+ """Change Whisper model."""
1494
+ if not self.voice_manager:
1495
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
1496
+ return
1497
+ model = arg.strip()
1498
+ if not model:
1499
+ print(f"Current Whisper model: {self.voice_manager.get_whisper()}")
1500
+ return
1501
+
1502
+ self.voice_manager.set_whisper(model)
1503
+
1504
+ def do_speak(self, arg):
1505
+ """Speak a text immediately (without calling the LLM).
1506
+
1507
+ Usage:
1508
+ /speak Hello world
1509
+ """
1510
+ if not self.voice_manager:
1511
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1512
+ return
1513
+
1514
+ text = arg.strip()
1515
+ if not text:
1516
+ print("Usage: /speak <text>")
1517
+ return
1518
+
1519
+ try:
1520
+ self._speak_with_spinner_until_audio_starts(text)
1521
+ if self.verbose_mode:
1522
+ out_words = self._count_words(text)
1523
+ out_tokens = None
1524
+ try:
1525
+ enc = self._get_tiktoken_encoding()
1526
+ if enc is not None:
1527
+ out_tokens = int(len(enc.encode(str(text or ""))))
1528
+ except Exception:
1529
+ out_tokens = None
1530
+
1531
+ tts_metrics = None
1532
+ try:
1533
+ if hasattr(self.voice_manager, "pop_last_tts_metrics"):
1534
+ tts_metrics = self.voice_manager.pop_last_tts_metrics()
1535
+ except Exception:
1536
+ tts_metrics = None
1537
+
1538
+ turn = {
1539
+ "stt": None,
1540
+ "llm": {},
1541
+ "counts": {
1542
+ "in_words": 0,
1543
+ "out_words": int(out_words),
1544
+ "in_tokens": None,
1545
+ "out_tokens": out_tokens,
1546
+ },
1547
+ "tts": tts_metrics,
1548
+ }
1549
+ self._print_verbose_turn_stats(turn)
1550
+ except Exception as e:
1551
+ print(f"❌ Speak failed: {e}")
1552
+ if self.debug_mode:
1553
+ import traceback
1554
+ traceback.print_exc()
1555
+
1556
+ def _speak_with_spinner_until_audio_starts(self, text: str) -> None:
1557
+ """REPL UX: show spinner while waiting for first audio, then stop.
1558
+
1559
+ This avoids corrupting the `cmd` prompt while still giving feedback during
1560
+ long cloned-TTS synthesis. Once playback starts, the prompt is displayed
1561
+ normally so the user can interrupt anytime by typing.
1562
+ """
1563
+ if not self.voice_manager:
1564
+ return
1565
+
1566
+ is_clone = bool(self.current_tts_voice)
1567
+ if not is_clone:
1568
+ # Offline-first: Piper voices must be explicitly cached. Provide a clear
1569
+ # message instead of hanging on implicit downloads.
1570
+ try:
1571
+ a = getattr(self.voice_manager, "tts_adapter", None)
1572
+ if a is not None and hasattr(a, "is_available") and not bool(a.is_available()):
1573
+ lang = str(getattr(self, "current_language", "en") or "en").strip().lower()
1574
+ raise RuntimeError(
1575
+ f"Piper voice model for '{lang}' is not available locally.\n"
1576
+ f"Run: python -m abstractvoice download --piper {lang}"
1577
+ )
1578
+ except RuntimeError:
1579
+ raise
1580
+ except Exception:
1581
+ pass
1582
+ ind = self._busy_indicator(enabled=is_clone)
1583
+ try:
1584
+ if is_clone:
1585
+ ind.start()
1586
+ self.voice_manager.speak(text, voice=self.current_tts_voice)
1587
+
1588
+ if not is_clone:
1589
+ return
1590
+
1591
+ # Wait until audio playback actually starts (or synthesis ends without audio).
1592
+ vm = self.voice_manager
1593
+ while True:
1594
+ try:
1595
+ playing = bool(vm.is_speaking())
1596
+ synth_active = bool(
1597
+ getattr(vm, "_cloned_synthesis_active", None) and vm._cloned_synthesis_active.is_set()
1598
+ )
1599
+ except Exception:
1600
+ playing, synth_active = False, False
1601
+
1602
+ if playing:
1603
+ break
1604
+
1605
+ # If synthesis is no longer active and we aren't playing, stop the spinner
1606
+ # (either done very quickly or failed).
1607
+ if not synth_active:
1608
+ break
1609
+
1610
+ time.sleep(0.05)
1611
+ finally:
1612
+ try:
1613
+ ind.stop()
1614
+ except Exception:
1615
+ pass
1616
+ # If ASR auto-generated the clone's reference_text, print an easy override command
1617
+ # (once). We do this after stopping the spinner to avoid corrupting the prompt line.
1618
+ try:
1619
+ if is_clone and self.current_tts_voice:
1620
+ self._maybe_print_asr_ref_text_override(self.current_tts_voice)
1621
+ except Exception:
1622
+ pass
1623
+ # Do not print the prompt manually: `cmd` will render it on return,
1624
+ # and printing here can result in duplicate prompts (`> >`).
1625
+
1626
+ def _maybe_print_asr_ref_text_override(self, voice_id: str) -> None:
1627
+ """If `reference_text` was auto-generated via ASR, print a paste-ready override hint.
1628
+
1629
+ Important: `/clone_set_ref_text` uses a simple `split(maxsplit=1)`, so quoting is not
1630
+ interpreted. We therefore print the command *without* quotes to avoid storing them.
1631
+ """
1632
+ if not self.voice_manager:
1633
+ return
1634
+ vid = str(voice_id or "").strip()
1635
+ if not vid:
1636
+ return
1637
+ if vid in self._printed_asr_ref_text_hint:
1638
+ return
1639
+ try:
1640
+ info = self.voice_manager.get_cloned_voice(vid) or {}
1641
+ except Exception:
1642
+ return
1643
+ meta = info.get("meta") or {}
1644
+ src = str(meta.get("reference_text_source") or "").strip().lower()
1645
+ ref_text = str(info.get("reference_text") or "").strip()
1646
+ if not ref_text:
1647
+ return
1648
+ if src != "asr":
1649
+ return
1650
+
1651
+ # Mark first so any printing errors won't cause repeated spam.
1652
+ self._printed_asr_ref_text_hint.add(vid)
1653
+
1654
+ prefix = vid[:8] if len(vid) >= 8 else vid
1655
+ name = str(info.get("name") or "").strip()
1656
+ label = f"{name} ({prefix})" if name else prefix
1657
+ print("ℹ️ Auto-generated reference transcript (ASR).")
1658
+ print(f" Voice: {label}")
1659
+ print(" If you want to correct it, copy/paste and edit the text after the id:")
1660
+ print(f" /clone_set_ref_text {prefix} {ref_text}")
1661
+
1662
+ class _busy_indicator:
1663
+ """A minimal, discreet spinner (no extra lines)."""
1664
+
1665
+ def __init__(self, enabled: bool = False):
1666
+ self.enabled = bool(enabled)
1667
+ self._stop = threading.Event()
1668
+ self._thread = None
1669
+
1670
+ def start(self):
1671
+ if not self.enabled:
1672
+ return
1673
+ if self._thread and self._thread.is_alive():
1674
+ return
1675
+
1676
+ def _run():
1677
+ frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
1678
+ i = 0
1679
+ t0 = time.time()
1680
+ # Small delay so fast operations don't flash.
1681
+ time.sleep(0.25)
1682
+ if self._stop.is_set():
1683
+ return
1684
+ # Hide cursor for a cleaner look.
1685
+ try:
1686
+ sys.stdout.write("\033[?25l")
1687
+ sys.stdout.flush()
1688
+ except Exception:
1689
+ pass
1690
+ while not self._stop.is_set():
1691
+ elapsed = time.time() - t0
1692
+ sys.stdout.write(f"\r(synthesizing {elapsed:0.1f}s) {frames[i % len(frames)]}")
1693
+ sys.stdout.flush()
1694
+ i += 1
1695
+ time.sleep(0.1)
1696
+
1697
+ self._thread = threading.Thread(target=_run, daemon=True)
1698
+ self._thread.start()
1699
+
1700
+ def stop(self):
1701
+ if not self.enabled:
1702
+ return
1703
+ self._stop.set()
1704
+ try:
1705
+ if self._thread:
1706
+ self._thread.join(timeout=0.5)
1707
+ except Exception:
1708
+ pass
1709
+ # Clear spinner line.
1710
+ try:
1711
+ # `\033[2K` clears the entire line (more robust than fixed spaces).
1712
+ sys.stdout.write("\r\033[2K\r")
1713
+ # Restore cursor.
1714
+ sys.stdout.write("\033[?25h")
1715
+ sys.stdout.flush()
1716
+ except Exception:
1717
+ pass
1718
+
1719
+ def __enter__(self):
1720
+ self.start()
1721
+ return self
1722
+
1723
+ def __exit__(self, exc_type, exc, tb):
1724
+ self.stop()
1725
+ return False
1726
+
1727
+ # NOTE: We intentionally do not keep a background spinner running while the REPL
1728
+ # is waiting for user input (it corrupts the prompt line). Instead, we show a
1729
+ # spinner only until the first audio actually starts, then stop it so the prompt
1730
+ # stays usable for interruption-by-typing.
1731
+
1732
+ def do_clones(self, arg):
1733
+ """List cloned voices in the local store."""
1734
+ if not self.voice_manager:
1735
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1736
+ return
1737
+ try:
1738
+ voices = self.voice_manager.list_cloned_voices()
1739
+ if not voices:
1740
+ print("No cloned voices yet. Use /clone <path> or /clone-my-voice.")
1741
+ return
1742
+ print(f"\n{Colors.CYAN}Cloned voices:{Colors.END}")
1743
+ for v in voices:
1744
+ vid = v.get("voice_id") or v.get("voice", "")
1745
+ name = v.get("name", "")
1746
+ eng = (v.get("engine") or "").strip()
1747
+ eng_txt = f" [{eng}]" if eng else ""
1748
+ src = (v.get("meta") or {}).get("reference_text_source", "")
1749
+ src_txt = f" [{src}]" if src else ""
1750
+ current = " (current)" if self.current_tts_voice == vid else ""
1751
+ print(f" - {name}: {vid}{eng_txt}{src_txt}{current}")
1752
+ print("Tip: /clone_rm <id-or-name> deletes one; /clone_rm_all --yes deletes all.")
1753
+ except Exception as e:
1754
+ print(f"❌ Error listing cloned voices: {e}")
1755
+
1756
+ def _resolve_clone_id(self, wanted: str) -> str | None:
1757
+ voices = self.voice_manager.list_cloned_voices()
1758
+ for v in voices:
1759
+ vid = v.get("voice_id") or ""
1760
+ name = v.get("name") or ""
1761
+ if wanted == vid or vid.startswith(wanted) or wanted == name:
1762
+ return vid
1763
+ return None
1764
+
1765
+ def _resolve_clone_id_by_source(self, source: str, *, engine: str | None = None) -> str | None:
1766
+ """Find a cloned voice by its stored meta.source (best-effort)."""
1767
+ if not self.voice_manager:
1768
+ return None
1769
+
1770
+ try:
1771
+ from pathlib import Path
1772
+
1773
+ target = Path(str(source)).expanduser()
1774
+ try:
1775
+ target_norm = str(target.resolve())
1776
+ except Exception:
1777
+ target_norm = str(target)
1778
+ except Exception:
1779
+ target_norm = str(source)
1780
+
1781
+ try:
1782
+ voices = self.voice_manager.list_cloned_voices()
1783
+ except Exception:
1784
+ return None
1785
+
1786
+ wanted_engine = (str(engine).strip().lower() if engine else None) or None
1787
+ for v in voices:
1788
+ meta = v.get("meta") or {}
1789
+ src = meta.get("source")
1790
+ if not src:
1791
+ continue
1792
+ try:
1793
+ from pathlib import Path
1794
+
1795
+ p = Path(str(src)).expanduser()
1796
+ try:
1797
+ src_norm = str(p.resolve())
1798
+ except Exception:
1799
+ src_norm = str(p)
1800
+ except Exception:
1801
+ src_norm = str(src)
1802
+
1803
+ if src_norm != target_norm:
1804
+ continue
1805
+ if wanted_engine and (str(v.get("engine") or "").strip().lower() != wanted_engine):
1806
+ continue
1807
+ return str(v.get("voice_id") or "").strip() or None
1808
+ return None
1809
+
1810
+ def do_clone_info(self, arg):
1811
+ """Show details for a cloned voice.
1812
+
1813
+ Usage:
1814
+ /clone_info <id-or-name>
1815
+ """
1816
+ if not self.voice_manager:
1817
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1818
+ return
1819
+ wanted = arg.strip()
1820
+ if not wanted:
1821
+ print("Usage: /clone_info <id-or-name>")
1822
+ return
1823
+ vid = self._resolve_clone_id(wanted)
1824
+ if not vid:
1825
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
1826
+ return
1827
+ try:
1828
+ info = self.voice_manager.get_cloned_voice(vid)
1829
+ meta = info.get("meta") or {}
1830
+ print(f"\n{Colors.CYAN}Cloned voice info:{Colors.END}")
1831
+ print(f" id: {info.get('voice_id')}")
1832
+ print(f" name: {info.get('name')}")
1833
+ print(f" engine: {info.get('engine')}")
1834
+ print(f" refs: {len(info.get('reference_files') or [])}")
1835
+ print(f" ref_text_source: {meta.get('reference_text_source','')}")
1836
+ rt = (info.get('reference_text') or '').strip()
1837
+ if rt:
1838
+ short = (rt[:200] + "…") if len(rt) > 200 else rt
1839
+ print(f" reference_text: {short}")
1840
+ else:
1841
+ print(" reference_text: (missing)")
1842
+ except Exception as e:
1843
+ print(f"❌ Error: {e}")
1844
+
1845
+ def do_clone_ref(self, arg):
1846
+ """Print the full reference_text for a cloned voice.
1847
+
1848
+ Usage:
1849
+ /clone_ref <id-or-name>
1850
+ """
1851
+ if not self.voice_manager:
1852
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1853
+ return
1854
+ wanted = arg.strip()
1855
+ if not wanted:
1856
+ print("Usage: /clone_ref <id-or-name>")
1857
+ return
1858
+ vid = self._resolve_clone_id(wanted)
1859
+ if not vid:
1860
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
1861
+ return
1862
+ info = self.voice_manager.get_cloned_voice(vid)
1863
+ print((info.get("reference_text") or "").strip())
1864
+
1865
+ def do_clone_rename(self, arg):
1866
+ """Rename a cloned voice.
1867
+
1868
+ Usage:
1869
+ /clone_rename <id-or-name> <new_name>
1870
+ """
1871
+ if not self.voice_manager:
1872
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1873
+ return
1874
+ parts = arg.strip().split(maxsplit=1)
1875
+ if len(parts) < 2:
1876
+ print("Usage: /clone_rename <id-or-name> <new_name>")
1877
+ return
1878
+ vid = self._resolve_clone_id(parts[0])
1879
+ if not vid:
1880
+ print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
1881
+ return
1882
+ self.voice_manager.rename_cloned_voice(vid, parts[1])
1883
+ print("✅ Renamed.")
1884
+
1885
+ def do_clone_rm(self, arg):
1886
+ """Remove a cloned voice from the store.
1887
+
1888
+ Usage:
1889
+ /clone_rm <id-or-name>
1890
+ """
1891
+ if not self.voice_manager:
1892
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1893
+ return
1894
+ wanted = arg.strip()
1895
+ if not wanted:
1896
+ print("Usage: /clone_rm <id-or-name>")
1897
+ return
1898
+ vid = self._resolve_clone_id(wanted)
1899
+ if not vid:
1900
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
1901
+ return
1902
+ # If currently selected, switch back to Piper.
1903
+ if self.current_tts_voice == vid:
1904
+ self.current_tts_voice = None
1905
+ self.voice_manager.delete_cloned_voice(vid)
1906
+ print("✅ Deleted.")
1907
+
1908
+ def do_clone_rm_all(self, arg):
1909
+ """Remove ALL cloned voices from the local store.
1910
+
1911
+ Usage:
1912
+ /clone_rm_all --yes
1913
+ """
1914
+ if not self.voice_manager:
1915
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1916
+ return
1917
+
1918
+ confirm = (arg or "").strip().lower()
1919
+ if confirm not in ("--yes", "-y", "yes"):
1920
+ try:
1921
+ n = len(self.voice_manager.list_cloned_voices() or [])
1922
+ except Exception:
1923
+ n = 0
1924
+ if n <= 0:
1925
+ print("No cloned voices to delete.")
1926
+ return
1927
+ print(f"⚠️ This will permanently delete {n} cloned voice(s).")
1928
+ print("Re-run with: /clone_rm_all --yes")
1929
+ return
1930
+
1931
+ # If currently selected, switch back to Piper.
1932
+ self.current_tts_voice = None
1933
+
1934
+ deleted = 0
1935
+ failed = 0
1936
+ try:
1937
+ voices = list(self.voice_manager.list_cloned_voices() or [])
1938
+ except Exception as e:
1939
+ print(f"❌ Error listing cloned voices: {e}")
1940
+ return
1941
+
1942
+ for v in voices:
1943
+ vid = str(v.get("voice_id") or v.get("voice") or "").strip()
1944
+ if not vid:
1945
+ continue
1946
+ try:
1947
+ self.voice_manager.delete_cloned_voice(vid)
1948
+ deleted += 1
1949
+ except Exception:
1950
+ failed += 1
1951
+
1952
+ if failed:
1953
+ print(f"✅ Deleted {deleted} cloned voice(s). ⚠️ Failed: {failed}")
1954
+ else:
1955
+ print(f"✅ Deleted {deleted} cloned voice(s).")
1956
+
1957
+ def do_clone_export(self, arg):
1958
+ """Export a cloned voice bundle (.zip).
1959
+
1960
+ Usage:
1961
+ /clone_export <id-or-name> <path.zip>
1962
+ """
1963
+ if not self.voice_manager:
1964
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1965
+ return
1966
+ parts = arg.strip().split(maxsplit=1)
1967
+ if len(parts) < 2:
1968
+ print("Usage: /clone_export <id-or-name> <path.zip>")
1969
+ return
1970
+ vid = self._resolve_clone_id(parts[0])
1971
+ if not vid:
1972
+ print(f"❌ Unknown cloned voice: {parts[0]}. Use /clones to list.")
1973
+ return
1974
+ out = self.voice_manager.export_voice(vid, parts[1])
1975
+ print(f"✅ Exported: {out}")
1976
+
1977
+ def do_clone_import(self, arg):
1978
+ """Import a cloned voice bundle (.zip).
1979
+
1980
+ Usage:
1981
+ /clone_import <path.zip>
1982
+ """
1983
+ if not self.voice_manager:
1984
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
1985
+ return
1986
+ path = arg.strip()
1987
+ if not path:
1988
+ print("Usage: /clone_import <path.zip>")
1989
+ return
1990
+ vid = self.voice_manager.import_voice(path)
1991
+ print(f"✅ Imported as: {vid}")
1992
+
1993
+ def do_clone(self, arg):
1994
+ """Clone a voice from a reference file or folder.
1995
+
1996
+ Usage:
1997
+ /clone <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
1998
+ """
1999
+ if not self.voice_manager:
2000
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2001
+ return
2002
+
2003
+ try:
2004
+ parts = shlex.split(arg.strip())
2005
+ except ValueError as e:
2006
+ print(f"Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
2007
+ return
2008
+
2009
+ if not parts:
2010
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2011
+ return
2012
+
2013
+ engine = None
2014
+ reference_text = None
2015
+ pos = []
2016
+ i = 0
2017
+ while i < len(parts):
2018
+ tok = parts[i]
2019
+ if tok in ("--engine",):
2020
+ if i + 1 >= len(parts):
2021
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2022
+ return
2023
+ engine = parts[i + 1]
2024
+ i += 2
2025
+ continue
2026
+ if tok in ("--text", "--reference-text", "--reference_text"):
2027
+ if i + 1 >= len(parts):
2028
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2029
+ return
2030
+ reference_text = parts[i + 1]
2031
+ i += 2
2032
+ continue
2033
+ pos.append(tok)
2034
+ i += 1
2035
+
2036
+ if not pos:
2037
+ print("Usage: /clone <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2038
+ return
2039
+
2040
+ path = pos[0]
2041
+ name = pos[1] if len(pos) > 1 else None
2042
+ try:
2043
+ t0 = time.monotonic()
2044
+ voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine)
2045
+ t1 = time.monotonic()
2046
+
2047
+ eng = ""
2048
+ ref_src = ""
2049
+ try:
2050
+ info = self.voice_manager.get_cloned_voice(voice_id) or {}
2051
+ eng = str(info.get("engine") or "").strip()
2052
+ ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
2053
+ except Exception:
2054
+ eng = ""
2055
+ ref_src = ""
2056
+
2057
+ eng_txt = f" (engine: {eng})" if eng else ""
2058
+ print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
2059
+ print(" Use /tts_voice clone <id-or-name> to select it.")
2060
+ print(" Tip: set reference text for best quality:")
2061
+ print(" /clone_set_ref_text <id-or-name> \"...\"")
2062
+ if not self._is_cloning_runtime_ready(voice_id=voice_id):
2063
+ print(" (Cloning runtime not ready yet; run /cloning_status and /cloning_download first.)")
2064
+ if str(eng or (engine or self.cloning_engine) or "").strip().lower() == "chroma" and not (reference_text or "").strip():
2065
+ print("ℹ️ No reference transcript provided.")
2066
+ print(" We will auto-generate it via STT on first speak (offline-first: requires cached STT model).")
2067
+ print(" Optional (often best quality): /clone_set_ref_text <id-or-name> \"...\" (or re-run /clone ... --text \"...\")")
2068
+
2069
+ if self.verbose_mode:
2070
+ n_files, ref_audio_s = self._summarize_audio_source(path)
2071
+ n_txt = str(n_files) if isinstance(n_files, int) else "--"
2072
+ src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
2073
+ msg = f"CLONE {eng or (engine or self.cloning_engine)} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
2074
+ print(f"{Colors.YELLOW}{msg}{Colors.END}")
2075
+ except Exception as e:
2076
+ print(f"❌ Clone failed: {e}")
2077
+
2078
+ def do_clone_use(self, arg):
2079
+ """Clone a voice (or reuse an existing one) and immediately select it.
2080
+
2081
+ Usage:
2082
+ /clone_use <path> [name] [--engine f5_tts|chroma] [--text "reference transcript"]
2083
+
2084
+ Shortcut:
2085
+ - Paste a WAV/FLAC/OGG path directly (optionally: `path.wav | transcript`).
2086
+ """
2087
+ if not self.voice_manager:
2088
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2089
+ return
2090
+
2091
+ try:
2092
+ parts = shlex.split(arg.strip())
2093
+ except ValueError as e:
2094
+ print(f"Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"] (parse error: {e})")
2095
+ return
2096
+
2097
+ if not parts:
2098
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2099
+ return
2100
+
2101
+ engine = None
2102
+ reference_text = None
2103
+ pos = []
2104
+ i = 0
2105
+ while i < len(parts):
2106
+ tok = parts[i]
2107
+ if tok in ("--engine",):
2108
+ if i + 1 >= len(parts):
2109
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2110
+ return
2111
+ engine = parts[i + 1]
2112
+ i += 2
2113
+ continue
2114
+ if tok in ("--text", "--reference-text", "--reference_text"):
2115
+ if i + 1 >= len(parts):
2116
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2117
+ return
2118
+ reference_text = parts[i + 1]
2119
+ i += 2
2120
+ continue
2121
+ pos.append(tok)
2122
+ i += 1
2123
+
2124
+ if not pos:
2125
+ print("Usage: /clone_use <path> [name] [--engine f5_tts|chroma] [--text \"...\"]")
2126
+ return
2127
+
2128
+ path = pos[0]
2129
+ name = pos[1] if len(pos) > 1 else None
2130
+
2131
+ engine_name = str(engine or self.cloning_engine or "f5_tts").strip().lower()
2132
+
2133
+ # If name isn't provided, use something stable for UX.
2134
+ if not name:
2135
+ try:
2136
+ from pathlib import Path
2137
+
2138
+ p = Path(path)
2139
+ name = p.stem if p.is_file() else p.name
2140
+ except Exception:
2141
+ name = None
2142
+
2143
+ # Reuse a prior clone created from the same source path + engine.
2144
+ voice_id = self._resolve_clone_id_by_source(path, engine=engine_name)
2145
+ if voice_id:
2146
+ if reference_text:
2147
+ try:
2148
+ self.voice_manager.set_cloned_voice_reference_text(voice_id, reference_text)
2149
+ print("✅ Reusing cloned voice and updating reference text.")
2150
+ except Exception:
2151
+ print("✅ Reusing cloned voice.")
2152
+ else:
2153
+ print("✅ Reusing cloned voice.")
2154
+ else:
2155
+ try:
2156
+ t0 = time.monotonic()
2157
+ voice_id = self.voice_manager.clone_voice(path, name=name, reference_text=reference_text, engine=engine_name)
2158
+ t1 = time.monotonic()
2159
+
2160
+ eng = ""
2161
+ ref_src = ""
2162
+ try:
2163
+ info = self.voice_manager.get_cloned_voice(voice_id) or {}
2164
+ eng = str(info.get("engine") or "").strip()
2165
+ ref_src = str((info.get("meta") or {}).get("reference_text_source") or "").strip()
2166
+ except Exception:
2167
+ eng = ""
2168
+ ref_src = ""
2169
+
2170
+ eng_txt = f" (engine: {eng})" if eng else ""
2171
+ print(f"✅ Cloned voice created: {voice_id}{eng_txt}")
2172
+ if reference_text:
2173
+ print(" (Reference text provided)")
2174
+ else:
2175
+ print(" Tip: set reference text for best quality:")
2176
+ print(" /clone_set_ref_text <id-or-name> \"...\"")
2177
+ if str(eng or engine_name or "").strip().lower() == "chroma":
2178
+ print(" ℹ️ No transcript provided; STT auto-fallback runs on first speak (requires cached STT model).")
2179
+
2180
+ if self.verbose_mode:
2181
+ n_files, ref_audio_s = self._summarize_audio_source(path)
2182
+ n_txt = str(n_files) if isinstance(n_files, int) else "--"
2183
+ src_txt = ref_src or ("manual" if (reference_text or "").strip() else "--")
2184
+ msg = f"CLONE {eng or engine_name} | refs {n_txt} a{self._fmt_s(ref_audio_s)} | ref_text {src_txt} | {self._fmt_s(float(t1 - t0))}"
2185
+ print(f"{Colors.YELLOW}{msg}{Colors.END}")
2186
+ except Exception as e:
2187
+ print(f"❌ Clone failed: {e}")
2188
+ return
2189
+
2190
+ # Select if runtime is ready (no surprise downloads).
2191
+ if not self._is_cloning_runtime_ready(voice_id=voice_id):
2192
+ print("ℹ️ Cloning runtime is not ready (would trigger large downloads).")
2193
+ print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
2194
+ return
2195
+
2196
+ self.current_tts_voice = voice_id
2197
+ eng = ""
2198
+ try:
2199
+ info = self.voice_manager.get_cloned_voice(voice_id) or {}
2200
+ eng = str(info.get("engine") or "").strip()
2201
+ except Exception:
2202
+ eng = ""
2203
+ eng_txt = f" (engine: {eng})" if eng else ""
2204
+ print(f"✅ Using cloned voice: {voice_id}{eng_txt}")
2205
+ if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
2206
+ print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
2207
+ # Free memory from other cloning engines (important for large backends like Chroma).
2208
+ try:
2209
+ if hasattr(self.voice_manager, "unload_cloning_engines"):
2210
+ self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
2211
+ except Exception:
2212
+ pass
2213
+ # Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
2214
+ try:
2215
+ if hasattr(self.voice_manager, "unload_piper_voice"):
2216
+ self.voice_manager.unload_piper_voice()
2217
+ except Exception:
2218
+ pass
2219
+
2220
+ def do_clone_set_ref_text(self, arg):
2221
+ """Set the reference transcript for a cloned voice (quality fix).
2222
+
644
2223
  Usage:
645
- /tts_model vits
646
- /tts_model fast_pitch
2224
+ /clone_set_ref_text <id-or-name> <text...>
647
2225
  """
648
- model_shortcuts = {
649
- 'vits': 'tts_models/en/ljspeech/vits',
650
- 'fast_pitch': 'tts_models/en/ljspeech/fast_pitch',
651
- 'glow-tts': 'tts_models/en/ljspeech/glow-tts',
652
- 'tacotron2-DDC': 'tts_models/en/ljspeech/tacotron2-DDC',
653
- }
654
-
655
- arg = arg.strip()
656
- if not arg:
657
- print("Usage: /tts_model <model_name>")
658
- print("Available models: vits (best), fast_pitch, glow-tts, tacotron2-DDC")
2226
+ if not self.voice_manager:
2227
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
659
2228
  return
660
-
661
- # Get full model name
662
- model_name = model_shortcuts.get(arg, arg)
663
-
664
- print(f"Changing TTS model to: {model_name}")
2229
+
2230
+ parts = arg.strip().split(maxsplit=1)
2231
+ if len(parts) < 2:
2232
+ print("Usage: /clone_set_ref_text <id-or-name> <text...>")
2233
+ return
2234
+
2235
+ wanted, text = parts[0], parts[1]
2236
+ voices = self.voice_manager.list_cloned_voices()
2237
+ match = None
2238
+ for v in voices:
2239
+ vid = v.get("voice_id") or ""
2240
+ name = v.get("name") or ""
2241
+ if wanted == vid or vid.startswith(wanted) or wanted == name:
2242
+ match = vid
2243
+ break
2244
+ if not match:
2245
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
2246
+ return
2247
+
665
2248
  try:
666
- self.voice_manager.set_tts_model(model_name)
667
- print(" TTS model changed successfully")
2249
+ self.voice_manager.set_cloned_voice_reference_text(match, text)
2250
+ print(" Updated reference text.")
668
2251
  except Exception as e:
669
- print(f" Error changing model: {e}")
670
-
671
- def do_whisper(self, arg):
672
- """Change Whisper model."""
673
- model = arg.strip()
674
- if not model:
675
- print(f"Current Whisper model: {self.voice_manager.get_whisper()}")
2252
+ print(f" Failed to update reference text: {e}")
2253
+
2254
+ def do_tts_voice(self, arg):
2255
+ """Select which voice is used for speaking.
2256
+
2257
+ Usage:
2258
+ /tts_voice piper
2259
+ /tts_voice clone <voice_id_or_name>
2260
+ """
2261
+ if not self.voice_manager:
2262
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
676
2263
  return
677
-
678
- self.voice_manager.set_whisper(model)
2264
+
2265
+ parts = arg.strip().split()
2266
+ if not parts:
2267
+ if self.current_tts_voice:
2268
+ vid = self.current_tts_voice
2269
+ try:
2270
+ info = self.voice_manager.get_cloned_voice(vid) or {}
2271
+ name = (info.get("name") or "").strip()
2272
+ eng = (info.get("engine") or "").strip()
2273
+ label = name or vid
2274
+ suffix = f" (engine: {eng})" if eng else ""
2275
+ print(f"Current TTS voice: {label}{suffix}")
2276
+ except Exception:
2277
+ print(f"Current TTS voice: {vid}")
2278
+ else:
2279
+ print("Current TTS voice: piper")
2280
+ print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
2281
+ return
2282
+
2283
+ if parts[0] == "piper":
2284
+ self.current_tts_voice = None
2285
+ # Free any heavy cloning engines when switching back to Piper.
2286
+ try:
2287
+ if hasattr(self.voice_manager, "unload_cloning_engines"):
2288
+ self.voice_manager.unload_cloning_engines()
2289
+ except Exception:
2290
+ pass
2291
+ # If Piper was previously unloaded to save memory, reload it now (offline-first).
2292
+ try:
2293
+ if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
2294
+ a = getattr(self.voice_manager, "tts_adapter", None)
2295
+ if hasattr(a, "is_available") and not bool(a.is_available()):
2296
+ self.voice_manager.set_language(self.current_language)
2297
+ except Exception:
2298
+ pass
2299
+ print("✅ Using Piper (default) voice")
2300
+ return
2301
+
2302
+ if parts[0] != "clone" or len(parts) < 2:
2303
+ print("Usage: /tts_voice piper | /tts_voice clone <id-or-name>")
2304
+ return
2305
+
2306
+ wanted = parts[1]
2307
+ match = self._resolve_clone_id(wanted)
2308
+ if not match:
2309
+ print(f"❌ Unknown cloned voice: {wanted}. Use /clones to list.")
2310
+ return
2311
+
2312
+ # Do not allow selecting a cloned voice unless the runtime is ready.
2313
+ if not self._is_cloning_runtime_ready(voice_id=match):
2314
+ print("❌ Cloning runtime is not ready (would trigger large downloads).")
2315
+ print(" Run /cloning_status and /cloning_download, or use /tts_voice piper.")
2316
+ return
2317
+
2318
+ # Allow selecting voices without reference_text; we will auto-fallback at speak-time
2319
+ # if the STT model is already cached locally (no downloads in REPL).
2320
+
2321
+ self.current_tts_voice = match
2322
+ eng = ""
2323
+ try:
2324
+ info = self.voice_manager.get_cloned_voice(match) or {}
2325
+ eng = (info.get("engine") or "").strip()
2326
+ except Exception:
2327
+ eng = ""
2328
+ eng_txt = f" (engine: {eng})" if eng else ""
2329
+ print(f"✅ Using cloned voice: {match}{eng_txt}")
2330
+ if eng and str(eng).strip().lower() != str(self.cloning_engine).strip().lower():
2331
+ print(f"ℹ️ Default cloning engine is {self.cloning_engine}; this voice uses {eng}.")
2332
+ # Free memory from other cloning engines (e.g. unloading Chroma when switching to F5, or vice-versa).
2333
+ try:
2334
+ if hasattr(self.voice_manager, "unload_cloning_engines"):
2335
+ self.voice_manager.unload_cloning_engines(keep_engine=str(eng or "").strip().lower() or None)
2336
+ except Exception:
2337
+ pass
2338
+ # Piper is not needed while speaking with a cloned voice; unload it to reduce memory pressure.
2339
+ try:
2340
+ if hasattr(self.voice_manager, "unload_piper_voice"):
2341
+ self.voice_manager.unload_piper_voice()
2342
+ except Exception:
2343
+ pass
2344
+
2345
+ def do_clone_my_voice(self, arg):
2346
+ """Interactive voice cloning from microphone.
2347
+
2348
+ This records a short prompt to WAV and adds it to the voice store.
2349
+ """
2350
+ if not self.voice_manager:
2351
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2352
+ return
2353
+
2354
+ prompt = "Good evening, Dave."
2355
+ seconds = 6.0
2356
+ print("You will record a short reference sample for voice cloning.")
2357
+ print(f"Please read this aloud (once): {prompt}")
2358
+ input("Press Enter to start recording...")
2359
+ try:
2360
+ import appdirs
2361
+ from pathlib import Path
2362
+ from abstractvoice.audio import record_wav
2363
+
2364
+ out_dir = Path(appdirs.user_data_dir("abstractvoice")) / "recordings"
2365
+ out_path = out_dir / "my_voice.wav"
2366
+ record_wav(out_path, seconds=seconds, sample_rate=24000, channels=1)
2367
+ voice_id = self.voice_manager.clone_voice(str(out_path), name="my_voice", reference_text=prompt)
2368
+ print(f"✅ Recorded and cloned: {voice_id}")
2369
+ print(" Use /tts_voice clone <id-or-name> to select it.")
2370
+ except Exception as e:
2371
+ print(f"❌ /clone-my-voice failed: {e}")
2372
+
2373
+ def do_cloning_status(self, arg):
2374
+ """Show whether cloning runtime is ready locally (no downloads)."""
2375
+ try:
2376
+ import torch
2377
+
2378
+ mps = False
2379
+ try:
2380
+ mps = bool(torch.backends.mps.is_available())
2381
+ except Exception:
2382
+ mps = False
2383
+ print(f"torch: {getattr(torch, '__version__', '?')}")
2384
+ print(f"cuda_available: {bool(torch.cuda.is_available())}")
2385
+ print(f"mps_available: {mps}")
2386
+ except Exception:
2387
+ pass
2388
+
2389
+ print(f"default_cloning_engine: {self.cloning_engine}")
2390
+
2391
+ if importlib.util.find_spec("f5_tts") is None:
2392
+ print("ℹ️ OpenF5 runtime: not installed (missing: f5_tts)")
2393
+ print(" Install: pip install \"abstractvoice[cloning]\"")
2394
+ else:
2395
+ if self._is_openf5_cached():
2396
+ print("✅ OpenF5 artifacts: present (cached)")
2397
+ else:
2398
+ print("ℹ️ OpenF5 artifacts: not present (will require ~5.4GB download)")
2399
+ print(" Run: /cloning_download f5_tts")
2400
+
2401
+ if importlib.util.find_spec("transformers") is None or importlib.util.find_spec("torch") is None:
2402
+ print("ℹ️ Chroma runtime: not installed (missing: transformers/torch)")
2403
+ print(" Install: pip install \"abstractvoice[chroma]\"")
2404
+ else:
2405
+ if self._is_chroma_cached():
2406
+ print("✅ Chroma artifacts: present (cached)")
2407
+ else:
2408
+ print("ℹ️ Chroma artifacts: not present (will require a large download + HF access)")
2409
+ print(" Run: /cloning_download chroma")
2410
+ try:
2411
+ if self.voice_manager:
2412
+ info = self.voice_manager.get_cloning_runtime_info()
2413
+ if info:
2414
+ print(f"cloning_resolved_device: {info.get('resolved_device')}")
2415
+ print(f"cloning_model_param_device: {info.get('model_param_device','?')}")
2416
+ print(f"cloning_quality_preset: {info.get('quality_preset')}")
2417
+ except Exception:
2418
+ pass
2419
+
2420
+ def do_clone_quality(self, arg):
2421
+ """Set cloned TTS quality preset (speed/quality tradeoff).
2422
+
2423
+ Usage:
2424
+ /clone_quality fast|balanced|high
2425
+ """
2426
+ if not self.voice_manager:
2427
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2428
+ return
2429
+ preset = (arg or "").strip().lower()
2430
+ if preset not in ("fast", "balanced", "high"):
2431
+ print("Usage: /clone_quality fast|balanced|high")
2432
+ return
2433
+ try:
2434
+ self.voice_manager.set_cloned_tts_quality(preset)
2435
+ print(f"✅ Cloned TTS quality preset: {preset}")
2436
+ except Exception as e:
2437
+ print(f"❌ Failed to set preset: {e}")
2438
+
2439
+ def do_cloning_download(self, arg):
2440
+ """Explicitly download cloning artifacts (this may take a long time)."""
2441
+ if not self.voice_manager:
2442
+ print("🔇 TTS is disabled. Use '/tts on' to enable voice features.")
2443
+ return
2444
+
2445
+ target = (arg or "").strip().lower() or self.cloning_engine
2446
+ engine_name = "f5_tts" if target in ("openf5", "f5", "f5_tts") else target
2447
+ if engine_name == "f5_tts":
2448
+ if importlib.util.find_spec("f5_tts") is None:
2449
+ print("❌ OpenF5 runtime not installed in this environment (missing: f5_tts).")
2450
+ print(" Install: pip install \"abstractvoice[cloning]\"")
2451
+ return
2452
+ elif engine_name == "chroma":
2453
+ # Artifacts download uses huggingface_hub and does not require loading the model.
2454
+ if importlib.util.find_spec("huggingface_hub") is None:
2455
+ print("❌ huggingface_hub is required to download Chroma artifacts.")
2456
+ print(" Install: pip install huggingface_hub")
2457
+ return
2458
+ else:
2459
+ print("Usage: /cloning_download [f5_tts|chroma]")
2460
+ return
2461
+
2462
+ try:
2463
+ cloner = self.voice_manager._get_voice_cloner() # REPL convenience
2464
+ engine = cloner._get_engine(engine_name) # explicit download is an engine concern
2465
+ if engine_name == "f5_tts":
2466
+ print("Downloading OpenF5 artifacts (~5.4GB). This is a one-time cache per machine.")
2467
+ engine.ensure_openf5_artifacts_downloaded()
2468
+ else:
2469
+ print("Downloading Chroma artifacts (very large; requires HF access). This is a one-time cache per machine.")
2470
+ engine.ensure_chroma_artifacts_downloaded()
2471
+ print("✅ Download complete.")
2472
+ except Exception as e:
2473
+ print(f"❌ Download failed: {e}")
2474
+
2475
+ def _is_openf5_cached(self) -> bool:
2476
+ """Heuristic local check that avoids importing huggingface_hub."""
2477
+ from pathlib import Path
2478
+ import os
2479
+
2480
+ root = Path(os.path.expanduser("~/.cache/abstractvoice/openf5"))
2481
+ if not root.exists():
2482
+ return False
2483
+ cfg = next(iter(root.rglob("*.yaml")), None) or next(iter(root.rglob("*.yml")), None)
2484
+ ckpt = next(iter(root.rglob("*.pt")), None)
2485
+ vocab = next(iter(root.rglob("vocab*.txt")), None) or next(iter(root.rglob("*.txt")), None)
2486
+ return bool(cfg and ckpt and vocab)
2487
+
2488
+ def _is_chroma_cached(self) -> bool:
2489
+ """Heuristic local check that avoids importing huggingface_hub."""
2490
+ from pathlib import Path
2491
+ import os
2492
+
2493
+ root = Path(os.path.expanduser("~/.cache/abstractvoice/chroma"))
2494
+ if not root.exists():
2495
+ return False
2496
+ required = [
2497
+ "config.json",
2498
+ "processor_config.json",
2499
+ "model.safetensors.index.json",
2500
+ "modeling_chroma.py",
2501
+ "processing_chroma.py",
2502
+ "configuration_chroma.py",
2503
+ ]
2504
+ return all((root / name).exists() for name in required)
2505
+
2506
+ def _is_cloning_runtime_ready(self, *, voice_id: str | None = None, engine: str | None = None) -> bool:
2507
+ """Return whether the selected cloning engine is ready locally (no downloads)."""
2508
+ eng = str(engine or "").strip().lower()
2509
+ if not eng and voice_id and self.voice_manager:
2510
+ try:
2511
+ info = self.voice_manager.get_cloned_voice(voice_id)
2512
+ eng = str((info or {}).get("engine") or "").strip().lower()
2513
+ except Exception:
2514
+ eng = ""
2515
+ if not eng:
2516
+ eng = str(getattr(self, "cloning_engine", "f5_tts") or "f5_tts").strip().lower()
2517
+
2518
+ if eng == "chroma":
2519
+ return (
2520
+ importlib.util.find_spec("torch") is not None
2521
+ and importlib.util.find_spec("transformers") is not None
2522
+ and self._is_chroma_cached()
2523
+ )
2524
+ return importlib.util.find_spec("f5_tts") is not None and self._is_openf5_cached()
2525
+
2526
+ def _seed_hal9000_voice(self):
2527
+ """Seed a default 'hal9000' cloned voice if sample WAVs are present."""
2528
+ if not self.voice_manager:
2529
+ return
2530
+ try:
2531
+ from pathlib import Path
2532
+
2533
+ sample_dir = Path("audio_samples") / "hal9000"
2534
+ if not sample_dir.exists():
2535
+ return
2536
+
2537
+ # If already present, do nothing.
2538
+ existing_hal = None
2539
+ for v in self.voice_manager.list_cloned_voices():
2540
+ if (v.get("name") or "").lower() == "hal9000":
2541
+ existing_hal = v.get("voice_id")
2542
+ break
2543
+
2544
+ # Seed from the clean short WAV sample to avoid noisy auto-transcriptions.
2545
+ # This avoids repeated artifacts like "how are you hal" bleeding into outputs.
2546
+ if existing_hal is None:
2547
+ ref = sample_dir / "hal9000_hello.wav"
2548
+ if ref.exists():
2549
+ existing_hal = self.voice_manager.clone_voice(
2550
+ str(ref),
2551
+ name="hal9000",
2552
+ reference_text="Hello, Dave.",
2553
+ )
2554
+ else:
2555
+ existing_hal = self.voice_manager.clone_voice(str(sample_dir), name="hal9000")
2556
+ if self.debug_mode:
2557
+ print(f"Seeded cloned voice 'hal9000': {existing_hal}")
2558
+
2559
+ # Do NOT auto-select here; selecting a clone without explicit user action
2560
+ # can cause surprise multi-GB downloads. Users can opt in via /tts_voice.
2561
+ except Exception:
2562
+ # Best-effort only; never block REPL start.
2563
+ return
2564
+
2565
+ def do_tts_engine(self, arg):
2566
+ """Select TTS engine: auto|piper.
2567
+
2568
+ This recreates the internal VoiceManager instance.
2569
+ """
2570
+ engine = arg.strip().lower()
2571
+ if engine not in ("auto", "piper"):
2572
+ print("Usage: /tts_engine auto|piper")
2573
+ return
2574
+
2575
+ if self.voice_manager:
2576
+ try:
2577
+ self.voice_manager.cleanup()
2578
+ except Exception:
2579
+ pass
2580
+
2581
+ self.voice_manager = VoiceManager(
2582
+ language=self.current_language,
2583
+ tts_model=self._initial_tts_model,
2584
+ debug_mode=self.debug_mode,
2585
+ tts_engine=engine,
2586
+ allow_downloads=False,
2587
+ cloned_tts_streaming=False,
2588
+ cloning_engine=self.cloning_engine,
2589
+ )
2590
+ print(f"✅ TTS engine set to: {engine}")
2591
+
2592
+ def do_aec(self, arg):
2593
+ """Enable/disable optional AEC (echo cancellation) for true barge-in.
2594
+
2595
+ Usage:
2596
+ /aec on [delay_ms]
2597
+ /aec off
2598
+ """
2599
+ if not self.voice_manager:
2600
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2601
+ return
2602
+
2603
+ parts = arg.strip().split()
2604
+ if not parts:
2605
+ enabled = bool(getattr(self.voice_manager, "_aec_enabled", False))
2606
+ delay = int(getattr(self.voice_manager, "_aec_stream_delay_ms", 0))
2607
+ print(f"AEC: {'on' if enabled else 'off'} (delay_ms={delay})")
2608
+ print("Usage: /aec on [delay_ms] | /aec off")
2609
+ return
2610
+
2611
+ if parts[0] == "off":
2612
+ try:
2613
+ self.voice_manager.enable_aec(False)
2614
+ print("✅ AEC disabled")
2615
+ except Exception as e:
2616
+ print(f"❌ AEC disable failed: {e}")
2617
+ return
2618
+
2619
+ if parts[0] != "on":
2620
+ print("Usage: /aec on [delay_ms] | /aec off")
2621
+ return
2622
+
2623
+ delay_ms = 0
2624
+ if len(parts) > 1:
2625
+ try:
2626
+ delay_ms = int(parts[1])
2627
+ except Exception:
2628
+ print("Usage: /aec on [delay_ms] | /aec off")
2629
+ return
2630
+
2631
+ try:
2632
+ self.voice_manager.enable_aec(True, stream_delay_ms=delay_ms)
2633
+ print(f"✅ AEC enabled (delay_ms={delay_ms}).")
2634
+ print("Tip: use /voice full for barge-in behavior when AEC is enabled.")
2635
+ except Exception as e:
2636
+ print(f"❌ AEC enable failed: {e}")
2637
+
2638
+ def do_stt_engine(self, arg):
2639
+ """Select STT engine: auto|faster_whisper|whisper.
2640
+
2641
+ This recreates the internal VoiceManager instance.
2642
+ """
2643
+ engine = arg.strip().lower()
2644
+ if engine not in ("auto", "faster_whisper", "whisper"):
2645
+ print("Usage: /stt_engine auto|faster_whisper|whisper")
2646
+ return
2647
+
2648
+ if not self.voice_manager:
2649
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2650
+ return
2651
+
2652
+ # Recreate VoiceManager preserving current TTS engine preference.
2653
+ # If the current engine is unknown, let it auto-select.
2654
+ tts_engine = getattr(self.voice_manager, "_tts_engine_preference", "auto")
2655
+
2656
+ try:
2657
+ self.voice_manager.cleanup()
2658
+ except Exception:
2659
+ pass
2660
+
2661
+ self.voice_manager = VoiceManager(
2662
+ language=self.current_language,
2663
+ tts_model=self._initial_tts_model,
2664
+ debug_mode=self.debug_mode,
2665
+ tts_engine=tts_engine,
2666
+ stt_engine=engine,
2667
+ allow_downloads=False,
2668
+ cloned_tts_streaming=False,
2669
+ cloning_engine=self.cloning_engine,
2670
+ )
2671
+ print(f"✅ STT engine set to: {engine}")
2672
+
2673
+ def do_transcribe(self, arg):
2674
+ """Transcribe an audio file via the library STT path (faster-whisper by default).
2675
+
2676
+ Usage:
2677
+ /transcribe path/to/audio.wav
2678
+
2679
+ Notes:
2680
+ - This is the simplest way to validate STT without requiring microphone capture.
2681
+ - The default engine is faster-whisper; legacy openai-whisper remains optional.
2682
+ """
2683
+ if not self.voice_manager:
2684
+ print("🔇 Voice features are disabled. Use '/tts on' to enable.")
2685
+ return
2686
+
2687
+ path = arg.strip()
2688
+ if not path:
2689
+ print("Usage: /transcribe <path/to/audio.wav>")
2690
+ return
2691
+
2692
+ try:
2693
+ text = self.voice_manager.transcribe_file(path)
2694
+ print(f"{Colors.CYAN}{text}{Colors.END}")
2695
+ except Exception as e:
2696
+ print(f"❌ Transcription failed: {e}")
2697
+ if self.debug_mode:
2698
+ import traceback
2699
+ traceback.print_exc()
679
2700
 
680
2701
  def do_clear(self, arg):
681
2702
  """Clear chat history."""
2703
+ self._clear_history()
2704
+ print("History cleared")
2705
+
2706
+ def do_reset(self, arg):
2707
+ """Reset the session (history + current voice selection)."""
2708
+ try:
2709
+ if self.voice_manager:
2710
+ self.voice_manager.stop_speaking()
2711
+ except Exception:
2712
+ pass
2713
+
2714
+ # Reset voice selection back to Piper (default).
2715
+ self.current_tts_voice = None
2716
+ # Free any heavy cloning engines as part of reset.
2717
+ try:
2718
+ if self.voice_manager and hasattr(self.voice_manager, "unload_cloning_engines"):
2719
+ self.voice_manager.unload_cloning_engines()
2720
+ except Exception:
2721
+ pass
2722
+ # Ensure Piper is ready (in case it was unloaded to save memory).
2723
+ try:
2724
+ if self.voice_manager and getattr(self.voice_manager, "tts_adapter", None):
2725
+ a = getattr(self.voice_manager, "tts_adapter", None)
2726
+ if hasattr(a, "is_available") and not bool(a.is_available()):
2727
+ self.voice_manager.set_language(self.current_language)
2728
+ except Exception:
2729
+ pass
2730
+
2731
+ # Clear chat history.
2732
+ self._clear_history()
2733
+ print("✅ Reset.")
2734
+
2735
+ def _clear_history(self) -> None:
682
2736
  self.messages = [{"role": "system", "content": self.system_prompt}]
683
2737
  # Reset token counters
684
2738
  self.system_tokens = 0
685
2739
  self.user_tokens = 0
686
2740
  self.assistant_tokens = 0
2741
+ # Reset word counters
2742
+ self.system_words = 0
2743
+ self.user_words = 0
2744
+ self.assistant_words = 0
687
2745
  # Recalculate system tokens
688
2746
  self._count_system_tokens()
689
- print("History cleared")
2747
+ self._count_system_words()
690
2748
 
691
2749
  def do_system(self, arg):
692
2750
  """Set the system prompt."""
693
2751
  if arg.strip():
694
2752
  self.system_prompt = arg.strip()
695
- self.messages = [{"role": "system", "content": self.system_prompt}]
2753
+ self._clear_history()
696
2754
  print(f"System prompt set to: {self.system_prompt}")
697
2755
  else:
698
2756
  print(f"Current system prompt: {self.system_prompt}")
699
2757
 
700
2758
  def do_exit(self, arg):
701
2759
  """Exit the REPL."""
702
- self.voice_manager.cleanup()
2760
+ # Stop any PTT session cleanly.
2761
+ self._ptt_session_active = False
2762
+ self._ptt_recording = False
2763
+ self._ptt_busy = False
2764
+
2765
+ # Stop voice mode / audio best-effort.
2766
+ try:
2767
+ if self.voice_manager:
2768
+ try:
2769
+ self.voice_manager.stop_listening()
2770
+ except Exception:
2771
+ pass
2772
+ try:
2773
+ self.voice_manager.stop_speaking()
2774
+ except Exception:
2775
+ pass
2776
+ except Exception:
2777
+ pass
2778
+
2779
+ try:
2780
+ if self.voice_manager:
2781
+ self.voice_manager.cleanup()
2782
+ except Exception:
2783
+ pass
703
2784
  if self.debug_mode:
704
2785
  print("Goodbye!")
705
2786
  return True
@@ -781,37 +2862,81 @@ class VoiceREPL(cmd.Cmd):
781
2862
 
782
2863
  # If neither voice mode nor TTS is active - don't show any message
783
2864
  pass
2865
+
2866
+ def do_verbose(self, arg):
2867
+ """Toggle verbose per-turn performance stats.
2868
+
2869
+ Usage:
2870
+ /verbose (toggle)
2871
+ /verbose on|off
2872
+ """
2873
+ s = (arg or "").strip().lower()
2874
+ if s in ("", "toggle"):
2875
+ self.verbose_mode = not bool(getattr(self, "verbose_mode", False))
2876
+ elif s in ("on", "1", "true", "yes", "y"):
2877
+ self.verbose_mode = True
2878
+ elif s in ("off", "0", "false", "no", "n"):
2879
+ self.verbose_mode = False
2880
+ else:
2881
+ print("Usage: /verbose [on|off]")
2882
+ return
2883
+ print(f"Verbose mode: {'on' if self.verbose_mode else 'off'}")
784
2884
 
785
2885
  def do_help(self, arg):
786
2886
  """Show help information."""
787
2887
  print("Commands:")
788
2888
  print(" /exit, /q, /quit Exit REPL")
789
2889
  print(" /clear Clear history")
2890
+ print(" /reset Reset (history + voice)")
790
2891
  print(" /tts on|off Toggle TTS")
791
2892
  print(" /voice <mode> Voice input: off|full|wait|stop|ptt")
792
- print(" /language <lang> Switch voice language (en, fr, es, de, it)")
793
- print(" /setvoice [id] List voices or set specific voice (lang.voice_id)")
2893
+ print(" /voice ptt Push-to-talk session (SPACE captures, ESC exits)")
2894
+ print(" /language <lang> Switch voice language (en, fr, es, de, ru, zh)")
2895
+ print(" /setvoice [id] List Piper voices or set one (lang.voice_id)")
794
2896
  print(" /lang_info Show current language information")
795
2897
  print(" /list_languages List all supported languages")
796
2898
  print(" /speed <number> Set TTS speed (0.5-2.0, default: 1.0, pitch preserved)")
797
- print(" /tts_model <model> Switch TTS model: vits(best)|fast_pitch|glow-tts|tacotron2-DDC")
2899
+ print(" /tts_voice ... Select Piper vs cloned voice (see below)")
2900
+ print(" /tts_engine <e> Switch TTS engine: auto|piper")
798
2901
  print(" /whisper <model> Switch Whisper model: tiny|base|small|medium|large")
2902
+ print(" /stt_engine <e> Switch STT engine: auto|faster_whisper|whisper (whisper is optional extra)")
2903
+ print(" /speak <text> Speak text (no LLM call)")
2904
+ print(" /transcribe <path> Transcribe an audio file (faster-whisper by default)")
799
2905
  print(" /system <prompt> Set system prompt")
800
2906
  print(" /stop Stop voice mode or TTS playback")
801
2907
  print(" /pause Pause current TTS playback")
802
2908
  print(" /resume Resume paused TTS playback")
2909
+ print(" /aec on|off Optional echo cancellation for true barge-in (requires [aec])")
803
2910
  print(" /tokens Display token usage stats")
2911
+ print(" /verbose [on|off] Toggle verbose per-turn stats")
804
2912
  print(" /help Show this help")
2913
+ print(" /clones List cloned voices")
2914
+ print(" /clone_info <id> Show cloned voice details")
2915
+ print(" /clone_ref <id> Show cloned voice reference text")
2916
+ print(" /clone_rename ... Rename a cloned voice")
2917
+ print(" /clone_rm <id> Delete a cloned voice")
2918
+ print(" /clone_rm_all --yes Delete ALL cloned voices")
2919
+ print(" /clone_export ... Export a cloned voice (.zip)")
2920
+ print(" /clone_import ... Import a cloned voice (.zip)")
2921
+ print(" /clone <path> [nm] Add a cloned voice from WAV/FLAC/OGG")
2922
+ print(" /clone_use <path> Clone+select voice (or reuse)")
2923
+ print(" /clone-my-voice Record a short prompt and clone it")
2924
+ print(" /tts_voice piper Speak with Piper (default)")
2925
+ print(" /tts_voice clone X Speak with a cloned voice (requires cloning runtime + cache)")
2926
+ print(" /cloning_status Show cloning readiness (no downloads)")
2927
+ print(" /cloning_download Explicitly download OpenF5 artifacts (~5.4GB)")
2928
+ print(" /clone_quality Set cloned TTS speed/quality: fast|balanced|high")
805
2929
  print(" /save <filename> Save chat history to file")
806
2930
  print(" /load <filename> Load chat history from file")
807
2931
  print(" /model <name> Change the LLM model")
808
2932
  print(" /temperature <val> Set temperature (0.0-2.0, default: 0.7)")
809
2933
  print(" /max_tokens <num> Set max tokens (default: 4096)")
810
- print(" stop Stop voice mode or TTS (voice command)")
2934
+ print(" stop (deprecated) use /voice off or say 'stop' during STOP mode")
811
2935
  print(" <message> Send to LLM (text mode)")
812
2936
  print()
813
2937
  print("Note: ALL commands must start with / except 'stop'")
814
- print("In voice mode, say 'stop' to exit voice mode.")
2938
+ print("In STOP mode, say 'stop' / 'ok stop' to stop speaking (does not exit voice mode).")
2939
+ print("Shortcut: paste a WAV/FLAC/OGG path to clone+select (optionally: `path | transcript`).")
815
2940
 
816
2941
  def emptyline(self):
817
2942
  """Handle empty line input."""
@@ -821,6 +2946,10 @@ class VoiceREPL(cmd.Cmd):
821
2946
  def do_tokens(self, arg):
822
2947
  """Display token usage information."""
823
2948
  try:
2949
+ if self._get_tiktoken_encoding() is None:
2950
+ print("Token counting is not available (install: pip install tiktoken).")
2951
+ return
2952
+
824
2953
  # Always recalculate tokens to ensure accuracy
825
2954
  self._reset_and_recalculate_tokens()
826
2955
 
@@ -998,15 +3127,26 @@ class VoiceREPL(cmd.Cmd):
998
3127
  print(f"Failed to load chat history from {filename}")
999
3128
 
1000
3129
  def _reset_and_recalculate_tokens(self):
1001
- """Reset token counts and recalculate for all messages."""
3130
+ """Reset token/word counts and recalculate for all messages."""
1002
3131
  self.system_tokens = 0
1003
3132
  self.user_tokens = 0
1004
3133
  self.assistant_tokens = 0
3134
+ self.system_words = 0
3135
+ self.user_words = 0
3136
+ self.assistant_words = 0
1005
3137
 
1006
3138
  # Count tokens for all messages
1007
3139
  for msg in self.messages:
1008
3140
  if isinstance(msg, dict) and "content" in msg and "role" in msg:
1009
3141
  self._count_tokens(msg["content"], msg["role"])
3142
+ w = self._count_words(msg["content"])
3143
+ r = msg.get("role")
3144
+ if r == "system":
3145
+ self.system_words = int(w)
3146
+ elif r == "user":
3147
+ self.user_words += int(w)
3148
+ elif r == "assistant":
3149
+ self.assistant_words += int(w)
1010
3150
 
1011
3151
  def _ensure_system_message(self):
1012
3152
  """Ensure there's a system message at the start of messages."""
@@ -1070,13 +3210,30 @@ def parse_args():
1070
3210
  """Parse command line arguments."""
1071
3211
  parser = argparse.ArgumentParser(description="AbstractVoice CLI Example")
1072
3212
  parser.add_argument("--debug", action="store_true", help="Enable debug mode")
3213
+ parser.add_argument("--verbose", action="store_true", help="Show per-turn performance stats")
1073
3214
  parser.add_argument("--api", default="http://localhost:11434/api/chat",
1074
3215
  help="LLM API URL")
1075
- parser.add_argument("--model", default="granite3.3:2b",
3216
+ parser.add_argument("--model", default="cogito:3b",
1076
3217
  help="LLM model name")
1077
- parser.add_argument("--language", "--lang", default="en",
1078
- choices=["en", "fr", "es", "de", "it", "ru", "multilingual"],
1079
- help="Voice language (en=English, fr=French, es=Spanish, de=German, it=Italian, ru=Russian, multilingual=All)")
3218
+ parser.add_argument(
3219
+ "--cloning-engine",
3220
+ default="f5_tts",
3221
+ choices=["f5_tts", "chroma"],
3222
+ help="Default cloning backend for new voices (f5_tts|chroma)",
3223
+ )
3224
+ parser.add_argument(
3225
+ "--voice-mode",
3226
+ default="off",
3227
+ choices=["off", "wait", "stop", "full", "ptt"],
3228
+ help="Auto-start microphone voice mode (off|wait|stop|full|ptt). Default: off.",
3229
+ )
3230
+ parser.add_argument(
3231
+ "--language",
3232
+ "--lang",
3233
+ default="en",
3234
+ choices=["en", "fr", "de", "es", "ru", "zh"],
3235
+ help="Voice language for default Piper TTS (en|fr|de|es|ru|zh).",
3236
+ )
1080
3237
  parser.add_argument("--tts-model",
1081
3238
  help="Specific TTS model to use (overrides language default)")
1082
3239
  return parser.parse_args()
@@ -1093,8 +3250,11 @@ def main():
1093
3250
  api_url=args.api,
1094
3251
  model=args.model,
1095
3252
  debug_mode=args.debug,
3253
+ verbose_mode=args.verbose,
1096
3254
  language=args.language,
1097
- tts_model=args.tts_model
3255
+ tts_model=args.tts_model,
3256
+ voice_mode=args.voice_mode,
3257
+ cloning_engine=args.cloning_engine,
1098
3258
  )
1099
3259
  repl.cmdloop()
1100
3260
  except KeyboardInterrupt:
@@ -1104,4 +3264,4 @@ def main():
1104
3264
 
1105
3265
 
1106
3266
  if __name__ == "__main__":
1107
- main()
3267
+ main()