abstractvoice 0.5.2__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.2.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.2.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.2.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -16,14 +16,12 @@ def print_examples():
16
16
  print(" web - Web API example")
17
17
  print(" simple - Simple usage example")
18
18
  print(" check-deps - Check dependency compatibility")
19
- print(" download-models - Download TTS models for offline use")
20
19
  print("\nUsage: abstractvoice <command> [--language <lang>] [args...]")
21
- print("\nSupported languages: en, fr, es, de, it, ru, multilingual")
20
+ print("\nSupported languages: en, fr, es, de, ru, zh")
22
21
  print("\nExamples:")
23
22
  print(" abstractvoice cli --language fr # French CLI")
24
23
  print(" abstractvoice simple --language ru # Russian simple example")
25
24
  print(" abstractvoice check-deps # Check dependencies")
26
- print(" abstractvoice download-models # Download models for offline use")
27
25
  print(" abstractvoice # Direct voice mode (default)")
28
26
 
29
27
  def simple_example():
@@ -97,33 +95,37 @@ def simple_example():
97
95
 
98
96
  def parse_args():
99
97
  """Parse command line arguments."""
100
- import sys
101
-
102
- # Check if it's a download-models command and handle separately
103
- if len(sys.argv) > 1 and sys.argv[1] == "download-models":
104
- # Return early with just the command to handle in main()
105
- class DownloadModelsArgs:
106
- command = "download-models"
107
- # Add dummy attributes to prevent AttributeError
108
- model = "granite3.3:2b"
109
- debug = False
110
- return DownloadModelsArgs()
111
-
112
98
  parser = argparse.ArgumentParser(description="AbstractVoice - Voice interactions with AI")
113
99
 
114
100
  # Examples and special commands
115
- parser.add_argument("command", nargs="?", help="Command to run: cli, web, simple, check-deps, download-models (default: voice mode)")
101
+ parser.add_argument("command", nargs="?", help="Command to run: cli, web, simple, check-deps (default: voice mode)")
116
102
 
117
103
  # Voice mode arguments
118
104
  parser.add_argument("--debug", action="store_true", help="Enable debug mode")
105
+ parser.add_argument("--verbose", action="store_true", help="Show per-turn performance stats")
119
106
  parser.add_argument("--api", default="http://localhost:11434/api/chat",
120
107
  help="LLM API URL")
121
- parser.add_argument("--model", default="granite3.3:2b",
108
+ parser.add_argument("--model", default="cogito:3b",
122
109
  help="LLM model name")
123
- parser.add_argument("--whisper", default="tiny",
124
- help="Whisper model to use (tiny, base, small, medium, large)")
110
+ parser.add_argument(
111
+ "--whisper",
112
+ default="base",
113
+ help="STT model size for faster-whisper (e.g. tiny|base|small|medium|large-v3).",
114
+ )
115
+ parser.add_argument(
116
+ "--cloning-engine",
117
+ default="f5_tts",
118
+ choices=["f5_tts", "chroma"],
119
+ help="Default cloning backend for new voices (f5_tts|chroma).",
120
+ )
121
+ parser.add_argument(
122
+ "--voice-mode",
123
+ default="off",
124
+ choices=["off", "wait", "stop", "full", "ptt"],
125
+ help="Auto-start microphone voice mode (off|wait|stop|full|ptt). Default: off.",
126
+ )
125
127
  parser.add_argument("--no-listening", action="store_true",
126
- help="Disable speech-to-text (listening), TTS still works")
128
+ help="Disable speech-to-text (listening). Alias for --voice-mode off.")
127
129
  parser.add_argument("--no-tts", action="store_true",
128
130
  help="Disable text-to-speech (TTS), text-only mode")
129
131
  parser.add_argument("--system",
@@ -133,8 +135,8 @@ def parse_args():
133
135
  parser.add_argument("--max-tokens", type=int, default=4096,
134
136
  help="Set maximum tokens for the LLM response")
135
137
  parser.add_argument("--language", "--lang", default="en",
136
- choices=["en", "fr", "es", "de", "it", "ru", "multilingual"],
137
- help="Voice language (en=English, fr=French, es=Spanish, de=German, it=Italian, ru=Russian, multilingual=All)")
138
+ choices=["en", "fr", "es", "de", "ru", "zh"],
139
+ help="Voice language (en=English, fr=French, es=Spanish, de=German, ru=Russian, zh=Chinese)")
138
140
  parser.add_argument("--tts-model",
139
141
  help="Specific TTS model to use (overrides language default)")
140
142
  return parser.parse_args()
@@ -145,6 +147,10 @@ def main():
145
147
  # Parse command line arguments
146
148
  args = parse_args()
147
149
 
150
+ # Normalize aliases/compat flags.
151
+ if getattr(args, "no_listening", False):
152
+ args.voice_mode = "off"
153
+
148
154
  # Handle special commands and examples
149
155
  if args.command == "check-deps":
150
156
  from abstractvoice.dependency_check import check_dependencies
@@ -157,26 +163,25 @@ def main():
157
163
  import traceback
158
164
  traceback.print_exc()
159
165
  return
160
- elif args.command == "download-models":
161
- from abstractvoice.simple_model_manager import download_models_cli
162
- # Pass remaining arguments to download_models_cli
163
- import sys
164
- original_argv = sys.argv
165
- sys.argv = ["download-models"] + sys.argv[2:] # Remove script name and "download-models"
166
- try:
167
- download_models_cli()
168
- finally:
169
- sys.argv = original_argv
170
- return
171
166
  elif args.command == "cli":
172
167
  # Import and run CLI REPL example
173
168
  repl = VoiceREPL(
174
169
  api_url=args.api,
175
170
  model=args.model,
176
171
  debug_mode=args.debug,
172
+ verbose_mode=args.verbose,
177
173
  language=args.language,
178
- tts_model=args.tts_model
174
+ tts_model=args.tts_model,
175
+ voice_mode=args.voice_mode,
176
+ disable_tts=args.no_tts,
177
+ cloning_engine=args.cloning_engine,
179
178
  )
179
+ # Apply requested STT model size (best-effort).
180
+ try:
181
+ if getattr(repl, "voice_manager", None) is not None:
182
+ repl.voice_manager.set_whisper(str(args.whisper))
183
+ except Exception:
184
+ pass
180
185
  # Set temperature and max_tokens
181
186
  repl.temperature = args.temperature
182
187
  repl.max_tokens = args.max_tokens
@@ -200,23 +205,28 @@ def main():
200
205
  print_examples()
201
206
  return
202
207
 
203
- # Show language information
204
- language_names = {
205
- 'en': 'English', 'fr': 'French', 'es': 'Spanish',
206
- 'de': 'German', 'it': 'Italian', 'ru': 'Russian',
207
- 'multilingual': 'Multilingual'
208
- }
209
- lang_name = language_names.get(args.language, args.language)
210
- print(f"Starting AbstractVoice voice interface ({lang_name})...")
211
-
212
- # Initialize REPL with language support
208
+ # Default behavior: start the REPL (mic OFF unless --voice-mode is set).
209
+ lang_name = {
210
+ "en": "English",
211
+ "fr": "French",
212
+ "de": "German",
213
+ "es": "Spanish",
214
+ "ru": "Russian",
215
+ "zh": "Chinese",
216
+ }.get(str(args.language), str(args.language))
217
+ print(f"Starting AbstractVoice ({lang_name})…")
218
+
219
+ # Initialize REPL.
213
220
  repl = VoiceREPL(
214
221
  api_url=args.api,
215
222
  model=args.model,
216
223
  debug_mode=args.debug,
224
+ verbose_mode=args.verbose,
217
225
  language=args.language,
218
226
  tts_model=args.tts_model,
219
- disable_tts=args.no_tts
227
+ voice_mode=args.voice_mode,
228
+ disable_tts=args.no_tts,
229
+ cloning_engine=args.cloning_engine,
220
230
  )
221
231
 
222
232
  # Set custom system prompt if provided
@@ -233,17 +243,12 @@ def main():
233
243
  print(f"Temperature: {args.temperature}")
234
244
  print(f"Max tokens: {args.max_tokens}")
235
245
 
236
- # Change Whisper model if specified
237
- if args.whisper and args.whisper != "tiny":
238
- if repl.voice_manager.set_whisper(args.whisper):
239
- if args.debug:
240
- print(f"Using Whisper model: {args.whisper}")
241
-
242
- # Start in voice mode automatically unless --no-listening is specified
243
- if not args.no_listening:
244
- print("Activating voice mode. Say 'stop' to exit voice mode.")
245
- # Use the existing voice mode method
246
- repl.do_voice("on")
246
+ # Apply requested STT model size (best-effort).
247
+ try:
248
+ if getattr(repl, "voice_manager", None) is not None:
249
+ repl.voice_manager.set_whisper(str(args.whisper))
250
+ except Exception:
251
+ pass
247
252
 
248
253
  # Start the REPL
249
254
  repl.cmdloop()
@@ -258,7 +263,7 @@ def main():
258
263
  print(f"❌ TTS model download failed")
259
264
  print(f" This is a TTS voice model issue, not your Ollama model")
260
265
  print(f" Your Ollama model '{args.model}' is fine")
261
- print(f" Try: rm -rf ~/.cache/tts && pip install --force-reinstall coqui-tts")
266
+ print(" Try: pip install --upgrade abstractvoice")
262
267
  print(f" Or check network connectivity for model downloads")
263
268
  elif "ollama" in error_msg or "11434" in error_msg:
264
269
  print(f"❌ Cannot connect to Ollama at {args.api}")
@@ -267,11 +272,7 @@ def main():
267
272
  elif "importerror" in error_msg or "no module" in error_msg:
268
273
  print(f"❌ Missing dependencies")
269
274
  print(f" Try running: abstractvoice check-deps")
270
- print(f" Or install dependencies: pip install abstractvoice[voice-full]")
271
- elif "espeak" in error_msg or "phoneme" in error_msg:
272
- print(f"❌ Voice synthesis setup issue")
273
- print(f" Install espeak-ng for better voice quality: brew install espeak-ng")
274
- print(f" Or this might be a TTS model download issue")
275
+ print(f" Or install extras: pip install \"abstractvoice[all]\"")
275
276
  else:
276
277
  print(f"❌ Application error: {e}")
277
278
  print(f" Try running with --debug for more details")
@@ -282,4 +283,4 @@ def main():
282
283
  traceback.print_exc()
283
284
 
284
285
  if __name__ == "__main__":
285
- main()
286
+ main()
@@ -0,0 +1,2 @@
1
+ """Optional integration modules (kept dependency-light)."""
2
+
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+ from ..artifacts import MediaStore, RuntimeArtifactStoreAdapter, get_artifact_id, is_artifact_ref
7
+
8
+
9
+ def _require_abstractcore_tool():
10
+ try:
11
+ from abstractcore import tool # type: ignore
12
+ except Exception as e: # pragma: no cover
13
+ raise ImportError("AbstractCore is required for this integration. Install it via: pip install abstractcore") from e
14
+ return tool
15
+
16
+
17
+ def _decode_base64_bytes(value: str) -> bytes:
18
+ raw = str(value or "").strip()
19
+ if not raw:
20
+ return b""
21
+ if raw.startswith("data:") and "," in raw:
22
+ raw = raw.split(",", 1)[1].strip()
23
+ raw = "".join(raw.split())
24
+ pad = (-len(raw)) % 4
25
+ if pad:
26
+ raw = raw + ("=" * pad)
27
+ return base64.b64decode(raw, validate=False)
28
+
29
+
30
+ def _require_store(store: Any) -> MediaStore:
31
+ # If the caller passed an AbstractRuntime ArtifactStore, adapt it.
32
+ if hasattr(store, "store") and hasattr(store, "load") and not hasattr(store, "store_bytes"):
33
+ return RuntimeArtifactStoreAdapter(store)
34
+ if not hasattr(store, "store_bytes") or not hasattr(store, "load_bytes"):
35
+ raise TypeError("store must be a MediaStore-like object or an AbstractRuntime-like ArtifactStore")
36
+ return store # type: ignore[return-value]
37
+
38
+
39
+ def _resolve_audio_bytes(
40
+ *,
41
+ store: MediaStore,
42
+ artifact: Optional[Dict[str, Any]],
43
+ b64: Optional[str],
44
+ required: bool,
45
+ ) -> Optional[bytes]:
46
+ if artifact is not None:
47
+ if not is_artifact_ref(artifact):
48
+ raise ValueError("audio_artifact: expected an artifact ref dict like {'$artifact': '...'}")
49
+ return store.load_bytes(get_artifact_id(artifact))
50
+ if b64 is not None:
51
+ out = _decode_base64_bytes(b64)
52
+ if required and not out:
53
+ raise ValueError("audio_b64: decoded to empty bytes")
54
+ return out
55
+ if required:
56
+ raise ValueError("Either audio_artifact or audio_b64 is required")
57
+ return None
58
+
59
+
60
+ def make_voice_tools(
61
+ *,
62
+ voice_manager: Any,
63
+ store: Any,
64
+ ) -> List[Callable[..., Any]]:
65
+ """Create AbstractCore tools for TTS/STT (artifact-first outputs)."""
66
+
67
+ tool = _require_abstractcore_tool()
68
+ media_store = _require_store(store)
69
+
70
+ @tool(
71
+ name="voice_tts",
72
+ description="Synthesize speech from text and return an audio artifact ref.",
73
+ tags=["voice", "tts", "audio"],
74
+ when_to_use="Use when you need to generate an audio rendition of text (TTS).",
75
+ )
76
+ def voice_tts(
77
+ text: str,
78
+ voice: Optional[str] = None,
79
+ format: str = "wav",
80
+ run_id: Optional[str] = None,
81
+ ) -> Dict[str, Any]:
82
+ audio = voice_manager.speak_to_bytes(str(text), format=str(format), voice=voice)
83
+ return media_store.store_bytes(
84
+ bytes(audio),
85
+ content_type=f"audio/{str(format).lower()}",
86
+ filename=f"tts.{str(format).lower()}",
87
+ run_id=str(run_id) if run_id else None,
88
+ tags={"kind": "generated_media", "modality": "audio", "task": "tts"},
89
+ )
90
+
91
+ @tool(
92
+ name="audio_transcribe",
93
+ description="Transcribe audio (speech-to-text) and return text plus a transcript artifact ref.",
94
+ tags=["audio", "stt", "transcribe"],
95
+ when_to_use="Use when you need to convert speech audio into text (STT).",
96
+ )
97
+ def audio_transcribe(
98
+ audio_artifact: Optional[Dict[str, Any]] = None,
99
+ audio_b64: Optional[str] = None,
100
+ language: Optional[str] = None,
101
+ run_id: Optional[str] = None,
102
+ ) -> Dict[str, Any]:
103
+ audio_bytes = _resolve_audio_bytes(store=media_store, artifact=audio_artifact, b64=audio_b64, required=True)
104
+ text = voice_manager.transcribe_from_bytes(bytes(audio_bytes or b""), language=language)
105
+
106
+ transcript_ref = media_store.store_bytes(
107
+ str(text).encode("utf-8"),
108
+ content_type="text/plain; charset=utf-8",
109
+ filename="transcript.txt",
110
+ run_id=str(run_id) if run_id else None,
111
+ tags={"kind": "derived_text", "modality": "audio", "task": "stt"},
112
+ )
113
+ return {"text": text, "transcript_artifact": transcript_ref}
114
+
115
+ return [voice_tts, audio_transcribe]
116
+
@@ -0,0 +1,253 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Optional, Union
4
+
5
+ from ..artifacts import RuntimeArtifactStoreAdapter, is_artifact_ref, get_artifact_id
6
+
7
+
8
+ class _BaseVoice:
9
+ def __init__(self, owner: Any):
10
+ self._owner = owner
11
+ self._vm = None
12
+
13
+ def _get_vm(self):
14
+ if self._vm is not None:
15
+ return self._vm
16
+
17
+ # Injection hook (tests / advanced embedding).
18
+ try:
19
+ cfg = getattr(self._owner, "config", None)
20
+ if isinstance(cfg, dict):
21
+ inst = cfg.get("voice_manager_instance")
22
+ if inst is not None:
23
+ self._vm = inst
24
+ return self._vm
25
+ factory = cfg.get("voice_manager_factory")
26
+ if callable(factory):
27
+ self._vm = factory(self._owner)
28
+ return self._vm
29
+ except Exception:
30
+ pass
31
+
32
+ # Lazy import (keeps plugin import-light).
33
+ from ..voice_manager import VoiceManager
34
+
35
+ # Best-effort config overrides (optional).
36
+ language = "en"
37
+ allow_downloads = True
38
+ try:
39
+ cfg = getattr(self._owner, "config", None)
40
+ if isinstance(cfg, dict):
41
+ if isinstance(cfg.get("voice_language"), str) and cfg["voice_language"].strip():
42
+ language = str(cfg["voice_language"]).strip().lower()
43
+ if "voice_allow_downloads" in cfg:
44
+ allow_downloads = bool(cfg.get("voice_allow_downloads"))
45
+ except Exception:
46
+ pass
47
+
48
+ self._vm = VoiceManager(language=language, allow_downloads=allow_downloads)
49
+ return self._vm
50
+
51
+ def _maybe_store_audio(
52
+ self,
53
+ audio_bytes: bytes,
54
+ *,
55
+ artifact_store: Any,
56
+ fmt: str,
57
+ run_id: Optional[str],
58
+ tags: Optional[Dict[str, str]],
59
+ metadata: Optional[Dict[str, Any]],
60
+ ):
61
+ if artifact_store is None:
62
+ return bytes(audio_bytes)
63
+ store = RuntimeArtifactStoreAdapter(artifact_store)
64
+ merged_tags: Dict[str, str] = {"kind": "generated_media", "modality": "audio", "task": "tts"}
65
+ if isinstance(tags, dict):
66
+ merged_tags.update({str(k): str(v) for k, v in tags.items()})
67
+ return store.store_bytes(
68
+ bytes(audio_bytes),
69
+ content_type=f"audio/{str(fmt).lower()}",
70
+ filename=f"tts.{str(fmt).lower()}",
71
+ run_id=str(run_id) if run_id else None,
72
+ tags=merged_tags,
73
+ metadata=metadata if isinstance(metadata, dict) else None,
74
+ )
75
+
76
+ def _resolve_audio_bytes(self, audio: Union[bytes, Dict[str, Any], str], *, artifact_store: Any) -> bytes:
77
+ if isinstance(audio, (bytes, bytearray)):
78
+ return bytes(audio)
79
+ if isinstance(audio, dict):
80
+ if not is_artifact_ref(audio):
81
+ raise ValueError("Expected an artifact ref dict like {'$artifact': '...'}")
82
+ if artifact_store is None:
83
+ raise ValueError("artifact_store is required to resolve artifact refs to bytes")
84
+ store = RuntimeArtifactStoreAdapter(artifact_store)
85
+ return store.load_bytes(get_artifact_id(audio))
86
+ if isinstance(audio, str):
87
+ from pathlib import Path
88
+
89
+ p = Path(audio).expanduser()
90
+ if p.exists() and p.is_file():
91
+ return p.read_bytes()
92
+ raise FileNotFoundError(f"File not found: {audio}")
93
+ raise TypeError("Unsupported input type; expected bytes, artifact-ref dict, or file path")
94
+
95
+ def _suffix_for_audio_ref(self, audio: Dict[str, Any], *, artifact_store: Any) -> str:
96
+ """Pick a best-effort file suffix for an audio artifact-ref dict."""
97
+ import mimetypes
98
+ from pathlib import Path
99
+
100
+ # Prefer explicit filename when provided (most clients include it).
101
+ try:
102
+ filename = audio.get("filename")
103
+ if isinstance(filename, str) and filename.strip():
104
+ suf = Path(filename.strip()).suffix
105
+ if isinstance(suf, str) and suf and len(suf) <= 10:
106
+ return suf
107
+ except Exception:
108
+ pass
109
+
110
+ # Next: content_type from ref (or artifact metadata when available).
111
+ content_type: Optional[str] = None
112
+ try:
113
+ ct = audio.get("content_type")
114
+ if isinstance(ct, str) and ct.strip():
115
+ content_type = ct.strip()
116
+ except Exception:
117
+ content_type = None
118
+
119
+ if content_type is None and artifact_store is not None:
120
+ try:
121
+ store = RuntimeArtifactStoreAdapter(artifact_store)
122
+ meta = store.get_metadata(get_artifact_id(audio))
123
+ if isinstance(meta, dict):
124
+ ct2 = meta.get("content_type")
125
+ if isinstance(ct2, str) and ct2.strip():
126
+ content_type = ct2.strip()
127
+ fn2 = meta.get("filename")
128
+ if isinstance(fn2, str) and fn2.strip():
129
+ suf = Path(fn2.strip()).suffix
130
+ if isinstance(suf, str) and suf and len(suf) <= 10:
131
+ return suf
132
+ except Exception:
133
+ pass
134
+
135
+ if isinstance(content_type, str) and content_type.strip():
136
+ # Drop charset/params (e.g. "audio/wav; codecs=...").
137
+ base = content_type.split(";", 1)[0].strip().lower()
138
+ ext = mimetypes.guess_extension(base) or ""
139
+ if ext:
140
+ return ext
141
+
142
+ return ".bin"
143
+
144
+
145
+ class _VoiceCapability(_BaseVoice):
146
+ backend_id = "abstractvoice:default"
147
+
148
+ def tts(
149
+ self,
150
+ text: str,
151
+ *,
152
+ voice: Optional[str] = None,
153
+ format: str = "wav",
154
+ artifact_store: Any = None,
155
+ run_id: Optional[str] = None,
156
+ tags: Optional[Dict[str, str]] = None,
157
+ metadata: Optional[Dict[str, Any]] = None,
158
+ **_kwargs: Any,
159
+ ):
160
+ vm = self._get_vm()
161
+ audio = vm.speak_to_bytes(str(text), format=str(format), voice=voice)
162
+ return self._maybe_store_audio(audio, artifact_store=artifact_store, fmt=str(format), run_id=run_id, tags=tags, metadata=metadata)
163
+
164
+ def stt(
165
+ self,
166
+ audio: Union[bytes, Dict[str, Any], str],
167
+ *,
168
+ language: Optional[str] = None,
169
+ artifact_store: Any = None,
170
+ metadata: Optional[Dict[str, Any]] = None,
171
+ **_kwargs: Any,
172
+ ) -> str:
173
+ _ = metadata
174
+ vm = self._get_vm()
175
+ if isinstance(audio, str):
176
+ return vm.transcribe_file(str(audio), language=language)
177
+
178
+ if isinstance(audio, dict):
179
+ import os
180
+ import tempfile
181
+
182
+ audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
183
+ suffix = self._suffix_for_audio_ref(audio, artifact_store=artifact_store)
184
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
185
+ tmp_file.write(bytes(audio_bytes))
186
+ tmp_path = tmp_file.name
187
+ try:
188
+ return vm.transcribe_file(tmp_path, language=language)
189
+ finally:
190
+ try:
191
+ os.unlink(tmp_path)
192
+ except Exception:
193
+ pass
194
+
195
+ audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
196
+ return vm.transcribe_from_bytes(bytes(audio_bytes), language=language)
197
+
198
+
199
+ class _AudioCapability(_BaseVoice):
200
+ backend_id = "abstractvoice:stt"
201
+
202
+ def transcribe(
203
+ self,
204
+ audio: Union[bytes, Dict[str, Any], str],
205
+ *,
206
+ language: Optional[str] = None,
207
+ artifact_store: Any = None,
208
+ metadata: Optional[Dict[str, Any]] = None,
209
+ **_kwargs: Any,
210
+ ) -> str:
211
+ _ = metadata
212
+ vm = self._get_vm()
213
+ if isinstance(audio, str):
214
+ return vm.transcribe_file(str(audio), language=language)
215
+
216
+ if isinstance(audio, dict):
217
+ import os
218
+ import tempfile
219
+
220
+ audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
221
+ suffix = self._suffix_for_audio_ref(audio, artifact_store=artifact_store)
222
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
223
+ tmp_file.write(bytes(audio_bytes))
224
+ tmp_path = tmp_file.name
225
+ try:
226
+ return vm.transcribe_file(tmp_path, language=language)
227
+ finally:
228
+ try:
229
+ os.unlink(tmp_path)
230
+ except Exception:
231
+ pass
232
+
233
+ audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
234
+ return vm.transcribe_from_bytes(bytes(audio_bytes), language=language)
235
+
236
+
237
+ def register(registry: Any) -> None:
238
+ """Register AbstractVoice as an AbstractCore capability plugin."""
239
+
240
+ registry.register_voice_backend(
241
+ backend_id=_VoiceCapability.backend_id,
242
+ factory=lambda owner: _VoiceCapability(owner),
243
+ priority=0,
244
+ description="AbstractVoice VoiceManager (TTS+STT).",
245
+ config_hint="Install voices/models with `abstractvoice-prefetch` for offline use (or allow downloads).",
246
+ )
247
+ registry.register_audio_backend(
248
+ backend_id=_AudioCapability.backend_id,
249
+ factory=lambda owner: _AudioCapability(owner),
250
+ priority=0,
251
+ description="AbstractVoice STT (speech-to-text).",
252
+ config_hint="Install STT models with `abstractvoice-prefetch --stt <size>` for offline use (or allow downloads).",
253
+ )
@@ -0,0 +1,82 @@
1
+ """Explicit model/artifact prefetch (cross-platform).
2
+
3
+ Design rule: This must never run implicitly during normal library usage.
4
+ Users/integrators call it explicitly after installation.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+
11
+
12
+ def main(argv: list[str] | None = None) -> int:
13
+ parser = argparse.ArgumentParser(prog="abstractvoice-prefetch", description="AbstractVoice explicit prefetch")
14
+ parser.add_argument(
15
+ "--stt",
16
+ dest="stt_model",
17
+ default=None,
18
+ help="Prefetch faster-whisper model weights (e.g. tiny/base/small/medium/large-v3)",
19
+ )
20
+ parser.add_argument(
21
+ "--openf5",
22
+ action="store_true",
23
+ help="Prefetch OpenF5 artifacts for cloning (~5.4GB, requires abstractvoice[cloning])",
24
+ )
25
+ parser.add_argument(
26
+ "--chroma",
27
+ action="store_true",
28
+ help="Prefetch Chroma-4B artifacts (~14GB+, requires HF access; install abstractvoice[chroma] to run inference)",
29
+ )
30
+ parser.add_argument(
31
+ "--piper",
32
+ dest="piper_language",
33
+ default=None,
34
+ help="Prefetch Piper voice model for a language (e.g. en/fr/de).",
35
+ )
36
+ args = parser.parse_args(argv)
37
+
38
+ if not args.stt_model and not args.openf5 and not args.chroma and not args.piper_language:
39
+ parser.print_help()
40
+ return 2
41
+
42
+ if args.stt_model:
43
+ from abstractvoice.adapters.stt_faster_whisper import FasterWhisperAdapter
44
+
45
+ model = str(args.stt_model).strip()
46
+ print(f"Downloading STT model (faster-whisper): {model}")
47
+ stt = FasterWhisperAdapter(model_size=model, device="cpu", compute_type="int8", allow_downloads=True)
48
+ if not stt.is_available():
49
+ raise RuntimeError("STT model download/load failed.")
50
+ print("✅ STT model ready.")
51
+
52
+ if args.openf5:
53
+ from abstractvoice.cloning.engine_f5 import F5TTSVoiceCloningEngine
54
+
55
+ print("Downloading OpenF5 artifacts (cloning)…")
56
+ engine = F5TTSVoiceCloningEngine(debug=True)
57
+ engine.ensure_openf5_artifacts_downloaded()
58
+ print("✅ OpenF5 artifacts ready.")
59
+
60
+ if args.chroma:
61
+ from abstractvoice.cloning.engine_chroma import ChromaVoiceCloningEngine
62
+
63
+ print("Downloading Chroma artifacts (cloning)…")
64
+ engine = ChromaVoiceCloningEngine(debug=True)
65
+ engine.ensure_chroma_artifacts_downloaded()
66
+ print("✅ Chroma artifacts ready.")
67
+
68
+ if args.piper_language:
69
+ from abstractvoice.adapters.tts_piper import PiperTTSAdapter
70
+
71
+ lang = str(args.piper_language).strip().lower()
72
+ print(f"Downloading Piper voice model: {lang}")
73
+ piper = PiperTTSAdapter(language=lang, allow_downloads=True, auto_load=False)
74
+ if not piper.ensure_model_downloaded(lang):
75
+ raise RuntimeError("Piper model download failed.")
76
+ print("✅ Piper model ready.")
77
+
78
+ return 0
79
+
80
+
81
+ if __name__ == "__main__":
82
+ raise SystemExit(main())