PyPI - abstractvoice - Versions diffs - 0.5.2__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

abstractvoice 0.5.2py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

abstractvoice/__init__.py +2 -5
abstractvoice/__main__.py +82 -3
abstractvoice/adapters/__init__.py +12 -0
abstractvoice/adapters/base.py +207 -0
abstractvoice/adapters/stt_faster_whisper.py +401 -0
abstractvoice/adapters/tts_piper.py +480 -0
abstractvoice/aec/__init__.py +10 -0
abstractvoice/aec/webrtc_apm.py +56 -0
abstractvoice/artifacts.py +173 -0
abstractvoice/audio/__init__.py +7 -0
abstractvoice/audio/recorder.py +46 -0
abstractvoice/audio/resample.py +25 -0
abstractvoice/cloning/__init__.py +7 -0
abstractvoice/cloning/engine_chroma.py +738 -0
abstractvoice/cloning/engine_f5.py +546 -0
abstractvoice/cloning/manager.py +349 -0
abstractvoice/cloning/store.py +362 -0
abstractvoice/compute/__init__.py +6 -0
abstractvoice/compute/device.py +73 -0
abstractvoice/config/__init__.py +2 -0
abstractvoice/config/voice_catalog.py +19 -0
abstractvoice/dependency_check.py +0 -1
abstractvoice/examples/cli_repl.py +2408 -243
abstractvoice/examples/voice_cli.py +64 -63
abstractvoice/integrations/__init__.py +2 -0
abstractvoice/integrations/abstractcore.py +116 -0
abstractvoice/integrations/abstractcore_plugin.py +253 -0
abstractvoice/prefetch.py +82 -0
abstractvoice/recognition.py +424 -42
abstractvoice/stop_phrase.py +103 -0
abstractvoice/text_sanitize.py +33 -0
abstractvoice/tts/__init__.py +3 -3
abstractvoice/tts/adapter_tts_engine.py +210 -0
abstractvoice/tts/tts_engine.py +257 -1208
abstractvoice/vm/__init__.py +2 -0
abstractvoice/vm/common.py +21 -0
abstractvoice/vm/core.py +139 -0
abstractvoice/vm/manager.py +108 -0
abstractvoice/vm/stt_mixin.py +158 -0
abstractvoice/vm/tts_mixin.py +550 -0
abstractvoice/voice_manager.py +6 -1061
abstractvoice-0.6.2.dist-info/METADATA +213 -0
abstractvoice-0.6.2.dist-info/RECORD +53 -0
{abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/WHEEL +1 -1
abstractvoice-0.6.2.dist-info/entry_points.txt +6 -0
abstractvoice/instant_setup.py +0 -83
abstractvoice/simple_model_manager.py +0 -539
abstractvoice-0.5.2.dist-info/METADATA +0 -1458
abstractvoice-0.5.2.dist-info/RECORD +0 -23
abstractvoice-0.5.2.dist-info/entry_points.txt +0 -2
{abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/licenses/LICENSE +0 -0
{abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/top_level.txt +0 -0

abstractvoice/examples/voice_cli.py CHANGED Viewed

@@ -16,14 +16,12 @@ def print_examples():
     print("  web            - Web API example")
     print("  simple         - Simple usage example")
     print("  check-deps     - Check dependency compatibility")
-    print("  download-models - Download TTS models for offline use")
     print("\nUsage: abstractvoice <command> [--language <lang>] [args...]")
-    print("\nSupported languages: en, fr, es, de, it, ru, multilingual")
+    print("\nSupported languages: en, fr, es, de, ru, zh")
     print("\nExamples:")
     print("  abstractvoice cli --language fr     # French CLI")
     print("  abstractvoice simple --language ru  # Russian simple example")
     print("  abstractvoice check-deps            # Check dependencies")
-    print("  abstractvoice download-models       # Download models for offline use")
     print("  abstractvoice                       # Direct voice mode (default)")
 def simple_example():
@@ -97,33 +95,37 @@ def simple_example():
 def parse_args():
     """Parse command line arguments."""
-    import sys
-    # Check if it's a download-models command and handle separately
-    if len(sys.argv) > 1 and sys.argv[1] == "download-models":
-        # Return early with just the command to handle in main()
-        class DownloadModelsArgs:
-            command = "download-models"
-            # Add dummy attributes to prevent AttributeError
-            model = "granite3.3:2b"
-            debug = False
-        return DownloadModelsArgs()
     parser = argparse.ArgumentParser(description="AbstractVoice - Voice interactions with AI")
     # Examples and special commands
-    parser.add_argument("command", nargs="?", help="Command to run: cli, web, simple, check-deps, download-models (default: voice mode)")
+    parser.add_argument("command", nargs="?", help="Command to run: cli, web, simple, check-deps (default: voice mode)")
     # Voice mode arguments
     parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("--verbose", action="store_true", help="Show per-turn performance stats")
     parser.add_argument("--api", default="http://localhost:11434/api/chat",
                       help="LLM API URL")
-    parser.add_argument("--model", default="granite3.3:2b",
+    parser.add_argument("--model", default="cogito:3b",
                       help="LLM model name")
-    parser.add_argument("--whisper", default="tiny",
-                      help="Whisper model to use (tiny, base, small, medium, large)")
+    parser.add_argument(
+        "--whisper",
+        default="base",
+        help="STT model size for faster-whisper (e.g. tiny|base|small|medium|large-v3).",
+    )
+    parser.add_argument(
+        "--cloning-engine",
+        default="f5_tts",
+        choices=["f5_tts", "chroma"],
+        help="Default cloning backend for new voices (f5_tts|chroma).",
+    )
+    parser.add_argument(
+        "--voice-mode",
+        default="off",
+        choices=["off", "wait", "stop", "full", "ptt"],
+        help="Auto-start microphone voice mode (off|wait|stop|full|ptt). Default: off.",
+    )
     parser.add_argument("--no-listening", action="store_true",
-                      help="Disable speech-to-text (listening), TTS still works")
+                      help="Disable speech-to-text (listening). Alias for --voice-mode off.")
     parser.add_argument("--no-tts", action="store_true",
                       help="Disable text-to-speech (TTS), text-only mode")
     parser.add_argument("--system",
@@ -133,8 +135,8 @@ def parse_args():
     parser.add_argument("--max-tokens", type=int, default=4096,
                       help="Set maximum tokens for the LLM response")
     parser.add_argument("--language", "--lang", default="en",
-                      choices=["en", "fr", "es", "de", "it", "ru", "multilingual"],
-                      help="Voice language (en=English, fr=French, es=Spanish, de=German, it=Italian, ru=Russian, multilingual=All)")
+                      choices=["en", "fr", "es", "de", "ru", "zh"],
+                      help="Voice language (en=English, fr=French, es=Spanish, de=German, ru=Russian, zh=Chinese)")
     parser.add_argument("--tts-model",
                       help="Specific TTS model to use (overrides language default)")
     return parser.parse_args()
@@ -145,6 +147,10 @@ def main():
         # Parse command line arguments
         args = parse_args()
+        # Normalize aliases/compat flags.
+        if getattr(args, "no_listening", False):
+            args.voice_mode = "off"
         # Handle special commands and examples
         if args.command == "check-deps":
             from abstractvoice.dependency_check import check_dependencies
@@ -157,26 +163,25 @@ def main():
                     import traceback
                     traceback.print_exc()
             return
-        elif args.command == "download-models":
-            from abstractvoice.simple_model_manager import download_models_cli
-            # Pass remaining arguments to download_models_cli
-            import sys
-            original_argv = sys.argv
-            sys.argv = ["download-models"] + sys.argv[2:]  # Remove script name and "download-models"
-            try:
-                download_models_cli()
-            finally:
-                sys.argv = original_argv
-            return
         elif args.command == "cli":
             # Import and run CLI REPL example
             repl = VoiceREPL(
                 api_url=args.api,
                 model=args.model,
                 debug_mode=args.debug,
+                verbose_mode=args.verbose,
                 language=args.language,
-                tts_model=args.tts_model
+                tts_model=args.tts_model,
+                voice_mode=args.voice_mode,
+                disable_tts=args.no_tts,
+                cloning_engine=args.cloning_engine,
             )
+            # Apply requested STT model size (best-effort).
+            try:
+                if getattr(repl, "voice_manager", None) is not None:
+                    repl.voice_manager.set_whisper(str(args.whisper))
+            except Exception:
+                pass
             # Set temperature and max_tokens
             repl.temperature = args.temperature
             repl.max_tokens = args.max_tokens
@@ -200,23 +205,28 @@ def main():
             print_examples()
             return
-        # Show language information
-        language_names = {
-            'en': 'English', 'fr': 'French', 'es': 'Spanish',
-            'de': 'German', 'it': 'Italian', 'ru': 'Russian',
-            'multilingual': 'Multilingual'
-        }
-        lang_name = language_names.get(args.language, args.language)
-        print(f"Starting AbstractVoice voice interface ({lang_name})...")
-        # Initialize REPL with language support
+        # Default behavior: start the REPL (mic OFF unless --voice-mode is set).
+        lang_name = {
+            "en": "English",
+            "fr": "French",
+            "de": "German",
+            "es": "Spanish",
+            "ru": "Russian",
+            "zh": "Chinese",
+        }.get(str(args.language), str(args.language))
+        print(f"Starting AbstractVoice ({lang_name})…")
+        # Initialize REPL.
         repl = VoiceREPL(
             api_url=args.api,
             model=args.model,
             debug_mode=args.debug,
+            verbose_mode=args.verbose,
             language=args.language,
             tts_model=args.tts_model,
-            disable_tts=args.no_tts
+            voice_mode=args.voice_mode,
+            disable_tts=args.no_tts,
+            cloning_engine=args.cloning_engine,
         )
         # Set custom system prompt if provided
@@ -233,17 +243,12 @@ def main():
             print(f"Temperature: {args.temperature}")
             print(f"Max tokens: {args.max_tokens}")
-        # Change Whisper model if specified
-        if args.whisper and args.whisper != "tiny":
-            if repl.voice_manager.set_whisper(args.whisper):
-                if args.debug:
-                    print(f"Using Whisper model: {args.whisper}")
-        # Start in voice mode automatically unless --no-listening is specified
-        if not args.no_listening:
-            print("Activating voice mode. Say 'stop' to exit voice mode.")
-            # Use the existing voice mode method
-            repl.do_voice("on")
+        # Apply requested STT model size (best-effort).
+        try:
+            if getattr(repl, "voice_manager", None) is not None:
+                repl.voice_manager.set_whisper(str(args.whisper))
+        except Exception:
+            pass
         # Start the REPL
         repl.cmdloop()
@@ -258,7 +263,7 @@ def main():
             print(f"❌ TTS model download failed")
             print(f"   This is a TTS voice model issue, not your Ollama model")
             print(f"   Your Ollama model '{args.model}' is fine")
-            print(f"   Try: rm -rf ~/.cache/tts && pip install --force-reinstall coqui-tts")
+            print("   Try: pip install --upgrade abstractvoice")
             print(f"   Or check network connectivity for model downloads")
         elif "ollama" in error_msg or "11434" in error_msg:
             print(f"❌ Cannot connect to Ollama at {args.api}")
@@ -267,11 +272,7 @@ def main():
         elif "importerror" in error_msg or "no module" in error_msg:
             print(f"❌ Missing dependencies")
             print(f"   Try running: abstractvoice check-deps")
-            print(f"   Or install dependencies: pip install abstractvoice[voice-full]")
-        elif "espeak" in error_msg or "phoneme" in error_msg:
-            print(f"❌ Voice synthesis setup issue")
-            print(f"   Install espeak-ng for better voice quality: brew install espeak-ng")
-            print(f"   Or this might be a TTS model download issue")
+            print(f"   Or install extras: pip install \"abstractvoice[all]\"")
         else:
             print(f"❌ Application error: {e}")
             print(f"   Try running with --debug for more details")
@@ -282,4 +283,4 @@ def main():
             traceback.print_exc()
 if __name__ == "__main__":
-    main()
+    main()

abstractvoice/integrations/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Optional integration modules (kept dependency-light)."""
2	+

abstractvoice/integrations/abstractcore.py ADDED Viewed

@@ -0,0 +1,116 @@
+from __future__ import annotations
+import base64
+from typing import Any, Callable, Dict, List, Optional
+from ..artifacts import MediaStore, RuntimeArtifactStoreAdapter, get_artifact_id, is_artifact_ref
+def _require_abstractcore_tool():
+    try:
+        from abstractcore import tool  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise ImportError("AbstractCore is required for this integration. Install it via: pip install abstractcore") from e
+    return tool
+def _decode_base64_bytes(value: str) -> bytes:
+    raw = str(value or "").strip()
+    if not raw:
+        return b""
+    if raw.startswith("data:") and "," in raw:
+        raw = raw.split(",", 1)[1].strip()
+    raw = "".join(raw.split())
+    pad = (-len(raw)) % 4
+    if pad:
+        raw = raw + ("=" * pad)
+    return base64.b64decode(raw, validate=False)
+def _require_store(store: Any) -> MediaStore:
+    # If the caller passed an AbstractRuntime ArtifactStore, adapt it.
+    if hasattr(store, "store") and hasattr(store, "load") and not hasattr(store, "store_bytes"):
+        return RuntimeArtifactStoreAdapter(store)
+    if not hasattr(store, "store_bytes") or not hasattr(store, "load_bytes"):
+        raise TypeError("store must be a MediaStore-like object or an AbstractRuntime-like ArtifactStore")
+    return store  # type: ignore[return-value]
+def _resolve_audio_bytes(
+    *,
+    store: MediaStore,
+    artifact: Optional[Dict[str, Any]],
+    b64: Optional[str],
+    required: bool,
+) -> Optional[bytes]:
+    if artifact is not None:
+        if not is_artifact_ref(artifact):
+            raise ValueError("audio_artifact: expected an artifact ref dict like {'$artifact': '...'}")
+        return store.load_bytes(get_artifact_id(artifact))
+    if b64 is not None:
+        out = _decode_base64_bytes(b64)
+        if required and not out:
+            raise ValueError("audio_b64: decoded to empty bytes")
+        return out
+    if required:
+        raise ValueError("Either audio_artifact or audio_b64 is required")
+    return None
+def make_voice_tools(
+    *,
+    voice_manager: Any,
+    store: Any,
+) -> List[Callable[..., Any]]:
+    """Create AbstractCore tools for TTS/STT (artifact-first outputs)."""
+    tool = _require_abstractcore_tool()
+    media_store = _require_store(store)
+    @tool(
+        name="voice_tts",
+        description="Synthesize speech from text and return an audio artifact ref.",
+        tags=["voice", "tts", "audio"],
+        when_to_use="Use when you need to generate an audio rendition of text (TTS).",
+    )
+    def voice_tts(
+        text: str,
+        voice: Optional[str] = None,
+        format: str = "wav",
+        run_id: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        audio = voice_manager.speak_to_bytes(str(text), format=str(format), voice=voice)
+        return media_store.store_bytes(
+            bytes(audio),
+            content_type=f"audio/{str(format).lower()}",
+            filename=f"tts.{str(format).lower()}",
+            run_id=str(run_id) if run_id else None,
+            tags={"kind": "generated_media", "modality": "audio", "task": "tts"},
+        )
+    @tool(
+        name="audio_transcribe",
+        description="Transcribe audio (speech-to-text) and return text plus a transcript artifact ref.",
+        tags=["audio", "stt", "transcribe"],
+        when_to_use="Use when you need to convert speech audio into text (STT).",
+    )
+    def audio_transcribe(
+        audio_artifact: Optional[Dict[str, Any]] = None,
+        audio_b64: Optional[str] = None,
+        language: Optional[str] = None,
+        run_id: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        audio_bytes = _resolve_audio_bytes(store=media_store, artifact=audio_artifact, b64=audio_b64, required=True)
+        text = voice_manager.transcribe_from_bytes(bytes(audio_bytes or b""), language=language)
+        transcript_ref = media_store.store_bytes(
+            str(text).encode("utf-8"),
+            content_type="text/plain; charset=utf-8",
+            filename="transcript.txt",
+            run_id=str(run_id) if run_id else None,
+            tags={"kind": "derived_text", "modality": "audio", "task": "stt"},
+        )
+        return {"text": text, "transcript_artifact": transcript_ref}
+    return [voice_tts, audio_transcribe]

abstractvoice/integrations/abstractcore_plugin.py ADDED Viewed

@@ -0,0 +1,253 @@
+from __future__ import annotations
+from typing import Any, Dict, Optional, Union
+from ..artifacts import RuntimeArtifactStoreAdapter, is_artifact_ref, get_artifact_id
+class _BaseVoice:
+    def __init__(self, owner: Any):
+        self._owner = owner
+        self._vm = None
+    def _get_vm(self):
+        if self._vm is not None:
+            return self._vm
+        # Injection hook (tests / advanced embedding).
+        try:
+            cfg = getattr(self._owner, "config", None)
+            if isinstance(cfg, dict):
+                inst = cfg.get("voice_manager_instance")
+                if inst is not None:
+                    self._vm = inst
+                    return self._vm
+                factory = cfg.get("voice_manager_factory")
+                if callable(factory):
+                    self._vm = factory(self._owner)
+                    return self._vm
+        except Exception:
+            pass
+        # Lazy import (keeps plugin import-light).
+        from ..voice_manager import VoiceManager
+        # Best-effort config overrides (optional).
+        language = "en"
+        allow_downloads = True
+        try:
+            cfg = getattr(self._owner, "config", None)
+            if isinstance(cfg, dict):
+                if isinstance(cfg.get("voice_language"), str) and cfg["voice_language"].strip():
+                    language = str(cfg["voice_language"]).strip().lower()
+                if "voice_allow_downloads" in cfg:
+                    allow_downloads = bool(cfg.get("voice_allow_downloads"))
+        except Exception:
+            pass
+        self._vm = VoiceManager(language=language, allow_downloads=allow_downloads)
+        return self._vm
+    def _maybe_store_audio(
+        self,
+        audio_bytes: bytes,
+        *,
+        artifact_store: Any,
+        fmt: str,
+        run_id: Optional[str],
+        tags: Optional[Dict[str, str]],
+        metadata: Optional[Dict[str, Any]],
+    ):
+        if artifact_store is None:
+            return bytes(audio_bytes)
+        store = RuntimeArtifactStoreAdapter(artifact_store)
+        merged_tags: Dict[str, str] = {"kind": "generated_media", "modality": "audio", "task": "tts"}
+        if isinstance(tags, dict):
+            merged_tags.update({str(k): str(v) for k, v in tags.items()})
+        return store.store_bytes(
+            bytes(audio_bytes),
+            content_type=f"audio/{str(fmt).lower()}",
+            filename=f"tts.{str(fmt).lower()}",
+            run_id=str(run_id) if run_id else None,
+            tags=merged_tags,
+            metadata=metadata if isinstance(metadata, dict) else None,
+        )
+    def _resolve_audio_bytes(self, audio: Union[bytes, Dict[str, Any], str], *, artifact_store: Any) -> bytes:
+        if isinstance(audio, (bytes, bytearray)):
+            return bytes(audio)
+        if isinstance(audio, dict):
+            if not is_artifact_ref(audio):
+                raise ValueError("Expected an artifact ref dict like {'$artifact': '...'}")
+            if artifact_store is None:
+                raise ValueError("artifact_store is required to resolve artifact refs to bytes")
+            store = RuntimeArtifactStoreAdapter(artifact_store)
+            return store.load_bytes(get_artifact_id(audio))
+        if isinstance(audio, str):
+            from pathlib import Path
+            p = Path(audio).expanduser()
+            if p.exists() and p.is_file():
+                return p.read_bytes()
+            raise FileNotFoundError(f"File not found: {audio}")
+        raise TypeError("Unsupported input type; expected bytes, artifact-ref dict, or file path")
+    def _suffix_for_audio_ref(self, audio: Dict[str, Any], *, artifact_store: Any) -> str:
+        """Pick a best-effort file suffix for an audio artifact-ref dict."""
+        import mimetypes
+        from pathlib import Path
+        # Prefer explicit filename when provided (most clients include it).
+        try:
+            filename = audio.get("filename")
+            if isinstance(filename, str) and filename.strip():
+                suf = Path(filename.strip()).suffix
+                if isinstance(suf, str) and suf and len(suf) <= 10:
+                    return suf
+        except Exception:
+            pass
+        # Next: content_type from ref (or artifact metadata when available).
+        content_type: Optional[str] = None
+        try:
+            ct = audio.get("content_type")
+            if isinstance(ct, str) and ct.strip():
+                content_type = ct.strip()
+        except Exception:
+            content_type = None
+        if content_type is None and artifact_store is not None:
+            try:
+                store = RuntimeArtifactStoreAdapter(artifact_store)
+                meta = store.get_metadata(get_artifact_id(audio))
+                if isinstance(meta, dict):
+                    ct2 = meta.get("content_type")
+                    if isinstance(ct2, str) and ct2.strip():
+                        content_type = ct2.strip()
+                    fn2 = meta.get("filename")
+                    if isinstance(fn2, str) and fn2.strip():
+                        suf = Path(fn2.strip()).suffix
+                        if isinstance(suf, str) and suf and len(suf) <= 10:
+                            return suf
+            except Exception:
+                pass
+        if isinstance(content_type, str) and content_type.strip():
+            # Drop charset/params (e.g. "audio/wav; codecs=...").
+            base = content_type.split(";", 1)[0].strip().lower()
+            ext = mimetypes.guess_extension(base) or ""
+            if ext:
+                return ext
+        return ".bin"
+class _VoiceCapability(_BaseVoice):
+    backend_id = "abstractvoice:default"
+    def tts(
+        self,
+        text: str,
+        *,
+        voice: Optional[str] = None,
+        format: str = "wav",
+        artifact_store: Any = None,
+        run_id: Optional[str] = None,
+        tags: Optional[Dict[str, str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **_kwargs: Any,
+    ):
+        vm = self._get_vm()
+        audio = vm.speak_to_bytes(str(text), format=str(format), voice=voice)
+        return self._maybe_store_audio(audio, artifact_store=artifact_store, fmt=str(format), run_id=run_id, tags=tags, metadata=metadata)
+    def stt(
+        self,
+        audio: Union[bytes, Dict[str, Any], str],
+        *,
+        language: Optional[str] = None,
+        artifact_store: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **_kwargs: Any,
+    ) -> str:
+        _ = metadata
+        vm = self._get_vm()
+        if isinstance(audio, str):
+            return vm.transcribe_file(str(audio), language=language)
+        if isinstance(audio, dict):
+            import os
+            import tempfile
+            audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
+            suffix = self._suffix_for_audio_ref(audio, artifact_store=artifact_store)
+            with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
+                tmp_file.write(bytes(audio_bytes))
+                tmp_path = tmp_file.name
+            try:
+                return vm.transcribe_file(tmp_path, language=language)
+            finally:
+                try:
+                    os.unlink(tmp_path)
+                except Exception:
+                    pass
+        audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
+        return vm.transcribe_from_bytes(bytes(audio_bytes), language=language)
+class _AudioCapability(_BaseVoice):
+    backend_id = "abstractvoice:stt"
+    def transcribe(
+        self,
+        audio: Union[bytes, Dict[str, Any], str],
+        *,
+        language: Optional[str] = None,
+        artifact_store: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **_kwargs: Any,
+    ) -> str:
+        _ = metadata
+        vm = self._get_vm()
+        if isinstance(audio, str):
+            return vm.transcribe_file(str(audio), language=language)
+        if isinstance(audio, dict):
+            import os
+            import tempfile
+            audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
+            suffix = self._suffix_for_audio_ref(audio, artifact_store=artifact_store)
+            with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
+                tmp_file.write(bytes(audio_bytes))
+                tmp_path = tmp_file.name
+            try:
+                return vm.transcribe_file(tmp_path, language=language)
+            finally:
+                try:
+                    os.unlink(tmp_path)
+                except Exception:
+                    pass
+        audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
+        return vm.transcribe_from_bytes(bytes(audio_bytes), language=language)
+def register(registry: Any) -> None:
+    """Register AbstractVoice as an AbstractCore capability plugin."""
+    registry.register_voice_backend(
+        backend_id=_VoiceCapability.backend_id,
+        factory=lambda owner: _VoiceCapability(owner),
+        priority=0,
+        description="AbstractVoice VoiceManager (TTS+STT).",
+        config_hint="Install voices/models with `abstractvoice-prefetch` for offline use (or allow downloads).",
+    )
+    registry.register_audio_backend(
+        backend_id=_AudioCapability.backend_id,
+        factory=lambda owner: _AudioCapability(owner),
+        priority=0,
+        description="AbstractVoice STT (speech-to-text).",
+        config_hint="Install STT models with `abstractvoice-prefetch --stt <size>` for offline use (or allow downloads).",
+    )

abstractvoice/prefetch.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Explicit model/artifact prefetch (cross-platform).
+Design rule: This must never run implicitly during normal library usage.
+Users/integrators call it explicitly after installation.
+"""
+from __future__ import annotations
+import argparse
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="abstractvoice-prefetch", description="AbstractVoice explicit prefetch")
+    parser.add_argument(
+        "--stt",
+        dest="stt_model",
+        default=None,
+        help="Prefetch faster-whisper model weights (e.g. tiny/base/small/medium/large-v3)",
+    )
+    parser.add_argument(
+        "--openf5",
+        action="store_true",
+        help="Prefetch OpenF5 artifacts for cloning (~5.4GB, requires abstractvoice[cloning])",
+    )
+    parser.add_argument(
+        "--chroma",
+        action="store_true",
+        help="Prefetch Chroma-4B artifacts (~14GB+, requires HF access; install abstractvoice[chroma] to run inference)",
+    )
+    parser.add_argument(
+        "--piper",
+        dest="piper_language",
+        default=None,
+        help="Prefetch Piper voice model for a language (e.g. en/fr/de).",
+    )
+    args = parser.parse_args(argv)
+    if not args.stt_model and not args.openf5 and not args.chroma and not args.piper_language:
+        parser.print_help()
+        return 2
+    if args.stt_model:
+        from abstractvoice.adapters.stt_faster_whisper import FasterWhisperAdapter
+        model = str(args.stt_model).strip()
+        print(f"Downloading STT model (faster-whisper): {model}")
+        stt = FasterWhisperAdapter(model_size=model, device="cpu", compute_type="int8", allow_downloads=True)
+        if not stt.is_available():
+            raise RuntimeError("STT model download/load failed.")
+        print("✅ STT model ready.")
+    if args.openf5:
+        from abstractvoice.cloning.engine_f5 import F5TTSVoiceCloningEngine
+        print("Downloading OpenF5 artifacts (cloning)…")
+        engine = F5TTSVoiceCloningEngine(debug=True)
+        engine.ensure_openf5_artifacts_downloaded()
+        print("✅ OpenF5 artifacts ready.")
+    if args.chroma:
+        from abstractvoice.cloning.engine_chroma import ChromaVoiceCloningEngine
+        print("Downloading Chroma artifacts (cloning)…")
+        engine = ChromaVoiceCloningEngine(debug=True)
+        engine.ensure_chroma_artifacts_downloaded()
+        print("✅ Chroma artifacts ready.")
+    if args.piper_language:
+        from abstractvoice.adapters.tts_piper import PiperTTSAdapter
+        lang = str(args.piper_language).strip().lower()
+        print(f"Downloading Piper voice model: {lang}")
+        piper = PiperTTSAdapter(language=lang, allow_downloads=True, auto_load=False)
+        if not piper.ensure_model_downloaded(lang):
+            raise RuntimeError("Piper model download failed.")
+        print("✅ Piper model ready.")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

abstractvoice 0.5.2__py3-none-any.whl → 0.6.2__py3-none-any.whl

abstractvoice 0.5.2py3-none-any.whl → 0.6.2py3-none-any.whl