abstractvoice 0.5.2__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2408 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/text_sanitize.py +33 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.2.dist-info/METADATA +213 -0
- abstractvoice-0.6.2.dist-info/RECORD +53 -0
- {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.2.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.2.dist-info/METADATA +0 -1458
- abstractvoice-0.5.2.dist-info/RECORD +0 -23
- abstractvoice-0.5.2.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.2.dist-info → abstractvoice-0.6.2.dist-info}/top_level.txt +0 -0
|
@@ -16,14 +16,12 @@ def print_examples():
|
|
|
16
16
|
print(" web - Web API example")
|
|
17
17
|
print(" simple - Simple usage example")
|
|
18
18
|
print(" check-deps - Check dependency compatibility")
|
|
19
|
-
print(" download-models - Download TTS models for offline use")
|
|
20
19
|
print("\nUsage: abstractvoice <command> [--language <lang>] [args...]")
|
|
21
|
-
print("\nSupported languages: en, fr, es, de,
|
|
20
|
+
print("\nSupported languages: en, fr, es, de, ru, zh")
|
|
22
21
|
print("\nExamples:")
|
|
23
22
|
print(" abstractvoice cli --language fr # French CLI")
|
|
24
23
|
print(" abstractvoice simple --language ru # Russian simple example")
|
|
25
24
|
print(" abstractvoice check-deps # Check dependencies")
|
|
26
|
-
print(" abstractvoice download-models # Download models for offline use")
|
|
27
25
|
print(" abstractvoice # Direct voice mode (default)")
|
|
28
26
|
|
|
29
27
|
def simple_example():
|
|
@@ -97,33 +95,37 @@ def simple_example():
|
|
|
97
95
|
|
|
98
96
|
def parse_args():
|
|
99
97
|
"""Parse command line arguments."""
|
|
100
|
-
import sys
|
|
101
|
-
|
|
102
|
-
# Check if it's a download-models command and handle separately
|
|
103
|
-
if len(sys.argv) > 1 and sys.argv[1] == "download-models":
|
|
104
|
-
# Return early with just the command to handle in main()
|
|
105
|
-
class DownloadModelsArgs:
|
|
106
|
-
command = "download-models"
|
|
107
|
-
# Add dummy attributes to prevent AttributeError
|
|
108
|
-
model = "granite3.3:2b"
|
|
109
|
-
debug = False
|
|
110
|
-
return DownloadModelsArgs()
|
|
111
|
-
|
|
112
98
|
parser = argparse.ArgumentParser(description="AbstractVoice - Voice interactions with AI")
|
|
113
99
|
|
|
114
100
|
# Examples and special commands
|
|
115
|
-
parser.add_argument("command", nargs="?", help="Command to run: cli, web, simple, check-deps
|
|
101
|
+
parser.add_argument("command", nargs="?", help="Command to run: cli, web, simple, check-deps (default: voice mode)")
|
|
116
102
|
|
|
117
103
|
# Voice mode arguments
|
|
118
104
|
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
105
|
+
parser.add_argument("--verbose", action="store_true", help="Show per-turn performance stats")
|
|
119
106
|
parser.add_argument("--api", default="http://localhost:11434/api/chat",
|
|
120
107
|
help="LLM API URL")
|
|
121
|
-
parser.add_argument("--model", default="
|
|
108
|
+
parser.add_argument("--model", default="cogito:3b",
|
|
122
109
|
help="LLM model name")
|
|
123
|
-
parser.add_argument(
|
|
124
|
-
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"--whisper",
|
|
112
|
+
default="base",
|
|
113
|
+
help="STT model size for faster-whisper (e.g. tiny|base|small|medium|large-v3).",
|
|
114
|
+
)
|
|
115
|
+
parser.add_argument(
|
|
116
|
+
"--cloning-engine",
|
|
117
|
+
default="f5_tts",
|
|
118
|
+
choices=["f5_tts", "chroma"],
|
|
119
|
+
help="Default cloning backend for new voices (f5_tts|chroma).",
|
|
120
|
+
)
|
|
121
|
+
parser.add_argument(
|
|
122
|
+
"--voice-mode",
|
|
123
|
+
default="off",
|
|
124
|
+
choices=["off", "wait", "stop", "full", "ptt"],
|
|
125
|
+
help="Auto-start microphone voice mode (off|wait|stop|full|ptt). Default: off.",
|
|
126
|
+
)
|
|
125
127
|
parser.add_argument("--no-listening", action="store_true",
|
|
126
|
-
help="Disable speech-to-text (listening)
|
|
128
|
+
help="Disable speech-to-text (listening). Alias for --voice-mode off.")
|
|
127
129
|
parser.add_argument("--no-tts", action="store_true",
|
|
128
130
|
help="Disable text-to-speech (TTS), text-only mode")
|
|
129
131
|
parser.add_argument("--system",
|
|
@@ -133,8 +135,8 @@ def parse_args():
|
|
|
133
135
|
parser.add_argument("--max-tokens", type=int, default=4096,
|
|
134
136
|
help="Set maximum tokens for the LLM response")
|
|
135
137
|
parser.add_argument("--language", "--lang", default="en",
|
|
136
|
-
choices=["en", "fr", "es", "de", "
|
|
137
|
-
help="Voice language (en=English, fr=French, es=Spanish, de=German,
|
|
138
|
+
choices=["en", "fr", "es", "de", "ru", "zh"],
|
|
139
|
+
help="Voice language (en=English, fr=French, es=Spanish, de=German, ru=Russian, zh=Chinese)")
|
|
138
140
|
parser.add_argument("--tts-model",
|
|
139
141
|
help="Specific TTS model to use (overrides language default)")
|
|
140
142
|
return parser.parse_args()
|
|
@@ -145,6 +147,10 @@ def main():
|
|
|
145
147
|
# Parse command line arguments
|
|
146
148
|
args = parse_args()
|
|
147
149
|
|
|
150
|
+
# Normalize aliases/compat flags.
|
|
151
|
+
if getattr(args, "no_listening", False):
|
|
152
|
+
args.voice_mode = "off"
|
|
153
|
+
|
|
148
154
|
# Handle special commands and examples
|
|
149
155
|
if args.command == "check-deps":
|
|
150
156
|
from abstractvoice.dependency_check import check_dependencies
|
|
@@ -157,26 +163,25 @@ def main():
|
|
|
157
163
|
import traceback
|
|
158
164
|
traceback.print_exc()
|
|
159
165
|
return
|
|
160
|
-
elif args.command == "download-models":
|
|
161
|
-
from abstractvoice.simple_model_manager import download_models_cli
|
|
162
|
-
# Pass remaining arguments to download_models_cli
|
|
163
|
-
import sys
|
|
164
|
-
original_argv = sys.argv
|
|
165
|
-
sys.argv = ["download-models"] + sys.argv[2:] # Remove script name and "download-models"
|
|
166
|
-
try:
|
|
167
|
-
download_models_cli()
|
|
168
|
-
finally:
|
|
169
|
-
sys.argv = original_argv
|
|
170
|
-
return
|
|
171
166
|
elif args.command == "cli":
|
|
172
167
|
# Import and run CLI REPL example
|
|
173
168
|
repl = VoiceREPL(
|
|
174
169
|
api_url=args.api,
|
|
175
170
|
model=args.model,
|
|
176
171
|
debug_mode=args.debug,
|
|
172
|
+
verbose_mode=args.verbose,
|
|
177
173
|
language=args.language,
|
|
178
|
-
tts_model=args.tts_model
|
|
174
|
+
tts_model=args.tts_model,
|
|
175
|
+
voice_mode=args.voice_mode,
|
|
176
|
+
disable_tts=args.no_tts,
|
|
177
|
+
cloning_engine=args.cloning_engine,
|
|
179
178
|
)
|
|
179
|
+
# Apply requested STT model size (best-effort).
|
|
180
|
+
try:
|
|
181
|
+
if getattr(repl, "voice_manager", None) is not None:
|
|
182
|
+
repl.voice_manager.set_whisper(str(args.whisper))
|
|
183
|
+
except Exception:
|
|
184
|
+
pass
|
|
180
185
|
# Set temperature and max_tokens
|
|
181
186
|
repl.temperature = args.temperature
|
|
182
187
|
repl.max_tokens = args.max_tokens
|
|
@@ -200,23 +205,28 @@ def main():
|
|
|
200
205
|
print_examples()
|
|
201
206
|
return
|
|
202
207
|
|
|
203
|
-
#
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
208
|
+
# Default behavior: start the REPL (mic OFF unless --voice-mode is set).
|
|
209
|
+
lang_name = {
|
|
210
|
+
"en": "English",
|
|
211
|
+
"fr": "French",
|
|
212
|
+
"de": "German",
|
|
213
|
+
"es": "Spanish",
|
|
214
|
+
"ru": "Russian",
|
|
215
|
+
"zh": "Chinese",
|
|
216
|
+
}.get(str(args.language), str(args.language))
|
|
217
|
+
print(f"Starting AbstractVoice ({lang_name})…")
|
|
218
|
+
|
|
219
|
+
# Initialize REPL.
|
|
213
220
|
repl = VoiceREPL(
|
|
214
221
|
api_url=args.api,
|
|
215
222
|
model=args.model,
|
|
216
223
|
debug_mode=args.debug,
|
|
224
|
+
verbose_mode=args.verbose,
|
|
217
225
|
language=args.language,
|
|
218
226
|
tts_model=args.tts_model,
|
|
219
|
-
|
|
227
|
+
voice_mode=args.voice_mode,
|
|
228
|
+
disable_tts=args.no_tts,
|
|
229
|
+
cloning_engine=args.cloning_engine,
|
|
220
230
|
)
|
|
221
231
|
|
|
222
232
|
# Set custom system prompt if provided
|
|
@@ -233,17 +243,12 @@ def main():
|
|
|
233
243
|
print(f"Temperature: {args.temperature}")
|
|
234
244
|
print(f"Max tokens: {args.max_tokens}")
|
|
235
245
|
|
|
236
|
-
#
|
|
237
|
-
|
|
238
|
-
if repl
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
# Start in voice mode automatically unless --no-listening is specified
|
|
243
|
-
if not args.no_listening:
|
|
244
|
-
print("Activating voice mode. Say 'stop' to exit voice mode.")
|
|
245
|
-
# Use the existing voice mode method
|
|
246
|
-
repl.do_voice("on")
|
|
246
|
+
# Apply requested STT model size (best-effort).
|
|
247
|
+
try:
|
|
248
|
+
if getattr(repl, "voice_manager", None) is not None:
|
|
249
|
+
repl.voice_manager.set_whisper(str(args.whisper))
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
247
252
|
|
|
248
253
|
# Start the REPL
|
|
249
254
|
repl.cmdloop()
|
|
@@ -258,7 +263,7 @@ def main():
|
|
|
258
263
|
print(f"❌ TTS model download failed")
|
|
259
264
|
print(f" This is a TTS voice model issue, not your Ollama model")
|
|
260
265
|
print(f" Your Ollama model '{args.model}' is fine")
|
|
261
|
-
print(
|
|
266
|
+
print(" Try: pip install --upgrade abstractvoice")
|
|
262
267
|
print(f" Or check network connectivity for model downloads")
|
|
263
268
|
elif "ollama" in error_msg or "11434" in error_msg:
|
|
264
269
|
print(f"❌ Cannot connect to Ollama at {args.api}")
|
|
@@ -267,11 +272,7 @@ def main():
|
|
|
267
272
|
elif "importerror" in error_msg or "no module" in error_msg:
|
|
268
273
|
print(f"❌ Missing dependencies")
|
|
269
274
|
print(f" Try running: abstractvoice check-deps")
|
|
270
|
-
print(f" Or install
|
|
271
|
-
elif "espeak" in error_msg or "phoneme" in error_msg:
|
|
272
|
-
print(f"❌ Voice synthesis setup issue")
|
|
273
|
-
print(f" Install espeak-ng for better voice quality: brew install espeak-ng")
|
|
274
|
-
print(f" Or this might be a TTS model download issue")
|
|
275
|
+
print(f" Or install extras: pip install \"abstractvoice[all]\"")
|
|
275
276
|
else:
|
|
276
277
|
print(f"❌ Application error: {e}")
|
|
277
278
|
print(f" Try running with --debug for more details")
|
|
@@ -282,4 +283,4 @@ def main():
|
|
|
282
283
|
traceback.print_exc()
|
|
283
284
|
|
|
284
285
|
if __name__ == "__main__":
|
|
285
|
-
main()
|
|
286
|
+
main()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from ..artifacts import MediaStore, RuntimeArtifactStoreAdapter, get_artifact_id, is_artifact_ref
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _require_abstractcore_tool():
|
|
10
|
+
try:
|
|
11
|
+
from abstractcore import tool # type: ignore
|
|
12
|
+
except Exception as e: # pragma: no cover
|
|
13
|
+
raise ImportError("AbstractCore is required for this integration. Install it via: pip install abstractcore") from e
|
|
14
|
+
return tool
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _decode_base64_bytes(value: str) -> bytes:
|
|
18
|
+
raw = str(value or "").strip()
|
|
19
|
+
if not raw:
|
|
20
|
+
return b""
|
|
21
|
+
if raw.startswith("data:") and "," in raw:
|
|
22
|
+
raw = raw.split(",", 1)[1].strip()
|
|
23
|
+
raw = "".join(raw.split())
|
|
24
|
+
pad = (-len(raw)) % 4
|
|
25
|
+
if pad:
|
|
26
|
+
raw = raw + ("=" * pad)
|
|
27
|
+
return base64.b64decode(raw, validate=False)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _require_store(store: Any) -> MediaStore:
|
|
31
|
+
# If the caller passed an AbstractRuntime ArtifactStore, adapt it.
|
|
32
|
+
if hasattr(store, "store") and hasattr(store, "load") and not hasattr(store, "store_bytes"):
|
|
33
|
+
return RuntimeArtifactStoreAdapter(store)
|
|
34
|
+
if not hasattr(store, "store_bytes") or not hasattr(store, "load_bytes"):
|
|
35
|
+
raise TypeError("store must be a MediaStore-like object or an AbstractRuntime-like ArtifactStore")
|
|
36
|
+
return store # type: ignore[return-value]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _resolve_audio_bytes(
|
|
40
|
+
*,
|
|
41
|
+
store: MediaStore,
|
|
42
|
+
artifact: Optional[Dict[str, Any]],
|
|
43
|
+
b64: Optional[str],
|
|
44
|
+
required: bool,
|
|
45
|
+
) -> Optional[bytes]:
|
|
46
|
+
if artifact is not None:
|
|
47
|
+
if not is_artifact_ref(artifact):
|
|
48
|
+
raise ValueError("audio_artifact: expected an artifact ref dict like {'$artifact': '...'}")
|
|
49
|
+
return store.load_bytes(get_artifact_id(artifact))
|
|
50
|
+
if b64 is not None:
|
|
51
|
+
out = _decode_base64_bytes(b64)
|
|
52
|
+
if required and not out:
|
|
53
|
+
raise ValueError("audio_b64: decoded to empty bytes")
|
|
54
|
+
return out
|
|
55
|
+
if required:
|
|
56
|
+
raise ValueError("Either audio_artifact or audio_b64 is required")
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def make_voice_tools(
|
|
61
|
+
*,
|
|
62
|
+
voice_manager: Any,
|
|
63
|
+
store: Any,
|
|
64
|
+
) -> List[Callable[..., Any]]:
|
|
65
|
+
"""Create AbstractCore tools for TTS/STT (artifact-first outputs)."""
|
|
66
|
+
|
|
67
|
+
tool = _require_abstractcore_tool()
|
|
68
|
+
media_store = _require_store(store)
|
|
69
|
+
|
|
70
|
+
@tool(
|
|
71
|
+
name="voice_tts",
|
|
72
|
+
description="Synthesize speech from text and return an audio artifact ref.",
|
|
73
|
+
tags=["voice", "tts", "audio"],
|
|
74
|
+
when_to_use="Use when you need to generate an audio rendition of text (TTS).",
|
|
75
|
+
)
|
|
76
|
+
def voice_tts(
|
|
77
|
+
text: str,
|
|
78
|
+
voice: Optional[str] = None,
|
|
79
|
+
format: str = "wav",
|
|
80
|
+
run_id: Optional[str] = None,
|
|
81
|
+
) -> Dict[str, Any]:
|
|
82
|
+
audio = voice_manager.speak_to_bytes(str(text), format=str(format), voice=voice)
|
|
83
|
+
return media_store.store_bytes(
|
|
84
|
+
bytes(audio),
|
|
85
|
+
content_type=f"audio/{str(format).lower()}",
|
|
86
|
+
filename=f"tts.{str(format).lower()}",
|
|
87
|
+
run_id=str(run_id) if run_id else None,
|
|
88
|
+
tags={"kind": "generated_media", "modality": "audio", "task": "tts"},
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@tool(
|
|
92
|
+
name="audio_transcribe",
|
|
93
|
+
description="Transcribe audio (speech-to-text) and return text plus a transcript artifact ref.",
|
|
94
|
+
tags=["audio", "stt", "transcribe"],
|
|
95
|
+
when_to_use="Use when you need to convert speech audio into text (STT).",
|
|
96
|
+
)
|
|
97
|
+
def audio_transcribe(
|
|
98
|
+
audio_artifact: Optional[Dict[str, Any]] = None,
|
|
99
|
+
audio_b64: Optional[str] = None,
|
|
100
|
+
language: Optional[str] = None,
|
|
101
|
+
run_id: Optional[str] = None,
|
|
102
|
+
) -> Dict[str, Any]:
|
|
103
|
+
audio_bytes = _resolve_audio_bytes(store=media_store, artifact=audio_artifact, b64=audio_b64, required=True)
|
|
104
|
+
text = voice_manager.transcribe_from_bytes(bytes(audio_bytes or b""), language=language)
|
|
105
|
+
|
|
106
|
+
transcript_ref = media_store.store_bytes(
|
|
107
|
+
str(text).encode("utf-8"),
|
|
108
|
+
content_type="text/plain; charset=utf-8",
|
|
109
|
+
filename="transcript.txt",
|
|
110
|
+
run_id=str(run_id) if run_id else None,
|
|
111
|
+
tags={"kind": "derived_text", "modality": "audio", "task": "stt"},
|
|
112
|
+
)
|
|
113
|
+
return {"text": text, "transcript_artifact": transcript_ref}
|
|
114
|
+
|
|
115
|
+
return [voice_tts, audio_transcribe]
|
|
116
|
+
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional, Union
|
|
4
|
+
|
|
5
|
+
from ..artifacts import RuntimeArtifactStoreAdapter, is_artifact_ref, get_artifact_id
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _BaseVoice:
|
|
9
|
+
def __init__(self, owner: Any):
|
|
10
|
+
self._owner = owner
|
|
11
|
+
self._vm = None
|
|
12
|
+
|
|
13
|
+
def _get_vm(self):
|
|
14
|
+
if self._vm is not None:
|
|
15
|
+
return self._vm
|
|
16
|
+
|
|
17
|
+
# Injection hook (tests / advanced embedding).
|
|
18
|
+
try:
|
|
19
|
+
cfg = getattr(self._owner, "config", None)
|
|
20
|
+
if isinstance(cfg, dict):
|
|
21
|
+
inst = cfg.get("voice_manager_instance")
|
|
22
|
+
if inst is not None:
|
|
23
|
+
self._vm = inst
|
|
24
|
+
return self._vm
|
|
25
|
+
factory = cfg.get("voice_manager_factory")
|
|
26
|
+
if callable(factory):
|
|
27
|
+
self._vm = factory(self._owner)
|
|
28
|
+
return self._vm
|
|
29
|
+
except Exception:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
# Lazy import (keeps plugin import-light).
|
|
33
|
+
from ..voice_manager import VoiceManager
|
|
34
|
+
|
|
35
|
+
# Best-effort config overrides (optional).
|
|
36
|
+
language = "en"
|
|
37
|
+
allow_downloads = True
|
|
38
|
+
try:
|
|
39
|
+
cfg = getattr(self._owner, "config", None)
|
|
40
|
+
if isinstance(cfg, dict):
|
|
41
|
+
if isinstance(cfg.get("voice_language"), str) and cfg["voice_language"].strip():
|
|
42
|
+
language = str(cfg["voice_language"]).strip().lower()
|
|
43
|
+
if "voice_allow_downloads" in cfg:
|
|
44
|
+
allow_downloads = bool(cfg.get("voice_allow_downloads"))
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
self._vm = VoiceManager(language=language, allow_downloads=allow_downloads)
|
|
49
|
+
return self._vm
|
|
50
|
+
|
|
51
|
+
def _maybe_store_audio(
|
|
52
|
+
self,
|
|
53
|
+
audio_bytes: bytes,
|
|
54
|
+
*,
|
|
55
|
+
artifact_store: Any,
|
|
56
|
+
fmt: str,
|
|
57
|
+
run_id: Optional[str],
|
|
58
|
+
tags: Optional[Dict[str, str]],
|
|
59
|
+
metadata: Optional[Dict[str, Any]],
|
|
60
|
+
):
|
|
61
|
+
if artifact_store is None:
|
|
62
|
+
return bytes(audio_bytes)
|
|
63
|
+
store = RuntimeArtifactStoreAdapter(artifact_store)
|
|
64
|
+
merged_tags: Dict[str, str] = {"kind": "generated_media", "modality": "audio", "task": "tts"}
|
|
65
|
+
if isinstance(tags, dict):
|
|
66
|
+
merged_tags.update({str(k): str(v) for k, v in tags.items()})
|
|
67
|
+
return store.store_bytes(
|
|
68
|
+
bytes(audio_bytes),
|
|
69
|
+
content_type=f"audio/{str(fmt).lower()}",
|
|
70
|
+
filename=f"tts.{str(fmt).lower()}",
|
|
71
|
+
run_id=str(run_id) if run_id else None,
|
|
72
|
+
tags=merged_tags,
|
|
73
|
+
metadata=metadata if isinstance(metadata, dict) else None,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def _resolve_audio_bytes(self, audio: Union[bytes, Dict[str, Any], str], *, artifact_store: Any) -> bytes:
|
|
77
|
+
if isinstance(audio, (bytes, bytearray)):
|
|
78
|
+
return bytes(audio)
|
|
79
|
+
if isinstance(audio, dict):
|
|
80
|
+
if not is_artifact_ref(audio):
|
|
81
|
+
raise ValueError("Expected an artifact ref dict like {'$artifact': '...'}")
|
|
82
|
+
if artifact_store is None:
|
|
83
|
+
raise ValueError("artifact_store is required to resolve artifact refs to bytes")
|
|
84
|
+
store = RuntimeArtifactStoreAdapter(artifact_store)
|
|
85
|
+
return store.load_bytes(get_artifact_id(audio))
|
|
86
|
+
if isinstance(audio, str):
|
|
87
|
+
from pathlib import Path
|
|
88
|
+
|
|
89
|
+
p = Path(audio).expanduser()
|
|
90
|
+
if p.exists() and p.is_file():
|
|
91
|
+
return p.read_bytes()
|
|
92
|
+
raise FileNotFoundError(f"File not found: {audio}")
|
|
93
|
+
raise TypeError("Unsupported input type; expected bytes, artifact-ref dict, or file path")
|
|
94
|
+
|
|
95
|
+
def _suffix_for_audio_ref(self, audio: Dict[str, Any], *, artifact_store: Any) -> str:
|
|
96
|
+
"""Pick a best-effort file suffix for an audio artifact-ref dict."""
|
|
97
|
+
import mimetypes
|
|
98
|
+
from pathlib import Path
|
|
99
|
+
|
|
100
|
+
# Prefer explicit filename when provided (most clients include it).
|
|
101
|
+
try:
|
|
102
|
+
filename = audio.get("filename")
|
|
103
|
+
if isinstance(filename, str) and filename.strip():
|
|
104
|
+
suf = Path(filename.strip()).suffix
|
|
105
|
+
if isinstance(suf, str) and suf and len(suf) <= 10:
|
|
106
|
+
return suf
|
|
107
|
+
except Exception:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# Next: content_type from ref (or artifact metadata when available).
|
|
111
|
+
content_type: Optional[str] = None
|
|
112
|
+
try:
|
|
113
|
+
ct = audio.get("content_type")
|
|
114
|
+
if isinstance(ct, str) and ct.strip():
|
|
115
|
+
content_type = ct.strip()
|
|
116
|
+
except Exception:
|
|
117
|
+
content_type = None
|
|
118
|
+
|
|
119
|
+
if content_type is None and artifact_store is not None:
|
|
120
|
+
try:
|
|
121
|
+
store = RuntimeArtifactStoreAdapter(artifact_store)
|
|
122
|
+
meta = store.get_metadata(get_artifact_id(audio))
|
|
123
|
+
if isinstance(meta, dict):
|
|
124
|
+
ct2 = meta.get("content_type")
|
|
125
|
+
if isinstance(ct2, str) and ct2.strip():
|
|
126
|
+
content_type = ct2.strip()
|
|
127
|
+
fn2 = meta.get("filename")
|
|
128
|
+
if isinstance(fn2, str) and fn2.strip():
|
|
129
|
+
suf = Path(fn2.strip()).suffix
|
|
130
|
+
if isinstance(suf, str) and suf and len(suf) <= 10:
|
|
131
|
+
return suf
|
|
132
|
+
except Exception:
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
if isinstance(content_type, str) and content_type.strip():
|
|
136
|
+
# Drop charset/params (e.g. "audio/wav; codecs=...").
|
|
137
|
+
base = content_type.split(";", 1)[0].strip().lower()
|
|
138
|
+
ext = mimetypes.guess_extension(base) or ""
|
|
139
|
+
if ext:
|
|
140
|
+
return ext
|
|
141
|
+
|
|
142
|
+
return ".bin"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class _VoiceCapability(_BaseVoice):
|
|
146
|
+
backend_id = "abstractvoice:default"
|
|
147
|
+
|
|
148
|
+
def tts(
|
|
149
|
+
self,
|
|
150
|
+
text: str,
|
|
151
|
+
*,
|
|
152
|
+
voice: Optional[str] = None,
|
|
153
|
+
format: str = "wav",
|
|
154
|
+
artifact_store: Any = None,
|
|
155
|
+
run_id: Optional[str] = None,
|
|
156
|
+
tags: Optional[Dict[str, str]] = None,
|
|
157
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
158
|
+
**_kwargs: Any,
|
|
159
|
+
):
|
|
160
|
+
vm = self._get_vm()
|
|
161
|
+
audio = vm.speak_to_bytes(str(text), format=str(format), voice=voice)
|
|
162
|
+
return self._maybe_store_audio(audio, artifact_store=artifact_store, fmt=str(format), run_id=run_id, tags=tags, metadata=metadata)
|
|
163
|
+
|
|
164
|
+
def stt(
|
|
165
|
+
self,
|
|
166
|
+
audio: Union[bytes, Dict[str, Any], str],
|
|
167
|
+
*,
|
|
168
|
+
language: Optional[str] = None,
|
|
169
|
+
artifact_store: Any = None,
|
|
170
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
171
|
+
**_kwargs: Any,
|
|
172
|
+
) -> str:
|
|
173
|
+
_ = metadata
|
|
174
|
+
vm = self._get_vm()
|
|
175
|
+
if isinstance(audio, str):
|
|
176
|
+
return vm.transcribe_file(str(audio), language=language)
|
|
177
|
+
|
|
178
|
+
if isinstance(audio, dict):
|
|
179
|
+
import os
|
|
180
|
+
import tempfile
|
|
181
|
+
|
|
182
|
+
audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
|
|
183
|
+
suffix = self._suffix_for_audio_ref(audio, artifact_store=artifact_store)
|
|
184
|
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
|
|
185
|
+
tmp_file.write(bytes(audio_bytes))
|
|
186
|
+
tmp_path = tmp_file.name
|
|
187
|
+
try:
|
|
188
|
+
return vm.transcribe_file(tmp_path, language=language)
|
|
189
|
+
finally:
|
|
190
|
+
try:
|
|
191
|
+
os.unlink(tmp_path)
|
|
192
|
+
except Exception:
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
|
|
196
|
+
return vm.transcribe_from_bytes(bytes(audio_bytes), language=language)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class _AudioCapability(_BaseVoice):
|
|
200
|
+
backend_id = "abstractvoice:stt"
|
|
201
|
+
|
|
202
|
+
def transcribe(
|
|
203
|
+
self,
|
|
204
|
+
audio: Union[bytes, Dict[str, Any], str],
|
|
205
|
+
*,
|
|
206
|
+
language: Optional[str] = None,
|
|
207
|
+
artifact_store: Any = None,
|
|
208
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
209
|
+
**_kwargs: Any,
|
|
210
|
+
) -> str:
|
|
211
|
+
_ = metadata
|
|
212
|
+
vm = self._get_vm()
|
|
213
|
+
if isinstance(audio, str):
|
|
214
|
+
return vm.transcribe_file(str(audio), language=language)
|
|
215
|
+
|
|
216
|
+
if isinstance(audio, dict):
|
|
217
|
+
import os
|
|
218
|
+
import tempfile
|
|
219
|
+
|
|
220
|
+
audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
|
|
221
|
+
suffix = self._suffix_for_audio_ref(audio, artifact_store=artifact_store)
|
|
222
|
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
|
|
223
|
+
tmp_file.write(bytes(audio_bytes))
|
|
224
|
+
tmp_path = tmp_file.name
|
|
225
|
+
try:
|
|
226
|
+
return vm.transcribe_file(tmp_path, language=language)
|
|
227
|
+
finally:
|
|
228
|
+
try:
|
|
229
|
+
os.unlink(tmp_path)
|
|
230
|
+
except Exception:
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
audio_bytes = self._resolve_audio_bytes(audio, artifact_store=artifact_store)
|
|
234
|
+
return vm.transcribe_from_bytes(bytes(audio_bytes), language=language)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def register(registry: Any) -> None:
|
|
238
|
+
"""Register AbstractVoice as an AbstractCore capability plugin."""
|
|
239
|
+
|
|
240
|
+
registry.register_voice_backend(
|
|
241
|
+
backend_id=_VoiceCapability.backend_id,
|
|
242
|
+
factory=lambda owner: _VoiceCapability(owner),
|
|
243
|
+
priority=0,
|
|
244
|
+
description="AbstractVoice VoiceManager (TTS+STT).",
|
|
245
|
+
config_hint="Install voices/models with `abstractvoice-prefetch` for offline use (or allow downloads).",
|
|
246
|
+
)
|
|
247
|
+
registry.register_audio_backend(
|
|
248
|
+
backend_id=_AudioCapability.backend_id,
|
|
249
|
+
factory=lambda owner: _AudioCapability(owner),
|
|
250
|
+
priority=0,
|
|
251
|
+
description="AbstractVoice STT (speech-to-text).",
|
|
252
|
+
config_hint="Install STT models with `abstractvoice-prefetch --stt <size>` for offline use (or allow downloads).",
|
|
253
|
+
)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Explicit model/artifact prefetch (cross-platform).
|
|
2
|
+
|
|
3
|
+
Design rule: This must never run implicitly during normal library usage.
|
|
4
|
+
Users/integrators call it explicitly after installation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main(argv: list[str] | None = None) -> int:
|
|
13
|
+
parser = argparse.ArgumentParser(prog="abstractvoice-prefetch", description="AbstractVoice explicit prefetch")
|
|
14
|
+
parser.add_argument(
|
|
15
|
+
"--stt",
|
|
16
|
+
dest="stt_model",
|
|
17
|
+
default=None,
|
|
18
|
+
help="Prefetch faster-whisper model weights (e.g. tiny/base/small/medium/large-v3)",
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"--openf5",
|
|
22
|
+
action="store_true",
|
|
23
|
+
help="Prefetch OpenF5 artifacts for cloning (~5.4GB, requires abstractvoice[cloning])",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--chroma",
|
|
27
|
+
action="store_true",
|
|
28
|
+
help="Prefetch Chroma-4B artifacts (~14GB+, requires HF access; install abstractvoice[chroma] to run inference)",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--piper",
|
|
32
|
+
dest="piper_language",
|
|
33
|
+
default=None,
|
|
34
|
+
help="Prefetch Piper voice model for a language (e.g. en/fr/de).",
|
|
35
|
+
)
|
|
36
|
+
args = parser.parse_args(argv)
|
|
37
|
+
|
|
38
|
+
if not args.stt_model and not args.openf5 and not args.chroma and not args.piper_language:
|
|
39
|
+
parser.print_help()
|
|
40
|
+
return 2
|
|
41
|
+
|
|
42
|
+
if args.stt_model:
|
|
43
|
+
from abstractvoice.adapters.stt_faster_whisper import FasterWhisperAdapter
|
|
44
|
+
|
|
45
|
+
model = str(args.stt_model).strip()
|
|
46
|
+
print(f"Downloading STT model (faster-whisper): {model}")
|
|
47
|
+
stt = FasterWhisperAdapter(model_size=model, device="cpu", compute_type="int8", allow_downloads=True)
|
|
48
|
+
if not stt.is_available():
|
|
49
|
+
raise RuntimeError("STT model download/load failed.")
|
|
50
|
+
print("✅ STT model ready.")
|
|
51
|
+
|
|
52
|
+
if args.openf5:
|
|
53
|
+
from abstractvoice.cloning.engine_f5 import F5TTSVoiceCloningEngine
|
|
54
|
+
|
|
55
|
+
print("Downloading OpenF5 artifacts (cloning)…")
|
|
56
|
+
engine = F5TTSVoiceCloningEngine(debug=True)
|
|
57
|
+
engine.ensure_openf5_artifacts_downloaded()
|
|
58
|
+
print("✅ OpenF5 artifacts ready.")
|
|
59
|
+
|
|
60
|
+
if args.chroma:
|
|
61
|
+
from abstractvoice.cloning.engine_chroma import ChromaVoiceCloningEngine
|
|
62
|
+
|
|
63
|
+
print("Downloading Chroma artifacts (cloning)…")
|
|
64
|
+
engine = ChromaVoiceCloningEngine(debug=True)
|
|
65
|
+
engine.ensure_chroma_artifacts_downloaded()
|
|
66
|
+
print("✅ Chroma artifacts ready.")
|
|
67
|
+
|
|
68
|
+
if args.piper_language:
|
|
69
|
+
from abstractvoice.adapters.tts_piper import PiperTTSAdapter
|
|
70
|
+
|
|
71
|
+
lang = str(args.piper_language).strip().lower()
|
|
72
|
+
print(f"Downloading Piper voice model: {lang}")
|
|
73
|
+
piper = PiperTTSAdapter(language=lang, allow_downloads=True, auto_load=False)
|
|
74
|
+
if not piper.ensure_model_downloaded(lang):
|
|
75
|
+
raise RuntimeError("Piper model download failed.")
|
|
76
|
+
print("✅ Piper model ready.")
|
|
77
|
+
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
raise SystemExit(main())
|