autobyteus 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autobyteus/agent/context/agent_config.py +6 -1
- autobyteus/agent/context/agent_runtime_state.py +7 -1
- autobyteus/agent/handlers/llm_user_message_ready_event_handler.py +30 -7
- autobyteus/agent/handlers/tool_result_event_handler.py +100 -88
- autobyteus/agent/handlers/user_input_message_event_handler.py +22 -25
- autobyteus/agent/llm_response_processor/provider_aware_tool_usage_processor.py +7 -1
- autobyteus/agent/message/__init__.py +7 -5
- autobyteus/agent/message/agent_input_user_message.py +6 -16
- autobyteus/agent/message/context_file.py +24 -24
- autobyteus/agent/message/context_file_type.py +29 -8
- autobyteus/agent/message/multimodal_message_builder.py +47 -0
- autobyteus/agent/streaming/stream_event_payloads.py +23 -4
- autobyteus/agent/system_prompt_processor/tool_manifest_injector_processor.py +6 -2
- autobyteus/agent/tool_invocation.py +27 -2
- autobyteus/agent_team/agent_team_builder.py +22 -1
- autobyteus/agent_team/bootstrap_steps/agent_configuration_preparation_step.py +9 -2
- autobyteus/agent_team/context/agent_team_config.py +1 -0
- autobyteus/agent_team/context/agent_team_runtime_state.py +0 -2
- autobyteus/llm/api/autobyteus_llm.py +33 -33
- autobyteus/llm/api/bedrock_llm.py +13 -5
- autobyteus/llm/api/claude_llm.py +13 -27
- autobyteus/llm/api/gemini_llm.py +108 -42
- autobyteus/llm/api/groq_llm.py +4 -3
- autobyteus/llm/api/mistral_llm.py +97 -51
- autobyteus/llm/api/nvidia_llm.py +6 -5
- autobyteus/llm/api/ollama_llm.py +37 -12
- autobyteus/llm/api/openai_compatible_llm.py +91 -91
- autobyteus/llm/autobyteus_provider.py +1 -1
- autobyteus/llm/base_llm.py +42 -139
- autobyteus/llm/extensions/base_extension.py +6 -6
- autobyteus/llm/extensions/token_usage_tracking_extension.py +3 -2
- autobyteus/llm/llm_factory.py +131 -61
- autobyteus/llm/ollama_provider_resolver.py +1 -0
- autobyteus/llm/providers.py +1 -0
- autobyteus/llm/token_counter/token_counter_factory.py +3 -1
- autobyteus/llm/user_message.py +43 -35
- autobyteus/llm/utils/llm_config.py +34 -18
- autobyteus/llm/utils/media_payload_formatter.py +99 -0
- autobyteus/llm/utils/messages.py +32 -25
- autobyteus/llm/utils/response_types.py +9 -3
- autobyteus/llm/utils/token_usage.py +6 -5
- autobyteus/multimedia/__init__.py +31 -0
- autobyteus/multimedia/audio/__init__.py +11 -0
- autobyteus/multimedia/audio/api/__init__.py +4 -0
- autobyteus/multimedia/audio/api/autobyteus_audio_client.py +59 -0
- autobyteus/multimedia/audio/api/gemini_audio_client.py +219 -0
- autobyteus/multimedia/audio/audio_client_factory.py +120 -0
- autobyteus/multimedia/audio/audio_model.py +97 -0
- autobyteus/multimedia/audio/autobyteus_audio_provider.py +108 -0
- autobyteus/multimedia/audio/base_audio_client.py +40 -0
- autobyteus/multimedia/image/__init__.py +11 -0
- autobyteus/multimedia/image/api/__init__.py +9 -0
- autobyteus/multimedia/image/api/autobyteus_image_client.py +97 -0
- autobyteus/multimedia/image/api/gemini_image_client.py +188 -0
- autobyteus/multimedia/image/api/openai_image_client.py +142 -0
- autobyteus/multimedia/image/autobyteus_image_provider.py +109 -0
- autobyteus/multimedia/image/base_image_client.py +67 -0
- autobyteus/multimedia/image/image_client_factory.py +118 -0
- autobyteus/multimedia/image/image_model.py +97 -0
- autobyteus/multimedia/providers.py +5 -0
- autobyteus/multimedia/runtimes.py +8 -0
- autobyteus/multimedia/utils/__init__.py +10 -0
- autobyteus/multimedia/utils/api_utils.py +19 -0
- autobyteus/multimedia/utils/multimedia_config.py +29 -0
- autobyteus/multimedia/utils/response_types.py +13 -0
- autobyteus/task_management/tools/publish_task_plan.py +4 -16
- autobyteus/task_management/tools/update_task_status.py +4 -19
- autobyteus/tools/__init__.py +5 -4
- autobyteus/tools/base_tool.py +98 -29
- autobyteus/tools/browser/standalone/__init__.py +0 -1
- autobyteus/tools/google_search.py +149 -0
- autobyteus/tools/mcp/schema_mapper.py +29 -71
- autobyteus/tools/multimedia/__init__.py +8 -0
- autobyteus/tools/multimedia/audio_tools.py +116 -0
- autobyteus/tools/multimedia/image_tools.py +186 -0
- autobyteus/tools/parameter_schema.py +82 -89
- autobyteus/tools/pydantic_schema_converter.py +81 -0
- autobyteus/tools/tool_category.py +1 -0
- autobyteus/tools/usage/formatters/default_json_example_formatter.py +89 -20
- autobyteus/tools/usage/formatters/default_xml_example_formatter.py +115 -41
- autobyteus/tools/usage/formatters/default_xml_schema_formatter.py +50 -20
- autobyteus/tools/usage/formatters/gemini_json_example_formatter.py +55 -22
- autobyteus/tools/usage/formatters/google_json_example_formatter.py +54 -21
- autobyteus/tools/usage/formatters/openai_json_example_formatter.py +53 -23
- autobyteus/tools/usage/parsers/default_xml_tool_usage_parser.py +270 -94
- autobyteus/tools/usage/parsers/provider_aware_tool_usage_parser.py +5 -2
- autobyteus/tools/usage/providers/tool_manifest_provider.py +43 -16
- autobyteus/tools/usage/registries/tool_formatting_registry.py +9 -2
- autobyteus/tools/usage/registries/tool_usage_parser_registry.py +9 -2
- autobyteus-1.1.7.dist-info/METADATA +204 -0
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.7.dist-info}/RECORD +98 -71
- examples/run_browser_agent.py +1 -1
- examples/run_google_slides_agent.py +2 -2
- examples/run_mcp_google_slides_client.py +1 -1
- examples/run_sqlite_agent.py +1 -1
- autobyteus/llm/utils/image_payload_formatter.py +0 -89
- autobyteus/tools/ask_user_input.py +0 -40
- autobyteus/tools/browser/standalone/factory/google_search_factory.py +0 -25
- autobyteus/tools/browser/standalone/google_search_ui.py +0 -126
- autobyteus-1.1.5.dist-info/METADATA +0 -161
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.7.dist-info}/WHEEL +0 -0
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.7.dist-info}/licenses/LICENSE +0 -0
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import base64
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import uuid
|
|
6
|
+
import wave
|
|
7
|
+
from typing import Optional, Dict, Any, TYPE_CHECKING, List
|
|
8
|
+
|
|
9
|
+
# Old/legacy Gemini SDK (as requested)
|
|
10
|
+
import google.generativeai as genai
|
|
11
|
+
|
|
12
|
+
from autobyteus.multimedia.audio.base_audio_client import BaseAudioClient
|
|
13
|
+
from autobyteus.multimedia.utils.response_types import SpeechGenerationResponse
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from autobyteus.multimedia.audio.audio_model import AudioModel
|
|
17
|
+
from autobyteus.multimedia.utils.multimedia_config import MultimediaConfig
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _save_audio_bytes_to_wav(
|
|
23
|
+
pcm_bytes: bytes,
|
|
24
|
+
channels: int = 1,
|
|
25
|
+
rate: int = 24000,
|
|
26
|
+
sample_width: int = 2,
|
|
27
|
+
) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Save raw PCM (s16le) audio bytes to a temporary WAV file and return the file path.
|
|
30
|
+
|
|
31
|
+
Gemini TTS models output mono, 24 kHz, 16-bit PCM by default.
|
|
32
|
+
"""
|
|
33
|
+
temp_dir = "/tmp/autobyteus_audio"
|
|
34
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
35
|
+
file_path = os.path.join(temp_dir, f"{uuid.uuid4()}.wav")
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
with wave.open(file_path, "wb") as wf:
|
|
39
|
+
wf.setnchannels(channels)
|
|
40
|
+
wf.setsampwidth(sample_width) # 2 bytes => 16-bit
|
|
41
|
+
wf.setframerate(rate)
|
|
42
|
+
wf.writeframes(pcm_bytes)
|
|
43
|
+
logger.info("Successfully saved generated audio to %s", file_path)
|
|
44
|
+
return file_path
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logger.error("Failed to save audio to WAV file at %s: %s", file_path, e)
|
|
47
|
+
raise
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _extract_inline_audio_bytes(response) -> bytes:
|
|
51
|
+
"""
|
|
52
|
+
Extract inline audio bytes from a google.generativeai response.
|
|
53
|
+
|
|
54
|
+
The legacy SDK returns a Response object with candidates -> content -> parts[0].inline_data.data.
|
|
55
|
+
Depending on version, `.data` can be bytes or base64-encoded str.
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
# Access the first candidate's first part's inline_data
|
|
59
|
+
part = response.candidates[0].content.parts[0]
|
|
60
|
+
inline = getattr(part, "inline_data", None)
|
|
61
|
+
if not inline or not hasattr(inline, "data"):
|
|
62
|
+
raise ValueError("No inline audio data found in response.")
|
|
63
|
+
data = inline.data
|
|
64
|
+
if isinstance(data, bytes):
|
|
65
|
+
return data
|
|
66
|
+
if isinstance(data, str):
|
|
67
|
+
return base64.b64decode(data)
|
|
68
|
+
raise TypeError(f"Unexpected inline_data.data type: {type(data)}")
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.error("Failed to extract audio from response: %s", e)
|
|
71
|
+
raise
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class GeminiAudioClient(BaseAudioClient):
|
|
75
|
+
"""
|
|
76
|
+
An audio client that uses Google's Gemini models for TTS via the *legacy* SDK
|
|
77
|
+
(`google.generativeai`).
|
|
78
|
+
|
|
79
|
+
Usage notes:
|
|
80
|
+
- Ensure your model value is a TTS-capable model (e.g. "gemini-2.5-flash-preview-tts"
|
|
81
|
+
or "gemini-2.5-pro-preview-tts").
|
|
82
|
+
- Single-speaker is default. For simple usage, provide `voice_name` (e.g. "Kore", "Puck")
|
|
83
|
+
in MultimediaConfig or generation_config.
|
|
84
|
+
- Multi-speaker preview exists in the API; if you want it, pass:
|
|
85
|
+
generation_config = {
|
|
86
|
+
"mode": "multi-speaker",
|
|
87
|
+
"speakers": [
|
|
88
|
+
{"speaker": "Alice", "voice_name": "Kore"},
|
|
89
|
+
{"speaker": "Bob", "voice_name": "Puck"},
|
|
90
|
+
]
|
|
91
|
+
}
|
|
92
|
+
and make sure your prompt contains lines for each named speaker.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(self, model: "AudioModel", config: "MultimediaConfig"):
|
|
96
|
+
super().__init__(model, config)
|
|
97
|
+
api_key = os.getenv("GEMINI_API_KEY")
|
|
98
|
+
if not api_key:
|
|
99
|
+
raise ValueError("Please set the GEMINI_API_KEY environment variable.")
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
# Legacy library uses a global configure call
|
|
103
|
+
genai.configure(api_key=api_key)
|
|
104
|
+
# Create a GenerativeModel handle
|
|
105
|
+
self._model = genai.GenerativeModel(self.model.value or "gemini-2.5-flash-preview-tts")
|
|
106
|
+
logger.info("GeminiAudioClient (legacy SDK) configured for model '%s'.", self.model.value)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.error("Failed to configure Gemini client: %s", e)
|
|
109
|
+
raise RuntimeError(f"Failed to configure Gemini client: {e}")
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def _build_single_speaker_generation_config(voice_name: str) -> Dict[str, Any]:
|
|
113
|
+
"""
|
|
114
|
+
Build generation_config for single-speaker TTS in the legacy SDK.
|
|
115
|
+
Key bits:
|
|
116
|
+
- response_mime_type => request audio
|
|
117
|
+
- speech_config.voice_config.prebuilt_voice_config.voice_name => set the voice
|
|
118
|
+
"""
|
|
119
|
+
return {
|
|
120
|
+
"response_mime_type": "audio/pcm",
|
|
121
|
+
"speech_config": {
|
|
122
|
+
"voice_config": {
|
|
123
|
+
"prebuilt_voice_config": {
|
|
124
|
+
"voice_name": voice_name,
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
},
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
@staticmethod
|
|
131
|
+
def _build_multi_speaker_generation_config(speakers: List[Dict[str, str]]) -> Dict[str, Any]:
|
|
132
|
+
"""
|
|
133
|
+
Build generation_config for multi-speaker TTS (preview).
|
|
134
|
+
`speakers` = [{"speaker": "...", "voice_name": "..."}, ...]
|
|
135
|
+
"""
|
|
136
|
+
speaker_voice_configs = []
|
|
137
|
+
for s in speakers:
|
|
138
|
+
spk = s.get("speaker")
|
|
139
|
+
vname = s.get("voice_name")
|
|
140
|
+
if not spk or not vname:
|
|
141
|
+
raise ValueError("Each speaker must include 'speaker' and 'voice_name'.")
|
|
142
|
+
speaker_voice_configs.append(
|
|
143
|
+
{
|
|
144
|
+
"speaker": spk,
|
|
145
|
+
"voice_config": {
|
|
146
|
+
"prebuilt_voice_config": {
|
|
147
|
+
"voice_name": vname,
|
|
148
|
+
}
|
|
149
|
+
},
|
|
150
|
+
}
|
|
151
|
+
)
|
|
152
|
+
return {
|
|
153
|
+
"response_mime_type": "audio/pcm",
|
|
154
|
+
"speech_config": {
|
|
155
|
+
"multi_speaker_voice_config": {
|
|
156
|
+
"speaker_voice_configs": speaker_voice_configs
|
|
157
|
+
}
|
|
158
|
+
},
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
async def generate_speech(
|
|
162
|
+
self,
|
|
163
|
+
prompt: str,
|
|
164
|
+
generation_config: Optional[Dict[str, Any]] = None
|
|
165
|
+
) -> SpeechGenerationResponse:
|
|
166
|
+
"""
|
|
167
|
+
Generates spoken audio from text using a Gemini TTS model through the legacy SDK.
|
|
168
|
+
|
|
169
|
+
Implementation details:
|
|
170
|
+
- We call `GenerativeModel.generate_content(...)` with a `generation_config`
|
|
171
|
+
that asks for AUDIO and sets the voice settings.
|
|
172
|
+
- The legacy SDK call is synchronous; we offload to a worker thread.
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
logger.info("Generating speech with Gemini TTS (legacy SDK) model '%s'...", self.model.value)
|
|
176
|
+
|
|
177
|
+
# Merge base config with per-call overrides
|
|
178
|
+
final_cfg = self.config.to_dict().copy()
|
|
179
|
+
if generation_config:
|
|
180
|
+
final_cfg.update(generation_config or {})
|
|
181
|
+
|
|
182
|
+
# Style instructions: prepend if provided
|
|
183
|
+
style_instructions = final_cfg.get("style_instructions")
|
|
184
|
+
final_prompt = f"{style_instructions}: {prompt}" if style_instructions else prompt
|
|
185
|
+
logger.debug("Final prompt for TTS (truncated): '%s...'", final_prompt[:160])
|
|
186
|
+
|
|
187
|
+
# Mode & voice
|
|
188
|
+
mode = final_cfg.get("mode", "single-speaker")
|
|
189
|
+
default_voice = final_cfg.get("voice_name", "Kore")
|
|
190
|
+
|
|
191
|
+
if mode == "multi-speaker":
|
|
192
|
+
speakers = final_cfg.get("speakers")
|
|
193
|
+
if not speakers or not isinstance(speakers, list):
|
|
194
|
+
raise ValueError(
|
|
195
|
+
"For multi-speaker mode, provide generation_config['speakers'] "
|
|
196
|
+
"as a list of {'speaker': <name>, 'voice_name': <prebuilt voice>}."
|
|
197
|
+
)
|
|
198
|
+
gen_config = self._build_multi_speaker_generation_config(speakers)
|
|
199
|
+
else:
|
|
200
|
+
gen_config = self._build_single_speaker_generation_config(default_voice)
|
|
201
|
+
|
|
202
|
+
# Run the blocking gen call in a thread so this coroutine stays non-blocking
|
|
203
|
+
response = await asyncio.to_thread(
|
|
204
|
+
self._model.generate_content,
|
|
205
|
+
final_prompt,
|
|
206
|
+
generation_config=gen_config,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
audio_pcm = _extract_inline_audio_bytes(response)
|
|
210
|
+
audio_path = _save_audio_bytes_to_wav(audio_pcm)
|
|
211
|
+
|
|
212
|
+
return SpeechGenerationResponse(audio_urls=[audio_path])
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error("Error during Google Gemini speech generation (legacy SDK): %s", str(e))
|
|
216
|
+
raise ValueError(f"Google Gemini speech generation failed: {str(e)}")
|
|
217
|
+
|
|
218
|
+
async def cleanup(self):
|
|
219
|
+
logger.debug("GeminiAudioClient cleanup called (legacy SDK; nothing to release).")
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
from autobyteus.multimedia.audio.base_audio_client import BaseAudioClient
|
|
5
|
+
from autobyteus.multimedia.audio.audio_model import AudioModel
|
|
6
|
+
from autobyteus.multimedia.providers import MultimediaProvider
|
|
7
|
+
from autobyteus.multimedia.audio.api.gemini_audio_client import GeminiAudioClient
|
|
8
|
+
from autobyteus.multimedia.audio.autobyteus_audio_provider import AutobyteusAudioModelProvider
|
|
9
|
+
from autobyteus.multimedia.utils.multimedia_config import MultimediaConfig
|
|
10
|
+
from autobyteus.utils.singleton import SingletonMeta
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
GEMINI_TTS_VOICES = [
|
|
15
|
+
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
|
|
16
|
+
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
|
|
17
|
+
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
|
|
18
|
+
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
|
|
19
|
+
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
class AudioClientFactory(metaclass=SingletonMeta):
|
|
23
|
+
"""
|
|
24
|
+
A factory for creating instances of audio clients based on registered AudioModels.
|
|
25
|
+
"""
|
|
26
|
+
_models_by_identifier: Dict[str, AudioModel] = {}
|
|
27
|
+
_initialized = False
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def ensure_initialized():
|
|
31
|
+
"""Ensures the factory is initialized before use."""
|
|
32
|
+
if not AudioClientFactory._initialized:
|
|
33
|
+
AudioClientFactory._initialize_registry()
|
|
34
|
+
AudioClientFactory._initialized = True
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def reinitialize():
|
|
38
|
+
"""Reinitializes the model registry, clearing all models and re-discovering them."""
|
|
39
|
+
logger.info("Reinitializing Audio model registry...")
|
|
40
|
+
AudioClientFactory._initialized = False
|
|
41
|
+
AudioClientFactory._models_by_identifier.clear()
|
|
42
|
+
AudioClientFactory.ensure_initialized()
|
|
43
|
+
logger.info("Audio model registry reinitialized successfully.")
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _initialize_registry():
|
|
47
|
+
"""Initializes the registry with built-in audio models."""
|
|
48
|
+
|
|
49
|
+
# Google Gemini Audio Models
|
|
50
|
+
gemini_tts_model = AudioModel(
|
|
51
|
+
name="gemini-2.5-flash-tts",
|
|
52
|
+
value="gemini-2.5-flash-preview-tts",
|
|
53
|
+
provider=MultimediaProvider.GOOGLE,
|
|
54
|
+
client_class=GeminiAudioClient,
|
|
55
|
+
parameter_schema={
|
|
56
|
+
"mode": {
|
|
57
|
+
"type": "string",
|
|
58
|
+
"default": "single-speaker",
|
|
59
|
+
"allowed_values": ["single-speaker", "multi-speaker"],
|
|
60
|
+
"description": "The speech generation mode. 'single-speaker' for a consistent voice, or 'multi-speaker' to assign different voices to speakers identified in the prompt."
|
|
61
|
+
},
|
|
62
|
+
"voice_name": {
|
|
63
|
+
"type": "string",
|
|
64
|
+
"default": "Kore",
|
|
65
|
+
"allowed_values": GEMINI_TTS_VOICES,
|
|
66
|
+
"description": "The voice to use for single-speaker generation."
|
|
67
|
+
},
|
|
68
|
+
"style_instructions": {
|
|
69
|
+
"type": "string",
|
|
70
|
+
"description": "Optional instructions on the style of speech, e.g., 'Say this in a dramatic whisper'."
|
|
71
|
+
},
|
|
72
|
+
"speaker_mapping": {
|
|
73
|
+
"type": "object",
|
|
74
|
+
"description": "Required for multi-speaker mode. An object mapping speaker names from the prompt (e.g., 'Joe') to a voice name (e.g., 'Puck')."
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
models_to_register = [
|
|
80
|
+
gemini_tts_model,
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
for model in models_to_register:
|
|
84
|
+
AudioClientFactory.register_model(model)
|
|
85
|
+
|
|
86
|
+
logger.info("Default API-based audio models registered.")
|
|
87
|
+
|
|
88
|
+
# Discover models from remote Autobyteus servers
|
|
89
|
+
AutobyteusAudioModelProvider.discover_and_register()
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def register_model(model: AudioModel):
|
|
93
|
+
"""Registers a new audio model."""
|
|
94
|
+
identifier = model.model_identifier
|
|
95
|
+
if identifier in AudioClientFactory._models_by_identifier:
|
|
96
|
+
logger.warning(f"Audio model '{identifier}' is already registered. Overwriting.")
|
|
97
|
+
|
|
98
|
+
if not isinstance(model.provider, MultimediaProvider):
|
|
99
|
+
try:
|
|
100
|
+
model.provider = MultimediaProvider(model.provider)
|
|
101
|
+
except ValueError:
|
|
102
|
+
logger.error(f"Cannot register model '{identifier}' with unknown provider '{model.provider}'.")
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
AudioClientFactory._models_by_identifier[identifier] = model
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def create_audio_client(model_identifier: str, config_override: Optional[MultimediaConfig] = None) -> BaseAudioClient:
|
|
109
|
+
"""Creates an instance of a registered audio client for a specific model."""
|
|
110
|
+
AudioClientFactory.ensure_initialized()
|
|
111
|
+
|
|
112
|
+
model = AudioClientFactory._models_by_identifier.get(model_identifier)
|
|
113
|
+
if not model:
|
|
114
|
+
raise ValueError(f"No audio model registered with the name '{model_identifier}'. "
|
|
115
|
+
f"Available models: {list(AudioClientFactory._models_by_identifier.keys())}")
|
|
116
|
+
|
|
117
|
+
logger.info(f"Creating instance of audio client for model '{model_identifier}'.")
|
|
118
|
+
return model.create_client(config_override)
|
|
119
|
+
|
|
120
|
+
audio_client_factory = AudioClientFactory()
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import logging
|
|
3
|
+
from typing import TYPE_CHECKING, Type, Optional, Iterator, Dict, Any
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
from autobyteus.multimedia.providers import MultimediaProvider
|
|
7
|
+
from autobyteus.multimedia.runtimes import MultimediaRuntime
|
|
8
|
+
from autobyteus.multimedia.utils.multimedia_config import MultimediaConfig
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from autobyteus.multimedia.audio.base_audio_client import BaseAudioClient
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
class AudioModelMeta(type):
|
|
16
|
+
"""
|
|
17
|
+
Metaclass for AudioModel to allow discovery and access like an Enum.
|
|
18
|
+
"""
|
|
19
|
+
def __iter__(cls) -> Iterator[AudioModel]:
|
|
20
|
+
from autobyteus.multimedia.audio.audio_client_factory import AudioClientFactory
|
|
21
|
+
AudioClientFactory.ensure_initialized()
|
|
22
|
+
for model in AudioClientFactory._models_by_identifier.values():
|
|
23
|
+
yield model
|
|
24
|
+
|
|
25
|
+
def __getitem__(cls, name_or_identifier: str) -> AudioModel:
|
|
26
|
+
from autobyteus.multimedia.audio.audio_client_factory import AudioClientFactory
|
|
27
|
+
AudioClientFactory.ensure_initialized()
|
|
28
|
+
model = AudioClientFactory._models_by_identifier.get(name_or_identifier)
|
|
29
|
+
if model:
|
|
30
|
+
return model
|
|
31
|
+
available_models = list(AudioClientFactory._models_by_identifier.keys())
|
|
32
|
+
raise KeyError(f"Audio model '{name_or_identifier}' not found. Available models: {available_models}")
|
|
33
|
+
|
|
34
|
+
def __len__(cls) -> int:
|
|
35
|
+
from autobyteus.multimedia.audio.audio_client_factory import AudioClientFactory
|
|
36
|
+
AudioClientFactory.ensure_initialized()
|
|
37
|
+
return len(AudioClientFactory._models_by_identifier)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class AudioModel(metaclass=AudioModelMeta):
|
|
41
|
+
"""
|
|
42
|
+
Represents a single audio model's metadata.
|
|
43
|
+
"""
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
name: str,
|
|
47
|
+
value: str,
|
|
48
|
+
provider: MultimediaProvider,
|
|
49
|
+
client_class: Type["BaseAudioClient"],
|
|
50
|
+
parameter_schema: Optional[Dict[str, Any]] = None,
|
|
51
|
+
runtime: MultimediaRuntime = MultimediaRuntime.API,
|
|
52
|
+
host_url: Optional[str] = None
|
|
53
|
+
):
|
|
54
|
+
self.name = name
|
|
55
|
+
self.value = value
|
|
56
|
+
self.provider = provider
|
|
57
|
+
self.client_class = client_class
|
|
58
|
+
self.runtime = runtime
|
|
59
|
+
self.host_url = host_url
|
|
60
|
+
self.parameter_schema = parameter_schema if parameter_schema else {}
|
|
61
|
+
|
|
62
|
+
# Automatically build default_config from the schema's default values
|
|
63
|
+
default_params = {
|
|
64
|
+
key: meta.get("default")
|
|
65
|
+
for key, meta in self.parameter_schema.items()
|
|
66
|
+
if "default" in meta
|
|
67
|
+
}
|
|
68
|
+
self.default_config = MultimediaConfig(params=default_params)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def model_identifier(self) -> str:
|
|
72
|
+
"""Returns the unique identifier for the model."""
|
|
73
|
+
if self.runtime == MultimediaRuntime.AUTOBYTEUS and self.host_url:
|
|
74
|
+
try:
|
|
75
|
+
host = urlparse(self.host_url).hostname
|
|
76
|
+
return f"{self.name}@{host}"
|
|
77
|
+
except Exception:
|
|
78
|
+
return f"{self.name}@{self.host_url}" # Fallback
|
|
79
|
+
return self.name
|
|
80
|
+
|
|
81
|
+
def create_client(self, config_override: Optional[MultimediaConfig] = None) -> "BaseAudioClient":
|
|
82
|
+
"""
|
|
83
|
+
Instantiates the client class for this model.
|
|
84
|
+
"""
|
|
85
|
+
config_to_use = self.default_config
|
|
86
|
+
if config_override:
|
|
87
|
+
from copy import deepcopy
|
|
88
|
+
config_to_use = deepcopy(self.default_config)
|
|
89
|
+
config_to_use.merge_with(config_override)
|
|
90
|
+
|
|
91
|
+
return self.client_class(model=self, config=config_to_use)
|
|
92
|
+
|
|
93
|
+
def __repr__(self):
|
|
94
|
+
return (
|
|
95
|
+
f"AudioModel(identifier='{self.model_identifier}', "
|
|
96
|
+
f"provider='{self.provider.name}', runtime='{self.runtime.value}')"
|
|
97
|
+
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List
|
|
3
|
+
import os
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
from autobyteus_llm_client import AutobyteusClient
|
|
7
|
+
from autobyteus.multimedia.audio.api.autobyteus_audio_client import AutobyteusAudioClient
|
|
8
|
+
from autobyteus.multimedia.audio.audio_model import AudioModel
|
|
9
|
+
from autobyteus.multimedia.providers import MultimediaProvider
|
|
10
|
+
from autobyteus.multimedia.runtimes import MultimediaRuntime
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
class AutobyteusAudioModelProvider:
|
|
15
|
+
"""
|
|
16
|
+
Discovers and registers audio models from remote Autobyteus server instances.
|
|
17
|
+
"""
|
|
18
|
+
DEFAULT_SERVER_URL = 'http://localhost:8000'
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def _get_hosts() -> List[str]:
|
|
22
|
+
"""Gets Autobyteus server hosts from env vars."""
|
|
23
|
+
hosts_str = os.getenv('AUTOBYTEUS_LLM_SERVER_HOSTS')
|
|
24
|
+
if hosts_str:
|
|
25
|
+
return [host.strip() for host in hosts_str.split(',')]
|
|
26
|
+
|
|
27
|
+
legacy_host = os.getenv('AUTOBYTEUS_LLM_SERVER_URL')
|
|
28
|
+
if legacy_host:
|
|
29
|
+
return [legacy_host]
|
|
30
|
+
|
|
31
|
+
return [AutobyteusAudioModelProvider.DEFAULT_SERVER_URL]
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def discover_and_register():
|
|
35
|
+
"""Discover and register audio models from all configured hosts."""
|
|
36
|
+
try:
|
|
37
|
+
from autobyteus.multimedia.audio.audio_client_factory import AudioClientFactory
|
|
38
|
+
|
|
39
|
+
hosts = AutobyteusAudioModelProvider._get_hosts()
|
|
40
|
+
total_registered_count = 0
|
|
41
|
+
|
|
42
|
+
for host_url in hosts:
|
|
43
|
+
if not AutobyteusAudioModelProvider.is_valid_url(host_url):
|
|
44
|
+
logger.error(f"Invalid Autobyteus host URL for audio model discovery: {host_url}, skipping.")
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
logger.info(f"Discovering audio models from host: {host_url}")
|
|
48
|
+
client = None
|
|
49
|
+
try:
|
|
50
|
+
client = AutobyteusClient(server_url=host_url)
|
|
51
|
+
response = client.get_available_audio_models_sync()
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logger.warning(f"Could not fetch audio models from Autobyteus server at {host_url}: {e}")
|
|
54
|
+
continue
|
|
55
|
+
finally:
|
|
56
|
+
if client:
|
|
57
|
+
client.sync_client.close()
|
|
58
|
+
|
|
59
|
+
if not response.get('models'):
|
|
60
|
+
logger.info(f"No audio models found on host {host_url}.")
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
models = response.get('models', [])
|
|
64
|
+
host_registered_count = 0
|
|
65
|
+
for model_info in models:
|
|
66
|
+
try:
|
|
67
|
+
if not all(k in model_info for k in ["name", "value", "provider"]):
|
|
68
|
+
logger.warning(f"Skipping malformed audio model from {host_url}: {model_info}")
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
if "parameter_schema" not in model_info:
|
|
72
|
+
logger.debug(f"Skipping model from {host_url} as it lacks a parameter schema, likely not an audio model: {model_info.get('name')}")
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
audio_model = AudioModel(
|
|
76
|
+
name=model_info["name"],
|
|
77
|
+
value=model_info["value"],
|
|
78
|
+
provider=MultimediaProvider(model_info["provider"]),
|
|
79
|
+
client_class=AutobyteusAudioClient,
|
|
80
|
+
runtime=MultimediaRuntime.AUTOBYTEUS,
|
|
81
|
+
host_url=host_url,
|
|
82
|
+
parameter_schema=model_info.get("parameter_schema")
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
AudioClientFactory.register_model(audio_model)
|
|
86
|
+
host_registered_count += 1
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.error(f"Failed to register audio model '{model_info.get('name')}' from {host_url}: {e}")
|
|
90
|
+
|
|
91
|
+
if host_registered_count > 0:
|
|
92
|
+
logger.info(f"Registered {host_registered_count} audio models from Autobyteus host {host_url}")
|
|
93
|
+
total_registered_count += host_registered_count
|
|
94
|
+
|
|
95
|
+
if total_registered_count > 0:
|
|
96
|
+
logger.info(f"Finished Autobyteus audio model discovery. Total models registered: {total_registered_count}")
|
|
97
|
+
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"An unexpected error occurred during Autobyteus audio model discovery: {e}", exc_info=True)
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def is_valid_url(url: str) -> bool:
|
|
103
|
+
"""Validate URL format"""
|
|
104
|
+
try:
|
|
105
|
+
result = urlparse(url)
|
|
106
|
+
return all([result.scheme, result.netloc])
|
|
107
|
+
except Exception:
|
|
108
|
+
return False
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Optional, Dict, Any, List, TYPE_CHECKING
|
|
4
|
+
from autobyteus.multimedia.utils.response_types import SpeechGenerationResponse
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from autobyteus.multimedia.audio.audio_model import AudioModel
|
|
8
|
+
from autobyteus.multimedia.utils.multimedia_config import MultimediaConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseAudioClient(ABC):
|
|
12
|
+
"""
|
|
13
|
+
Abstract base class for audio clients that connect to models for audio generation.
|
|
14
|
+
"""
|
|
15
|
+
def __init__(self, model: "AudioModel", config: "MultimediaConfig"):
|
|
16
|
+
self.model = model
|
|
17
|
+
self.config = config
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def generate_speech(
|
|
21
|
+
self,
|
|
22
|
+
prompt: str,
|
|
23
|
+
generation_config: Optional[Dict[str, Any]] = None
|
|
24
|
+
) -> SpeechGenerationResponse:
|
|
25
|
+
"""
|
|
26
|
+
Generates spoken audio from text (Text-to-Speech).
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
prompt (str): The text to be converted to speech.
|
|
30
|
+
generation_config (Optional[Dict[str, Any]]): Provider-specific parameters
|
|
31
|
+
(e.g., voice_name, speaker_mapping).
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
SpeechGenerationResponse: An object containing URLs or paths to the generated audio files.
|
|
35
|
+
"""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
async def cleanup(self):
|
|
39
|
+
"""Optional cleanup method for resources like network clients."""
|
|
40
|
+
pass
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .image_client_factory import image_client_factory, ImageClientFactory
|
|
2
|
+
from .image_model import ImageModel
|
|
3
|
+
from .base_image_client import BaseImageClient
|
|
4
|
+
from .api import *
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"image_client_factory",
|
|
8
|
+
"ImageClientFactory",
|
|
9
|
+
"ImageModel",
|
|
10
|
+
"BaseImageClient",
|
|
11
|
+
]
|