autobyteus 1.1.7__py3-none-any.whl → 1.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autobyteus/agent/bootstrap_steps/system_prompt_processing_step.py +6 -2
- autobyteus/agent/handlers/inter_agent_message_event_handler.py +17 -19
- autobyteus/agent/handlers/llm_complete_response_received_event_handler.py +6 -3
- autobyteus/agent/handlers/tool_result_event_handler.py +86 -23
- autobyteus/agent/handlers/user_input_message_event_handler.py +19 -10
- autobyteus/agent/hooks/base_phase_hook.py +17 -0
- autobyteus/agent/hooks/hook_registry.py +15 -27
- autobyteus/agent/input_processor/base_user_input_processor.py +17 -1
- autobyteus/agent/input_processor/processor_registry.py +15 -27
- autobyteus/agent/llm_response_processor/base_processor.py +17 -1
- autobyteus/agent/llm_response_processor/processor_registry.py +15 -24
- autobyteus/agent/llm_response_processor/provider_aware_tool_usage_processor.py +14 -0
- autobyteus/agent/message/agent_input_user_message.py +15 -2
- autobyteus/agent/message/send_message_to.py +1 -1
- autobyteus/agent/processor_option.py +17 -0
- autobyteus/agent/sender_type.py +1 -0
- autobyteus/agent/system_prompt_processor/base_processor.py +17 -1
- autobyteus/agent/system_prompt_processor/processor_registry.py +15 -27
- autobyteus/agent/system_prompt_processor/tool_manifest_injector_processor.py +10 -0
- autobyteus/agent/tool_execution_result_processor/base_processor.py +17 -1
- autobyteus/agent/tool_execution_result_processor/processor_registry.py +15 -1
- autobyteus/agent/workspace/base_workspace.py +1 -1
- autobyteus/agent/workspace/workspace_definition.py +1 -1
- autobyteus/agent_team/bootstrap_steps/team_context_initialization_step.py +1 -1
- autobyteus/agent_team/streaming/agent_team_stream_event_payloads.py +2 -2
- autobyteus/agent_team/task_notification/__init__.py +4 -0
- autobyteus/agent_team/task_notification/activation_policy.py +70 -0
- autobyteus/agent_team/task_notification/system_event_driven_agent_task_notifier.py +56 -122
- autobyteus/agent_team/task_notification/task_activator.py +66 -0
- autobyteus/cli/agent_team_tui/state.py +17 -20
- autobyteus/cli/agent_team_tui/widgets/focus_pane.py +1 -1
- autobyteus/cli/agent_team_tui/widgets/task_board_panel.py +1 -1
- autobyteus/events/event_types.py +2 -2
- autobyteus/llm/api/gemini_llm.py +45 -54
- autobyteus/llm/api/qwen_llm.py +25 -0
- autobyteus/llm/autobyteus_provider.py +8 -2
- autobyteus/llm/llm_factory.py +16 -0
- autobyteus/multimedia/audio/api/autobyteus_audio_client.py +4 -1
- autobyteus/multimedia/audio/api/gemini_audio_client.py +84 -153
- autobyteus/multimedia/audio/audio_client_factory.py +47 -22
- autobyteus/multimedia/audio/audio_model.py +13 -6
- autobyteus/multimedia/audio/autobyteus_audio_provider.py +8 -2
- autobyteus/multimedia/audio/base_audio_client.py +3 -1
- autobyteus/multimedia/image/api/autobyteus_image_client.py +12 -5
- autobyteus/multimedia/image/api/gemini_image_client.py +72 -130
- autobyteus/multimedia/image/api/openai_image_client.py +4 -2
- autobyteus/multimedia/image/autobyteus_image_provider.py +8 -2
- autobyteus/multimedia/image/base_image_client.py +6 -2
- autobyteus/multimedia/image/image_client_factory.py +20 -19
- autobyteus/multimedia/image/image_model.py +13 -6
- autobyteus/multimedia/providers.py +1 -0
- autobyteus/task_management/__init__.py +9 -10
- autobyteus/task_management/base_task_board.py +14 -6
- autobyteus/task_management/converters/__init__.py +0 -2
- autobyteus/task_management/converters/task_board_converter.py +7 -16
- autobyteus/task_management/events.py +6 -6
- autobyteus/task_management/in_memory_task_board.py +48 -38
- autobyteus/task_management/schemas/__init__.py +2 -2
- autobyteus/task_management/schemas/{plan_definition.py → task_definition.py} +5 -6
- autobyteus/task_management/schemas/task_status_report.py +0 -1
- autobyteus/task_management/task.py +60 -0
- autobyteus/task_management/tools/__init__.py +4 -2
- autobyteus/task_management/tools/get_my_tasks.py +80 -0
- autobyteus/task_management/tools/get_task_board_status.py +3 -3
- autobyteus/task_management/tools/publish_task.py +77 -0
- autobyteus/task_management/tools/publish_tasks.py +74 -0
- autobyteus/task_management/tools/update_task_status.py +5 -5
- autobyteus/tools/__init__.py +3 -1
- autobyteus/tools/base_tool.py +4 -4
- autobyteus/tools/browser/session_aware/browser_session_aware_navigate_to.py +1 -1
- autobyteus/tools/browser/session_aware/browser_session_aware_web_element_trigger.py +1 -1
- autobyteus/tools/browser/session_aware/browser_session_aware_webpage_reader.py +1 -1
- autobyteus/tools/browser/session_aware/browser_session_aware_webpage_screenshot_taker.py +1 -1
- autobyteus/tools/browser/standalone/navigate_to.py +1 -1
- autobyteus/tools/browser/standalone/web_page_pdf_generator.py +1 -1
- autobyteus/tools/browser/standalone/webpage_image_downloader.py +1 -1
- autobyteus/tools/browser/standalone/webpage_reader.py +1 -1
- autobyteus/tools/browser/standalone/webpage_screenshot_taker.py +1 -1
- autobyteus/tools/functional_tool.py +1 -1
- autobyteus/tools/google_search.py +1 -1
- autobyteus/tools/image_downloader.py +1 -1
- autobyteus/tools/mcp/factory.py +1 -1
- autobyteus/tools/mcp/schema_mapper.py +1 -1
- autobyteus/tools/mcp/tool.py +1 -1
- autobyteus/tools/multimedia/__init__.py +2 -0
- autobyteus/tools/multimedia/audio_tools.py +10 -20
- autobyteus/tools/multimedia/image_tools.py +21 -22
- autobyteus/tools/multimedia/media_reader_tool.py +117 -0
- autobyteus/tools/pydantic_schema_converter.py +1 -1
- autobyteus/tools/registry/tool_definition.py +1 -1
- autobyteus/tools/timer.py +1 -1
- autobyteus/tools/tool_meta.py +1 -1
- autobyteus/tools/usage/formatters/default_json_example_formatter.py +1 -1
- autobyteus/tools/usage/formatters/default_xml_example_formatter.py +1 -1
- autobyteus/tools/usage/formatters/default_xml_schema_formatter.py +59 -3
- autobyteus/tools/usage/formatters/gemini_json_example_formatter.py +1 -1
- autobyteus/tools/usage/formatters/google_json_example_formatter.py +1 -1
- autobyteus/tools/usage/formatters/openai_json_example_formatter.py +1 -1
- autobyteus/{tools → utils}/parameter_schema.py +1 -1
- {autobyteus-1.1.7.dist-info → autobyteus-1.1.9.dist-info}/METADATA +2 -2
- {autobyteus-1.1.7.dist-info → autobyteus-1.1.9.dist-info}/RECORD +105 -99
- examples/run_poem_writer.py +1 -1
- autobyteus/task_management/converters/task_plan_converter.py +0 -48
- autobyteus/task_management/task_plan.py +0 -110
- autobyteus/task_management/tools/publish_task_plan.py +0 -101
- {autobyteus-1.1.7.dist-info → autobyteus-1.1.9.dist-info}/WHEEL +0 -0
- {autobyteus-1.1.7.dist-info → autobyteus-1.1.9.dist-info}/licenses/LICENSE +0 -0
- {autobyteus-1.1.7.dist-info → autobyteus-1.1.9.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import base64
|
|
3
1
|
import logging
|
|
4
2
|
import os
|
|
3
|
+
import base64
|
|
5
4
|
import uuid
|
|
6
5
|
import wave
|
|
7
|
-
from typing import Optional, Dict, Any, TYPE_CHECKING
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
import google.generativeai as genai
|
|
6
|
+
from typing import Optional, List, Dict, Any, TYPE_CHECKING
|
|
7
|
+
from google import genai
|
|
8
|
+
from google.genai import types as genai_types
|
|
11
9
|
|
|
12
10
|
from autobyteus.multimedia.audio.base_audio_client import BaseAudioClient
|
|
13
11
|
from autobyteus.multimedia.utils.response_types import SpeechGenerationResponse
|
|
@@ -19,77 +17,31 @@ if TYPE_CHECKING:
|
|
|
19
17
|
logger = logging.getLogger(__name__)
|
|
20
18
|
|
|
21
19
|
|
|
22
|
-
def _save_audio_bytes_to_wav(
|
|
23
|
-
|
|
24
|
-
channels: int = 1,
|
|
25
|
-
rate: int = 24000,
|
|
26
|
-
sample_width: int = 2,
|
|
27
|
-
) -> str:
|
|
28
|
-
"""
|
|
29
|
-
Save raw PCM (s16le) audio bytes to a temporary WAV file and return the file path.
|
|
30
|
-
|
|
31
|
-
Gemini TTS models output mono, 24 kHz, 16-bit PCM by default.
|
|
32
|
-
"""
|
|
20
|
+
def _save_audio_bytes_to_wav(pcm_bytes: bytes, channels=1, rate=24000, sample_width=2) -> str:
|
|
21
|
+
"""Saves PCM audio bytes to a temporary WAV file and returns the path."""
|
|
33
22
|
temp_dir = "/tmp/autobyteus_audio"
|
|
34
23
|
os.makedirs(temp_dir, exist_ok=True)
|
|
35
24
|
file_path = os.path.join(temp_dir, f"{uuid.uuid4()}.wav")
|
|
36
|
-
|
|
25
|
+
|
|
37
26
|
try:
|
|
38
27
|
with wave.open(file_path, "wb") as wf:
|
|
39
28
|
wf.setnchannels(channels)
|
|
40
|
-
wf.setsampwidth(sample_width)
|
|
29
|
+
wf.setsampwidth(sample_width)
|
|
41
30
|
wf.setframerate(rate)
|
|
42
31
|
wf.writeframes(pcm_bytes)
|
|
43
|
-
logger.info("Successfully saved generated audio to
|
|
32
|
+
logger.info(f"Successfully saved generated audio to {file_path}")
|
|
44
33
|
return file_path
|
|
45
34
|
except Exception as e:
|
|
46
|
-
logger.error("Failed to save audio to WAV file at
|
|
47
|
-
raise
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def _extract_inline_audio_bytes(response) -> bytes:
|
|
51
|
-
"""
|
|
52
|
-
Extract inline audio bytes from a google.generativeai response.
|
|
53
|
-
|
|
54
|
-
The legacy SDK returns a Response object with candidates -> content -> parts[0].inline_data.data.
|
|
55
|
-
Depending on version, `.data` can be bytes or base64-encoded str.
|
|
56
|
-
"""
|
|
57
|
-
try:
|
|
58
|
-
# Access the first candidate's first part's inline_data
|
|
59
|
-
part = response.candidates[0].content.parts[0]
|
|
60
|
-
inline = getattr(part, "inline_data", None)
|
|
61
|
-
if not inline or not hasattr(inline, "data"):
|
|
62
|
-
raise ValueError("No inline audio data found in response.")
|
|
63
|
-
data = inline.data
|
|
64
|
-
if isinstance(data, bytes):
|
|
65
|
-
return data
|
|
66
|
-
if isinstance(data, str):
|
|
67
|
-
return base64.b64decode(data)
|
|
68
|
-
raise TypeError(f"Unexpected inline_data.data type: {type(data)}")
|
|
69
|
-
except Exception as e:
|
|
70
|
-
logger.error("Failed to extract audio from response: %s", e)
|
|
35
|
+
logger.error(f"Failed to save audio to WAV file at {file_path}: {e}")
|
|
71
36
|
raise
|
|
72
37
|
|
|
73
38
|
|
|
74
39
|
class GeminiAudioClient(BaseAudioClient):
|
|
75
40
|
"""
|
|
76
|
-
An audio client that uses Google's Gemini models for
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
- Ensure your model value is a TTS-capable model (e.g. "gemini-2.5-flash-preview-tts"
|
|
81
|
-
or "gemini-2.5-pro-preview-tts").
|
|
82
|
-
- Single-speaker is default. For simple usage, provide `voice_name` (e.g. "Kore", "Puck")
|
|
83
|
-
in MultimediaConfig or generation_config.
|
|
84
|
-
- Multi-speaker preview exists in the API; if you want it, pass:
|
|
85
|
-
generation_config = {
|
|
86
|
-
"mode": "multi-speaker",
|
|
87
|
-
"speakers": [
|
|
88
|
-
{"speaker": "Alice", "voice_name": "Kore"},
|
|
89
|
-
{"speaker": "Bob", "voice_name": "Puck"},
|
|
90
|
-
]
|
|
91
|
-
}
|
|
92
|
-
and make sure your prompt contains lines for each named speaker.
|
|
41
|
+
An audio client that uses Google's Gemini models for audio tasks.
|
|
42
|
+
|
|
43
|
+
**Setup Requirements:**
|
|
44
|
+
1. **Authentication:** Set the `GEMINI_API_KEY` environment variable with your API key.
|
|
93
45
|
"""
|
|
94
46
|
|
|
95
47
|
def __init__(self, model: "AudioModel", config: "MultimediaConfig"):
|
|
@@ -99,121 +51,100 @@ class GeminiAudioClient(BaseAudioClient):
|
|
|
99
51
|
raise ValueError("Please set the GEMINI_API_KEY environment variable.")
|
|
100
52
|
|
|
101
53
|
try:
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
self._model = genai.GenerativeModel(self.model.value or "gemini-2.5-flash-preview-tts")
|
|
106
|
-
logger.info("GeminiAudioClient (legacy SDK) configured for model '%s'.", self.model.value)
|
|
54
|
+
self.client = genai.Client()
|
|
55
|
+
self.async_client = self.client.aio
|
|
56
|
+
logger.info(f"GeminiAudioClient initialized for model '{self.model.name}'.")
|
|
107
57
|
except Exception as e:
|
|
108
|
-
logger.error("Failed to configure Gemini client:
|
|
58
|
+
logger.error(f"Failed to configure Gemini client: {e}")
|
|
109
59
|
raise RuntimeError(f"Failed to configure Gemini client: {e}")
|
|
110
60
|
|
|
111
|
-
@staticmethod
|
|
112
|
-
def _build_single_speaker_generation_config(voice_name: str) -> Dict[str, Any]:
|
|
113
|
-
"""
|
|
114
|
-
Build generation_config for single-speaker TTS in the legacy SDK.
|
|
115
|
-
Key bits:
|
|
116
|
-
- response_mime_type => request audio
|
|
117
|
-
- speech_config.voice_config.prebuilt_voice_config.voice_name => set the voice
|
|
118
|
-
"""
|
|
119
|
-
return {
|
|
120
|
-
"response_mime_type": "audio/pcm",
|
|
121
|
-
"speech_config": {
|
|
122
|
-
"voice_config": {
|
|
123
|
-
"prebuilt_voice_config": {
|
|
124
|
-
"voice_name": voice_name,
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
},
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
@staticmethod
|
|
131
|
-
def _build_multi_speaker_generation_config(speakers: List[Dict[str, str]]) -> Dict[str, Any]:
|
|
132
|
-
"""
|
|
133
|
-
Build generation_config for multi-speaker TTS (preview).
|
|
134
|
-
`speakers` = [{"speaker": "...", "voice_name": "..."}, ...]
|
|
135
|
-
"""
|
|
136
|
-
speaker_voice_configs = []
|
|
137
|
-
for s in speakers:
|
|
138
|
-
spk = s.get("speaker")
|
|
139
|
-
vname = s.get("voice_name")
|
|
140
|
-
if not spk or not vname:
|
|
141
|
-
raise ValueError("Each speaker must include 'speaker' and 'voice_name'.")
|
|
142
|
-
speaker_voice_configs.append(
|
|
143
|
-
{
|
|
144
|
-
"speaker": spk,
|
|
145
|
-
"voice_config": {
|
|
146
|
-
"prebuilt_voice_config": {
|
|
147
|
-
"voice_name": vname,
|
|
148
|
-
}
|
|
149
|
-
},
|
|
150
|
-
}
|
|
151
|
-
)
|
|
152
|
-
return {
|
|
153
|
-
"response_mime_type": "audio/pcm",
|
|
154
|
-
"speech_config": {
|
|
155
|
-
"multi_speaker_voice_config": {
|
|
156
|
-
"speaker_voice_configs": speaker_voice_configs
|
|
157
|
-
}
|
|
158
|
-
},
|
|
159
|
-
}
|
|
160
61
|
|
|
161
62
|
async def generate_speech(
|
|
162
63
|
self,
|
|
163
64
|
prompt: str,
|
|
164
|
-
generation_config: Optional[Dict[str, Any]] = None
|
|
65
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
66
|
+
**kwargs
|
|
165
67
|
) -> SpeechGenerationResponse:
|
|
166
68
|
"""
|
|
167
|
-
Generates spoken audio from text using a Gemini TTS model
|
|
168
|
-
|
|
169
|
-
Implementation details:
|
|
170
|
-
- We call `GenerativeModel.generate_content(...)` with a `generation_config`
|
|
171
|
-
that asks for AUDIO and sets the voice settings.
|
|
172
|
-
- The legacy SDK call is synchronous; we offload to a worker thread.
|
|
69
|
+
Generates spoken audio from text using a Gemini TTS model, supporting single-speaker,
|
|
70
|
+
multi-speaker, and style-controlled generation.
|
|
173
71
|
"""
|
|
174
72
|
try:
|
|
175
|
-
logger.info("Generating speech with Gemini TTS
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
final_cfg = self.config.to_dict().copy()
|
|
73
|
+
logger.info(f"Generating speech with Gemini TTS model '{self.model.value}'...")
|
|
74
|
+
|
|
75
|
+
final_config = self.config.to_dict().copy()
|
|
179
76
|
if generation_config:
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
#
|
|
183
|
-
style_instructions =
|
|
77
|
+
final_config.update(generation_config)
|
|
78
|
+
|
|
79
|
+
# Handle style instructions by prepending them to the prompt
|
|
80
|
+
style_instructions = final_config.get("style_instructions")
|
|
184
81
|
final_prompt = f"{style_instructions}: {prompt}" if style_instructions else prompt
|
|
185
|
-
logger.debug("Final prompt for TTS
|
|
82
|
+
logger.debug(f"Final prompt for TTS: '{final_prompt[:100]}...'")
|
|
186
83
|
|
|
187
|
-
|
|
188
|
-
mode =
|
|
189
|
-
default_voice = final_cfg.get("voice_name", "Kore")
|
|
84
|
+
speech_config = None
|
|
85
|
+
mode = final_config.get("mode", "single-speaker")
|
|
190
86
|
|
|
87
|
+
# Handle multi-speaker generation
|
|
191
88
|
if mode == "multi-speaker":
|
|
192
|
-
|
|
193
|
-
if not
|
|
194
|
-
raise ValueError(
|
|
195
|
-
|
|
196
|
-
|
|
89
|
+
speaker_mapping_list = final_config.get("speaker_mapping")
|
|
90
|
+
if not speaker_mapping_list or not isinstance(speaker_mapping_list, list):
|
|
91
|
+
raise ValueError("Multi-speaker mode requires a 'speaker_mapping' list in generation_config.")
|
|
92
|
+
|
|
93
|
+
logger.info(f"Configuring multi-speaker TTS with mapping: {speaker_mapping_list}")
|
|
94
|
+
speaker_voice_configs = []
|
|
95
|
+
for mapping_item in speaker_mapping_list:
|
|
96
|
+
speaker = mapping_item.get("speaker")
|
|
97
|
+
voice_name = mapping_item.get("voice")
|
|
98
|
+
if not speaker or not voice_name:
|
|
99
|
+
logger.warning(f"Skipping invalid item in speaker_mapping list: {mapping_item}")
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
speaker_voice_configs.append(
|
|
103
|
+
genai_types.SpeakerVoiceConfig(
|
|
104
|
+
speaker=speaker,
|
|
105
|
+
voice_config=genai_types.VoiceConfig(
|
|
106
|
+
prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(voice_name=voice_name)
|
|
107
|
+
)
|
|
108
|
+
)
|
|
197
109
|
)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
110
|
+
|
|
111
|
+
if not speaker_voice_configs:
|
|
112
|
+
raise ValueError("The 'speaker_mapping' list was empty or contained no valid mappings.")
|
|
201
113
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
final_prompt,
|
|
206
|
-
generation_config=gen_config,
|
|
207
|
-
)
|
|
114
|
+
speech_config = genai_types.SpeechConfig(
|
|
115
|
+
multi_speaker_voice_config=genai_types.MultiSpeakerVoiceConfig(speaker_voice_configs=speaker_voice_configs)
|
|
116
|
+
)
|
|
208
117
|
|
|
209
|
-
|
|
118
|
+
# Handle single-speaker generation (default)
|
|
119
|
+
else:
|
|
120
|
+
voice_name = final_config.get("voice_name", "Kore") # A default voice
|
|
121
|
+
logger.info(f"Configuring single-speaker TTS with voice: '{voice_name}'")
|
|
122
|
+
speech_config = genai_types.SpeechConfig(
|
|
123
|
+
voice_config=genai_types.VoiceConfig(
|
|
124
|
+
prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(voice_name=voice_name)
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# The google-genai library's TTS endpoint uses a synchronous call.
|
|
129
|
+
resp = self.client.models.generate_content(
|
|
130
|
+
model=self.model.value,
|
|
131
|
+
contents=final_prompt,
|
|
132
|
+
config=genai_types.GenerateContentConfig(
|
|
133
|
+
response_modalities=["AUDIO"],
|
|
134
|
+
speech_config=speech_config
|
|
135
|
+
),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
audio_b64 = resp.candidates[0].content.parts[0].inline_data.data
|
|
139
|
+
audio_pcm = base64.b64decode(audio_b64)
|
|
140
|
+
|
|
210
141
|
audio_path = _save_audio_bytes_to_wav(audio_pcm)
|
|
211
142
|
|
|
212
143
|
return SpeechGenerationResponse(audio_urls=[audio_path])
|
|
213
144
|
|
|
214
145
|
except Exception as e:
|
|
215
|
-
logger.error("Error during Google Gemini speech generation
|
|
146
|
+
logger.error(f"Error during Google Gemini speech generation: {str(e)}")
|
|
216
147
|
raise ValueError(f"Google Gemini speech generation failed: {str(e)}")
|
|
217
148
|
|
|
218
149
|
async def cleanup(self):
|
|
219
|
-
logger.debug("GeminiAudioClient cleanup called
|
|
150
|
+
logger.debug("GeminiAudioClient cleanup called.")
|
|
@@ -8,6 +8,7 @@ from autobyteus.multimedia.audio.api.gemini_audio_client import GeminiAudioClien
|
|
|
8
8
|
from autobyteus.multimedia.audio.autobyteus_audio_provider import AutobyteusAudioModelProvider
|
|
9
9
|
from autobyteus.multimedia.utils.multimedia_config import MultimediaConfig
|
|
10
10
|
from autobyteus.utils.singleton import SingletonMeta
|
|
11
|
+
from autobyteus.utils.parameter_schema import ParameterSchema, ParameterDefinition, ParameterType
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
|
|
@@ -46,34 +47,58 @@ class AudioClientFactory(metaclass=SingletonMeta):
|
|
|
46
47
|
def _initialize_registry():
|
|
47
48
|
"""Initializes the registry with built-in audio models."""
|
|
48
49
|
|
|
50
|
+
# --- Define a clear schema for speaker mapping items using ParameterSchema ---
|
|
51
|
+
speaker_mapping_item_schema = ParameterSchema(parameters=[
|
|
52
|
+
ParameterDefinition(
|
|
53
|
+
name="speaker",
|
|
54
|
+
param_type=ParameterType.STRING,
|
|
55
|
+
description="The speaker's name as it appears in the prompt (e.g., 'Joe').",
|
|
56
|
+
required=True
|
|
57
|
+
),
|
|
58
|
+
ParameterDefinition(
|
|
59
|
+
name="voice",
|
|
60
|
+
param_type=ParameterType.ENUM,
|
|
61
|
+
description="The voice to assign to this speaker.",
|
|
62
|
+
enum_values=GEMINI_TTS_VOICES,
|
|
63
|
+
required=True
|
|
64
|
+
)
|
|
65
|
+
])
|
|
66
|
+
|
|
49
67
|
# Google Gemini Audio Models
|
|
68
|
+
gemini_tts_schema = ParameterSchema(parameters=[
|
|
69
|
+
ParameterDefinition(
|
|
70
|
+
name="mode",
|
|
71
|
+
param_type=ParameterType.ENUM,
|
|
72
|
+
default_value="single-speaker",
|
|
73
|
+
enum_values=["single-speaker", "multi-speaker"],
|
|
74
|
+
description="The speech generation mode. 'single-speaker' for a consistent voice, or 'multi-speaker' to assign different voices to speakers identified in the prompt."
|
|
75
|
+
),
|
|
76
|
+
ParameterDefinition(
|
|
77
|
+
name="voice_name",
|
|
78
|
+
param_type=ParameterType.ENUM,
|
|
79
|
+
default_value="Kore",
|
|
80
|
+
enum_values=GEMINI_TTS_VOICES,
|
|
81
|
+
description="The voice to use for single-speaker generation."
|
|
82
|
+
),
|
|
83
|
+
ParameterDefinition(
|
|
84
|
+
name="style_instructions",
|
|
85
|
+
param_type=ParameterType.STRING,
|
|
86
|
+
description="Optional instructions on the style of speech, e.g., 'Say this in a dramatic whisper'."
|
|
87
|
+
),
|
|
88
|
+
ParameterDefinition(
|
|
89
|
+
name="speaker_mapping",
|
|
90
|
+
param_type=ParameterType.ARRAY,
|
|
91
|
+
description="Required for multi-speaker mode. A list of objects, each mapping a speaker name from the prompt to a voice name.",
|
|
92
|
+
array_item_schema=speaker_mapping_item_schema
|
|
93
|
+
)
|
|
94
|
+
])
|
|
95
|
+
|
|
50
96
|
gemini_tts_model = AudioModel(
|
|
51
97
|
name="gemini-2.5-flash-tts",
|
|
52
98
|
value="gemini-2.5-flash-preview-tts",
|
|
53
99
|
provider=MultimediaProvider.GOOGLE,
|
|
54
100
|
client_class=GeminiAudioClient,
|
|
55
|
-
parameter_schema=
|
|
56
|
-
"mode": {
|
|
57
|
-
"type": "string",
|
|
58
|
-
"default": "single-speaker",
|
|
59
|
-
"allowed_values": ["single-speaker", "multi-speaker"],
|
|
60
|
-
"description": "The speech generation mode. 'single-speaker' for a consistent voice, or 'multi-speaker' to assign different voices to speakers identified in the prompt."
|
|
61
|
-
},
|
|
62
|
-
"voice_name": {
|
|
63
|
-
"type": "string",
|
|
64
|
-
"default": "Kore",
|
|
65
|
-
"allowed_values": GEMINI_TTS_VOICES,
|
|
66
|
-
"description": "The voice to use for single-speaker generation."
|
|
67
|
-
},
|
|
68
|
-
"style_instructions": {
|
|
69
|
-
"type": "string",
|
|
70
|
-
"description": "Optional instructions on the style of speech, e.g., 'Say this in a dramatic whisper'."
|
|
71
|
-
},
|
|
72
|
-
"speaker_mapping": {
|
|
73
|
-
"type": "object",
|
|
74
|
-
"description": "Required for multi-speaker mode. An object mapping speaker names from the prompt (e.g., 'Joe') to a voice name (e.g., 'Puck')."
|
|
75
|
-
}
|
|
76
|
-
}
|
|
101
|
+
parameter_schema=gemini_tts_schema
|
|
77
102
|
)
|
|
78
103
|
|
|
79
104
|
models_to_register = [
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
import logging
|
|
3
|
-
from typing import TYPE_CHECKING, Type, Optional, Iterator, Dict, Any
|
|
3
|
+
from typing import TYPE_CHECKING, Type, Optional, Iterator, Dict, Any, Union
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
|
|
6
6
|
from autobyteus.multimedia.providers import MultimediaProvider
|
|
7
7
|
from autobyteus.multimedia.runtimes import MultimediaRuntime
|
|
8
8
|
from autobyteus.multimedia.utils.multimedia_config import MultimediaConfig
|
|
9
|
+
from autobyteus.utils.parameter_schema import ParameterSchema
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from autobyteus.multimedia.audio.base_audio_client import BaseAudioClient
|
|
@@ -47,7 +48,7 @@ class AudioModel(metaclass=AudioModelMeta):
|
|
|
47
48
|
value: str,
|
|
48
49
|
provider: MultimediaProvider,
|
|
49
50
|
client_class: Type["BaseAudioClient"],
|
|
50
|
-
parameter_schema: Optional[Dict[str, Any]] = None,
|
|
51
|
+
parameter_schema: Optional[Union[Dict[str, Any], ParameterSchema]] = None,
|
|
51
52
|
runtime: MultimediaRuntime = MultimediaRuntime.API,
|
|
52
53
|
host_url: Optional[str] = None
|
|
53
54
|
):
|
|
@@ -57,13 +58,19 @@ class AudioModel(metaclass=AudioModelMeta):
|
|
|
57
58
|
self.client_class = client_class
|
|
58
59
|
self.runtime = runtime
|
|
59
60
|
self.host_url = host_url
|
|
60
|
-
|
|
61
|
+
|
|
62
|
+
if isinstance(parameter_schema, dict):
|
|
63
|
+
self.parameter_schema = ParameterSchema.from_dict(parameter_schema)
|
|
64
|
+
elif parameter_schema is None:
|
|
65
|
+
self.parameter_schema = ParameterSchema()
|
|
66
|
+
else:
|
|
67
|
+
self.parameter_schema = parameter_schema
|
|
61
68
|
|
|
62
69
|
# Automatically build default_config from the schema's default values
|
|
63
70
|
default_params = {
|
|
64
|
-
|
|
65
|
-
for
|
|
66
|
-
if
|
|
71
|
+
param.name: param.default_value
|
|
72
|
+
for param in self.parameter_schema.parameters
|
|
73
|
+
if param.default_value is not None
|
|
67
74
|
}
|
|
68
75
|
self.default_config = MultimediaConfig(params=default_params)
|
|
69
76
|
|
|
@@ -19,7 +19,9 @@ class AutobyteusAudioModelProvider:
|
|
|
19
19
|
|
|
20
20
|
@staticmethod
|
|
21
21
|
def _get_hosts() -> List[str]:
|
|
22
|
-
"""
|
|
22
|
+
"""
|
|
23
|
+
Gets Autobyteus server hosts from env vars. Skips discovery if no host is configured.
|
|
24
|
+
"""
|
|
23
25
|
hosts_str = os.getenv('AUTOBYTEUS_LLM_SERVER_HOSTS')
|
|
24
26
|
if hosts_str:
|
|
25
27
|
return [host.strip() for host in hosts_str.split(',')]
|
|
@@ -28,7 +30,7 @@ class AutobyteusAudioModelProvider:
|
|
|
28
30
|
if legacy_host:
|
|
29
31
|
return [legacy_host]
|
|
30
32
|
|
|
31
|
-
return [
|
|
33
|
+
return []
|
|
32
34
|
|
|
33
35
|
@staticmethod
|
|
34
36
|
def discover_and_register():
|
|
@@ -37,6 +39,10 @@ class AutobyteusAudioModelProvider:
|
|
|
37
39
|
from autobyteus.multimedia.audio.audio_client_factory import AudioClientFactory
|
|
38
40
|
|
|
39
41
|
hosts = AutobyteusAudioModelProvider._get_hosts()
|
|
42
|
+
if not hosts:
|
|
43
|
+
logger.info("No Autobyteus server hosts configured. Skipping Autobyteus audio model discovery.")
|
|
44
|
+
return
|
|
45
|
+
|
|
40
46
|
total_registered_count = 0
|
|
41
47
|
|
|
42
48
|
for host_url in hosts:
|
|
@@ -20,7 +20,8 @@ class BaseAudioClient(ABC):
|
|
|
20
20
|
async def generate_speech(
|
|
21
21
|
self,
|
|
22
22
|
prompt: str,
|
|
23
|
-
generation_config: Optional[Dict[str, Any]] = None
|
|
23
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
24
|
+
**kwargs
|
|
24
25
|
) -> SpeechGenerationResponse:
|
|
25
26
|
"""
|
|
26
27
|
Generates spoken audio from text (Text-to-Speech).
|
|
@@ -29,6 +30,7 @@ class BaseAudioClient(ABC):
|
|
|
29
30
|
prompt (str): The text to be converted to speech.
|
|
30
31
|
generation_config (Optional[Dict[str, Any]]): Provider-specific parameters
|
|
31
32
|
(e.g., voice_name, speaker_mapping).
|
|
33
|
+
**kwargs: Additional keyword arguments for extensibility.
|
|
32
34
|
|
|
33
35
|
Returns:
|
|
34
36
|
SpeechGenerationResponse: An object containing URLs or paths to the generated audio files.
|
|
@@ -27,7 +27,8 @@ class AutobyteusImageClient(BaseImageClient):
|
|
|
27
27
|
self,
|
|
28
28
|
prompt: str,
|
|
29
29
|
input_image_urls: Optional[List[str]] = None,
|
|
30
|
-
generation_config: Optional[Dict[str, Any]] = None
|
|
30
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
31
|
+
**kwargs
|
|
31
32
|
) -> ImageGenerationResponse:
|
|
32
33
|
"""
|
|
33
34
|
Generates an image by calling the generate_image endpoint on the remote Autobyteus server.
|
|
@@ -38,7 +39,8 @@ class AutobyteusImageClient(BaseImageClient):
|
|
|
38
39
|
prompt=prompt,
|
|
39
40
|
input_image_urls=input_image_urls,
|
|
40
41
|
mask_url=None, # Not used in pure generation
|
|
41
|
-
generation_config=generation_config
|
|
42
|
+
generation_config=generation_config,
|
|
43
|
+
**kwargs
|
|
42
44
|
)
|
|
43
45
|
|
|
44
46
|
async def edit_image(
|
|
@@ -46,7 +48,8 @@ class AutobyteusImageClient(BaseImageClient):
|
|
|
46
48
|
prompt: str,
|
|
47
49
|
input_image_urls: List[str],
|
|
48
50
|
mask_url: Optional[str] = None,
|
|
49
|
-
generation_config: Optional[Dict[str, Any]] = None
|
|
51
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
52
|
+
**kwargs
|
|
50
53
|
) -> ImageGenerationResponse:
|
|
51
54
|
"""
|
|
52
55
|
Edits an image by calling the generate_image endpoint on the remote Autobyteus server.
|
|
@@ -55,7 +58,8 @@ class AutobyteusImageClient(BaseImageClient):
|
|
|
55
58
|
prompt=prompt,
|
|
56
59
|
input_image_urls=input_image_urls,
|
|
57
60
|
mask_url=mask_url,
|
|
58
|
-
generation_config=generation_config
|
|
61
|
+
generation_config=generation_config,
|
|
62
|
+
**kwargs
|
|
59
63
|
)
|
|
60
64
|
|
|
61
65
|
async def _call_remote_generate(
|
|
@@ -63,7 +67,8 @@ class AutobyteusImageClient(BaseImageClient):
|
|
|
63
67
|
prompt: str,
|
|
64
68
|
input_image_urls: Optional[List[str]],
|
|
65
69
|
mask_url: Optional[str],
|
|
66
|
-
generation_config: Optional[Dict[str, Any]]
|
|
70
|
+
generation_config: Optional[Dict[str, Any]],
|
|
71
|
+
**kwargs
|
|
67
72
|
) -> ImageGenerationResponse:
|
|
68
73
|
"""Internal helper to call the remote server."""
|
|
69
74
|
try:
|
|
@@ -72,6 +77,8 @@ class AutobyteusImageClient(BaseImageClient):
|
|
|
72
77
|
# The model name for the remote server is the `value`, not the unique `model_identifier`
|
|
73
78
|
model_name_for_server = self.model.name
|
|
74
79
|
|
|
80
|
+
# Note: The underlying autobyteus_client.generate_image does not currently accept **kwargs.
|
|
81
|
+
# They are accepted here for interface consistency and future-proofing.
|
|
75
82
|
response_data = await self.autobyteus_client.generate_image(
|
|
76
83
|
model_name=model_name_for_server,
|
|
77
84
|
prompt=prompt,
|