agentfield 0.1.22rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentfield/__init__.py +66 -0
- agentfield/agent.py +3569 -0
- agentfield/agent_ai.py +1125 -0
- agentfield/agent_cli.py +386 -0
- agentfield/agent_field_handler.py +494 -0
- agentfield/agent_mcp.py +534 -0
- agentfield/agent_registry.py +29 -0
- agentfield/agent_server.py +1185 -0
- agentfield/agent_utils.py +269 -0
- agentfield/agent_workflow.py +323 -0
- agentfield/async_config.py +278 -0
- agentfield/async_execution_manager.py +1227 -0
- agentfield/client.py +1447 -0
- agentfield/connection_manager.py +280 -0
- agentfield/decorators.py +527 -0
- agentfield/did_manager.py +337 -0
- agentfield/dynamic_skills.py +304 -0
- agentfield/execution_context.py +255 -0
- agentfield/execution_state.py +453 -0
- agentfield/http_connection_manager.py +429 -0
- agentfield/litellm_adapters.py +140 -0
- agentfield/logger.py +249 -0
- agentfield/mcp_client.py +204 -0
- agentfield/mcp_manager.py +340 -0
- agentfield/mcp_stdio_bridge.py +550 -0
- agentfield/memory.py +723 -0
- agentfield/memory_events.py +489 -0
- agentfield/multimodal.py +173 -0
- agentfield/multimodal_response.py +403 -0
- agentfield/pydantic_utils.py +227 -0
- agentfield/rate_limiter.py +280 -0
- agentfield/result_cache.py +441 -0
- agentfield/router.py +190 -0
- agentfield/status.py +70 -0
- agentfield/types.py +710 -0
- agentfield/utils.py +26 -0
- agentfield/vc_generator.py +464 -0
- agentfield/vision.py +198 -0
- agentfield-0.1.22rc2.dist-info/METADATA +102 -0
- agentfield-0.1.22rc2.dist-info/RECORD +42 -0
- agentfield-0.1.22rc2.dist-info/WHEEL +5 -0
- agentfield-0.1.22rc2.dist-info/top_level.txt +1 -0
agentfield/agent_ai.py
ADDED
|
@@ -0,0 +1,1125 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, List, Literal, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
from agentfield.agent_utils import AgentUtils
|
|
8
|
+
from agentfield.logger import log_debug, log_error, log_warn
|
|
9
|
+
from agentfield.rate_limiter import StatelessRateLimiter
|
|
10
|
+
from httpx import HTTPStatusError
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
# Expose module-level symbols for patching in tests
|
|
14
|
+
try:
|
|
15
|
+
import litellm as litellm # type: ignore
|
|
16
|
+
except Exception: # pragma: no cover - test environments may not have litellm
|
|
17
|
+
|
|
18
|
+
class _LiteLLMStub:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
litellm = _LiteLLMStub() # type: ignore
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
import openai as openai # type: ignore
|
|
25
|
+
except Exception: # pragma: no cover - test environments may not have openai
|
|
26
|
+
|
|
27
|
+
class _OpenAIStub:
|
|
28
|
+
class OpenAI:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
openai = _OpenAIStub() # type: ignore
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AgentAI:
|
|
35
|
+
"""AI/LLM Integration functionality for AgentField Agent"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, agent_instance):
|
|
38
|
+
"""
|
|
39
|
+
Initialize AgentAI with a reference to the main agent instance.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
agent_instance: The main Agent instance
|
|
43
|
+
"""
|
|
44
|
+
self.agent = agent_instance
|
|
45
|
+
self._initialization_complete = False
|
|
46
|
+
self._rate_limiter = None
|
|
47
|
+
|
|
48
|
+
def _get_rate_limiter(self) -> StatelessRateLimiter:
|
|
49
|
+
"""
|
|
50
|
+
Get or create the rate limiter instance based on current configuration.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
StatelessRateLimiter: Configured rate limiter instance
|
|
54
|
+
"""
|
|
55
|
+
if self._rate_limiter is None:
|
|
56
|
+
config = self.agent.ai_config
|
|
57
|
+
self._rate_limiter = StatelessRateLimiter(
|
|
58
|
+
max_retries=config.rate_limit_max_retries,
|
|
59
|
+
base_delay=config.rate_limit_base_delay,
|
|
60
|
+
max_delay=config.rate_limit_max_delay,
|
|
61
|
+
jitter_factor=config.rate_limit_jitter_factor,
|
|
62
|
+
circuit_breaker_threshold=config.rate_limit_circuit_breaker_threshold,
|
|
63
|
+
circuit_breaker_timeout=config.rate_limit_circuit_breaker_timeout,
|
|
64
|
+
)
|
|
65
|
+
return self._rate_limiter
|
|
66
|
+
|
|
67
|
+
async def _ensure_model_limits_cached(self):
|
|
68
|
+
"""
|
|
69
|
+
Ensure model limits are cached for the current model configuration.
|
|
70
|
+
This is called once during the first AI call to avoid startup delays.
|
|
71
|
+
"""
|
|
72
|
+
if not self._initialization_complete:
|
|
73
|
+
try:
|
|
74
|
+
# Cache limits for the default model
|
|
75
|
+
await self.agent.ai_config.get_model_limits()
|
|
76
|
+
|
|
77
|
+
# Cache limits for multimodal models if different
|
|
78
|
+
if self.agent.ai_config.audio_model != self.agent.ai_config.model:
|
|
79
|
+
await self.agent.ai_config.get_model_limits(
|
|
80
|
+
self.agent.ai_config.audio_model
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if self.agent.ai_config.vision_model != self.agent.ai_config.model:
|
|
84
|
+
await self.agent.ai_config.get_model_limits(
|
|
85
|
+
self.agent.ai_config.vision_model
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
self._initialization_complete = True
|
|
89
|
+
|
|
90
|
+
except Exception as e:
|
|
91
|
+
log_debug(f"Failed to cache model limits: {e}")
|
|
92
|
+
# Continue with fallback defaults
|
|
93
|
+
self._initialization_complete = True
|
|
94
|
+
|
|
95
|
+
async def ai(
|
|
96
|
+
self,
|
|
97
|
+
*args: Any,
|
|
98
|
+
system: Optional[str] = None,
|
|
99
|
+
user: Optional[str] = None,
|
|
100
|
+
schema: Optional[Type[BaseModel]] = None,
|
|
101
|
+
model: Optional[str] = None,
|
|
102
|
+
temperature: Optional[float] = None,
|
|
103
|
+
max_tokens: Optional[int] = None,
|
|
104
|
+
stream: Optional[bool] = None,
|
|
105
|
+
response_format: Optional[Union[Literal["auto", "json", "text"], Dict]] = None,
|
|
106
|
+
context: Optional[Dict] = None,
|
|
107
|
+
memory_scope: Optional[List[str]] = None,
|
|
108
|
+
**kwargs,
|
|
109
|
+
) -> Any:
|
|
110
|
+
"""
|
|
111
|
+
Universal AI method supporting multimodal inputs with intelligent type detection.
|
|
112
|
+
|
|
113
|
+
This method provides a flexible interface for interacting with various LLMs,
|
|
114
|
+
supporting text, image, audio, and file inputs. It intelligently detects
|
|
115
|
+
input types and applies a hierarchical configuration system.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
*args: Flexible inputs - text, images, audio, files, or mixed content.
|
|
119
|
+
- str: Text content, URLs, or file paths (auto-detected).
|
|
120
|
+
- bytes: Binary data (images, audio, documents).
|
|
121
|
+
- dict: Structured input with explicit keys (e.g., {"image": "url"}).
|
|
122
|
+
- list: Multimodal conversation or content list.
|
|
123
|
+
|
|
124
|
+
system (str, optional): System prompt for AI behavior.
|
|
125
|
+
user (str, optional): User message (alternative to positional args).
|
|
126
|
+
schema (Type[BaseModel], optional): Pydantic model for structured output validation.
|
|
127
|
+
model (str, optional): Override default model (e.g., "gpt-4", "claude-3").
|
|
128
|
+
temperature (float, optional): Creativity level (0.0-2.0).
|
|
129
|
+
max_tokens (int, optional): Maximum response length.
|
|
130
|
+
stream (bool, optional): Enable streaming response.
|
|
131
|
+
response_format (str, optional): Desired response format ('auto', 'json', 'text').
|
|
132
|
+
context (Dict, optional): Additional context data to pass to the LLM.
|
|
133
|
+
memory_scope (List[str], optional): Memory scopes to inject (e.g., ['workflow', 'session', 'reasoner']).
|
|
134
|
+
**kwargs: Additional provider-specific parameters to pass to the LLM.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Any: The AI response - raw text, structured object (if schema), or a stream.
|
|
138
|
+
|
|
139
|
+
Examples:
|
|
140
|
+
# Simple text input
|
|
141
|
+
response = await app.ai("Summarize this document.")
|
|
142
|
+
|
|
143
|
+
# System and user prompts
|
|
144
|
+
response = await app.ai(
|
|
145
|
+
system="You are a helpful assistant.",
|
|
146
|
+
user="What is the capital of France?"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Multimodal input with auto-detection (image URL and text)
|
|
150
|
+
response = await app.ai(
|
|
151
|
+
"Describe this image:",
|
|
152
|
+
"https://example.com/image.jpg"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Multimodal input with file path (audio)
|
|
156
|
+
response = await app.ai(
|
|
157
|
+
"Transcribe this audio:",
|
|
158
|
+
"./audio.mp3"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Structured output with Pydantic schema
|
|
162
|
+
class SentimentResult(BaseModel):
|
|
163
|
+
sentiment: str
|
|
164
|
+
confidence: float
|
|
165
|
+
|
|
166
|
+
result = await app.ai(
|
|
167
|
+
"Analyze the sentiment of 'I love this product!'",
|
|
168
|
+
schema=SentimentResult
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Override default AI configuration parameters
|
|
172
|
+
response = await app.ai(
|
|
173
|
+
"Generate a creative story.",
|
|
174
|
+
model="gpt-4-turbo",
|
|
175
|
+
temperature=0.9,
|
|
176
|
+
max_tokens=500,
|
|
177
|
+
stream=True
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Complex multimodal conversation
|
|
181
|
+
response = await app.ai([
|
|
182
|
+
{"role": "system", "content": "You are a visual assistant."},
|
|
183
|
+
{"role": "user", "content": "What do you see here?"},
|
|
184
|
+
"https://example.com/chart.png",
|
|
185
|
+
{"role": "user", "content": "Can you explain the trend?"}
|
|
186
|
+
])
|
|
187
|
+
"""
|
|
188
|
+
# Apply hierarchical configuration: Agent defaults < Method overrides < Runtime overrides
|
|
189
|
+
final_config = self.agent.ai_config.copy(deep=True)
|
|
190
|
+
|
|
191
|
+
# Default enable rate limit retry unless explicitly set to False
|
|
192
|
+
if (
|
|
193
|
+
not hasattr(final_config, "enable_rate_limit_retry")
|
|
194
|
+
or final_config.enable_rate_limit_retry is None
|
|
195
|
+
):
|
|
196
|
+
final_config.enable_rate_limit_retry = True
|
|
197
|
+
|
|
198
|
+
# Apply method-level overrides
|
|
199
|
+
if model:
|
|
200
|
+
final_config.model = model
|
|
201
|
+
if temperature is not None:
|
|
202
|
+
final_config.temperature = temperature
|
|
203
|
+
if max_tokens is not None:
|
|
204
|
+
final_config.max_tokens = max_tokens
|
|
205
|
+
if stream is not None:
|
|
206
|
+
final_config.stream = stream
|
|
207
|
+
if response_format is not None:
|
|
208
|
+
if isinstance(response_format, str):
|
|
209
|
+
final_config.response_format = response_format
|
|
210
|
+
|
|
211
|
+
# TODO: Integrate memory injection based on memory_scope and self.memory_config
|
|
212
|
+
# For now, just pass context if provided
|
|
213
|
+
if context:
|
|
214
|
+
# This would be where memory data is merged into the context
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
# Prepare messages for LiteLLM
|
|
218
|
+
messages = []
|
|
219
|
+
|
|
220
|
+
# If a schema is provided, augment the system prompt with strict schema adherence instructions and schema context
|
|
221
|
+
if schema:
|
|
222
|
+
# Generate a readable JSON schema string using the modern Pydantic API
|
|
223
|
+
try:
|
|
224
|
+
schema_dict = schema.model_json_schema()
|
|
225
|
+
schema_json = json.dumps(schema_dict, indent=2)
|
|
226
|
+
except Exception:
|
|
227
|
+
schema_json = str(schema)
|
|
228
|
+
schema_instruction = (
|
|
229
|
+
"IMPORTANT: You must exactly adhere to the output schema provided below. "
|
|
230
|
+
"Do not add or omit any fields. Output must be valid JSON matching the schema. "
|
|
231
|
+
"If a field is required in the schema, it must be present in the output. "
|
|
232
|
+
"If a field is not in the schema, do NOT include it in the output. "
|
|
233
|
+
"Here is the output schema you must follow:\n"
|
|
234
|
+
f"{schema_json}\n"
|
|
235
|
+
"Repeat: Output ONLY valid JSON matching the schema above. Do not include any extra text or explanation."
|
|
236
|
+
)
|
|
237
|
+
# Merge with any user-provided system prompt
|
|
238
|
+
if system:
|
|
239
|
+
system_prompt = f"{system}\n\n{schema_instruction}"
|
|
240
|
+
else:
|
|
241
|
+
system_prompt = schema_instruction
|
|
242
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
243
|
+
else:
|
|
244
|
+
if system:
|
|
245
|
+
messages.append({"role": "system", "content": system})
|
|
246
|
+
|
|
247
|
+
# Handle flexible user input with intelligent processing
|
|
248
|
+
if user:
|
|
249
|
+
messages.append({"role": "user", "content": user})
|
|
250
|
+
elif args:
|
|
251
|
+
processed_content = self._process_multimodal_args(args)
|
|
252
|
+
if processed_content:
|
|
253
|
+
messages.extend(processed_content)
|
|
254
|
+
|
|
255
|
+
litellm_module = litellm if hasattr(litellm, "acompletion") else None
|
|
256
|
+
|
|
257
|
+
# Ensure model limits are cached (done once per instance)
|
|
258
|
+
await self._ensure_model_limits_cached()
|
|
259
|
+
|
|
260
|
+
# Apply prompt trimming using LiteLLM's token-aware utility when available.
|
|
261
|
+
utils_module = getattr(litellm_module, "utils", None) if litellm_module else None
|
|
262
|
+
token_counter = getattr(utils_module, "token_counter", None) if utils_module else None
|
|
263
|
+
trim_messages = getattr(utils_module, "trim_messages", None) if utils_module else None
|
|
264
|
+
|
|
265
|
+
if token_counter is None:
|
|
266
|
+
def token_counter(model: str, messages: List[dict]) -> int:
|
|
267
|
+
return len(json.dumps(messages))
|
|
268
|
+
|
|
269
|
+
if trim_messages is None:
|
|
270
|
+
def trim_messages(messages: List[dict], model: str, max_tokens: int) -> List[dict]:
|
|
271
|
+
return messages
|
|
272
|
+
|
|
273
|
+
# Determine model context length using multiple fallback strategies
|
|
274
|
+
model_context_length = None
|
|
275
|
+
|
|
276
|
+
# Strategy 1: Use explicit max_input_tokens from config
|
|
277
|
+
if hasattr(final_config, "max_input_tokens") and final_config.max_input_tokens:
|
|
278
|
+
model_context_length = final_config.max_input_tokens
|
|
279
|
+
|
|
280
|
+
# Strategy 3: Use fallback model mappings
|
|
281
|
+
if not model_context_length and hasattr(final_config, "_MODEL_CONTEXT_LIMITS"):
|
|
282
|
+
candidate_limit = final_config._MODEL_CONTEXT_LIMITS.get(final_config.model)
|
|
283
|
+
if candidate_limit:
|
|
284
|
+
model_context_length = candidate_limit
|
|
285
|
+
|
|
286
|
+
# Strategy 4: Conservative fallback with warning
|
|
287
|
+
if not model_context_length:
|
|
288
|
+
model_context_length = 10192 # More reasonable than 4096
|
|
289
|
+
|
|
290
|
+
# Calculate safe input token limit: context_length - max_output_tokens - buffer
|
|
291
|
+
output_tokens = (
|
|
292
|
+
final_config.max_tokens or 7096
|
|
293
|
+
) # Default output if not specified
|
|
294
|
+
buffer_tokens = 100 # Small buffer for safety
|
|
295
|
+
|
|
296
|
+
safe_input_limit = max(
|
|
297
|
+
1000, model_context_length - output_tokens - buffer_tokens
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Validate the calculation makes sense
|
|
301
|
+
if safe_input_limit < 1000:
|
|
302
|
+
safe_input_limit = 1000
|
|
303
|
+
|
|
304
|
+
# Count actual prompt tokens using LiteLLM's token counter
|
|
305
|
+
try:
|
|
306
|
+
actual_prompt_tokens = token_counter(
|
|
307
|
+
model=final_config.model, messages=messages
|
|
308
|
+
)
|
|
309
|
+
except Exception as e:
|
|
310
|
+
log_debug(f"Could not count prompt tokens, proceeding with trimming: {e}")
|
|
311
|
+
actual_prompt_tokens = (
|
|
312
|
+
safe_input_limit + 1
|
|
313
|
+
) # Force trimming if we can't count
|
|
314
|
+
|
|
315
|
+
# Only trim if necessary based on actual token count
|
|
316
|
+
if actual_prompt_tokens > safe_input_limit:
|
|
317
|
+
trimmed_messages = trim_messages(
|
|
318
|
+
messages, final_config.model, max_tokens=safe_input_limit
|
|
319
|
+
)
|
|
320
|
+
if len(trimmed_messages) != len(messages) or any(
|
|
321
|
+
m1 != m2 for m1, m2 in zip(messages, trimmed_messages)
|
|
322
|
+
):
|
|
323
|
+
messages = trimmed_messages
|
|
324
|
+
else:
|
|
325
|
+
pass
|
|
326
|
+
|
|
327
|
+
# Prepare LiteLLM parameters using the config's method
|
|
328
|
+
# This leverages LiteLLM's standard environment variable handling and smart token management
|
|
329
|
+
litellm_params = final_config.get_litellm_params(
|
|
330
|
+
messages=messages,
|
|
331
|
+
**kwargs, # Runtime overrides have highest priority
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Ensure messages are always included in the final params
|
|
335
|
+
litellm_params["messages"] = messages
|
|
336
|
+
|
|
337
|
+
if schema:
|
|
338
|
+
# Use LiteLLM's native Pydantic model support for structured outputs
|
|
339
|
+
litellm_params["response_format"] = schema
|
|
340
|
+
|
|
341
|
+
# Define the LiteLLM call function for rate limiter
|
|
342
|
+
async def _make_litellm_call():
|
|
343
|
+
if litellm_module is None:
|
|
344
|
+
raise ImportError(
|
|
345
|
+
"litellm is not installed. Please install it with `pip install litellm`."
|
|
346
|
+
)
|
|
347
|
+
return await litellm_module.acompletion(**litellm_params)
|
|
348
|
+
|
|
349
|
+
async def _execute_with_fallbacks():
|
|
350
|
+
# Check for configured fallback models in AI config
|
|
351
|
+
fallback_models = getattr(final_config, "fallback_models", None)
|
|
352
|
+
if not fallback_models and getattr(
|
|
353
|
+
final_config, "final_fallback_model", None
|
|
354
|
+
):
|
|
355
|
+
# If only a final model is provided, treat it as a fallback list of one
|
|
356
|
+
fallback_models = [final_config.final_fallback_model]
|
|
357
|
+
|
|
358
|
+
if fallback_models:
|
|
359
|
+
# Ensure each fallback call has a valid provider
|
|
360
|
+
all_models = [final_config.model] + list(fallback_models)
|
|
361
|
+
last_exception = None
|
|
362
|
+
for m in all_models:
|
|
363
|
+
try:
|
|
364
|
+
if "/" not in m:
|
|
365
|
+
log_debug(
|
|
366
|
+
f"Skipping model {m} - no provider specified in model name"
|
|
367
|
+
)
|
|
368
|
+
raise ValueError(
|
|
369
|
+
f"Invalid model spec: '{m}'. Must include provider prefix, e.g. 'openai/gpt-4'."
|
|
370
|
+
)
|
|
371
|
+
litellm_params["model"] = m
|
|
372
|
+
return await _make_litellm_call()
|
|
373
|
+
except Exception as e:
|
|
374
|
+
log_debug(
|
|
375
|
+
f"Model {m} failed with {e}, trying next fallback if available..."
|
|
376
|
+
)
|
|
377
|
+
last_exception = e
|
|
378
|
+
continue
|
|
379
|
+
# If all models fail, re-raise the last exception
|
|
380
|
+
if last_exception:
|
|
381
|
+
raise last_exception
|
|
382
|
+
else:
|
|
383
|
+
# No fallbacks configured, just make the call
|
|
384
|
+
if "/" not in final_config.model:
|
|
385
|
+
raise ValueError(
|
|
386
|
+
f"Invalid model spec: '{final_config.model}'. Must include provider prefix, e.g. 'openai/gpt-4'."
|
|
387
|
+
)
|
|
388
|
+
return await _make_litellm_call()
|
|
389
|
+
|
|
390
|
+
if final_config.enable_rate_limit_retry:
|
|
391
|
+
rate_limiter = self._get_rate_limiter()
|
|
392
|
+
try:
|
|
393
|
+
response = await rate_limiter.execute_with_retry(
|
|
394
|
+
_execute_with_fallbacks
|
|
395
|
+
)
|
|
396
|
+
except Exception as e:
|
|
397
|
+
log_debug(f"LiteLLM call failed after retries: {e}")
|
|
398
|
+
raise
|
|
399
|
+
else:
|
|
400
|
+
try:
|
|
401
|
+
response = await _execute_with_fallbacks()
|
|
402
|
+
except HTTPStatusError as e:
|
|
403
|
+
log_debug(
|
|
404
|
+
f"LiteLLM HTTP call failed: {e.response.status_code} - {e.response.text}"
|
|
405
|
+
)
|
|
406
|
+
raise
|
|
407
|
+
except requests.exceptions.RequestException as e:
|
|
408
|
+
log_debug(f"LiteLLM network call failed: {e}")
|
|
409
|
+
if e.response is not None:
|
|
410
|
+
log_debug(f"Response status: {e.response.status_code}")
|
|
411
|
+
log_debug(f"Response text: {e.response.text}")
|
|
412
|
+
raise
|
|
413
|
+
except Exception as e:
|
|
414
|
+
log_debug(f"LiteLLM call failed: {e}")
|
|
415
|
+
raise
|
|
416
|
+
|
|
417
|
+
# Process the response
|
|
418
|
+
if final_config.stream:
|
|
419
|
+
# For streaming, return the generator
|
|
420
|
+
return response
|
|
421
|
+
else:
|
|
422
|
+
# Import multimodal response detection
|
|
423
|
+
from .multimodal_response import detect_multimodal_response
|
|
424
|
+
|
|
425
|
+
# Detect and wrap multimodal content
|
|
426
|
+
multimodal_response = detect_multimodal_response(response)
|
|
427
|
+
|
|
428
|
+
if schema:
|
|
429
|
+
# For schema responses, try to parse from text content
|
|
430
|
+
try:
|
|
431
|
+
json_data = json.loads(str(multimodal_response.text))
|
|
432
|
+
return schema(**json_data)
|
|
433
|
+
except (json.JSONDecodeError, ValueError) as parse_error:
|
|
434
|
+
log_error(f"Failed to parse JSON response: {parse_error}")
|
|
435
|
+
log_debug(f"Raw response: {multimodal_response.text}")
|
|
436
|
+
# Fallback: try to extract JSON from the response
|
|
437
|
+
json_match = re.search(
|
|
438
|
+
r"\{.*\}", str(multimodal_response.text), re.DOTALL
|
|
439
|
+
)
|
|
440
|
+
if json_match:
|
|
441
|
+
try:
|
|
442
|
+
json_data = json.loads(json_match.group())
|
|
443
|
+
return schema(**json_data)
|
|
444
|
+
except (json.JSONDecodeError, ValueError):
|
|
445
|
+
pass
|
|
446
|
+
raise ValueError(
|
|
447
|
+
f"Could not parse structured response: {multimodal_response.text}"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# Return MultimodalResponse for backward compatibility and enhanced features
|
|
451
|
+
return multimodal_response
|
|
452
|
+
|
|
453
|
+
def _process_multimodal_args(self, args: tuple) -> List[Dict[str, Any]]:
|
|
454
|
+
"""Process multimodal arguments into LiteLLM-compatible message format"""
|
|
455
|
+
from agentfield.multimodal import Audio, File, Image, Text
|
|
456
|
+
|
|
457
|
+
messages = []
|
|
458
|
+
user_content = []
|
|
459
|
+
|
|
460
|
+
for arg in args:
|
|
461
|
+
# Handle our multimodal input classes first
|
|
462
|
+
if isinstance(arg, Text):
|
|
463
|
+
user_content.append({"type": "text", "text": arg.text})
|
|
464
|
+
|
|
465
|
+
elif isinstance(arg, Image):
|
|
466
|
+
if isinstance(arg.image_url, dict):
|
|
467
|
+
user_content.append(
|
|
468
|
+
{"type": "image_url", "image_url": arg.image_url}
|
|
469
|
+
)
|
|
470
|
+
else:
|
|
471
|
+
user_content.append(
|
|
472
|
+
{
|
|
473
|
+
"type": "image_url",
|
|
474
|
+
"image_url": {"url": arg.image_url, "detail": "high"},
|
|
475
|
+
}
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
elif isinstance(arg, Audio):
|
|
479
|
+
# Handle audio input according to LiteLLM GPT-4o-audio pattern
|
|
480
|
+
user_content.append(
|
|
481
|
+
{"type": "input_audio", "input_audio": arg.input_audio}
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
elif isinstance(arg, File):
|
|
485
|
+
# For now, treat files as text references
|
|
486
|
+
if isinstance(arg.file, dict):
|
|
487
|
+
file_info = arg.file
|
|
488
|
+
user_content.append(
|
|
489
|
+
{
|
|
490
|
+
"type": "text",
|
|
491
|
+
"text": f"[File: {file_info.get('url', 'unknown')}]",
|
|
492
|
+
}
|
|
493
|
+
)
|
|
494
|
+
else:
|
|
495
|
+
user_content.append({"type": "text", "text": f"[File: {arg.file}]"})
|
|
496
|
+
|
|
497
|
+
else:
|
|
498
|
+
# Fall back to automatic detection for raw inputs
|
|
499
|
+
detected_type = AgentUtils.detect_input_type(arg)
|
|
500
|
+
|
|
501
|
+
if detected_type == "text":
|
|
502
|
+
user_content.append({"type": "text", "text": arg})
|
|
503
|
+
|
|
504
|
+
elif detected_type == "image_url":
|
|
505
|
+
user_content.append(
|
|
506
|
+
{
|
|
507
|
+
"type": "image_url",
|
|
508
|
+
"image_url": {"url": arg, "detail": "high"},
|
|
509
|
+
}
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
elif detected_type == "image_file":
|
|
513
|
+
# Convert file to base64 data URL
|
|
514
|
+
try:
|
|
515
|
+
import base64
|
|
516
|
+
|
|
517
|
+
with open(arg, "rb") as f:
|
|
518
|
+
image_data = base64.b64encode(f.read()).decode()
|
|
519
|
+
ext = os.path.splitext(arg)[1].lower()
|
|
520
|
+
mime_type = AgentUtils.get_mime_type(ext)
|
|
521
|
+
data_url = f"data:{mime_type};base64,{image_data}"
|
|
522
|
+
user_content.append(
|
|
523
|
+
{
|
|
524
|
+
"type": "image_url",
|
|
525
|
+
"image_url": {"url": data_url, "detail": "high"},
|
|
526
|
+
}
|
|
527
|
+
)
|
|
528
|
+
except Exception as e:
|
|
529
|
+
log_warn(f"Could not read image file {arg}: {e}")
|
|
530
|
+
user_content.append(
|
|
531
|
+
{"type": "text", "text": f"[Image file: {arg}]"}
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
elif detected_type == "audio_file":
|
|
535
|
+
# Convert audio file to LiteLLM input_audio format
|
|
536
|
+
try:
|
|
537
|
+
import base64
|
|
538
|
+
|
|
539
|
+
with open(arg, "rb") as f:
|
|
540
|
+
audio_data = base64.b64encode(f.read()).decode()
|
|
541
|
+
|
|
542
|
+
# Detect format from extension
|
|
543
|
+
ext = os.path.splitext(arg)[1].lower().lstrip(".")
|
|
544
|
+
audio_format = (
|
|
545
|
+
ext if ext in ["wav", "mp3", "flac", "ogg"] else "wav"
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
user_content.append(
|
|
549
|
+
{
|
|
550
|
+
"type": "input_audio",
|
|
551
|
+
"input_audio": {
|
|
552
|
+
"data": audio_data,
|
|
553
|
+
"format": audio_format,
|
|
554
|
+
},
|
|
555
|
+
}
|
|
556
|
+
)
|
|
557
|
+
except Exception as e:
|
|
558
|
+
log_warn(f"Could not read audio file {arg}: {e}")
|
|
559
|
+
user_content.append(
|
|
560
|
+
{
|
|
561
|
+
"type": "text",
|
|
562
|
+
"text": f"[Audio file: {os.path.basename(arg)}]",
|
|
563
|
+
}
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
elif detected_type == "document_file":
|
|
567
|
+
# For documents, we might need to extract text
|
|
568
|
+
# For now, just reference the file
|
|
569
|
+
user_content.append(
|
|
570
|
+
{
|
|
571
|
+
"type": "text",
|
|
572
|
+
"text": f"[Document file: {os.path.basename(arg)}]",
|
|
573
|
+
}
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
elif detected_type == "image_base64":
|
|
577
|
+
user_content.append(
|
|
578
|
+
{
|
|
579
|
+
"type": "image_url",
|
|
580
|
+
"image_url": {"url": arg, "detail": "high"},
|
|
581
|
+
}
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
elif detected_type == "audio_base64":
|
|
585
|
+
# Extract format and data from data URL
|
|
586
|
+
try:
|
|
587
|
+
if arg.startswith("data:audio/"):
|
|
588
|
+
# Parse data URL: data:audio/wav;base64,<data>
|
|
589
|
+
header, data = arg.split(",", 1)
|
|
590
|
+
format_part = header.split(";")[0].split("/")[1]
|
|
591
|
+
user_content.append(
|
|
592
|
+
{
|
|
593
|
+
"type": "input_audio",
|
|
594
|
+
"input_audio": {
|
|
595
|
+
"data": data,
|
|
596
|
+
"format": format_part,
|
|
597
|
+
},
|
|
598
|
+
}
|
|
599
|
+
)
|
|
600
|
+
else:
|
|
601
|
+
user_content.append(
|
|
602
|
+
{"type": "text", "text": "[Audio data provided]"}
|
|
603
|
+
)
|
|
604
|
+
except Exception as e:
|
|
605
|
+
log_warn(f"Could not process audio base64: {e}")
|
|
606
|
+
user_content.append(
|
|
607
|
+
{"type": "text", "text": "[Audio data provided]"}
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
elif detected_type == "image_bytes":
|
|
611
|
+
# Convert bytes to base64 data URL
|
|
612
|
+
try:
|
|
613
|
+
import base64
|
|
614
|
+
|
|
615
|
+
image_data = base64.b64encode(arg).decode()
|
|
616
|
+
# Try to detect image type from bytes
|
|
617
|
+
if arg.startswith(b"\xff\xd8\xff"):
|
|
618
|
+
mime_type = "image/jpeg"
|
|
619
|
+
elif arg.startswith(b"\x89PNG"):
|
|
620
|
+
mime_type = "image/png"
|
|
621
|
+
elif arg.startswith(b"GIF8"):
|
|
622
|
+
mime_type = "image/gif"
|
|
623
|
+
else:
|
|
624
|
+
mime_type = "image/png" # Default
|
|
625
|
+
|
|
626
|
+
data_url = f"data:{mime_type};base64,{image_data}"
|
|
627
|
+
user_content.append(
|
|
628
|
+
{
|
|
629
|
+
"type": "image_url",
|
|
630
|
+
"image_url": {"url": data_url, "detail": "high"},
|
|
631
|
+
}
|
|
632
|
+
)
|
|
633
|
+
except Exception as e:
|
|
634
|
+
log_warn(f"Could not process image bytes: {e}")
|
|
635
|
+
user_content.append(
|
|
636
|
+
{"type": "text", "text": "[Image data provided]"}
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
elif detected_type == "audio_bytes":
|
|
640
|
+
# Convert audio bytes to input_audio format
|
|
641
|
+
try:
|
|
642
|
+
import base64
|
|
643
|
+
|
|
644
|
+
audio_data = base64.b64encode(arg).decode()
|
|
645
|
+
# Try to detect format from bytes
|
|
646
|
+
if arg.startswith(b"RIFF") and b"WAVE" in arg[:12]:
|
|
647
|
+
audio_format = "wav"
|
|
648
|
+
elif arg.startswith(b"ID3") or arg.startswith(b"\xff\xfb"):
|
|
649
|
+
audio_format = "mp3"
|
|
650
|
+
else:
|
|
651
|
+
audio_format = "wav" # Default
|
|
652
|
+
|
|
653
|
+
user_content.append(
|
|
654
|
+
{
|
|
655
|
+
"type": "input_audio",
|
|
656
|
+
"input_audio": {
|
|
657
|
+
"data": audio_data,
|
|
658
|
+
"format": audio_format,
|
|
659
|
+
},
|
|
660
|
+
}
|
|
661
|
+
)
|
|
662
|
+
except Exception as e:
|
|
663
|
+
log_warn(f"Could not process audio bytes: {e}")
|
|
664
|
+
user_content.append(
|
|
665
|
+
{"type": "text", "text": "[Audio data provided]"}
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
elif detected_type == "structured_input":
|
|
669
|
+
# Handle dict with explicit keys
|
|
670
|
+
if "system" in arg:
|
|
671
|
+
messages.append({"role": "system", "content": arg["system"]})
|
|
672
|
+
if "user" in arg:
|
|
673
|
+
user_content.append({"type": "text", "text": arg["user"]})
|
|
674
|
+
# Handle other structured content
|
|
675
|
+
for key in [
|
|
676
|
+
"text",
|
|
677
|
+
"image",
|
|
678
|
+
"image_url",
|
|
679
|
+
"audio",
|
|
680
|
+
]:
|
|
681
|
+
if key in arg:
|
|
682
|
+
if key == "text":
|
|
683
|
+
user_content.append({"type": "text", "text": arg[key]})
|
|
684
|
+
elif key in ["image", "image_url"]:
|
|
685
|
+
if isinstance(arg[key], dict):
|
|
686
|
+
user_content.append(
|
|
687
|
+
{"type": "image_url", "image_url": arg[key]}
|
|
688
|
+
)
|
|
689
|
+
else:
|
|
690
|
+
user_content.append(
|
|
691
|
+
{
|
|
692
|
+
"type": "image_url",
|
|
693
|
+
"image_url": {
|
|
694
|
+
"url": arg[key],
|
|
695
|
+
"detail": "high",
|
|
696
|
+
},
|
|
697
|
+
}
|
|
698
|
+
)
|
|
699
|
+
elif key == "audio":
|
|
700
|
+
if isinstance(arg[key], dict):
|
|
701
|
+
user_content.append(
|
|
702
|
+
{"type": "input_audio", "input_audio": arg[key]}
|
|
703
|
+
)
|
|
704
|
+
else:
|
|
705
|
+
# Assume it's a file path or URL
|
|
706
|
+
user_content.append(
|
|
707
|
+
{"type": "text", "text": f"[Audio: {arg[key]}]"}
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
elif detected_type == "message_dict":
|
|
711
|
+
# Handle message format dict
|
|
712
|
+
messages.append(arg)
|
|
713
|
+
|
|
714
|
+
elif detected_type == "conversation_list":
|
|
715
|
+
# Handle list of messages
|
|
716
|
+
messages.extend(arg)
|
|
717
|
+
|
|
718
|
+
elif detected_type == "multimodal_list":
|
|
719
|
+
# Handle mixed list of content
|
|
720
|
+
for item in arg:
|
|
721
|
+
if isinstance(item, str):
|
|
722
|
+
user_content.append({"type": "text", "text": item})
|
|
723
|
+
elif isinstance(item, dict):
|
|
724
|
+
if "role" in item:
|
|
725
|
+
messages.append(item)
|
|
726
|
+
else:
|
|
727
|
+
# Process as structured input
|
|
728
|
+
sub_messages = self._process_multimodal_args((item,))
|
|
729
|
+
messages.extend(sub_messages)
|
|
730
|
+
|
|
731
|
+
elif detected_type == "dict":
|
|
732
|
+
# Generic dict - convert to text representation
|
|
733
|
+
import json
|
|
734
|
+
|
|
735
|
+
user_content.append(
|
|
736
|
+
{"type": "text", "text": f"Data: {json.dumps(arg, indent=2)}"}
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
else:
|
|
740
|
+
# Fallback for unknown types
|
|
741
|
+
user_content.append({"type": "text", "text": str(arg)})
|
|
742
|
+
|
|
743
|
+
# Add user content as a message if we have any
|
|
744
|
+
if user_content:
|
|
745
|
+
if len(user_content) == 1 and user_content[0]["type"] == "text":
|
|
746
|
+
# Simplify single text content
|
|
747
|
+
messages.append({"role": "user", "content": user_content[0]["text"]})
|
|
748
|
+
else:
|
|
749
|
+
# Multiple content types
|
|
750
|
+
messages.append({"role": "user", "content": user_content})
|
|
751
|
+
|
|
752
|
+
return messages
|
|
753
|
+
|
|
754
|
+
async def ai_with_audio(
|
|
755
|
+
self,
|
|
756
|
+
*args: Any,
|
|
757
|
+
voice: str = "alloy",
|
|
758
|
+
format: str = "wav",
|
|
759
|
+
model: Optional[str] = None,
|
|
760
|
+
mode: Optional[str] = None,
|
|
761
|
+
**kwargs,
|
|
762
|
+
) -> Any:
|
|
763
|
+
"""
|
|
764
|
+
AI method optimized for audio output generation.
|
|
765
|
+
|
|
766
|
+
Automatically detects the model type and uses the appropriate LiteLLM function:
|
|
767
|
+
- For TTS models (tts-1, tts-1-hd, gpt-4o-mini-tts): Uses litellm.speech()
|
|
768
|
+
- For audio-capable chat models (gpt-4o-audio-preview): Uses litellm.completion() with audio modalities
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
*args: Input arguments (text prompts, etc.)
|
|
772
|
+
voice: Voice to use for audio generation (alloy, echo, fable, onyx, nova, shimmer)
|
|
773
|
+
format: Audio format (wav, mp3, etc.)
|
|
774
|
+
model: Model to use (defaults to tts-1)
|
|
775
|
+
**kwargs: Additional parameters
|
|
776
|
+
|
|
777
|
+
Returns:
|
|
778
|
+
MultimodalResponse with audio content
|
|
779
|
+
|
|
780
|
+
Example:
|
|
781
|
+
audio_result = await agent.ai_with_audio("Say hello warmly", voice="alloy")
|
|
782
|
+
audio_result.audio.save("greeting.wav")
|
|
783
|
+
"""
|
|
784
|
+
# Use TTS model as default (more reliable than gpt-4o-audio-preview)
|
|
785
|
+
if model is None:
|
|
786
|
+
model = (
|
|
787
|
+
self.agent.ai_config.audio_model
|
|
788
|
+
) # Use configured audio model (defaults to tts-1)
|
|
789
|
+
|
|
790
|
+
# Check if mode="openai_direct" is specified
|
|
791
|
+
if mode == "openai_direct":
|
|
792
|
+
# Use direct OpenAI client with streaming response
|
|
793
|
+
return await self._generate_openai_direct_audio(
|
|
794
|
+
*args,
|
|
795
|
+
voice=voice,
|
|
796
|
+
format=format,
|
|
797
|
+
model=model or "gpt-4o-mini-tts",
|
|
798
|
+
**kwargs,
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
# Check if this is a TTS model that needs the speech endpoint
|
|
802
|
+
tts_models = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
|
|
803
|
+
if model in tts_models:
|
|
804
|
+
# Use LiteLLM speech function for TTS models
|
|
805
|
+
return await self._generate_tts_audio(
|
|
806
|
+
*args, voice=voice, format=format, model=model, **kwargs
|
|
807
|
+
)
|
|
808
|
+
else:
|
|
809
|
+
# Use chat completion with audio modalities for other models
|
|
810
|
+
audio_params = {
|
|
811
|
+
"modalities": ["text", "audio"],
|
|
812
|
+
"audio": {"voice": voice, "format": format},
|
|
813
|
+
}
|
|
814
|
+
final_kwargs = {**audio_params, **kwargs}
|
|
815
|
+
return await self.ai(*args, model=model, **final_kwargs)
|
|
816
|
+
|
|
817
|
+
async def _generate_tts_audio(
|
|
818
|
+
self,
|
|
819
|
+
*args: Any,
|
|
820
|
+
voice: str = "alloy",
|
|
821
|
+
format: str = "wav",
|
|
822
|
+
model: str = "tts-1",
|
|
823
|
+
**kwargs,
|
|
824
|
+
) -> Any:
|
|
825
|
+
"""
|
|
826
|
+
Generate audio using LiteLLM's speech function for TTS models.
|
|
827
|
+
"""
|
|
828
|
+
from agentfield.multimodal_response import (
|
|
829
|
+
AudioOutput,
|
|
830
|
+
MultimodalResponse,
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
litellm_module = litellm
|
|
834
|
+
if not hasattr(litellm_module, "aspeech"):
|
|
835
|
+
raise ImportError(
|
|
836
|
+
"litellm is not installed. Please install it with `pip install litellm` to use TTS features."
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
# Combine all text inputs
|
|
840
|
+
text_input = " ".join(str(arg) for arg in args if isinstance(arg, str))
|
|
841
|
+
if not text_input:
|
|
842
|
+
text_input = "Hello, this is a test audio message."
|
|
843
|
+
|
|
844
|
+
try:
|
|
845
|
+
# Get API configuration
|
|
846
|
+
config = self.agent.ai_config.get_litellm_params()
|
|
847
|
+
|
|
848
|
+
# Use LiteLLM speech function
|
|
849
|
+
response = await litellm_module.aspeech(
|
|
850
|
+
model=model,
|
|
851
|
+
input=text_input,
|
|
852
|
+
voice=voice,
|
|
853
|
+
response_format=format,
|
|
854
|
+
api_key=config.get("api_key"),
|
|
855
|
+
**kwargs,
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
# Convert binary response to base64 string for AudioOutput
|
|
859
|
+
import base64
|
|
860
|
+
|
|
861
|
+
try:
|
|
862
|
+
# Try different methods to get binary content
|
|
863
|
+
if hasattr(response, "content"):
|
|
864
|
+
binary_content = response.content
|
|
865
|
+
elif hasattr(response, "read"):
|
|
866
|
+
binary_content = response.read()
|
|
867
|
+
elif hasattr(response, "__iter__"):
|
|
868
|
+
# For HttpxBinaryResponseContent, iterate to get bytes
|
|
869
|
+
binary_content = b"".join(response)
|
|
870
|
+
else:
|
|
871
|
+
# Last resort - convert to string and encode
|
|
872
|
+
binary_content = str(response).encode("utf-8")
|
|
873
|
+
|
|
874
|
+
audio_data = base64.b64encode(binary_content).decode("utf-8")
|
|
875
|
+
except Exception as e:
|
|
876
|
+
log_error(f"Failed to process audio response: {e}")
|
|
877
|
+
# Use a placeholder for now
|
|
878
|
+
audio_data = ""
|
|
879
|
+
|
|
880
|
+
# Create AudioOutput directly
|
|
881
|
+
audio_output = AudioOutput(data=audio_data, format=format, url=None)
|
|
882
|
+
|
|
883
|
+
# Create MultimodalResponse directly
|
|
884
|
+
return MultimodalResponse(
|
|
885
|
+
text=text_input,
|
|
886
|
+
audio=audio_output,
|
|
887
|
+
images=[],
|
|
888
|
+
files=[],
|
|
889
|
+
raw_response=response,
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
except Exception as e:
|
|
893
|
+
# Fallback to text-only MultimodalResponse
|
|
894
|
+
log_error(f"TTS generation failed: {e}")
|
|
895
|
+
return MultimodalResponse(
|
|
896
|
+
text=text_input,
|
|
897
|
+
audio=None,
|
|
898
|
+
images=[],
|
|
899
|
+
files=[],
|
|
900
|
+
raw_response=text_input,
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
async def _generate_openai_direct_audio(
|
|
904
|
+
self,
|
|
905
|
+
*args: Any,
|
|
906
|
+
voice: str = "alloy",
|
|
907
|
+
format: str = "wav",
|
|
908
|
+
model: str = "gpt-4o-mini-tts",
|
|
909
|
+
**kwargs,
|
|
910
|
+
) -> Any:
|
|
911
|
+
"""
|
|
912
|
+
Generate audio using OpenAI client directly with streaming response.
|
|
913
|
+
This method supports OpenAI-specific parameters like 'instructions' and 'speed'.
|
|
914
|
+
|
|
915
|
+
All kwargs are passed through to OpenAI SDK. The SDK will validate parameters
|
|
916
|
+
and reject unsupported ones.
|
|
917
|
+
|
|
918
|
+
Common OpenAI parameters:
|
|
919
|
+
- instructions: Guide the model's speaking style
|
|
920
|
+
- speed: Speech speed (0.25 to 4.0)
|
|
921
|
+
- response_format: Audio format (mp3, opus, aac, flac, wav, pcm)
|
|
922
|
+
"""
|
|
923
|
+
import base64
|
|
924
|
+
import tempfile
|
|
925
|
+
from pathlib import Path
|
|
926
|
+
|
|
927
|
+
from agentfield.multimodal_response import AudioOutput, MultimodalResponse
|
|
928
|
+
from openai import OpenAI
|
|
929
|
+
|
|
930
|
+
# Combine all text inputs
|
|
931
|
+
text_input = " ".join(str(arg) for arg in args if isinstance(arg, str))
|
|
932
|
+
if not text_input:
|
|
933
|
+
text_input = "Hello, this is a test audio message."
|
|
934
|
+
|
|
935
|
+
try:
|
|
936
|
+
# Get API configuration
|
|
937
|
+
config = self.agent.ai_config.get_litellm_params()
|
|
938
|
+
api_key = config.get("api_key")
|
|
939
|
+
|
|
940
|
+
if not api_key:
|
|
941
|
+
raise ValueError("OpenAI API key not found in configuration")
|
|
942
|
+
|
|
943
|
+
# Initialize OpenAI client
|
|
944
|
+
client = OpenAI(api_key=api_key)
|
|
945
|
+
|
|
946
|
+
# Prepare base parameters for OpenAI speech API
|
|
947
|
+
speech_params = {
|
|
948
|
+
"model": model,
|
|
949
|
+
"voice": voice,
|
|
950
|
+
"input": text_input,
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
# Map format parameter to response_format if not already in kwargs
|
|
954
|
+
if "response_format" not in kwargs and format:
|
|
955
|
+
speech_params["response_format"] = format
|
|
956
|
+
|
|
957
|
+
# Pass all kwargs through to OpenAI SDK
|
|
958
|
+
# Let OpenAI SDK handle parameter validation
|
|
959
|
+
speech_params.update(kwargs)
|
|
960
|
+
|
|
961
|
+
# Create a temporary file for the audio
|
|
962
|
+
with tempfile.NamedTemporaryFile(
|
|
963
|
+
suffix=f".{format}", delete=False
|
|
964
|
+
) as temp_file:
|
|
965
|
+
temp_path = Path(temp_file.name)
|
|
966
|
+
|
|
967
|
+
try:
|
|
968
|
+
# Use OpenAI streaming response
|
|
969
|
+
with client.audio.speech.with_streaming_response.create(
|
|
970
|
+
**speech_params
|
|
971
|
+
) as response:
|
|
972
|
+
response.stream_to_file(temp_path)
|
|
973
|
+
|
|
974
|
+
# Read the audio file and convert to base64
|
|
975
|
+
with open(temp_path, "rb") as audio_file:
|
|
976
|
+
binary_content = audio_file.read()
|
|
977
|
+
audio_data = base64.b64encode(binary_content).decode("utf-8")
|
|
978
|
+
|
|
979
|
+
# Create AudioOutput
|
|
980
|
+
audio_output = AudioOutput(data=audio_data, format=format, url=None)
|
|
981
|
+
|
|
982
|
+
# Create MultimodalResponse
|
|
983
|
+
return MultimodalResponse(
|
|
984
|
+
text=text_input,
|
|
985
|
+
audio=audio_output,
|
|
986
|
+
images=[],
|
|
987
|
+
files=[],
|
|
988
|
+
raw_response=response,
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
finally:
|
|
992
|
+
# Clean up temporary file
|
|
993
|
+
if temp_path.exists():
|
|
994
|
+
temp_path.unlink()
|
|
995
|
+
|
|
996
|
+
except Exception as e:
|
|
997
|
+
# Fallback to text-only MultimodalResponse
|
|
998
|
+
log_error(f"OpenAI direct audio generation failed: {e}")
|
|
999
|
+
return MultimodalResponse(
|
|
1000
|
+
text=text_input,
|
|
1001
|
+
audio=None,
|
|
1002
|
+
images=[],
|
|
1003
|
+
files=[],
|
|
1004
|
+
raw_response=text_input,
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
async def ai_with_vision(
|
|
1008
|
+
self,
|
|
1009
|
+
prompt: str,
|
|
1010
|
+
size: str = "1024x1024",
|
|
1011
|
+
quality: str = "standard",
|
|
1012
|
+
style: Optional[str] = None,
|
|
1013
|
+
model: Optional[str] = None,
|
|
1014
|
+
response_format: str = "url",
|
|
1015
|
+
**kwargs,
|
|
1016
|
+
) -> Any:
|
|
1017
|
+
"""
|
|
1018
|
+
AI method optimized for image generation.
|
|
1019
|
+
|
|
1020
|
+
Supports both LiteLLM and OpenRouter providers:
|
|
1021
|
+
- LiteLLM: Use model names like "dall-e-3", "azure/dall-e-3", "bedrock/stability.stable-diffusion-xl"
|
|
1022
|
+
- OpenRouter: Use model names with "openrouter/" prefix like "openrouter/google/gemini-2.5-flash-image-preview"
|
|
1023
|
+
|
|
1024
|
+
Args:
|
|
1025
|
+
prompt: Text prompt for image generation
|
|
1026
|
+
size: Image size (256x256, 512x512, 1024x1024, 1792x1024, 1024x1792)
|
|
1027
|
+
quality: Image quality (standard, hd)
|
|
1028
|
+
style: Image style (vivid, natural) for DALL-E 3
|
|
1029
|
+
model: Model to use (defaults to dall-e-3)
|
|
1030
|
+
response_format: Response format ('url' or 'b64_json'). Defaults to 'url'
|
|
1031
|
+
**kwargs: Additional provider-specific parameters
|
|
1032
|
+
|
|
1033
|
+
Returns:
|
|
1034
|
+
MultimodalResponse with image content
|
|
1035
|
+
|
|
1036
|
+
Examples:
|
|
1037
|
+
# LiteLLM (DALL-E)
|
|
1038
|
+
result = await agent.ai_with_vision("A sunset over mountains")
|
|
1039
|
+
result.images[0].save("sunset.png")
|
|
1040
|
+
|
|
1041
|
+
# OpenRouter (Gemini)
|
|
1042
|
+
result = await agent.ai_with_vision(
|
|
1043
|
+
"A futuristic city",
|
|
1044
|
+
model="openrouter/google/gemini-2.5-flash-image-preview",
|
|
1045
|
+
image_config={"aspect_ratio": "16:9"}
|
|
1046
|
+
)
|
|
1047
|
+
|
|
1048
|
+
# Get base64 data directly
|
|
1049
|
+
result = await agent.ai_with_vision("A sunset", response_format="b64_json")
|
|
1050
|
+
"""
|
|
1051
|
+
from agentfield import vision
|
|
1052
|
+
|
|
1053
|
+
# Use image generation model if not specified
|
|
1054
|
+
if model is None:
|
|
1055
|
+
model = "dall-e-3" # Default image model
|
|
1056
|
+
|
|
1057
|
+
# Route based on model prefix
|
|
1058
|
+
if model.startswith("openrouter/"):
|
|
1059
|
+
# OpenRouter: Use chat completions API with image modality
|
|
1060
|
+
return await vision.generate_image_openrouter(
|
|
1061
|
+
prompt=prompt,
|
|
1062
|
+
model=model,
|
|
1063
|
+
size=size,
|
|
1064
|
+
quality=quality,
|
|
1065
|
+
style=style,
|
|
1066
|
+
response_format=response_format,
|
|
1067
|
+
**kwargs,
|
|
1068
|
+
)
|
|
1069
|
+
else:
|
|
1070
|
+
# LiteLLM: Use image generation API
|
|
1071
|
+
return await vision.generate_image_litellm(
|
|
1072
|
+
prompt=prompt,
|
|
1073
|
+
model=model,
|
|
1074
|
+
size=size,
|
|
1075
|
+
quality=quality,
|
|
1076
|
+
style=style,
|
|
1077
|
+
response_format=response_format,
|
|
1078
|
+
**kwargs,
|
|
1079
|
+
)
|
|
1080
|
+
|
|
1081
|
+
async def ai_with_multimodal(
|
|
1082
|
+
self,
|
|
1083
|
+
*args: Any,
|
|
1084
|
+
modalities: Optional[List[str]] = None,
|
|
1085
|
+
audio_config: Optional[Dict] = None,
|
|
1086
|
+
model: Optional[str] = None,
|
|
1087
|
+
**kwargs,
|
|
1088
|
+
) -> Any:
|
|
1089
|
+
"""
|
|
1090
|
+
AI method for explicit multimodal input/output control.
|
|
1091
|
+
|
|
1092
|
+
Args:
|
|
1093
|
+
*args: Mixed multimodal inputs
|
|
1094
|
+
modalities: List of desired output modalities (["text", "audio", "image"])
|
|
1095
|
+
audio_config: Audio configuration if audio modality requested
|
|
1096
|
+
model: Model to use
|
|
1097
|
+
**kwargs: Additional parameters
|
|
1098
|
+
|
|
1099
|
+
Returns:
|
|
1100
|
+
MultimodalResponse with requested modalities
|
|
1101
|
+
|
|
1102
|
+
Example:
|
|
1103
|
+
result = await agent.ai_with_multimodal(
|
|
1104
|
+
"Describe this image and provide audio narration",
|
|
1105
|
+
image_from_url("https://example.com/image.jpg"),
|
|
1106
|
+
modalities=["text", "audio"],
|
|
1107
|
+
audio_config={"voice": "nova", "format": "wav"}
|
|
1108
|
+
)
|
|
1109
|
+
"""
|
|
1110
|
+
multimodal_params = {}
|
|
1111
|
+
|
|
1112
|
+
if modalities:
|
|
1113
|
+
multimodal_params["modalities"] = modalities
|
|
1114
|
+
|
|
1115
|
+
if audio_config and "audio" in (modalities or []):
|
|
1116
|
+
multimodal_params["audio"] = audio_config
|
|
1117
|
+
|
|
1118
|
+
# Use multimodal-capable model if not specified
|
|
1119
|
+
if model is None and modalities and "audio" in modalities:
|
|
1120
|
+
model = "gpt-4o-audio-preview"
|
|
1121
|
+
|
|
1122
|
+
# Merge with user kwargs
|
|
1123
|
+
final_kwargs = {**multimodal_params, **kwargs}
|
|
1124
|
+
|
|
1125
|
+
return await self.ai(*args, model=model, **final_kwargs)
|