agentfield 0.1.22rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. agentfield/__init__.py +66 -0
  2. agentfield/agent.py +3569 -0
  3. agentfield/agent_ai.py +1125 -0
  4. agentfield/agent_cli.py +386 -0
  5. agentfield/agent_field_handler.py +494 -0
  6. agentfield/agent_mcp.py +534 -0
  7. agentfield/agent_registry.py +29 -0
  8. agentfield/agent_server.py +1185 -0
  9. agentfield/agent_utils.py +269 -0
  10. agentfield/agent_workflow.py +323 -0
  11. agentfield/async_config.py +278 -0
  12. agentfield/async_execution_manager.py +1227 -0
  13. agentfield/client.py +1447 -0
  14. agentfield/connection_manager.py +280 -0
  15. agentfield/decorators.py +527 -0
  16. agentfield/did_manager.py +337 -0
  17. agentfield/dynamic_skills.py +304 -0
  18. agentfield/execution_context.py +255 -0
  19. agentfield/execution_state.py +453 -0
  20. agentfield/http_connection_manager.py +429 -0
  21. agentfield/litellm_adapters.py +140 -0
  22. agentfield/logger.py +249 -0
  23. agentfield/mcp_client.py +204 -0
  24. agentfield/mcp_manager.py +340 -0
  25. agentfield/mcp_stdio_bridge.py +550 -0
  26. agentfield/memory.py +723 -0
  27. agentfield/memory_events.py +489 -0
  28. agentfield/multimodal.py +173 -0
  29. agentfield/multimodal_response.py +403 -0
  30. agentfield/pydantic_utils.py +227 -0
  31. agentfield/rate_limiter.py +280 -0
  32. agentfield/result_cache.py +441 -0
  33. agentfield/router.py +190 -0
  34. agentfield/status.py +70 -0
  35. agentfield/types.py +710 -0
  36. agentfield/utils.py +26 -0
  37. agentfield/vc_generator.py +464 -0
  38. agentfield/vision.py +198 -0
  39. agentfield-0.1.22rc2.dist-info/METADATA +102 -0
  40. agentfield-0.1.22rc2.dist-info/RECORD +42 -0
  41. agentfield-0.1.22rc2.dist-info/WHEEL +5 -0
  42. agentfield-0.1.22rc2.dist-info/top_level.txt +1 -0
agentfield/agent_ai.py ADDED
@@ -0,0 +1,1125 @@
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Any, Dict, List, Literal, Optional, Type, Union
5
+
6
+ import requests
7
+ from agentfield.agent_utils import AgentUtils
8
+ from agentfield.logger import log_debug, log_error, log_warn
9
+ from agentfield.rate_limiter import StatelessRateLimiter
10
+ from httpx import HTTPStatusError
11
+ from pydantic import BaseModel
12
+
13
+ # Expose module-level symbols for patching in tests
14
+ try:
15
+ import litellm as litellm # type: ignore
16
+ except Exception: # pragma: no cover - test environments may not have litellm
17
+
18
+ class _LiteLLMStub:
19
+ pass
20
+
21
+ litellm = _LiteLLMStub() # type: ignore
22
+
23
+ try:
24
+ import openai as openai # type: ignore
25
+ except Exception: # pragma: no cover - test environments may not have openai
26
+
27
+ class _OpenAIStub:
28
+ class OpenAI:
29
+ pass
30
+
31
+ openai = _OpenAIStub() # type: ignore
32
+
33
+
34
+ class AgentAI:
35
+ """AI/LLM Integration functionality for AgentField Agent"""
36
+
37
+ def __init__(self, agent_instance):
38
+ """
39
+ Initialize AgentAI with a reference to the main agent instance.
40
+
41
+ Args:
42
+ agent_instance: The main Agent instance
43
+ """
44
+ self.agent = agent_instance
45
+ self._initialization_complete = False
46
+ self._rate_limiter = None
47
+
48
+ def _get_rate_limiter(self) -> StatelessRateLimiter:
49
+ """
50
+ Get or create the rate limiter instance based on current configuration.
51
+
52
+ Returns:
53
+ StatelessRateLimiter: Configured rate limiter instance
54
+ """
55
+ if self._rate_limiter is None:
56
+ config = self.agent.ai_config
57
+ self._rate_limiter = StatelessRateLimiter(
58
+ max_retries=config.rate_limit_max_retries,
59
+ base_delay=config.rate_limit_base_delay,
60
+ max_delay=config.rate_limit_max_delay,
61
+ jitter_factor=config.rate_limit_jitter_factor,
62
+ circuit_breaker_threshold=config.rate_limit_circuit_breaker_threshold,
63
+ circuit_breaker_timeout=config.rate_limit_circuit_breaker_timeout,
64
+ )
65
+ return self._rate_limiter
66
+
67
+ async def _ensure_model_limits_cached(self):
68
+ """
69
+ Ensure model limits are cached for the current model configuration.
70
+ This is called once during the first AI call to avoid startup delays.
71
+ """
72
+ if not self._initialization_complete:
73
+ try:
74
+ # Cache limits for the default model
75
+ await self.agent.ai_config.get_model_limits()
76
+
77
+ # Cache limits for multimodal models if different
78
+ if self.agent.ai_config.audio_model != self.agent.ai_config.model:
79
+ await self.agent.ai_config.get_model_limits(
80
+ self.agent.ai_config.audio_model
81
+ )
82
+
83
+ if self.agent.ai_config.vision_model != self.agent.ai_config.model:
84
+ await self.agent.ai_config.get_model_limits(
85
+ self.agent.ai_config.vision_model
86
+ )
87
+
88
+ self._initialization_complete = True
89
+
90
+ except Exception as e:
91
+ log_debug(f"Failed to cache model limits: {e}")
92
+ # Continue with fallback defaults
93
+ self._initialization_complete = True
94
+
95
+ async def ai(
96
+ self,
97
+ *args: Any,
98
+ system: Optional[str] = None,
99
+ user: Optional[str] = None,
100
+ schema: Optional[Type[BaseModel]] = None,
101
+ model: Optional[str] = None,
102
+ temperature: Optional[float] = None,
103
+ max_tokens: Optional[int] = None,
104
+ stream: Optional[bool] = None,
105
+ response_format: Optional[Union[Literal["auto", "json", "text"], Dict]] = None,
106
+ context: Optional[Dict] = None,
107
+ memory_scope: Optional[List[str]] = None,
108
+ **kwargs,
109
+ ) -> Any:
110
+ """
111
+ Universal AI method supporting multimodal inputs with intelligent type detection.
112
+
113
+ This method provides a flexible interface for interacting with various LLMs,
114
+ supporting text, image, audio, and file inputs. It intelligently detects
115
+ input types and applies a hierarchical configuration system.
116
+
117
+ Args:
118
+ *args: Flexible inputs - text, images, audio, files, or mixed content.
119
+ - str: Text content, URLs, or file paths (auto-detected).
120
+ - bytes: Binary data (images, audio, documents).
121
+ - dict: Structured input with explicit keys (e.g., {"image": "url"}).
122
+ - list: Multimodal conversation or content list.
123
+
124
+ system (str, optional): System prompt for AI behavior.
125
+ user (str, optional): User message (alternative to positional args).
126
+ schema (Type[BaseModel], optional): Pydantic model for structured output validation.
127
+ model (str, optional): Override default model (e.g., "gpt-4", "claude-3").
128
+ temperature (float, optional): Creativity level (0.0-2.0).
129
+ max_tokens (int, optional): Maximum response length.
130
+ stream (bool, optional): Enable streaming response.
131
+ response_format (str, optional): Desired response format ('auto', 'json', 'text').
132
+ context (Dict, optional): Additional context data to pass to the LLM.
133
+ memory_scope (List[str], optional): Memory scopes to inject (e.g., ['workflow', 'session', 'reasoner']).
134
+ **kwargs: Additional provider-specific parameters to pass to the LLM.
135
+
136
+ Returns:
137
+ Any: The AI response - raw text, structured object (if schema), or a stream.
138
+
139
+ Examples:
140
+ # Simple text input
141
+ response = await app.ai("Summarize this document.")
142
+
143
+ # System and user prompts
144
+ response = await app.ai(
145
+ system="You are a helpful assistant.",
146
+ user="What is the capital of France?"
147
+ )
148
+
149
+ # Multimodal input with auto-detection (image URL and text)
150
+ response = await app.ai(
151
+ "Describe this image:",
152
+ "https://example.com/image.jpg"
153
+ )
154
+
155
+ # Multimodal input with file path (audio)
156
+ response = await app.ai(
157
+ "Transcribe this audio:",
158
+ "./audio.mp3"
159
+ )
160
+
161
+ # Structured output with Pydantic schema
162
+ class SentimentResult(BaseModel):
163
+ sentiment: str
164
+ confidence: float
165
+
166
+ result = await app.ai(
167
+ "Analyze the sentiment of 'I love this product!'",
168
+ schema=SentimentResult
169
+ )
170
+
171
+ # Override default AI configuration parameters
172
+ response = await app.ai(
173
+ "Generate a creative story.",
174
+ model="gpt-4-turbo",
175
+ temperature=0.9,
176
+ max_tokens=500,
177
+ stream=True
178
+ )
179
+
180
+ # Complex multimodal conversation
181
+ response = await app.ai([
182
+ {"role": "system", "content": "You are a visual assistant."},
183
+ {"role": "user", "content": "What do you see here?"},
184
+ "https://example.com/chart.png",
185
+ {"role": "user", "content": "Can you explain the trend?"}
186
+ ])
187
+ """
188
+ # Apply hierarchical configuration: Agent defaults < Method overrides < Runtime overrides
189
+ final_config = self.agent.ai_config.copy(deep=True)
190
+
191
+ # Default enable rate limit retry unless explicitly set to False
192
+ if (
193
+ not hasattr(final_config, "enable_rate_limit_retry")
194
+ or final_config.enable_rate_limit_retry is None
195
+ ):
196
+ final_config.enable_rate_limit_retry = True
197
+
198
+ # Apply method-level overrides
199
+ if model:
200
+ final_config.model = model
201
+ if temperature is not None:
202
+ final_config.temperature = temperature
203
+ if max_tokens is not None:
204
+ final_config.max_tokens = max_tokens
205
+ if stream is not None:
206
+ final_config.stream = stream
207
+ if response_format is not None:
208
+ if isinstance(response_format, str):
209
+ final_config.response_format = response_format
210
+
211
+ # TODO: Integrate memory injection based on memory_scope and self.memory_config
212
+ # For now, just pass context if provided
213
+ if context:
214
+ # This would be where memory data is merged into the context
215
+ pass
216
+
217
+ # Prepare messages for LiteLLM
218
+ messages = []
219
+
220
+ # If a schema is provided, augment the system prompt with strict schema adherence instructions and schema context
221
+ if schema:
222
+ # Generate a readable JSON schema string using the modern Pydantic API
223
+ try:
224
+ schema_dict = schema.model_json_schema()
225
+ schema_json = json.dumps(schema_dict, indent=2)
226
+ except Exception:
227
+ schema_json = str(schema)
228
+ schema_instruction = (
229
+ "IMPORTANT: You must exactly adhere to the output schema provided below. "
230
+ "Do not add or omit any fields. Output must be valid JSON matching the schema. "
231
+ "If a field is required in the schema, it must be present in the output. "
232
+ "If a field is not in the schema, do NOT include it in the output. "
233
+ "Here is the output schema you must follow:\n"
234
+ f"{schema_json}\n"
235
+ "Repeat: Output ONLY valid JSON matching the schema above. Do not include any extra text or explanation."
236
+ )
237
+ # Merge with any user-provided system prompt
238
+ if system:
239
+ system_prompt = f"{system}\n\n{schema_instruction}"
240
+ else:
241
+ system_prompt = schema_instruction
242
+ messages.append({"role": "system", "content": system_prompt})
243
+ else:
244
+ if system:
245
+ messages.append({"role": "system", "content": system})
246
+
247
+ # Handle flexible user input with intelligent processing
248
+ if user:
249
+ messages.append({"role": "user", "content": user})
250
+ elif args:
251
+ processed_content = self._process_multimodal_args(args)
252
+ if processed_content:
253
+ messages.extend(processed_content)
254
+
255
+ litellm_module = litellm if hasattr(litellm, "acompletion") else None
256
+
257
+ # Ensure model limits are cached (done once per instance)
258
+ await self._ensure_model_limits_cached()
259
+
260
+ # Apply prompt trimming using LiteLLM's token-aware utility when available.
261
+ utils_module = getattr(litellm_module, "utils", None) if litellm_module else None
262
+ token_counter = getattr(utils_module, "token_counter", None) if utils_module else None
263
+ trim_messages = getattr(utils_module, "trim_messages", None) if utils_module else None
264
+
265
+ if token_counter is None:
266
+ def token_counter(model: str, messages: List[dict]) -> int:
267
+ return len(json.dumps(messages))
268
+
269
+ if trim_messages is None:
270
+ def trim_messages(messages: List[dict], model: str, max_tokens: int) -> List[dict]:
271
+ return messages
272
+
273
+ # Determine model context length using multiple fallback strategies
274
+ model_context_length = None
275
+
276
+ # Strategy 1: Use explicit max_input_tokens from config
277
+ if hasattr(final_config, "max_input_tokens") and final_config.max_input_tokens:
278
+ model_context_length = final_config.max_input_tokens
279
+
280
+ # Strategy 3: Use fallback model mappings
281
+ if not model_context_length and hasattr(final_config, "_MODEL_CONTEXT_LIMITS"):
282
+ candidate_limit = final_config._MODEL_CONTEXT_LIMITS.get(final_config.model)
283
+ if candidate_limit:
284
+ model_context_length = candidate_limit
285
+
286
+ # Strategy 4: Conservative fallback with warning
287
+ if not model_context_length:
288
+ model_context_length = 10192 # More reasonable than 4096
289
+
290
+ # Calculate safe input token limit: context_length - max_output_tokens - buffer
291
+ output_tokens = (
292
+ final_config.max_tokens or 7096
293
+ ) # Default output if not specified
294
+ buffer_tokens = 100 # Small buffer for safety
295
+
296
+ safe_input_limit = max(
297
+ 1000, model_context_length - output_tokens - buffer_tokens
298
+ )
299
+
300
+ # Validate the calculation makes sense
301
+ if safe_input_limit < 1000:
302
+ safe_input_limit = 1000
303
+
304
+ # Count actual prompt tokens using LiteLLM's token counter
305
+ try:
306
+ actual_prompt_tokens = token_counter(
307
+ model=final_config.model, messages=messages
308
+ )
309
+ except Exception as e:
310
+ log_debug(f"Could not count prompt tokens, proceeding with trimming: {e}")
311
+ actual_prompt_tokens = (
312
+ safe_input_limit + 1
313
+ ) # Force trimming if we can't count
314
+
315
+ # Only trim if necessary based on actual token count
316
+ if actual_prompt_tokens > safe_input_limit:
317
+ trimmed_messages = trim_messages(
318
+ messages, final_config.model, max_tokens=safe_input_limit
319
+ )
320
+ if len(trimmed_messages) != len(messages) or any(
321
+ m1 != m2 for m1, m2 in zip(messages, trimmed_messages)
322
+ ):
323
+ messages = trimmed_messages
324
+ else:
325
+ pass
326
+
327
+ # Prepare LiteLLM parameters using the config's method
328
+ # This leverages LiteLLM's standard environment variable handling and smart token management
329
+ litellm_params = final_config.get_litellm_params(
330
+ messages=messages,
331
+ **kwargs, # Runtime overrides have highest priority
332
+ )
333
+
334
+ # Ensure messages are always included in the final params
335
+ litellm_params["messages"] = messages
336
+
337
+ if schema:
338
+ # Use LiteLLM's native Pydantic model support for structured outputs
339
+ litellm_params["response_format"] = schema
340
+
341
+ # Define the LiteLLM call function for rate limiter
342
+ async def _make_litellm_call():
343
+ if litellm_module is None:
344
+ raise ImportError(
345
+ "litellm is not installed. Please install it with `pip install litellm`."
346
+ )
347
+ return await litellm_module.acompletion(**litellm_params)
348
+
349
+ async def _execute_with_fallbacks():
350
+ # Check for configured fallback models in AI config
351
+ fallback_models = getattr(final_config, "fallback_models", None)
352
+ if not fallback_models and getattr(
353
+ final_config, "final_fallback_model", None
354
+ ):
355
+ # If only a final model is provided, treat it as a fallback list of one
356
+ fallback_models = [final_config.final_fallback_model]
357
+
358
+ if fallback_models:
359
+ # Ensure each fallback call has a valid provider
360
+ all_models = [final_config.model] + list(fallback_models)
361
+ last_exception = None
362
+ for m in all_models:
363
+ try:
364
+ if "/" not in m:
365
+ log_debug(
366
+ f"Skipping model {m} - no provider specified in model name"
367
+ )
368
+ raise ValueError(
369
+ f"Invalid model spec: '{m}'. Must include provider prefix, e.g. 'openai/gpt-4'."
370
+ )
371
+ litellm_params["model"] = m
372
+ return await _make_litellm_call()
373
+ except Exception as e:
374
+ log_debug(
375
+ f"Model {m} failed with {e}, trying next fallback if available..."
376
+ )
377
+ last_exception = e
378
+ continue
379
+ # If all models fail, re-raise the last exception
380
+ if last_exception:
381
+ raise last_exception
382
+ else:
383
+ # No fallbacks configured, just make the call
384
+ if "/" not in final_config.model:
385
+ raise ValueError(
386
+ f"Invalid model spec: '{final_config.model}'. Must include provider prefix, e.g. 'openai/gpt-4'."
387
+ )
388
+ return await _make_litellm_call()
389
+
390
+ if final_config.enable_rate_limit_retry:
391
+ rate_limiter = self._get_rate_limiter()
392
+ try:
393
+ response = await rate_limiter.execute_with_retry(
394
+ _execute_with_fallbacks
395
+ )
396
+ except Exception as e:
397
+ log_debug(f"LiteLLM call failed after retries: {e}")
398
+ raise
399
+ else:
400
+ try:
401
+ response = await _execute_with_fallbacks()
402
+ except HTTPStatusError as e:
403
+ log_debug(
404
+ f"LiteLLM HTTP call failed: {e.response.status_code} - {e.response.text}"
405
+ )
406
+ raise
407
+ except requests.exceptions.RequestException as e:
408
+ log_debug(f"LiteLLM network call failed: {e}")
409
+ if e.response is not None:
410
+ log_debug(f"Response status: {e.response.status_code}")
411
+ log_debug(f"Response text: {e.response.text}")
412
+ raise
413
+ except Exception as e:
414
+ log_debug(f"LiteLLM call failed: {e}")
415
+ raise
416
+
417
+ # Process the response
418
+ if final_config.stream:
419
+ # For streaming, return the generator
420
+ return response
421
+ else:
422
+ # Import multimodal response detection
423
+ from .multimodal_response import detect_multimodal_response
424
+
425
+ # Detect and wrap multimodal content
426
+ multimodal_response = detect_multimodal_response(response)
427
+
428
+ if schema:
429
+ # For schema responses, try to parse from text content
430
+ try:
431
+ json_data = json.loads(str(multimodal_response.text))
432
+ return schema(**json_data)
433
+ except (json.JSONDecodeError, ValueError) as parse_error:
434
+ log_error(f"Failed to parse JSON response: {parse_error}")
435
+ log_debug(f"Raw response: {multimodal_response.text}")
436
+ # Fallback: try to extract JSON from the response
437
+ json_match = re.search(
438
+ r"\{.*\}", str(multimodal_response.text), re.DOTALL
439
+ )
440
+ if json_match:
441
+ try:
442
+ json_data = json.loads(json_match.group())
443
+ return schema(**json_data)
444
+ except (json.JSONDecodeError, ValueError):
445
+ pass
446
+ raise ValueError(
447
+ f"Could not parse structured response: {multimodal_response.text}"
448
+ )
449
+
450
+ # Return MultimodalResponse for backward compatibility and enhanced features
451
+ return multimodal_response
452
+
453
+ def _process_multimodal_args(self, args: tuple) -> List[Dict[str, Any]]:
454
+ """Process multimodal arguments into LiteLLM-compatible message format"""
455
+ from agentfield.multimodal import Audio, File, Image, Text
456
+
457
+ messages = []
458
+ user_content = []
459
+
460
+ for arg in args:
461
+ # Handle our multimodal input classes first
462
+ if isinstance(arg, Text):
463
+ user_content.append({"type": "text", "text": arg.text})
464
+
465
+ elif isinstance(arg, Image):
466
+ if isinstance(arg.image_url, dict):
467
+ user_content.append(
468
+ {"type": "image_url", "image_url": arg.image_url}
469
+ )
470
+ else:
471
+ user_content.append(
472
+ {
473
+ "type": "image_url",
474
+ "image_url": {"url": arg.image_url, "detail": "high"},
475
+ }
476
+ )
477
+
478
+ elif isinstance(arg, Audio):
479
+ # Handle audio input according to LiteLLM GPT-4o-audio pattern
480
+ user_content.append(
481
+ {"type": "input_audio", "input_audio": arg.input_audio}
482
+ )
483
+
484
+ elif isinstance(arg, File):
485
+ # For now, treat files as text references
486
+ if isinstance(arg.file, dict):
487
+ file_info = arg.file
488
+ user_content.append(
489
+ {
490
+ "type": "text",
491
+ "text": f"[File: {file_info.get('url', 'unknown')}]",
492
+ }
493
+ )
494
+ else:
495
+ user_content.append({"type": "text", "text": f"[File: {arg.file}]"})
496
+
497
+ else:
498
+ # Fall back to automatic detection for raw inputs
499
+ detected_type = AgentUtils.detect_input_type(arg)
500
+
501
+ if detected_type == "text":
502
+ user_content.append({"type": "text", "text": arg})
503
+
504
+ elif detected_type == "image_url":
505
+ user_content.append(
506
+ {
507
+ "type": "image_url",
508
+ "image_url": {"url": arg, "detail": "high"},
509
+ }
510
+ )
511
+
512
+ elif detected_type == "image_file":
513
+ # Convert file to base64 data URL
514
+ try:
515
+ import base64
516
+
517
+ with open(arg, "rb") as f:
518
+ image_data = base64.b64encode(f.read()).decode()
519
+ ext = os.path.splitext(arg)[1].lower()
520
+ mime_type = AgentUtils.get_mime_type(ext)
521
+ data_url = f"data:{mime_type};base64,{image_data}"
522
+ user_content.append(
523
+ {
524
+ "type": "image_url",
525
+ "image_url": {"url": data_url, "detail": "high"},
526
+ }
527
+ )
528
+ except Exception as e:
529
+ log_warn(f"Could not read image file {arg}: {e}")
530
+ user_content.append(
531
+ {"type": "text", "text": f"[Image file: {arg}]"}
532
+ )
533
+
534
+ elif detected_type == "audio_file":
535
+ # Convert audio file to LiteLLM input_audio format
536
+ try:
537
+ import base64
538
+
539
+ with open(arg, "rb") as f:
540
+ audio_data = base64.b64encode(f.read()).decode()
541
+
542
+ # Detect format from extension
543
+ ext = os.path.splitext(arg)[1].lower().lstrip(".")
544
+ audio_format = (
545
+ ext if ext in ["wav", "mp3", "flac", "ogg"] else "wav"
546
+ )
547
+
548
+ user_content.append(
549
+ {
550
+ "type": "input_audio",
551
+ "input_audio": {
552
+ "data": audio_data,
553
+ "format": audio_format,
554
+ },
555
+ }
556
+ )
557
+ except Exception as e:
558
+ log_warn(f"Could not read audio file {arg}: {e}")
559
+ user_content.append(
560
+ {
561
+ "type": "text",
562
+ "text": f"[Audio file: {os.path.basename(arg)}]",
563
+ }
564
+ )
565
+
566
+ elif detected_type == "document_file":
567
+ # For documents, we might need to extract text
568
+ # For now, just reference the file
569
+ user_content.append(
570
+ {
571
+ "type": "text",
572
+ "text": f"[Document file: {os.path.basename(arg)}]",
573
+ }
574
+ )
575
+
576
+ elif detected_type == "image_base64":
577
+ user_content.append(
578
+ {
579
+ "type": "image_url",
580
+ "image_url": {"url": arg, "detail": "high"},
581
+ }
582
+ )
583
+
584
+ elif detected_type == "audio_base64":
585
+ # Extract format and data from data URL
586
+ try:
587
+ if arg.startswith("data:audio/"):
588
+ # Parse data URL: data:audio/wav;base64,<data>
589
+ header, data = arg.split(",", 1)
590
+ format_part = header.split(";")[0].split("/")[1]
591
+ user_content.append(
592
+ {
593
+ "type": "input_audio",
594
+ "input_audio": {
595
+ "data": data,
596
+ "format": format_part,
597
+ },
598
+ }
599
+ )
600
+ else:
601
+ user_content.append(
602
+ {"type": "text", "text": "[Audio data provided]"}
603
+ )
604
+ except Exception as e:
605
+ log_warn(f"Could not process audio base64: {e}")
606
+ user_content.append(
607
+ {"type": "text", "text": "[Audio data provided]"}
608
+ )
609
+
610
+ elif detected_type == "image_bytes":
611
+ # Convert bytes to base64 data URL
612
+ try:
613
+ import base64
614
+
615
+ image_data = base64.b64encode(arg).decode()
616
+ # Try to detect image type from bytes
617
+ if arg.startswith(b"\xff\xd8\xff"):
618
+ mime_type = "image/jpeg"
619
+ elif arg.startswith(b"\x89PNG"):
620
+ mime_type = "image/png"
621
+ elif arg.startswith(b"GIF8"):
622
+ mime_type = "image/gif"
623
+ else:
624
+ mime_type = "image/png" # Default
625
+
626
+ data_url = f"data:{mime_type};base64,{image_data}"
627
+ user_content.append(
628
+ {
629
+ "type": "image_url",
630
+ "image_url": {"url": data_url, "detail": "high"},
631
+ }
632
+ )
633
+ except Exception as e:
634
+ log_warn(f"Could not process image bytes: {e}")
635
+ user_content.append(
636
+ {"type": "text", "text": "[Image data provided]"}
637
+ )
638
+
639
+ elif detected_type == "audio_bytes":
640
+ # Convert audio bytes to input_audio format
641
+ try:
642
+ import base64
643
+
644
+ audio_data = base64.b64encode(arg).decode()
645
+ # Try to detect format from bytes
646
+ if arg.startswith(b"RIFF") and b"WAVE" in arg[:12]:
647
+ audio_format = "wav"
648
+ elif arg.startswith(b"ID3") or arg.startswith(b"\xff\xfb"):
649
+ audio_format = "mp3"
650
+ else:
651
+ audio_format = "wav" # Default
652
+
653
+ user_content.append(
654
+ {
655
+ "type": "input_audio",
656
+ "input_audio": {
657
+ "data": audio_data,
658
+ "format": audio_format,
659
+ },
660
+ }
661
+ )
662
+ except Exception as e:
663
+ log_warn(f"Could not process audio bytes: {e}")
664
+ user_content.append(
665
+ {"type": "text", "text": "[Audio data provided]"}
666
+ )
667
+
668
+ elif detected_type == "structured_input":
669
+ # Handle dict with explicit keys
670
+ if "system" in arg:
671
+ messages.append({"role": "system", "content": arg["system"]})
672
+ if "user" in arg:
673
+ user_content.append({"type": "text", "text": arg["user"]})
674
+ # Handle other structured content
675
+ for key in [
676
+ "text",
677
+ "image",
678
+ "image_url",
679
+ "audio",
680
+ ]:
681
+ if key in arg:
682
+ if key == "text":
683
+ user_content.append({"type": "text", "text": arg[key]})
684
+ elif key in ["image", "image_url"]:
685
+ if isinstance(arg[key], dict):
686
+ user_content.append(
687
+ {"type": "image_url", "image_url": arg[key]}
688
+ )
689
+ else:
690
+ user_content.append(
691
+ {
692
+ "type": "image_url",
693
+ "image_url": {
694
+ "url": arg[key],
695
+ "detail": "high",
696
+ },
697
+ }
698
+ )
699
+ elif key == "audio":
700
+ if isinstance(arg[key], dict):
701
+ user_content.append(
702
+ {"type": "input_audio", "input_audio": arg[key]}
703
+ )
704
+ else:
705
+ # Assume it's a file path or URL
706
+ user_content.append(
707
+ {"type": "text", "text": f"[Audio: {arg[key]}]"}
708
+ )
709
+
710
+ elif detected_type == "message_dict":
711
+ # Handle message format dict
712
+ messages.append(arg)
713
+
714
+ elif detected_type == "conversation_list":
715
+ # Handle list of messages
716
+ messages.extend(arg)
717
+
718
+ elif detected_type == "multimodal_list":
719
+ # Handle mixed list of content
720
+ for item in arg:
721
+ if isinstance(item, str):
722
+ user_content.append({"type": "text", "text": item})
723
+ elif isinstance(item, dict):
724
+ if "role" in item:
725
+ messages.append(item)
726
+ else:
727
+ # Process as structured input
728
+ sub_messages = self._process_multimodal_args((item,))
729
+ messages.extend(sub_messages)
730
+
731
+ elif detected_type == "dict":
732
+ # Generic dict - convert to text representation
733
+ import json
734
+
735
+ user_content.append(
736
+ {"type": "text", "text": f"Data: {json.dumps(arg, indent=2)}"}
737
+ )
738
+
739
+ else:
740
+ # Fallback for unknown types
741
+ user_content.append({"type": "text", "text": str(arg)})
742
+
743
+ # Add user content as a message if we have any
744
+ if user_content:
745
+ if len(user_content) == 1 and user_content[0]["type"] == "text":
746
+ # Simplify single text content
747
+ messages.append({"role": "user", "content": user_content[0]["text"]})
748
+ else:
749
+ # Multiple content types
750
+ messages.append({"role": "user", "content": user_content})
751
+
752
+ return messages
753
+
754
+ async def ai_with_audio(
755
+ self,
756
+ *args: Any,
757
+ voice: str = "alloy",
758
+ format: str = "wav",
759
+ model: Optional[str] = None,
760
+ mode: Optional[str] = None,
761
+ **kwargs,
762
+ ) -> Any:
763
+ """
764
+ AI method optimized for audio output generation.
765
+
766
+ Automatically detects the model type and uses the appropriate LiteLLM function:
767
+ - For TTS models (tts-1, tts-1-hd, gpt-4o-mini-tts): Uses litellm.speech()
768
+ - For audio-capable chat models (gpt-4o-audio-preview): Uses litellm.completion() with audio modalities
769
+
770
+ Args:
771
+ *args: Input arguments (text prompts, etc.)
772
+ voice: Voice to use for audio generation (alloy, echo, fable, onyx, nova, shimmer)
773
+ format: Audio format (wav, mp3, etc.)
774
+ model: Model to use (defaults to tts-1)
775
+ **kwargs: Additional parameters
776
+
777
+ Returns:
778
+ MultimodalResponse with audio content
779
+
780
+ Example:
781
+ audio_result = await agent.ai_with_audio("Say hello warmly", voice="alloy")
782
+ audio_result.audio.save("greeting.wav")
783
+ """
784
+ # Use TTS model as default (more reliable than gpt-4o-audio-preview)
785
+ if model is None:
786
+ model = (
787
+ self.agent.ai_config.audio_model
788
+ ) # Use configured audio model (defaults to tts-1)
789
+
790
+ # Check if mode="openai_direct" is specified
791
+ if mode == "openai_direct":
792
+ # Use direct OpenAI client with streaming response
793
+ return await self._generate_openai_direct_audio(
794
+ *args,
795
+ voice=voice,
796
+ format=format,
797
+ model=model or "gpt-4o-mini-tts",
798
+ **kwargs,
799
+ )
800
+
801
+ # Check if this is a TTS model that needs the speech endpoint
802
+ tts_models = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
803
+ if model in tts_models:
804
+ # Use LiteLLM speech function for TTS models
805
+ return await self._generate_tts_audio(
806
+ *args, voice=voice, format=format, model=model, **kwargs
807
+ )
808
+ else:
809
+ # Use chat completion with audio modalities for other models
810
+ audio_params = {
811
+ "modalities": ["text", "audio"],
812
+ "audio": {"voice": voice, "format": format},
813
+ }
814
+ final_kwargs = {**audio_params, **kwargs}
815
+ return await self.ai(*args, model=model, **final_kwargs)
816
+
817
+ async def _generate_tts_audio(
818
+ self,
819
+ *args: Any,
820
+ voice: str = "alloy",
821
+ format: str = "wav",
822
+ model: str = "tts-1",
823
+ **kwargs,
824
+ ) -> Any:
825
+ """
826
+ Generate audio using LiteLLM's speech function for TTS models.
827
+ """
828
+ from agentfield.multimodal_response import (
829
+ AudioOutput,
830
+ MultimodalResponse,
831
+ )
832
+
833
+ litellm_module = litellm
834
+ if not hasattr(litellm_module, "aspeech"):
835
+ raise ImportError(
836
+ "litellm is not installed. Please install it with `pip install litellm` to use TTS features."
837
+ )
838
+
839
+ # Combine all text inputs
840
+ text_input = " ".join(str(arg) for arg in args if isinstance(arg, str))
841
+ if not text_input:
842
+ text_input = "Hello, this is a test audio message."
843
+
844
+ try:
845
+ # Get API configuration
846
+ config = self.agent.ai_config.get_litellm_params()
847
+
848
+ # Use LiteLLM speech function
849
+ response = await litellm_module.aspeech(
850
+ model=model,
851
+ input=text_input,
852
+ voice=voice,
853
+ response_format=format,
854
+ api_key=config.get("api_key"),
855
+ **kwargs,
856
+ )
857
+
858
+ # Convert binary response to base64 string for AudioOutput
859
+ import base64
860
+
861
+ try:
862
+ # Try different methods to get binary content
863
+ if hasattr(response, "content"):
864
+ binary_content = response.content
865
+ elif hasattr(response, "read"):
866
+ binary_content = response.read()
867
+ elif hasattr(response, "__iter__"):
868
+ # For HttpxBinaryResponseContent, iterate to get bytes
869
+ binary_content = b"".join(response)
870
+ else:
871
+ # Last resort - convert to string and encode
872
+ binary_content = str(response).encode("utf-8")
873
+
874
+ audio_data = base64.b64encode(binary_content).decode("utf-8")
875
+ except Exception as e:
876
+ log_error(f"Failed to process audio response: {e}")
877
+ # Use a placeholder for now
878
+ audio_data = ""
879
+
880
+ # Create AudioOutput directly
881
+ audio_output = AudioOutput(data=audio_data, format=format, url=None)
882
+
883
+ # Create MultimodalResponse directly
884
+ return MultimodalResponse(
885
+ text=text_input,
886
+ audio=audio_output,
887
+ images=[],
888
+ files=[],
889
+ raw_response=response,
890
+ )
891
+
892
+ except Exception as e:
893
+ # Fallback to text-only MultimodalResponse
894
+ log_error(f"TTS generation failed: {e}")
895
+ return MultimodalResponse(
896
+ text=text_input,
897
+ audio=None,
898
+ images=[],
899
+ files=[],
900
+ raw_response=text_input,
901
+ )
902
+
903
+ async def _generate_openai_direct_audio(
904
+ self,
905
+ *args: Any,
906
+ voice: str = "alloy",
907
+ format: str = "wav",
908
+ model: str = "gpt-4o-mini-tts",
909
+ **kwargs,
910
+ ) -> Any:
911
+ """
912
+ Generate audio using OpenAI client directly with streaming response.
913
+ This method supports OpenAI-specific parameters like 'instructions' and 'speed'.
914
+
915
+ All kwargs are passed through to OpenAI SDK. The SDK will validate parameters
916
+ and reject unsupported ones.
917
+
918
+ Common OpenAI parameters:
919
+ - instructions: Guide the model's speaking style
920
+ - speed: Speech speed (0.25 to 4.0)
921
+ - response_format: Audio format (mp3, opus, aac, flac, wav, pcm)
922
+ """
923
+ import base64
924
+ import tempfile
925
+ from pathlib import Path
926
+
927
+ from agentfield.multimodal_response import AudioOutput, MultimodalResponse
928
+ from openai import OpenAI
929
+
930
+ # Combine all text inputs
931
+ text_input = " ".join(str(arg) for arg in args if isinstance(arg, str))
932
+ if not text_input:
933
+ text_input = "Hello, this is a test audio message."
934
+
935
+ try:
936
+ # Get API configuration
937
+ config = self.agent.ai_config.get_litellm_params()
938
+ api_key = config.get("api_key")
939
+
940
+ if not api_key:
941
+ raise ValueError("OpenAI API key not found in configuration")
942
+
943
+ # Initialize OpenAI client
944
+ client = OpenAI(api_key=api_key)
945
+
946
+ # Prepare base parameters for OpenAI speech API
947
+ speech_params = {
948
+ "model": model,
949
+ "voice": voice,
950
+ "input": text_input,
951
+ }
952
+
953
+ # Map format parameter to response_format if not already in kwargs
954
+ if "response_format" not in kwargs and format:
955
+ speech_params["response_format"] = format
956
+
957
+ # Pass all kwargs through to OpenAI SDK
958
+ # Let OpenAI SDK handle parameter validation
959
+ speech_params.update(kwargs)
960
+
961
+ # Create a temporary file for the audio
962
+ with tempfile.NamedTemporaryFile(
963
+ suffix=f".{format}", delete=False
964
+ ) as temp_file:
965
+ temp_path = Path(temp_file.name)
966
+
967
+ try:
968
+ # Use OpenAI streaming response
969
+ with client.audio.speech.with_streaming_response.create(
970
+ **speech_params
971
+ ) as response:
972
+ response.stream_to_file(temp_path)
973
+
974
+ # Read the audio file and convert to base64
975
+ with open(temp_path, "rb") as audio_file:
976
+ binary_content = audio_file.read()
977
+ audio_data = base64.b64encode(binary_content).decode("utf-8")
978
+
979
+ # Create AudioOutput
980
+ audio_output = AudioOutput(data=audio_data, format=format, url=None)
981
+
982
+ # Create MultimodalResponse
983
+ return MultimodalResponse(
984
+ text=text_input,
985
+ audio=audio_output,
986
+ images=[],
987
+ files=[],
988
+ raw_response=response,
989
+ )
990
+
991
+ finally:
992
+ # Clean up temporary file
993
+ if temp_path.exists():
994
+ temp_path.unlink()
995
+
996
+ except Exception as e:
997
+ # Fallback to text-only MultimodalResponse
998
+ log_error(f"OpenAI direct audio generation failed: {e}")
999
+ return MultimodalResponse(
1000
+ text=text_input,
1001
+ audio=None,
1002
+ images=[],
1003
+ files=[],
1004
+ raw_response=text_input,
1005
+ )
1006
+
1007
+ async def ai_with_vision(
1008
+ self,
1009
+ prompt: str,
1010
+ size: str = "1024x1024",
1011
+ quality: str = "standard",
1012
+ style: Optional[str] = None,
1013
+ model: Optional[str] = None,
1014
+ response_format: str = "url",
1015
+ **kwargs,
1016
+ ) -> Any:
1017
+ """
1018
+ AI method optimized for image generation.
1019
+
1020
+ Supports both LiteLLM and OpenRouter providers:
1021
+ - LiteLLM: Use model names like "dall-e-3", "azure/dall-e-3", "bedrock/stability.stable-diffusion-xl"
1022
+ - OpenRouter: Use model names with "openrouter/" prefix like "openrouter/google/gemini-2.5-flash-image-preview"
1023
+
1024
+ Args:
1025
+ prompt: Text prompt for image generation
1026
+ size: Image size (256x256, 512x512, 1024x1024, 1792x1024, 1024x1792)
1027
+ quality: Image quality (standard, hd)
1028
+ style: Image style (vivid, natural) for DALL-E 3
1029
+ model: Model to use (defaults to dall-e-3)
1030
+ response_format: Response format ('url' or 'b64_json'). Defaults to 'url'
1031
+ **kwargs: Additional provider-specific parameters
1032
+
1033
+ Returns:
1034
+ MultimodalResponse with image content
1035
+
1036
+ Examples:
1037
+ # LiteLLM (DALL-E)
1038
+ result = await agent.ai_with_vision("A sunset over mountains")
1039
+ result.images[0].save("sunset.png")
1040
+
1041
+ # OpenRouter (Gemini)
1042
+ result = await agent.ai_with_vision(
1043
+ "A futuristic city",
1044
+ model="openrouter/google/gemini-2.5-flash-image-preview",
1045
+ image_config={"aspect_ratio": "16:9"}
1046
+ )
1047
+
1048
+ # Get base64 data directly
1049
+ result = await agent.ai_with_vision("A sunset", response_format="b64_json")
1050
+ """
1051
+ from agentfield import vision
1052
+
1053
+ # Use image generation model if not specified
1054
+ if model is None:
1055
+ model = "dall-e-3" # Default image model
1056
+
1057
+ # Route based on model prefix
1058
+ if model.startswith("openrouter/"):
1059
+ # OpenRouter: Use chat completions API with image modality
1060
+ return await vision.generate_image_openrouter(
1061
+ prompt=prompt,
1062
+ model=model,
1063
+ size=size,
1064
+ quality=quality,
1065
+ style=style,
1066
+ response_format=response_format,
1067
+ **kwargs,
1068
+ )
1069
+ else:
1070
+ # LiteLLM: Use image generation API
1071
+ return await vision.generate_image_litellm(
1072
+ prompt=prompt,
1073
+ model=model,
1074
+ size=size,
1075
+ quality=quality,
1076
+ style=style,
1077
+ response_format=response_format,
1078
+ **kwargs,
1079
+ )
1080
+
1081
+ async def ai_with_multimodal(
1082
+ self,
1083
+ *args: Any,
1084
+ modalities: Optional[List[str]] = None,
1085
+ audio_config: Optional[Dict] = None,
1086
+ model: Optional[str] = None,
1087
+ **kwargs,
1088
+ ) -> Any:
1089
+ """
1090
+ AI method for explicit multimodal input/output control.
1091
+
1092
+ Args:
1093
+ *args: Mixed multimodal inputs
1094
+ modalities: List of desired output modalities (["text", "audio", "image"])
1095
+ audio_config: Audio configuration if audio modality requested
1096
+ model: Model to use
1097
+ **kwargs: Additional parameters
1098
+
1099
+ Returns:
1100
+ MultimodalResponse with requested modalities
1101
+
1102
+ Example:
1103
+ result = await agent.ai_with_multimodal(
1104
+ "Describe this image and provide audio narration",
1105
+ image_from_url("https://example.com/image.jpg"),
1106
+ modalities=["text", "audio"],
1107
+ audio_config={"voice": "nova", "format": "wav"}
1108
+ )
1109
+ """
1110
+ multimodal_params = {}
1111
+
1112
+ if modalities:
1113
+ multimodal_params["modalities"] = modalities
1114
+
1115
+ if audio_config and "audio" in (modalities or []):
1116
+ multimodal_params["audio"] = audio_config
1117
+
1118
+ # Use multimodal-capable model if not specified
1119
+ if model is None and modalities and "audio" in modalities:
1120
+ model = "gpt-4o-audio-preview"
1121
+
1122
+ # Merge with user kwargs
1123
+ final_kwargs = {**multimodal_params, **kwargs}
1124
+
1125
+ return await self.ai(*args, model=model, **final_kwargs)