autobyteus 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. autobyteus/agent/context/agent_config.py +6 -1
  2. autobyteus/agent/handlers/llm_user_message_ready_event_handler.py +30 -7
  3. autobyteus/agent/handlers/user_input_message_event_handler.py +22 -25
  4. autobyteus/agent/message/__init__.py +7 -5
  5. autobyteus/agent/message/agent_input_user_message.py +6 -16
  6. autobyteus/agent/message/context_file.py +24 -24
  7. autobyteus/agent/message/context_file_type.py +29 -8
  8. autobyteus/agent/message/multimodal_message_builder.py +47 -0
  9. autobyteus/agent/streaming/stream_event_payloads.py +23 -4
  10. autobyteus/agent/system_prompt_processor/tool_manifest_injector_processor.py +6 -2
  11. autobyteus/agent/tool_invocation.py +2 -1
  12. autobyteus/agent_team/bootstrap_steps/agent_configuration_preparation_step.py +9 -2
  13. autobyteus/agent_team/context/agent_team_config.py +1 -0
  14. autobyteus/llm/api/autobyteus_llm.py +33 -33
  15. autobyteus/llm/api/bedrock_llm.py +13 -5
  16. autobyteus/llm/api/claude_llm.py +13 -27
  17. autobyteus/llm/api/gemini_llm.py +108 -42
  18. autobyteus/llm/api/groq_llm.py +4 -3
  19. autobyteus/llm/api/mistral_llm.py +97 -51
  20. autobyteus/llm/api/nvidia_llm.py +6 -5
  21. autobyteus/llm/api/ollama_llm.py +37 -12
  22. autobyteus/llm/api/openai_compatible_llm.py +91 -91
  23. autobyteus/llm/autobyteus_provider.py +1 -1
  24. autobyteus/llm/base_llm.py +42 -139
  25. autobyteus/llm/extensions/base_extension.py +6 -6
  26. autobyteus/llm/extensions/token_usage_tracking_extension.py +3 -2
  27. autobyteus/llm/llm_factory.py +106 -4
  28. autobyteus/llm/token_counter/token_counter_factory.py +1 -1
  29. autobyteus/llm/user_message.py +43 -35
  30. autobyteus/llm/utils/llm_config.py +34 -18
  31. autobyteus/llm/utils/media_payload_formatter.py +99 -0
  32. autobyteus/llm/utils/messages.py +32 -25
  33. autobyteus/llm/utils/response_types.py +9 -3
  34. autobyteus/llm/utils/token_usage.py +6 -5
  35. autobyteus/multimedia/__init__.py +31 -0
  36. autobyteus/multimedia/audio/__init__.py +11 -0
  37. autobyteus/multimedia/audio/api/__init__.py +4 -0
  38. autobyteus/multimedia/audio/api/autobyteus_audio_client.py +59 -0
  39. autobyteus/multimedia/audio/api/gemini_audio_client.py +219 -0
  40. autobyteus/multimedia/audio/audio_client_factory.py +120 -0
  41. autobyteus/multimedia/audio/audio_model.py +96 -0
  42. autobyteus/multimedia/audio/autobyteus_audio_provider.py +108 -0
  43. autobyteus/multimedia/audio/base_audio_client.py +40 -0
  44. autobyteus/multimedia/image/__init__.py +11 -0
  45. autobyteus/multimedia/image/api/__init__.py +9 -0
  46. autobyteus/multimedia/image/api/autobyteus_image_client.py +97 -0
  47. autobyteus/multimedia/image/api/gemini_image_client.py +188 -0
  48. autobyteus/multimedia/image/api/openai_image_client.py +142 -0
  49. autobyteus/multimedia/image/autobyteus_image_provider.py +109 -0
  50. autobyteus/multimedia/image/base_image_client.py +67 -0
  51. autobyteus/multimedia/image/image_client_factory.py +118 -0
  52. autobyteus/multimedia/image/image_model.py +96 -0
  53. autobyteus/multimedia/providers.py +5 -0
  54. autobyteus/multimedia/runtimes.py +8 -0
  55. autobyteus/multimedia/utils/__init__.py +10 -0
  56. autobyteus/multimedia/utils/api_utils.py +19 -0
  57. autobyteus/multimedia/utils/multimedia_config.py +29 -0
  58. autobyteus/multimedia/utils/response_types.py +13 -0
  59. autobyteus/tools/__init__.py +3 -0
  60. autobyteus/tools/multimedia/__init__.py +8 -0
  61. autobyteus/tools/multimedia/audio_tools.py +116 -0
  62. autobyteus/tools/multimedia/image_tools.py +186 -0
  63. autobyteus/tools/tool_category.py +1 -0
  64. autobyteus/tools/usage/parsers/provider_aware_tool_usage_parser.py +5 -2
  65. autobyteus/tools/usage/providers/tool_manifest_provider.py +5 -3
  66. autobyteus/tools/usage/registries/tool_formatting_registry.py +9 -2
  67. autobyteus/tools/usage/registries/tool_usage_parser_registry.py +9 -2
  68. {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/METADATA +9 -9
  69. {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/RECORD +73 -45
  70. examples/run_browser_agent.py +1 -1
  71. autobyteus/llm/utils/image_payload_formatter.py +0 -89
  72. {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/WHEEL +0 -0
  73. {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/licenses/LICENSE +0 -0
  74. {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ class AgentConfig:
37
37
  system_prompt: Optional[str] = None,
38
38
  tools: Optional[List['BaseTool']] = None,
39
39
  auto_execute_tools: bool = True,
40
+ use_xml_tool_format: bool = False,
40
41
  input_processors: Optional[List['BaseAgentUserInputMessageProcessor']] = None,
41
42
  llm_response_processors: Optional[List['BaseLLMResponseProcessor']] = None,
42
43
  system_prompt_processors: Optional[List['BaseSystemPromptProcessor']] = None,
@@ -57,6 +58,8 @@ class AgentConfig:
57
58
  llm_instance's config will be used as the base.
58
59
  tools: An optional list of pre-initialized tool instances (subclasses of BaseTool).
59
60
  auto_execute_tools: If True, the agent will execute tools without approval.
61
+ use_xml_tool_format: If True, forces the agent to use XML format for tool
62
+ definitions and parsing, overriding provider defaults.
60
63
  input_processors: A list of input processor instances.
61
64
  llm_response_processors: A list of LLM response processor instances.
62
65
  system_prompt_processors: A list of system prompt processor instances.
@@ -74,6 +77,7 @@ class AgentConfig:
74
77
  self.tools = tools or []
75
78
  self.workspace = workspace
76
79
  self.auto_execute_tools = auto_execute_tools
80
+ self.use_xml_tool_format = use_xml_tool_format
77
81
  self.input_processors = input_processors or []
78
82
  self.llm_response_processors = llm_response_processors if llm_response_processors is not None else list(self.DEFAULT_LLM_RESPONSE_PROCESSORS)
79
83
  self.system_prompt_processors = system_prompt_processors if system_prompt_processors is not None else list(self.DEFAULT_SYSTEM_PROMPT_PROCESSORS)
@@ -81,7 +85,7 @@ class AgentConfig:
81
85
  self.phase_hooks = phase_hooks or []
82
86
  self.initial_custom_data = initial_custom_data
83
87
 
84
- logger.debug(f"AgentConfig created for name '{self.name}', role '{self.role}'.")
88
+ logger.debug(f"AgentConfig created for name '{self.name}', role '{self.role}'. XML tool format override: {self.use_xml_tool_format}")
85
89
 
86
90
  def copy(self) -> 'AgentConfig':
87
91
  """
@@ -98,6 +102,7 @@ class AgentConfig:
98
102
  system_prompt=self.system_prompt,
99
103
  tools=self.tools.copy(), # Shallow copy the list, but reference the original tool instances
100
104
  auto_execute_tools=self.auto_execute_tools,
105
+ use_xml_tool_format=self.use_xml_tool_format,
101
106
  input_processors=self.input_processors.copy(), # Shallow copy the list
102
107
  llm_response_processors=self.llm_response_processors.copy(), # Shallow copy the list
103
108
  system_prompt_processors=self.system_prompt_processors.copy(), # Shallow copy the list
@@ -1,7 +1,7 @@
1
1
  # file: autobyteus/autobyteus/agent/handlers/llm_user_message_ready_event_handler.py
2
2
  import logging
3
3
  import traceback
4
- from typing import TYPE_CHECKING, cast, Optional
4
+ from typing import TYPE_CHECKING, cast, Optional, List
5
5
 
6
6
  from autobyteus.agent.handlers.base_event_handler import AgentEventHandler
7
7
  from autobyteus.agent.events import LLMUserMessageReadyEvent, LLMCompleteResponseReceivedEvent
@@ -53,6 +53,9 @@ class LLMUserMessageReadyEventHandler(AgentEventHandler):
53
53
  complete_response_text = ""
54
54
  complete_reasoning_text = ""
55
55
  token_usage: Optional[TokenUsage] = None
56
+ complete_image_urls: List[str] = []
57
+ complete_audio_urls: List[str] = []
58
+ complete_video_urls: List[str] = []
56
59
 
57
60
  notifier: Optional['AgentExternalEventNotifier'] = None
58
61
  if context.phase_manager:
@@ -72,9 +75,19 @@ class LLMUserMessageReadyEventHandler(AgentEventHandler):
72
75
  if chunk_response.reasoning:
73
76
  complete_reasoning_text += chunk_response.reasoning
74
77
 
75
- if chunk_response.is_complete and chunk_response.usage:
76
- token_usage = chunk_response.usage
77
- logger.debug(f"Agent '{agent_id}' received final chunk with token usage: {token_usage}")
78
+ if chunk_response.is_complete:
79
+ if chunk_response.usage:
80
+ token_usage = chunk_response.usage
81
+ logger.debug(f"Agent '{agent_id}' received final chunk with token usage: {token_usage}")
82
+ if chunk_response.image_urls:
83
+ complete_image_urls.extend(chunk_response.image_urls)
84
+ logger.debug(f"Agent '{agent_id}' received final chunk with {len(chunk_response.image_urls)} image URLs.")
85
+ if chunk_response.audio_urls:
86
+ complete_audio_urls.extend(chunk_response.audio_urls)
87
+ logger.debug(f"Agent '{agent_id}' received final chunk with {len(chunk_response.audio_urls)} audio URLs.")
88
+ if chunk_response.video_urls:
89
+ complete_video_urls.extend(chunk_response.video_urls)
90
+ logger.debug(f"Agent '{agent_id}' received final chunk with {len(chunk_response.video_urls)} video URLs.")
78
91
 
79
92
  if notifier:
80
93
  try:
@@ -121,20 +134,30 @@ class LLMUserMessageReadyEventHandler(AgentEventHandler):
121
134
  logger.info(f"Agent '{agent_id}' enqueued LLMCompleteResponseReceivedEvent with error details from LLMUserMessageReadyEventHandler.")
122
135
  return
123
136
 
124
- # Add message to history with reasoning
137
+ # Add message to history with reasoning and multimodal data
125
138
  history_entry = {"role": "assistant", "content": complete_response_text}
126
139
  if complete_reasoning_text:
127
140
  history_entry["reasoning"] = complete_reasoning_text
141
+ if complete_image_urls:
142
+ history_entry["image_urls"] = complete_image_urls
143
+ if complete_audio_urls:
144
+ history_entry["audio_urls"] = complete_audio_urls
145
+ if complete_video_urls:
146
+ history_entry["video_urls"] = complete_video_urls
128
147
  context.state.add_message_to_history(history_entry)
129
148
 
130
- # Create complete response with reasoning
149
+ # Create complete response with reasoning and multimodal data
131
150
  complete_response_obj = CompleteResponse(
132
151
  content=complete_response_text,
133
152
  reasoning=complete_reasoning_text,
134
- usage=token_usage
153
+ usage=token_usage,
154
+ image_urls=complete_image_urls,
155
+ audio_urls=complete_audio_urls,
156
+ video_urls=complete_video_urls
135
157
  )
136
158
  llm_complete_event = LLMCompleteResponseReceivedEvent(
137
159
  complete_response=complete_response_obj
138
160
  )
139
161
  await context.input_event_queues.enqueue_internal_system_event(llm_complete_event)
140
162
  logger.info(f"Agent '{agent_id}' enqueued LLMCompleteResponseReceivedEvent from LLMUserMessageReadyEventHandler.")
163
+
@@ -3,14 +3,14 @@ import logging
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  from autobyteus.agent.handlers.base_event_handler import AgentEventHandler
6
- from autobyteus.agent.events import UserMessageReceivedEvent, LLMUserMessageReadyEvent
7
- from autobyteus.agent.message.agent_input_user_message import AgentInputUserMessage
6
+ from autobyteus.agent.events import UserMessageReceivedEvent, LLMUserMessageReadyEvent
7
+ from autobyteus.agent.message.agent_input_user_message import AgentInputUserMessage
8
8
  from autobyteus.agent.input_processor import BaseAgentUserInputMessageProcessor
9
- from autobyteus.llm.user_message import LLMUserMessage
9
+ from autobyteus.agent.message.multimodal_message_builder import build_llm_user_message
10
10
 
11
11
 
12
12
  if TYPE_CHECKING:
13
- from autobyteus.agent.context import AgentContext
13
+ from autobyteus.agent.context import AgentContext
14
14
  from autobyteus.agent.events.notifiers import AgentExternalEventNotifier
15
15
 
16
16
  logger = logging.getLogger(__name__)
@@ -18,24 +18,23 @@ logger = logging.getLogger(__name__)
18
18
  class UserInputMessageEventHandler(AgentEventHandler):
19
19
  """
20
20
  Handles UserMessageReceivedEvents by first applying any configured
21
- AgentUserInputMessageProcessors (provided as instances) to the AgentInputUserMessage,
22
- then converting the processed message into an LLMUserMessage, and finally
21
+ AgentUserInputMessageProcessors, then using the multimodal_message_builder
22
+ to convert the processed message into an LLMUserMessage, and finally
23
23
  enqueuing an LLMUserMessageReadyEvent for further processing by the LLM.
24
- It also checks for metadata to emit special notifications for system-generated tasks.
25
24
  """
26
25
 
27
26
  def __init__(self):
28
27
  logger.info("UserInputMessageEventHandler initialized.")
29
28
 
30
29
  async def handle(self,
31
- event: UserMessageReceivedEvent,
30
+ event: UserMessageReceivedEvent,
32
31
  context: 'AgentContext') -> None:
33
- if not isinstance(event, UserMessageReceivedEvent):
32
+ if not isinstance(event, UserMessageReceivedEvent):
34
33
  logger.warning(f"UserInputMessageEventHandler received non-UserMessageReceivedEvent: {type(event)}. Skipping.")
35
34
  return
36
35
 
37
- original_agent_input_user_msg: AgentInputUserMessage = event.agent_input_user_message
38
-
36
+ original_agent_input_user_msg: AgentInputUserMessage = event.agent_input_user_message
37
+
39
38
  # --- NEW LOGIC: Check metadata for system-generated tasks and notify TUI ---
40
39
  if original_agent_input_user_msg.metadata.get('source') == 'system_task_notifier':
41
40
  if context.phase_manager:
@@ -47,11 +46,11 @@ class UserInputMessageEventHandler(AgentEventHandler):
47
46
  notifier.notify_agent_data_system_task_notification_received(notification_data)
48
47
  logger.info(f"Agent '{context.agent_id}' emitted system task notification for TUI.")
49
48
  # --- END NEW LOGIC ---
50
-
51
- processed_agent_input_user_msg: AgentInputUserMessage = original_agent_input_user_msg
52
-
53
- logger.info(f"Agent '{context.agent_id}' handling UserMessageReceivedEvent: '{original_agent_input_user_msg.content}'")
54
-
49
+
50
+ processed_agent_input_user_msg: AgentInputUserMessage = original_agent_input_user_msg
51
+
52
+ logger.info(f"Agent '{context.agent_id}' handling UserMessageReceivedEvent: '{original_agent_input_user_msg.content}'")
53
+
55
54
  processor_instances = context.config.input_processors
56
55
  if processor_instances:
57
56
  processor_names = [p.get_name() for p in processor_instances]
@@ -62,14 +61,14 @@ class UserInputMessageEventHandler(AgentEventHandler):
62
61
  if not isinstance(processor_instance, BaseAgentUserInputMessageProcessor):
63
62
  logger.error(f"Agent '{context.agent_id}': Invalid input processor type in config: {type(processor_instance)}. Skipping.")
64
63
  continue
65
-
64
+
66
65
  processor_name_for_log = processor_instance.get_name()
67
66
  logger.debug(f"Agent '{context.agent_id}': Applying input processor '{processor_name_for_log}'.")
68
67
  msg_before_this_processor = processed_agent_input_user_msg
69
68
  # Pass the original event to the processor
70
69
  processed_agent_input_user_msg = await processor_instance.process(
71
- message=msg_before_this_processor,
72
- context=context,
70
+ message=msg_before_this_processor,
71
+ context=context,
73
72
  triggering_event=event
74
73
  )
75
74
  logger.info(f"Agent '{context.agent_id}': Input processor '{processor_name_for_log}' applied successfully.")
@@ -81,12 +80,10 @@ class UserInputMessageEventHandler(AgentEventHandler):
81
80
  else:
82
81
  logger.debug(f"Agent '{context.agent_id}': No input processors configured in agent config.")
83
82
 
84
- llm_user_message = LLMUserMessage(
85
- content=processed_agent_input_user_msg.content,
86
- image_urls=processed_agent_input_user_msg.image_urls
87
- )
83
+ # --- Refactored: Use the dedicated builder ---
84
+ llm_user_message = build_llm_user_message(processed_agent_input_user_msg)
88
85
 
89
- llm_user_message_ready_event = LLMUserMessageReadyEvent(llm_user_message=llm_user_message)
86
+ llm_user_message_ready_event = LLMUserMessageReadyEvent(llm_user_message=llm_user_message)
90
87
  await context.input_event_queues.enqueue_internal_system_event(llm_user_message_ready_event)
91
-
88
+
92
89
  logger.info(f"Agent '{context.agent_id}' processed AgentInputUserMessage and enqueued LLMUserMessageReadyEvent.")
@@ -9,12 +9,14 @@ from .agent_input_user_message import AgentInputUserMessage
9
9
  from .send_message_to import SendMessageTo
10
10
  from .context_file import ContextFile
11
11
  from .context_file_type import ContextFileType
12
+ from .multimodal_message_builder import build_llm_user_message
12
13
 
13
14
  __all__ = [
14
- "InterAgentMessage",
15
- "InterAgentMessageType",
16
- "AgentInputUserMessage",
15
+ "InterAgentMessage",
16
+ "InterAgentMessageType",
17
+ "AgentInputUserMessage",
17
18
  "SendMessageTo",
18
- "ContextFile",
19
- "ContextFileType",
19
+ "ContextFile",
20
+ "ContextFileType",
21
+ "build_llm_user_message",
20
22
  ]
@@ -8,21 +8,18 @@ from .context_file import ContextFile # Import the new ContextFile dataclass
8
8
  logger = logging.getLogger(__name__)
9
9
 
10
10
  @dataclass
11
- class AgentInputUserMessage:
11
+ class AgentInputUserMessage:
12
12
  """
13
13
  Represents a message received from an external user interacting with the agent system.
14
- This is a simple dataclass. It includes support for a list of ContextFile objects,
15
- allowing users to provide various documents as context.
14
+ This is a simple dataclass. It includes support for a list of ContextFile objects,
15
+ allowing users to provide various documents and media as context via a single list.
16
16
  """
17
17
  content: str
18
- image_urls: Optional[List[str]] = field(default=None) # Basic list of strings
19
18
  context_files: Optional[List[ContextFile]] = field(default=None)
20
19
  metadata: Dict[str, Any] = field(default_factory=dict)
21
20
 
22
21
  def __post_init__(self):
23
22
  # Basic type validation that dataclasses don't do automatically for mutable defaults or complex types
24
- if self.image_urls is not None and not (isinstance(self.image_urls, list) and all(isinstance(url, str) for url in self.image_urls)):
25
- raise TypeError("AgentInputUserMessage 'image_urls' must be a list of strings if provided.")
26
23
  if self.context_files is not None and not (isinstance(self.context_files, list) and all(isinstance(cf, ContextFile) for cf in self.context_files)):
27
24
  raise TypeError("AgentInputUserMessage 'context_files' must be a list of ContextFile objects if provided.")
28
25
  if not isinstance(self.metadata, dict): # Should be caught by default_factory, but good practice
@@ -34,7 +31,7 @@ class AgentInputUserMessage:
34
31
  num_context_files = len(self.context_files) if self.context_files else 0
35
32
  logger.debug(
36
33
  f"AgentInputUserMessage initialized. Content: '{self.content[:50]}...', "
37
- f"Image URLs: {self.image_urls}, Num ContextFiles: {num_context_files}, "
34
+ f"Num ContextFiles: {num_context_files}, "
38
35
  f"Metadata keys: {list(self.metadata.keys())}"
39
36
  )
40
37
 
@@ -47,7 +44,6 @@ class AgentInputUserMessage:
47
44
 
48
45
  return {
49
46
  "content": self.content,
50
- "image_urls": self.image_urls,
51
47
  "context_files": context_files_dict_list,
52
48
  "metadata": self.metadata,
53
49
  }
@@ -59,31 +55,25 @@ class AgentInputUserMessage:
59
55
  if not isinstance(content, str): # Ensure content is string
60
56
  raise ValueError("AgentInputUserMessage 'content' in dictionary must be a string.")
61
57
 
62
- image_urls = data.get("image_urls")
63
- if image_urls is not None and not (isinstance(image_urls, list) and all(isinstance(url, str) for url in image_urls)):
64
- raise ValueError("AgentInputUserMessage 'image_urls' in dictionary must be a list of strings if provided.")
65
-
66
58
  context_files_data = data.get("context_files")
67
59
  context_files_list: Optional[List[ContextFile]] = None
68
60
  if context_files_data is not None:
69
61
  if not isinstance(context_files_data, list):
70
62
  raise ValueError("AgentInputUserMessage 'context_files' in dictionary must be a list if provided.")
71
63
  context_files_list = [ContextFile.from_dict(cf_data) for cf_data in context_files_data]
72
-
64
+
73
65
  metadata = data.get("metadata", {})
74
66
  if not isinstance(metadata, dict):
75
67
  raise ValueError("AgentInputUserMessage 'metadata' in dictionary must be a dict if provided.")
76
68
 
77
69
  return cls(
78
70
  content=content,
79
- image_urls=image_urls,
80
71
  context_files=context_files_list,
81
72
  metadata=metadata
82
73
  )
83
74
 
84
75
  def __repr__(self) -> str:
85
76
  content_preview = f"{self.content[:100]}..." if len(self.content) > 100 else self.content
86
- images_repr = f", image_urls={self.image_urls}" if self.image_urls else ""
87
77
 
88
78
  if self.context_files:
89
79
  context_repr = f", context_files=[{len(self.context_files)} ContextFile(s)]"
@@ -93,4 +83,4 @@ class AgentInputUserMessage:
93
83
  meta_repr = f", metadata_keys={list(self.metadata.keys())}" if self.metadata else ""
94
84
 
95
85
  return (f"AgentInputUserMessage(content='{content_preview}'"
96
- f"{images_repr}{context_repr}{meta_repr})")
86
+ f"{context_repr}{meta_repr})")
@@ -3,6 +3,7 @@ import os
3
3
  import logging
4
4
  from typing import Optional, Dict, Any
5
5
  from dataclasses import dataclass, field
6
+ from urllib.parse import urlparse
6
7
 
7
8
  from .context_file_type import ContextFileType
8
9
 
@@ -12,10 +13,9 @@ logger = logging.getLogger(__name__)
12
13
  class ContextFile:
13
14
  """
14
15
  Represents a single context file provided to an agent.
15
- This is a simple dataclass, deferring path validation and file access
16
- to input processors.
16
+ The 'uri' can be a local file path or a network URL.
17
17
  """
18
- path: str
18
+ uri: str
19
19
  file_type: ContextFileType = ContextFileType.UNKNOWN
20
20
  file_name: Optional[str] = None
21
21
  metadata: Dict[str, Any] = field(default_factory=dict)
@@ -25,33 +25,33 @@ class ContextFile:
25
25
  Called after the dataclass's __init__ method.
26
26
  Used here to infer file_name and file_type if not provided or UNKNOWN.
27
27
  """
28
- if self.file_name is None and self.path:
28
+ if not isinstance(self.uri, str) or not self.uri:
29
+ raise TypeError(f"ContextFile uri must be a non-empty string, got {type(self.uri)}")
30
+
31
+ if self.file_name is None:
29
32
  try:
30
- self.file_name = os.path.basename(self.path)
33
+ # Use urlparse to correctly handle both URLs and local paths
34
+ parsed_path = urlparse(self.uri).path
35
+ self.file_name = os.path.basename(parsed_path)
31
36
  except Exception as e:
32
- logger.warning(f"Could not determine basename for path '{self.path}': {e}")
37
+ logger.warning(f"Could not determine basename for uri '{self.uri}': {e}")
33
38
  self.file_name = "unknown_file"
34
39
 
35
- if self.file_type == ContextFileType.UNKNOWN and self.path:
36
- inferred_type = ContextFileType.from_path(self.path)
40
+ if self.file_type == ContextFileType.UNKNOWN:
41
+ inferred_type = ContextFileType.from_path(self.uri)
37
42
  if inferred_type != ContextFileType.UNKNOWN:
38
43
  self.file_type = inferred_type
39
- logger.debug(f"Inferred file type for '{self.path}' as {self.file_type.value}")
44
+ logger.debug(f"Inferred file type for '{self.uri}' as {self.file_type.value}")
40
45
  else:
41
- logger.debug(f"Could not infer specific file type for '{self.path}', remaining UNKNOWN.")
42
-
43
- # Ensure path is a string
44
- if not isinstance(self.path, str):
45
- # This ideally should be caught by type hints earlier, but as a runtime safeguard:
46
- raise TypeError(f"ContextFile path must be a string, got {type(self.path)}")
47
-
46
+ logger.debug(f"Could not infer specific file type for '{self.uri}', remaining UNKNOWN.")
47
+
48
48
  if logger.isEnabledFor(logging.DEBUG):
49
- logger.debug(f"ContextFile initialized: path='{self.path}', type='{self.file_type.value}', name='{self.file_name}'")
49
+ logger.debug(f"ContextFile initialized: uri='{self.uri}', type='{self.file_type.value}', name='{self.file_name}'")
50
50
 
51
51
  def to_dict(self) -> Dict[str, Any]:
52
52
  """Serializes the ContextFile to a dictionary."""
53
53
  return {
54
- "path": self.path,
54
+ "uri": self.uri,
55
55
  "file_type": self.file_type.value, # Serialize enum to its value
56
56
  "file_name": self.file_name,
57
57
  "metadata": self.metadata,
@@ -60,23 +60,23 @@ class ContextFile:
60
60
  @classmethod
61
61
  def from_dict(cls, data: Dict[str, Any]) -> 'ContextFile':
62
62
  """Deserializes a ContextFile from a dictionary."""
63
- if not isinstance(data.get("path"), str):
64
- raise ValueError("ContextFile 'path' in dictionary must be a string.")
65
-
63
+ if not isinstance(data.get("uri"), str):
64
+ raise ValueError("ContextFile 'uri' in dictionary must be a string.")
65
+
66
66
  file_type_str = data.get("file_type", ContextFileType.UNKNOWN.value)
67
67
  try:
68
68
  file_type = ContextFileType(file_type_str)
69
69
  except ValueError:
70
70
  logger.warning(f"Invalid file_type string '{file_type_str}' in ContextFile data. Defaulting to UNKNOWN.")
71
71
  file_type = ContextFileType.UNKNOWN
72
-
72
+
73
73
  return cls(
74
- path=data["path"],
74
+ uri=data["uri"],
75
75
  file_type=file_type,
76
76
  file_name=data.get("file_name"),
77
77
  metadata=data.get("metadata", {})
78
78
  )
79
79
 
80
80
  def __repr__(self) -> str:
81
- return (f"ContextFile(path='{self.path}', file_name='{self.file_name}', "
81
+ return (f"ContextFile(uri='{self.uri}', file_name='{self.file_name}', "
82
82
  f"file_type='{self.file_type.value}', metadata_keys={list(self.metadata.keys())})")
@@ -1,5 +1,6 @@
1
1
  from enum import Enum
2
2
  import os
3
+ from urllib.parse import urlparse
3
4
 
4
5
  class ContextFileType(str, Enum):
5
6
  """
@@ -23,19 +24,25 @@ class ContextFileType(str, Enum):
23
24
  UNKNOWN = "unknown" # Fallback for unrecognized types
24
25
 
25
26
  @classmethod
26
- def from_path(cls, file_path: str) -> 'ContextFileType':
27
+ def from_path(cls, uri: str) -> 'ContextFileType':
27
28
  """
28
- Infers the ContextFileType from a file path based on its extension.
29
+ Infers the ContextFileType from a file path or URL based on its extension.
29
30
  """
30
- if not file_path or not isinstance(file_path, str):
31
+ if not uri or not isinstance(uri, str):
31
32
  return cls.UNKNOWN
32
-
33
- _, extension = os.path.splitext(file_path.lower())
34
-
33
+
34
+ try:
35
+ # Parse the URI to handle both file paths and URLs gracefully
36
+ parsed_path = urlparse(uri).path
37
+ _, extension = os.path.splitext(parsed_path.lower())
38
+ except Exception:
39
+ # Fallback for malformed URIs
40
+ _, extension = os.path.splitext(uri.lower())
41
+
35
42
  if extension == ".txt":
36
43
  return cls.TEXT
37
44
  elif extension == ".md":
38
- return cls.MARKDOWN
45
+ return cls.MARKDOWN
39
46
  elif extension == ".pdf":
40
47
  return cls.PDF
41
48
  elif extension == ".docx":
@@ -61,9 +68,23 @@ class ContextFileType(str, Enum):
61
68
  elif extension in [".mp4", ".mov", ".avi", ".mkv", ".webm"]:
62
69
  return cls.VIDEO
63
70
  elif extension in [".png", ".jpg", ".jpeg", ".gif", ".webp"]:
64
- return cls.IMAGE
71
+ return cls.IMAGE
65
72
  else:
66
73
  return cls.UNKNOWN
67
74
 
75
+ @classmethod
76
+ def get_readable_text_types(cls) -> list['ContextFileType']:
77
+ """Returns a list of file types that can be read as plain text for context."""
78
+ return [
79
+ cls.TEXT,
80
+ cls.MARKDOWN,
81
+ cls.JSON,
82
+ cls.XML,
83
+ cls.HTML,
84
+ cls.PYTHON,
85
+ cls.JAVASCRIPT,
86
+ cls.CSV,
87
+ ]
88
+
68
89
  def __str__(self) -> str:
69
90
  return self.value
@@ -0,0 +1,47 @@
1
+ # file: autobyteus/autobyteus/agent/message/multimodal_message_builder.py
2
+ import logging
3
+
4
+ from autobyteus.agent.message.agent_input_user_message import AgentInputUserMessage
5
+ from autobyteus.agent.message.context_file_type import ContextFileType
6
+ from autobyteus.llm.user_message import LLMUserMessage
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def build_llm_user_message(agent_input_user_message: AgentInputUserMessage) -> LLMUserMessage:
11
+ """
12
+ Builds an LLMUserMessage from an AgentInputUserMessage by categorizing its context files.
13
+
14
+ This function iterates through the context files, sorting URIs for images, audio, and video
15
+ into the appropriate fields of the LLMUserMessage. It ignores other file types for now.
16
+
17
+ Args:
18
+ agent_input_user_message: The user input message containing content and context files.
19
+
20
+ Returns:
21
+ An LLMUserMessage ready to be sent to the LLM.
22
+ """
23
+ image_urls = []
24
+ audio_urls = []
25
+ video_urls = []
26
+
27
+ if agent_input_user_message.context_files:
28
+ for context_file in agent_input_user_message.context_files:
29
+ file_type = context_file.file_type
30
+ if file_type == ContextFileType.IMAGE:
31
+ image_urls.append(context_file.uri)
32
+ elif file_type == ContextFileType.AUDIO:
33
+ audio_urls.append(context_file.uri)
34
+ elif file_type == ContextFileType.VIDEO:
35
+ video_urls.append(context_file.uri)
36
+ else:
37
+ logger.debug(f"Ignoring non-media context file of type '{file_type.value}' during LLM message build: {context_file.uri}")
38
+
39
+ llm_user_message = LLMUserMessage(
40
+ content=agent_input_user_message.content,
41
+ image_urls=image_urls if image_urls else None,
42
+ audio_urls=audio_urls if audio_urls else None,
43
+ video_urls=video_urls if video_urls else None
44
+ )
45
+
46
+ logger.info(f"Built LLMUserMessage with {len(image_urls)} images, {len(audio_urls)} audio, {len(video_urls)} video files.")
47
+ return llm_user_message
@@ -20,12 +20,18 @@ class AssistantChunkData(BaseStreamPayload):
20
20
  reasoning: Optional[str] = None
21
21
  is_complete: bool
22
22
  usage: Optional[TokenUsage] = None
23
+ image_urls: Optional[List[str]] = None
24
+ audio_urls: Optional[List[str]] = None
25
+ video_urls: Optional[List[str]] = None
23
26
 
24
27
 
25
28
  class AssistantCompleteResponseData(BaseStreamPayload):
26
29
  content: str
27
30
  reasoning: Optional[str] = None
28
31
  usage: Optional[TokenUsage] = None
32
+ image_urls: Optional[List[str]] = None
33
+ audio_urls: Optional[List[str]] = None
34
+ video_urls: Optional[List[str]] = None
29
35
 
30
36
  class ToolInteractionLogEntryData(BaseStreamPayload):
31
37
  log_entry: str
@@ -102,14 +108,20 @@ def create_assistant_chunk_data(chunk_obj: Any) -> AssistantChunkData:
102
108
  content=str(getattr(chunk_obj, 'content', '')),
103
109
  reasoning=getattr(chunk_obj, 'reasoning', None),
104
110
  is_complete=bool(getattr(chunk_obj, 'is_complete', False)),
105
- usage=parsed_usage
111
+ usage=parsed_usage,
112
+ image_urls=getattr(chunk_obj, 'image_urls', None),
113
+ audio_urls=getattr(chunk_obj, 'audio_urls', None),
114
+ video_urls=getattr(chunk_obj, 'video_urls', None)
106
115
  )
107
116
  elif isinstance(chunk_obj, dict):
108
117
  return AssistantChunkData(
109
118
  content=str(chunk_obj.get('content', '')),
110
119
  reasoning=chunk_obj.get('reasoning', None),
111
120
  is_complete=bool(chunk_obj.get('is_complete', False)),
112
- usage=parsed_usage
121
+ usage=parsed_usage,
122
+ image_urls=chunk_obj.get('image_urls', None),
123
+ audio_urls=chunk_obj.get('audio_urls', None),
124
+ video_urls=chunk_obj.get('video_urls', None)
113
125
  )
114
126
  raise ValueError(f"Cannot create AssistantChunkData from {type(chunk_obj)}")
115
127
 
@@ -136,13 +148,19 @@ def create_assistant_complete_response_data(complete_resp_obj: Any) -> Assistant
136
148
  return AssistantCompleteResponseData(
137
149
  content=str(getattr(complete_resp_obj, 'content', '')),
138
150
  reasoning=getattr(complete_resp_obj, 'reasoning', None),
139
- usage=parsed_usage
151
+ usage=parsed_usage,
152
+ image_urls=getattr(complete_resp_obj, 'image_urls', None),
153
+ audio_urls=getattr(complete_resp_obj, 'audio_urls', None),
154
+ video_urls=getattr(complete_resp_obj, 'video_urls', None)
140
155
  )
141
156
  elif isinstance(complete_resp_obj, dict):
142
157
  return AssistantCompleteResponseData(
143
158
  content=str(complete_resp_obj.get('content', '')),
144
159
  reasoning=complete_resp_obj.get('reasoning', None),
145
- usage=parsed_usage
160
+ usage=parsed_usage,
161
+ image_urls=complete_resp_obj.get('image_urls', None),
162
+ audio_urls=complete_resp_obj.get('audio_urls', None),
163
+ video_urls=complete_resp_obj.get('video_urls', None)
146
164
  )
147
165
  raise ValueError(f"Cannot create AssistantCompleteResponseData from {type(complete_resp_obj)}")
148
166
 
@@ -177,3 +195,4 @@ def create_system_task_notification_data(notification_data_dict: Any) -> SystemT
177
195
  if isinstance(notification_data_dict, dict):
178
196
  return SystemTaskNotificationData(**notification_data_dict)
179
197
  raise ValueError(f"Cannot create SystemTaskNotificationData from {type(notification_data_dict)}")
198
+
@@ -47,6 +47,9 @@ class ToolManifestInjectorProcessor(BaseSystemPromptProcessor):
47
47
  llm_provider = None
48
48
  if context.llm_instance and context.llm_instance.model:
49
49
  llm_provider = context.llm_instance.model.provider
50
+
51
+ # Retrieve the override flag from the agent's configuration.
52
+ use_xml_tool_format = context.config.use_xml_tool_format
50
53
 
51
54
  # Generate the manifest string for the 'tools' variable.
52
55
  tools_manifest: str
@@ -59,10 +62,11 @@ class ToolManifestInjectorProcessor(BaseSystemPromptProcessor):
59
62
  ]
60
63
 
61
64
  try:
62
- # Delegate manifest generation to the provider, which now handles all format logic.
65
+ # Delegate manifest generation to the provider, passing the override flag.
63
66
  tools_manifest = self._manifest_provider.provide(
64
67
  tool_definitions=tool_definitions,
65
- provider=llm_provider
68
+ provider=llm_provider,
69
+ use_xml_tool_format=use_xml_tool_format
66
70
  )
67
71
  except Exception as e:
68
72
  logger.exception(f"An unexpected error occurred during tool manifest generation for agent '{agent_id}': {e}")
@@ -33,7 +33,8 @@ class ToolInvocation:
33
33
  """
34
34
  # Create a canonical representation of the arguments
35
35
  # sort_keys=True ensures that the order of keys doesn't change the hash
36
- canonical_args = json.dumps(arguments, sort_keys=True, separators=(',', ':'))
36
+ # ensure_ascii=False is critical for cross-language compatibility with JS
37
+ canonical_args = json.dumps(arguments, sort_keys=True, separators=(',', ':'), ensure_ascii=False)
37
38
 
38
39
  # Create a string to hash
39
40
  hash_string = f"{name}:{canonical_args}"