autobyteus 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autobyteus/agent/context/agent_config.py +6 -1
- autobyteus/agent/handlers/llm_user_message_ready_event_handler.py +30 -7
- autobyteus/agent/handlers/user_input_message_event_handler.py +22 -25
- autobyteus/agent/message/__init__.py +7 -5
- autobyteus/agent/message/agent_input_user_message.py +6 -16
- autobyteus/agent/message/context_file.py +24 -24
- autobyteus/agent/message/context_file_type.py +29 -8
- autobyteus/agent/message/multimodal_message_builder.py +47 -0
- autobyteus/agent/streaming/stream_event_payloads.py +23 -4
- autobyteus/agent/system_prompt_processor/tool_manifest_injector_processor.py +6 -2
- autobyteus/agent/tool_invocation.py +2 -1
- autobyteus/agent_team/bootstrap_steps/agent_configuration_preparation_step.py +9 -2
- autobyteus/agent_team/context/agent_team_config.py +1 -0
- autobyteus/llm/api/autobyteus_llm.py +33 -33
- autobyteus/llm/api/bedrock_llm.py +13 -5
- autobyteus/llm/api/claude_llm.py +13 -27
- autobyteus/llm/api/gemini_llm.py +108 -42
- autobyteus/llm/api/groq_llm.py +4 -3
- autobyteus/llm/api/mistral_llm.py +97 -51
- autobyteus/llm/api/nvidia_llm.py +6 -5
- autobyteus/llm/api/ollama_llm.py +37 -12
- autobyteus/llm/api/openai_compatible_llm.py +91 -91
- autobyteus/llm/autobyteus_provider.py +1 -1
- autobyteus/llm/base_llm.py +42 -139
- autobyteus/llm/extensions/base_extension.py +6 -6
- autobyteus/llm/extensions/token_usage_tracking_extension.py +3 -2
- autobyteus/llm/llm_factory.py +106 -4
- autobyteus/llm/token_counter/token_counter_factory.py +1 -1
- autobyteus/llm/user_message.py +43 -35
- autobyteus/llm/utils/llm_config.py +34 -18
- autobyteus/llm/utils/media_payload_formatter.py +99 -0
- autobyteus/llm/utils/messages.py +32 -25
- autobyteus/llm/utils/response_types.py +9 -3
- autobyteus/llm/utils/token_usage.py +6 -5
- autobyteus/multimedia/__init__.py +31 -0
- autobyteus/multimedia/audio/__init__.py +11 -0
- autobyteus/multimedia/audio/api/__init__.py +4 -0
- autobyteus/multimedia/audio/api/autobyteus_audio_client.py +59 -0
- autobyteus/multimedia/audio/api/gemini_audio_client.py +219 -0
- autobyteus/multimedia/audio/audio_client_factory.py +120 -0
- autobyteus/multimedia/audio/audio_model.py +96 -0
- autobyteus/multimedia/audio/autobyteus_audio_provider.py +108 -0
- autobyteus/multimedia/audio/base_audio_client.py +40 -0
- autobyteus/multimedia/image/__init__.py +11 -0
- autobyteus/multimedia/image/api/__init__.py +9 -0
- autobyteus/multimedia/image/api/autobyteus_image_client.py +97 -0
- autobyteus/multimedia/image/api/gemini_image_client.py +188 -0
- autobyteus/multimedia/image/api/openai_image_client.py +142 -0
- autobyteus/multimedia/image/autobyteus_image_provider.py +109 -0
- autobyteus/multimedia/image/base_image_client.py +67 -0
- autobyteus/multimedia/image/image_client_factory.py +118 -0
- autobyteus/multimedia/image/image_model.py +96 -0
- autobyteus/multimedia/providers.py +5 -0
- autobyteus/multimedia/runtimes.py +8 -0
- autobyteus/multimedia/utils/__init__.py +10 -0
- autobyteus/multimedia/utils/api_utils.py +19 -0
- autobyteus/multimedia/utils/multimedia_config.py +29 -0
- autobyteus/multimedia/utils/response_types.py +13 -0
- autobyteus/tools/__init__.py +3 -0
- autobyteus/tools/multimedia/__init__.py +8 -0
- autobyteus/tools/multimedia/audio_tools.py +116 -0
- autobyteus/tools/multimedia/image_tools.py +186 -0
- autobyteus/tools/tool_category.py +1 -0
- autobyteus/tools/usage/parsers/provider_aware_tool_usage_parser.py +5 -2
- autobyteus/tools/usage/providers/tool_manifest_provider.py +5 -3
- autobyteus/tools/usage/registries/tool_formatting_registry.py +9 -2
- autobyteus/tools/usage/registries/tool_usage_parser_registry.py +9 -2
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/METADATA +9 -9
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/RECORD +73 -45
- examples/run_browser_agent.py +1 -1
- autobyteus/llm/utils/image_payload_formatter.py +0 -89
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/WHEEL +0 -0
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/licenses/LICENSE +0 -0
- {autobyteus-1.1.5.dist-info → autobyteus-1.1.6.dist-info}/top_level.txt +0 -0
|
@@ -37,6 +37,7 @@ class AgentConfig:
|
|
|
37
37
|
system_prompt: Optional[str] = None,
|
|
38
38
|
tools: Optional[List['BaseTool']] = None,
|
|
39
39
|
auto_execute_tools: bool = True,
|
|
40
|
+
use_xml_tool_format: bool = False,
|
|
40
41
|
input_processors: Optional[List['BaseAgentUserInputMessageProcessor']] = None,
|
|
41
42
|
llm_response_processors: Optional[List['BaseLLMResponseProcessor']] = None,
|
|
42
43
|
system_prompt_processors: Optional[List['BaseSystemPromptProcessor']] = None,
|
|
@@ -57,6 +58,8 @@ class AgentConfig:
|
|
|
57
58
|
llm_instance's config will be used as the base.
|
|
58
59
|
tools: An optional list of pre-initialized tool instances (subclasses of BaseTool).
|
|
59
60
|
auto_execute_tools: If True, the agent will execute tools without approval.
|
|
61
|
+
use_xml_tool_format: If True, forces the agent to use XML format for tool
|
|
62
|
+
definitions and parsing, overriding provider defaults.
|
|
60
63
|
input_processors: A list of input processor instances.
|
|
61
64
|
llm_response_processors: A list of LLM response processor instances.
|
|
62
65
|
system_prompt_processors: A list of system prompt processor instances.
|
|
@@ -74,6 +77,7 @@ class AgentConfig:
|
|
|
74
77
|
self.tools = tools or []
|
|
75
78
|
self.workspace = workspace
|
|
76
79
|
self.auto_execute_tools = auto_execute_tools
|
|
80
|
+
self.use_xml_tool_format = use_xml_tool_format
|
|
77
81
|
self.input_processors = input_processors or []
|
|
78
82
|
self.llm_response_processors = llm_response_processors if llm_response_processors is not None else list(self.DEFAULT_LLM_RESPONSE_PROCESSORS)
|
|
79
83
|
self.system_prompt_processors = system_prompt_processors if system_prompt_processors is not None else list(self.DEFAULT_SYSTEM_PROMPT_PROCESSORS)
|
|
@@ -81,7 +85,7 @@ class AgentConfig:
|
|
|
81
85
|
self.phase_hooks = phase_hooks or []
|
|
82
86
|
self.initial_custom_data = initial_custom_data
|
|
83
87
|
|
|
84
|
-
logger.debug(f"AgentConfig created for name '{self.name}', role '{self.role}'.")
|
|
88
|
+
logger.debug(f"AgentConfig created for name '{self.name}', role '{self.role}'. XML tool format override: {self.use_xml_tool_format}")
|
|
85
89
|
|
|
86
90
|
def copy(self) -> 'AgentConfig':
|
|
87
91
|
"""
|
|
@@ -98,6 +102,7 @@ class AgentConfig:
|
|
|
98
102
|
system_prompt=self.system_prompt,
|
|
99
103
|
tools=self.tools.copy(), # Shallow copy the list, but reference the original tool instances
|
|
100
104
|
auto_execute_tools=self.auto_execute_tools,
|
|
105
|
+
use_xml_tool_format=self.use_xml_tool_format,
|
|
101
106
|
input_processors=self.input_processors.copy(), # Shallow copy the list
|
|
102
107
|
llm_response_processors=self.llm_response_processors.copy(), # Shallow copy the list
|
|
103
108
|
system_prompt_processors=self.system_prompt_processors.copy(), # Shallow copy the list
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# file: autobyteus/autobyteus/agent/handlers/llm_user_message_ready_event_handler.py
|
|
2
2
|
import logging
|
|
3
3
|
import traceback
|
|
4
|
-
from typing import TYPE_CHECKING, cast, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, cast, Optional, List
|
|
5
5
|
|
|
6
6
|
from autobyteus.agent.handlers.base_event_handler import AgentEventHandler
|
|
7
7
|
from autobyteus.agent.events import LLMUserMessageReadyEvent, LLMCompleteResponseReceivedEvent
|
|
@@ -53,6 +53,9 @@ class LLMUserMessageReadyEventHandler(AgentEventHandler):
|
|
|
53
53
|
complete_response_text = ""
|
|
54
54
|
complete_reasoning_text = ""
|
|
55
55
|
token_usage: Optional[TokenUsage] = None
|
|
56
|
+
complete_image_urls: List[str] = []
|
|
57
|
+
complete_audio_urls: List[str] = []
|
|
58
|
+
complete_video_urls: List[str] = []
|
|
56
59
|
|
|
57
60
|
notifier: Optional['AgentExternalEventNotifier'] = None
|
|
58
61
|
if context.phase_manager:
|
|
@@ -72,9 +75,19 @@ class LLMUserMessageReadyEventHandler(AgentEventHandler):
|
|
|
72
75
|
if chunk_response.reasoning:
|
|
73
76
|
complete_reasoning_text += chunk_response.reasoning
|
|
74
77
|
|
|
75
|
-
if chunk_response.is_complete
|
|
76
|
-
|
|
77
|
-
|
|
78
|
+
if chunk_response.is_complete:
|
|
79
|
+
if chunk_response.usage:
|
|
80
|
+
token_usage = chunk_response.usage
|
|
81
|
+
logger.debug(f"Agent '{agent_id}' received final chunk with token usage: {token_usage}")
|
|
82
|
+
if chunk_response.image_urls:
|
|
83
|
+
complete_image_urls.extend(chunk_response.image_urls)
|
|
84
|
+
logger.debug(f"Agent '{agent_id}' received final chunk with {len(chunk_response.image_urls)} image URLs.")
|
|
85
|
+
if chunk_response.audio_urls:
|
|
86
|
+
complete_audio_urls.extend(chunk_response.audio_urls)
|
|
87
|
+
logger.debug(f"Agent '{agent_id}' received final chunk with {len(chunk_response.audio_urls)} audio URLs.")
|
|
88
|
+
if chunk_response.video_urls:
|
|
89
|
+
complete_video_urls.extend(chunk_response.video_urls)
|
|
90
|
+
logger.debug(f"Agent '{agent_id}' received final chunk with {len(chunk_response.video_urls)} video URLs.")
|
|
78
91
|
|
|
79
92
|
if notifier:
|
|
80
93
|
try:
|
|
@@ -121,20 +134,30 @@ class LLMUserMessageReadyEventHandler(AgentEventHandler):
|
|
|
121
134
|
logger.info(f"Agent '{agent_id}' enqueued LLMCompleteResponseReceivedEvent with error details from LLMUserMessageReadyEventHandler.")
|
|
122
135
|
return
|
|
123
136
|
|
|
124
|
-
# Add message to history with reasoning
|
|
137
|
+
# Add message to history with reasoning and multimodal data
|
|
125
138
|
history_entry = {"role": "assistant", "content": complete_response_text}
|
|
126
139
|
if complete_reasoning_text:
|
|
127
140
|
history_entry["reasoning"] = complete_reasoning_text
|
|
141
|
+
if complete_image_urls:
|
|
142
|
+
history_entry["image_urls"] = complete_image_urls
|
|
143
|
+
if complete_audio_urls:
|
|
144
|
+
history_entry["audio_urls"] = complete_audio_urls
|
|
145
|
+
if complete_video_urls:
|
|
146
|
+
history_entry["video_urls"] = complete_video_urls
|
|
128
147
|
context.state.add_message_to_history(history_entry)
|
|
129
148
|
|
|
130
|
-
# Create complete response with reasoning
|
|
149
|
+
# Create complete response with reasoning and multimodal data
|
|
131
150
|
complete_response_obj = CompleteResponse(
|
|
132
151
|
content=complete_response_text,
|
|
133
152
|
reasoning=complete_reasoning_text,
|
|
134
|
-
usage=token_usage
|
|
153
|
+
usage=token_usage,
|
|
154
|
+
image_urls=complete_image_urls,
|
|
155
|
+
audio_urls=complete_audio_urls,
|
|
156
|
+
video_urls=complete_video_urls
|
|
135
157
|
)
|
|
136
158
|
llm_complete_event = LLMCompleteResponseReceivedEvent(
|
|
137
159
|
complete_response=complete_response_obj
|
|
138
160
|
)
|
|
139
161
|
await context.input_event_queues.enqueue_internal_system_event(llm_complete_event)
|
|
140
162
|
logger.info(f"Agent '{agent_id}' enqueued LLMCompleteResponseReceivedEvent from LLMUserMessageReadyEventHandler.")
|
|
163
|
+
|
|
@@ -3,14 +3,14 @@ import logging
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from autobyteus.agent.handlers.base_event_handler import AgentEventHandler
|
|
6
|
-
from autobyteus.agent.events import UserMessageReceivedEvent, LLMUserMessageReadyEvent
|
|
7
|
-
from autobyteus.agent.message.agent_input_user_message import AgentInputUserMessage
|
|
6
|
+
from autobyteus.agent.events import UserMessageReceivedEvent, LLMUserMessageReadyEvent
|
|
7
|
+
from autobyteus.agent.message.agent_input_user_message import AgentInputUserMessage
|
|
8
8
|
from autobyteus.agent.input_processor import BaseAgentUserInputMessageProcessor
|
|
9
|
-
from autobyteus.
|
|
9
|
+
from autobyteus.agent.message.multimodal_message_builder import build_llm_user_message
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
|
-
from autobyteus.agent.context import AgentContext
|
|
13
|
+
from autobyteus.agent.context import AgentContext
|
|
14
14
|
from autobyteus.agent.events.notifiers import AgentExternalEventNotifier
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
@@ -18,24 +18,23 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
class UserInputMessageEventHandler(AgentEventHandler):
|
|
19
19
|
"""
|
|
20
20
|
Handles UserMessageReceivedEvents by first applying any configured
|
|
21
|
-
AgentUserInputMessageProcessors
|
|
22
|
-
|
|
21
|
+
AgentUserInputMessageProcessors, then using the multimodal_message_builder
|
|
22
|
+
to convert the processed message into an LLMUserMessage, and finally
|
|
23
23
|
enqueuing an LLMUserMessageReadyEvent for further processing by the LLM.
|
|
24
|
-
It also checks for metadata to emit special notifications for system-generated tasks.
|
|
25
24
|
"""
|
|
26
25
|
|
|
27
26
|
def __init__(self):
|
|
28
27
|
logger.info("UserInputMessageEventHandler initialized.")
|
|
29
28
|
|
|
30
29
|
async def handle(self,
|
|
31
|
-
event: UserMessageReceivedEvent,
|
|
30
|
+
event: UserMessageReceivedEvent,
|
|
32
31
|
context: 'AgentContext') -> None:
|
|
33
|
-
if not isinstance(event, UserMessageReceivedEvent):
|
|
32
|
+
if not isinstance(event, UserMessageReceivedEvent):
|
|
34
33
|
logger.warning(f"UserInputMessageEventHandler received non-UserMessageReceivedEvent: {type(event)}. Skipping.")
|
|
35
34
|
return
|
|
36
35
|
|
|
37
|
-
original_agent_input_user_msg: AgentInputUserMessage = event.agent_input_user_message
|
|
38
|
-
|
|
36
|
+
original_agent_input_user_msg: AgentInputUserMessage = event.agent_input_user_message
|
|
37
|
+
|
|
39
38
|
# --- NEW LOGIC: Check metadata for system-generated tasks and notify TUI ---
|
|
40
39
|
if original_agent_input_user_msg.metadata.get('source') == 'system_task_notifier':
|
|
41
40
|
if context.phase_manager:
|
|
@@ -47,11 +46,11 @@ class UserInputMessageEventHandler(AgentEventHandler):
|
|
|
47
46
|
notifier.notify_agent_data_system_task_notification_received(notification_data)
|
|
48
47
|
logger.info(f"Agent '{context.agent_id}' emitted system task notification for TUI.")
|
|
49
48
|
# --- END NEW LOGIC ---
|
|
50
|
-
|
|
51
|
-
processed_agent_input_user_msg: AgentInputUserMessage = original_agent_input_user_msg
|
|
52
|
-
|
|
53
|
-
logger.info(f"Agent '{context.agent_id}' handling UserMessageReceivedEvent: '{original_agent_input_user_msg.content}'")
|
|
54
|
-
|
|
49
|
+
|
|
50
|
+
processed_agent_input_user_msg: AgentInputUserMessage = original_agent_input_user_msg
|
|
51
|
+
|
|
52
|
+
logger.info(f"Agent '{context.agent_id}' handling UserMessageReceivedEvent: '{original_agent_input_user_msg.content}'")
|
|
53
|
+
|
|
55
54
|
processor_instances = context.config.input_processors
|
|
56
55
|
if processor_instances:
|
|
57
56
|
processor_names = [p.get_name() for p in processor_instances]
|
|
@@ -62,14 +61,14 @@ class UserInputMessageEventHandler(AgentEventHandler):
|
|
|
62
61
|
if not isinstance(processor_instance, BaseAgentUserInputMessageProcessor):
|
|
63
62
|
logger.error(f"Agent '{context.agent_id}': Invalid input processor type in config: {type(processor_instance)}. Skipping.")
|
|
64
63
|
continue
|
|
65
|
-
|
|
64
|
+
|
|
66
65
|
processor_name_for_log = processor_instance.get_name()
|
|
67
66
|
logger.debug(f"Agent '{context.agent_id}': Applying input processor '{processor_name_for_log}'.")
|
|
68
67
|
msg_before_this_processor = processed_agent_input_user_msg
|
|
69
68
|
# Pass the original event to the processor
|
|
70
69
|
processed_agent_input_user_msg = await processor_instance.process(
|
|
71
|
-
message=msg_before_this_processor,
|
|
72
|
-
context=context,
|
|
70
|
+
message=msg_before_this_processor,
|
|
71
|
+
context=context,
|
|
73
72
|
triggering_event=event
|
|
74
73
|
)
|
|
75
74
|
logger.info(f"Agent '{context.agent_id}': Input processor '{processor_name_for_log}' applied successfully.")
|
|
@@ -81,12 +80,10 @@ class UserInputMessageEventHandler(AgentEventHandler):
|
|
|
81
80
|
else:
|
|
82
81
|
logger.debug(f"Agent '{context.agent_id}': No input processors configured in agent config.")
|
|
83
82
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
image_urls=processed_agent_input_user_msg.image_urls
|
|
87
|
-
)
|
|
83
|
+
# --- Refactored: Use the dedicated builder ---
|
|
84
|
+
llm_user_message = build_llm_user_message(processed_agent_input_user_msg)
|
|
88
85
|
|
|
89
|
-
llm_user_message_ready_event = LLMUserMessageReadyEvent(llm_user_message=llm_user_message)
|
|
86
|
+
llm_user_message_ready_event = LLMUserMessageReadyEvent(llm_user_message=llm_user_message)
|
|
90
87
|
await context.input_event_queues.enqueue_internal_system_event(llm_user_message_ready_event)
|
|
91
|
-
|
|
88
|
+
|
|
92
89
|
logger.info(f"Agent '{context.agent_id}' processed AgentInputUserMessage and enqueued LLMUserMessageReadyEvent.")
|
|
@@ -9,12 +9,14 @@ from .agent_input_user_message import AgentInputUserMessage
|
|
|
9
9
|
from .send_message_to import SendMessageTo
|
|
10
10
|
from .context_file import ContextFile
|
|
11
11
|
from .context_file_type import ContextFileType
|
|
12
|
+
from .multimodal_message_builder import build_llm_user_message
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
14
|
-
"InterAgentMessage",
|
|
15
|
-
"InterAgentMessageType",
|
|
16
|
-
"AgentInputUserMessage",
|
|
15
|
+
"InterAgentMessage",
|
|
16
|
+
"InterAgentMessageType",
|
|
17
|
+
"AgentInputUserMessage",
|
|
17
18
|
"SendMessageTo",
|
|
18
|
-
"ContextFile",
|
|
19
|
-
"ContextFileType",
|
|
19
|
+
"ContextFile",
|
|
20
|
+
"ContextFileType",
|
|
21
|
+
"build_llm_user_message",
|
|
20
22
|
]
|
|
@@ -8,21 +8,18 @@ from .context_file import ContextFile # Import the new ContextFile dataclass
|
|
|
8
8
|
logger = logging.getLogger(__name__)
|
|
9
9
|
|
|
10
10
|
@dataclass
|
|
11
|
-
class AgentInputUserMessage:
|
|
11
|
+
class AgentInputUserMessage:
|
|
12
12
|
"""
|
|
13
13
|
Represents a message received from an external user interacting with the agent system.
|
|
14
|
-
This is a simple dataclass. It includes support for a list of ContextFile objects,
|
|
15
|
-
allowing users to provide various documents as context.
|
|
14
|
+
This is a simple dataclass. It includes support for a list of ContextFile objects,
|
|
15
|
+
allowing users to provide various documents and media as context via a single list.
|
|
16
16
|
"""
|
|
17
17
|
content: str
|
|
18
|
-
image_urls: Optional[List[str]] = field(default=None) # Basic list of strings
|
|
19
18
|
context_files: Optional[List[ContextFile]] = field(default=None)
|
|
20
19
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
21
20
|
|
|
22
21
|
def __post_init__(self):
|
|
23
22
|
# Basic type validation that dataclasses don't do automatically for mutable defaults or complex types
|
|
24
|
-
if self.image_urls is not None and not (isinstance(self.image_urls, list) and all(isinstance(url, str) for url in self.image_urls)):
|
|
25
|
-
raise TypeError("AgentInputUserMessage 'image_urls' must be a list of strings if provided.")
|
|
26
23
|
if self.context_files is not None and not (isinstance(self.context_files, list) and all(isinstance(cf, ContextFile) for cf in self.context_files)):
|
|
27
24
|
raise TypeError("AgentInputUserMessage 'context_files' must be a list of ContextFile objects if provided.")
|
|
28
25
|
if not isinstance(self.metadata, dict): # Should be caught by default_factory, but good practice
|
|
@@ -34,7 +31,7 @@ class AgentInputUserMessage:
|
|
|
34
31
|
num_context_files = len(self.context_files) if self.context_files else 0
|
|
35
32
|
logger.debug(
|
|
36
33
|
f"AgentInputUserMessage initialized. Content: '{self.content[:50]}...', "
|
|
37
|
-
f"
|
|
34
|
+
f"Num ContextFiles: {num_context_files}, "
|
|
38
35
|
f"Metadata keys: {list(self.metadata.keys())}"
|
|
39
36
|
)
|
|
40
37
|
|
|
@@ -47,7 +44,6 @@ class AgentInputUserMessage:
|
|
|
47
44
|
|
|
48
45
|
return {
|
|
49
46
|
"content": self.content,
|
|
50
|
-
"image_urls": self.image_urls,
|
|
51
47
|
"context_files": context_files_dict_list,
|
|
52
48
|
"metadata": self.metadata,
|
|
53
49
|
}
|
|
@@ -59,31 +55,25 @@ class AgentInputUserMessage:
|
|
|
59
55
|
if not isinstance(content, str): # Ensure content is string
|
|
60
56
|
raise ValueError("AgentInputUserMessage 'content' in dictionary must be a string.")
|
|
61
57
|
|
|
62
|
-
image_urls = data.get("image_urls")
|
|
63
|
-
if image_urls is not None and not (isinstance(image_urls, list) and all(isinstance(url, str) for url in image_urls)):
|
|
64
|
-
raise ValueError("AgentInputUserMessage 'image_urls' in dictionary must be a list of strings if provided.")
|
|
65
|
-
|
|
66
58
|
context_files_data = data.get("context_files")
|
|
67
59
|
context_files_list: Optional[List[ContextFile]] = None
|
|
68
60
|
if context_files_data is not None:
|
|
69
61
|
if not isinstance(context_files_data, list):
|
|
70
62
|
raise ValueError("AgentInputUserMessage 'context_files' in dictionary must be a list if provided.")
|
|
71
63
|
context_files_list = [ContextFile.from_dict(cf_data) for cf_data in context_files_data]
|
|
72
|
-
|
|
64
|
+
|
|
73
65
|
metadata = data.get("metadata", {})
|
|
74
66
|
if not isinstance(metadata, dict):
|
|
75
67
|
raise ValueError("AgentInputUserMessage 'metadata' in dictionary must be a dict if provided.")
|
|
76
68
|
|
|
77
69
|
return cls(
|
|
78
70
|
content=content,
|
|
79
|
-
image_urls=image_urls,
|
|
80
71
|
context_files=context_files_list,
|
|
81
72
|
metadata=metadata
|
|
82
73
|
)
|
|
83
74
|
|
|
84
75
|
def __repr__(self) -> str:
|
|
85
76
|
content_preview = f"{self.content[:100]}..." if len(self.content) > 100 else self.content
|
|
86
|
-
images_repr = f", image_urls={self.image_urls}" if self.image_urls else ""
|
|
87
77
|
|
|
88
78
|
if self.context_files:
|
|
89
79
|
context_repr = f", context_files=[{len(self.context_files)} ContextFile(s)]"
|
|
@@ -93,4 +83,4 @@ class AgentInputUserMessage:
|
|
|
93
83
|
meta_repr = f", metadata_keys={list(self.metadata.keys())}" if self.metadata else ""
|
|
94
84
|
|
|
95
85
|
return (f"AgentInputUserMessage(content='{content_preview}'"
|
|
96
|
-
f"{
|
|
86
|
+
f"{context_repr}{meta_repr})")
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Optional, Dict, Any
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
+
from urllib.parse import urlparse
|
|
6
7
|
|
|
7
8
|
from .context_file_type import ContextFileType
|
|
8
9
|
|
|
@@ -12,10 +13,9 @@ logger = logging.getLogger(__name__)
|
|
|
12
13
|
class ContextFile:
|
|
13
14
|
"""
|
|
14
15
|
Represents a single context file provided to an agent.
|
|
15
|
-
|
|
16
|
-
to input processors.
|
|
16
|
+
The 'uri' can be a local file path or a network URL.
|
|
17
17
|
"""
|
|
18
|
-
|
|
18
|
+
uri: str
|
|
19
19
|
file_type: ContextFileType = ContextFileType.UNKNOWN
|
|
20
20
|
file_name: Optional[str] = None
|
|
21
21
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
@@ -25,33 +25,33 @@ class ContextFile:
|
|
|
25
25
|
Called after the dataclass's __init__ method.
|
|
26
26
|
Used here to infer file_name and file_type if not provided or UNKNOWN.
|
|
27
27
|
"""
|
|
28
|
-
if self.
|
|
28
|
+
if not isinstance(self.uri, str) or not self.uri:
|
|
29
|
+
raise TypeError(f"ContextFile uri must be a non-empty string, got {type(self.uri)}")
|
|
30
|
+
|
|
31
|
+
if self.file_name is None:
|
|
29
32
|
try:
|
|
30
|
-
|
|
33
|
+
# Use urlparse to correctly handle both URLs and local paths
|
|
34
|
+
parsed_path = urlparse(self.uri).path
|
|
35
|
+
self.file_name = os.path.basename(parsed_path)
|
|
31
36
|
except Exception as e:
|
|
32
|
-
logger.warning(f"Could not determine basename for
|
|
37
|
+
logger.warning(f"Could not determine basename for uri '{self.uri}': {e}")
|
|
33
38
|
self.file_name = "unknown_file"
|
|
34
39
|
|
|
35
|
-
if self.file_type == ContextFileType.UNKNOWN
|
|
36
|
-
inferred_type = ContextFileType.from_path(self.
|
|
40
|
+
if self.file_type == ContextFileType.UNKNOWN:
|
|
41
|
+
inferred_type = ContextFileType.from_path(self.uri)
|
|
37
42
|
if inferred_type != ContextFileType.UNKNOWN:
|
|
38
43
|
self.file_type = inferred_type
|
|
39
|
-
logger.debug(f"Inferred file type for '{self.
|
|
44
|
+
logger.debug(f"Inferred file type for '{self.uri}' as {self.file_type.value}")
|
|
40
45
|
else:
|
|
41
|
-
logger.debug(f"Could not infer specific file type for '{self.
|
|
42
|
-
|
|
43
|
-
# Ensure path is a string
|
|
44
|
-
if not isinstance(self.path, str):
|
|
45
|
-
# This ideally should be caught by type hints earlier, but as a runtime safeguard:
|
|
46
|
-
raise TypeError(f"ContextFile path must be a string, got {type(self.path)}")
|
|
47
|
-
|
|
46
|
+
logger.debug(f"Could not infer specific file type for '{self.uri}', remaining UNKNOWN.")
|
|
47
|
+
|
|
48
48
|
if logger.isEnabledFor(logging.DEBUG):
|
|
49
|
-
logger.debug(f"ContextFile initialized:
|
|
49
|
+
logger.debug(f"ContextFile initialized: uri='{self.uri}', type='{self.file_type.value}', name='{self.file_name}'")
|
|
50
50
|
|
|
51
51
|
def to_dict(self) -> Dict[str, Any]:
|
|
52
52
|
"""Serializes the ContextFile to a dictionary."""
|
|
53
53
|
return {
|
|
54
|
-
"
|
|
54
|
+
"uri": self.uri,
|
|
55
55
|
"file_type": self.file_type.value, # Serialize enum to its value
|
|
56
56
|
"file_name": self.file_name,
|
|
57
57
|
"metadata": self.metadata,
|
|
@@ -60,23 +60,23 @@ class ContextFile:
|
|
|
60
60
|
@classmethod
|
|
61
61
|
def from_dict(cls, data: Dict[str, Any]) -> 'ContextFile':
|
|
62
62
|
"""Deserializes a ContextFile from a dictionary."""
|
|
63
|
-
if not isinstance(data.get("
|
|
64
|
-
raise ValueError("ContextFile '
|
|
65
|
-
|
|
63
|
+
if not isinstance(data.get("uri"), str):
|
|
64
|
+
raise ValueError("ContextFile 'uri' in dictionary must be a string.")
|
|
65
|
+
|
|
66
66
|
file_type_str = data.get("file_type", ContextFileType.UNKNOWN.value)
|
|
67
67
|
try:
|
|
68
68
|
file_type = ContextFileType(file_type_str)
|
|
69
69
|
except ValueError:
|
|
70
70
|
logger.warning(f"Invalid file_type string '{file_type_str}' in ContextFile data. Defaulting to UNKNOWN.")
|
|
71
71
|
file_type = ContextFileType.UNKNOWN
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
return cls(
|
|
74
|
-
|
|
74
|
+
uri=data["uri"],
|
|
75
75
|
file_type=file_type,
|
|
76
76
|
file_name=data.get("file_name"),
|
|
77
77
|
metadata=data.get("metadata", {})
|
|
78
78
|
)
|
|
79
79
|
|
|
80
80
|
def __repr__(self) -> str:
|
|
81
|
-
return (f"ContextFile(
|
|
81
|
+
return (f"ContextFile(uri='{self.uri}', file_name='{self.file_name}', "
|
|
82
82
|
f"file_type='{self.file_type.value}', metadata_keys={list(self.metadata.keys())})")
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
import os
|
|
3
|
+
from urllib.parse import urlparse
|
|
3
4
|
|
|
4
5
|
class ContextFileType(str, Enum):
|
|
5
6
|
"""
|
|
@@ -23,19 +24,25 @@ class ContextFileType(str, Enum):
|
|
|
23
24
|
UNKNOWN = "unknown" # Fallback for unrecognized types
|
|
24
25
|
|
|
25
26
|
@classmethod
|
|
26
|
-
def from_path(cls,
|
|
27
|
+
def from_path(cls, uri: str) -> 'ContextFileType':
|
|
27
28
|
"""
|
|
28
|
-
Infers the ContextFileType from a file path based on its extension.
|
|
29
|
+
Infers the ContextFileType from a file path or URL based on its extension.
|
|
29
30
|
"""
|
|
30
|
-
if not
|
|
31
|
+
if not uri or not isinstance(uri, str):
|
|
31
32
|
return cls.UNKNOWN
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
# Parse the URI to handle both file paths and URLs gracefully
|
|
36
|
+
parsed_path = urlparse(uri).path
|
|
37
|
+
_, extension = os.path.splitext(parsed_path.lower())
|
|
38
|
+
except Exception:
|
|
39
|
+
# Fallback for malformed URIs
|
|
40
|
+
_, extension = os.path.splitext(uri.lower())
|
|
41
|
+
|
|
35
42
|
if extension == ".txt":
|
|
36
43
|
return cls.TEXT
|
|
37
44
|
elif extension == ".md":
|
|
38
|
-
return cls.MARKDOWN
|
|
45
|
+
return cls.MARKDOWN
|
|
39
46
|
elif extension == ".pdf":
|
|
40
47
|
return cls.PDF
|
|
41
48
|
elif extension == ".docx":
|
|
@@ -61,9 +68,23 @@ class ContextFileType(str, Enum):
|
|
|
61
68
|
elif extension in [".mp4", ".mov", ".avi", ".mkv", ".webm"]:
|
|
62
69
|
return cls.VIDEO
|
|
63
70
|
elif extension in [".png", ".jpg", ".jpeg", ".gif", ".webp"]:
|
|
64
|
-
return cls.IMAGE
|
|
71
|
+
return cls.IMAGE
|
|
65
72
|
else:
|
|
66
73
|
return cls.UNKNOWN
|
|
67
74
|
|
|
75
|
+
@classmethod
|
|
76
|
+
def get_readable_text_types(cls) -> list['ContextFileType']:
|
|
77
|
+
"""Returns a list of file types that can be read as plain text for context."""
|
|
78
|
+
return [
|
|
79
|
+
cls.TEXT,
|
|
80
|
+
cls.MARKDOWN,
|
|
81
|
+
cls.JSON,
|
|
82
|
+
cls.XML,
|
|
83
|
+
cls.HTML,
|
|
84
|
+
cls.PYTHON,
|
|
85
|
+
cls.JAVASCRIPT,
|
|
86
|
+
cls.CSV,
|
|
87
|
+
]
|
|
88
|
+
|
|
68
89
|
def __str__(self) -> str:
|
|
69
90
|
return self.value
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# file: autobyteus/autobyteus/agent/message/multimodal_message_builder.py
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from autobyteus.agent.message.agent_input_user_message import AgentInputUserMessage
|
|
5
|
+
from autobyteus.agent.message.context_file_type import ContextFileType
|
|
6
|
+
from autobyteus.llm.user_message import LLMUserMessage
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
def build_llm_user_message(agent_input_user_message: AgentInputUserMessage) -> LLMUserMessage:
|
|
11
|
+
"""
|
|
12
|
+
Builds an LLMUserMessage from an AgentInputUserMessage by categorizing its context files.
|
|
13
|
+
|
|
14
|
+
This function iterates through the context files, sorting URIs for images, audio, and video
|
|
15
|
+
into the appropriate fields of the LLMUserMessage. It ignores other file types for now.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
agent_input_user_message: The user input message containing content and context files.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
An LLMUserMessage ready to be sent to the LLM.
|
|
22
|
+
"""
|
|
23
|
+
image_urls = []
|
|
24
|
+
audio_urls = []
|
|
25
|
+
video_urls = []
|
|
26
|
+
|
|
27
|
+
if agent_input_user_message.context_files:
|
|
28
|
+
for context_file in agent_input_user_message.context_files:
|
|
29
|
+
file_type = context_file.file_type
|
|
30
|
+
if file_type == ContextFileType.IMAGE:
|
|
31
|
+
image_urls.append(context_file.uri)
|
|
32
|
+
elif file_type == ContextFileType.AUDIO:
|
|
33
|
+
audio_urls.append(context_file.uri)
|
|
34
|
+
elif file_type == ContextFileType.VIDEO:
|
|
35
|
+
video_urls.append(context_file.uri)
|
|
36
|
+
else:
|
|
37
|
+
logger.debug(f"Ignoring non-media context file of type '{file_type.value}' during LLM message build: {context_file.uri}")
|
|
38
|
+
|
|
39
|
+
llm_user_message = LLMUserMessage(
|
|
40
|
+
content=agent_input_user_message.content,
|
|
41
|
+
image_urls=image_urls if image_urls else None,
|
|
42
|
+
audio_urls=audio_urls if audio_urls else None,
|
|
43
|
+
video_urls=video_urls if video_urls else None
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
logger.info(f"Built LLMUserMessage with {len(image_urls)} images, {len(audio_urls)} audio, {len(video_urls)} video files.")
|
|
47
|
+
return llm_user_message
|
|
@@ -20,12 +20,18 @@ class AssistantChunkData(BaseStreamPayload):
|
|
|
20
20
|
reasoning: Optional[str] = None
|
|
21
21
|
is_complete: bool
|
|
22
22
|
usage: Optional[TokenUsage] = None
|
|
23
|
+
image_urls: Optional[List[str]] = None
|
|
24
|
+
audio_urls: Optional[List[str]] = None
|
|
25
|
+
video_urls: Optional[List[str]] = None
|
|
23
26
|
|
|
24
27
|
|
|
25
28
|
class AssistantCompleteResponseData(BaseStreamPayload):
|
|
26
29
|
content: str
|
|
27
30
|
reasoning: Optional[str] = None
|
|
28
31
|
usage: Optional[TokenUsage] = None
|
|
32
|
+
image_urls: Optional[List[str]] = None
|
|
33
|
+
audio_urls: Optional[List[str]] = None
|
|
34
|
+
video_urls: Optional[List[str]] = None
|
|
29
35
|
|
|
30
36
|
class ToolInteractionLogEntryData(BaseStreamPayload):
|
|
31
37
|
log_entry: str
|
|
@@ -102,14 +108,20 @@ def create_assistant_chunk_data(chunk_obj: Any) -> AssistantChunkData:
|
|
|
102
108
|
content=str(getattr(chunk_obj, 'content', '')),
|
|
103
109
|
reasoning=getattr(chunk_obj, 'reasoning', None),
|
|
104
110
|
is_complete=bool(getattr(chunk_obj, 'is_complete', False)),
|
|
105
|
-
usage=parsed_usage
|
|
111
|
+
usage=parsed_usage,
|
|
112
|
+
image_urls=getattr(chunk_obj, 'image_urls', None),
|
|
113
|
+
audio_urls=getattr(chunk_obj, 'audio_urls', None),
|
|
114
|
+
video_urls=getattr(chunk_obj, 'video_urls', None)
|
|
106
115
|
)
|
|
107
116
|
elif isinstance(chunk_obj, dict):
|
|
108
117
|
return AssistantChunkData(
|
|
109
118
|
content=str(chunk_obj.get('content', '')),
|
|
110
119
|
reasoning=chunk_obj.get('reasoning', None),
|
|
111
120
|
is_complete=bool(chunk_obj.get('is_complete', False)),
|
|
112
|
-
usage=parsed_usage
|
|
121
|
+
usage=parsed_usage,
|
|
122
|
+
image_urls=chunk_obj.get('image_urls', None),
|
|
123
|
+
audio_urls=chunk_obj.get('audio_urls', None),
|
|
124
|
+
video_urls=chunk_obj.get('video_urls', None)
|
|
113
125
|
)
|
|
114
126
|
raise ValueError(f"Cannot create AssistantChunkData from {type(chunk_obj)}")
|
|
115
127
|
|
|
@@ -136,13 +148,19 @@ def create_assistant_complete_response_data(complete_resp_obj: Any) -> Assistant
|
|
|
136
148
|
return AssistantCompleteResponseData(
|
|
137
149
|
content=str(getattr(complete_resp_obj, 'content', '')),
|
|
138
150
|
reasoning=getattr(complete_resp_obj, 'reasoning', None),
|
|
139
|
-
usage=parsed_usage
|
|
151
|
+
usage=parsed_usage,
|
|
152
|
+
image_urls=getattr(complete_resp_obj, 'image_urls', None),
|
|
153
|
+
audio_urls=getattr(complete_resp_obj, 'audio_urls', None),
|
|
154
|
+
video_urls=getattr(complete_resp_obj, 'video_urls', None)
|
|
140
155
|
)
|
|
141
156
|
elif isinstance(complete_resp_obj, dict):
|
|
142
157
|
return AssistantCompleteResponseData(
|
|
143
158
|
content=str(complete_resp_obj.get('content', '')),
|
|
144
159
|
reasoning=complete_resp_obj.get('reasoning', None),
|
|
145
|
-
usage=parsed_usage
|
|
160
|
+
usage=parsed_usage,
|
|
161
|
+
image_urls=complete_resp_obj.get('image_urls', None),
|
|
162
|
+
audio_urls=complete_resp_obj.get('audio_urls', None),
|
|
163
|
+
video_urls=complete_resp_obj.get('video_urls', None)
|
|
146
164
|
)
|
|
147
165
|
raise ValueError(f"Cannot create AssistantCompleteResponseData from {type(complete_resp_obj)}")
|
|
148
166
|
|
|
@@ -177,3 +195,4 @@ def create_system_task_notification_data(notification_data_dict: Any) -> SystemT
|
|
|
177
195
|
if isinstance(notification_data_dict, dict):
|
|
178
196
|
return SystemTaskNotificationData(**notification_data_dict)
|
|
179
197
|
raise ValueError(f"Cannot create SystemTaskNotificationData from {type(notification_data_dict)}")
|
|
198
|
+
|
|
@@ -47,6 +47,9 @@ class ToolManifestInjectorProcessor(BaseSystemPromptProcessor):
|
|
|
47
47
|
llm_provider = None
|
|
48
48
|
if context.llm_instance and context.llm_instance.model:
|
|
49
49
|
llm_provider = context.llm_instance.model.provider
|
|
50
|
+
|
|
51
|
+
# Retrieve the override flag from the agent's configuration.
|
|
52
|
+
use_xml_tool_format = context.config.use_xml_tool_format
|
|
50
53
|
|
|
51
54
|
# Generate the manifest string for the 'tools' variable.
|
|
52
55
|
tools_manifest: str
|
|
@@ -59,10 +62,11 @@ class ToolManifestInjectorProcessor(BaseSystemPromptProcessor):
|
|
|
59
62
|
]
|
|
60
63
|
|
|
61
64
|
try:
|
|
62
|
-
# Delegate manifest generation to the provider,
|
|
65
|
+
# Delegate manifest generation to the provider, passing the override flag.
|
|
63
66
|
tools_manifest = self._manifest_provider.provide(
|
|
64
67
|
tool_definitions=tool_definitions,
|
|
65
|
-
provider=llm_provider
|
|
68
|
+
provider=llm_provider,
|
|
69
|
+
use_xml_tool_format=use_xml_tool_format
|
|
66
70
|
)
|
|
67
71
|
except Exception as e:
|
|
68
72
|
logger.exception(f"An unexpected error occurred during tool manifest generation for agent '{agent_id}': {e}")
|
|
@@ -33,7 +33,8 @@ class ToolInvocation:
|
|
|
33
33
|
"""
|
|
34
34
|
# Create a canonical representation of the arguments
|
|
35
35
|
# sort_keys=True ensures that the order of keys doesn't change the hash
|
|
36
|
-
|
|
36
|
+
# ensure_ascii=False is critical for cross-language compatibility with JS
|
|
37
|
+
canonical_args = json.dumps(arguments, sort_keys=True, separators=(',', ':'), ensure_ascii=False)
|
|
37
38
|
|
|
38
39
|
# Create a string to hash
|
|
39
40
|
hash_string = f"{name}:{canonical_args}"
|