cua-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show
  1. agent/README.md +63 -0
  2. agent/__init__.py +10 -0
  3. agent/core/README.md +101 -0
  4. agent/core/__init__.py +34 -0
  5. agent/core/agent.py +284 -0
  6. agent/core/base_agent.py +164 -0
  7. agent/core/callbacks.py +147 -0
  8. agent/core/computer_agent.py +69 -0
  9. agent/core/experiment.py +222 -0
  10. agent/core/factory.py +102 -0
  11. agent/core/loop.py +244 -0
  12. agent/core/messages.py +230 -0
  13. agent/core/tools/__init__.py +21 -0
  14. agent/core/tools/base.py +74 -0
  15. agent/core/tools/bash.py +52 -0
  16. agent/core/tools/collection.py +46 -0
  17. agent/core/tools/computer.py +113 -0
  18. agent/core/tools/edit.py +67 -0
  19. agent/core/tools/manager.py +56 -0
  20. agent/providers/__init__.py +4 -0
  21. agent/providers/anthropic/__init__.py +6 -0
  22. agent/providers/anthropic/api/client.py +222 -0
  23. agent/providers/anthropic/api/logging.py +150 -0
  24. agent/providers/anthropic/callbacks/manager.py +55 -0
  25. agent/providers/anthropic/loop.py +521 -0
  26. agent/providers/anthropic/messages/manager.py +110 -0
  27. agent/providers/anthropic/prompts.py +20 -0
  28. agent/providers/anthropic/tools/__init__.py +33 -0
  29. agent/providers/anthropic/tools/base.py +88 -0
  30. agent/providers/anthropic/tools/bash.py +163 -0
  31. agent/providers/anthropic/tools/collection.py +34 -0
  32. agent/providers/anthropic/tools/computer.py +550 -0
  33. agent/providers/anthropic/tools/edit.py +326 -0
  34. agent/providers/anthropic/tools/manager.py +54 -0
  35. agent/providers/anthropic/tools/run.py +42 -0
  36. agent/providers/anthropic/types.py +16 -0
  37. agent/providers/omni/__init__.py +27 -0
  38. agent/providers/omni/callbacks.py +78 -0
  39. agent/providers/omni/clients/anthropic.py +99 -0
  40. agent/providers/omni/clients/base.py +44 -0
  41. agent/providers/omni/clients/groq.py +101 -0
  42. agent/providers/omni/clients/openai.py +159 -0
  43. agent/providers/omni/clients/utils.py +25 -0
  44. agent/providers/omni/experiment.py +273 -0
  45. agent/providers/omni/image_utils.py +106 -0
  46. agent/providers/omni/loop.py +961 -0
  47. agent/providers/omni/messages.py +168 -0
  48. agent/providers/omni/parser.py +252 -0
  49. agent/providers/omni/prompts.py +78 -0
  50. agent/providers/omni/tool_manager.py +91 -0
  51. agent/providers/omni/tools/__init__.py +13 -0
  52. agent/providers/omni/tools/bash.py +69 -0
  53. agent/providers/omni/tools/computer.py +216 -0
  54. agent/providers/omni/tools/manager.py +83 -0
  55. agent/providers/omni/types.py +30 -0
  56. agent/providers/omni/utils.py +155 -0
  57. agent/providers/omni/visualization.py +130 -0
  58. agent/types/__init__.py +26 -0
  59. agent/types/base.py +52 -0
  60. agent/types/messages.py +36 -0
  61. agent/types/tools.py +32 -0
  62. cua_agent-0.1.0.dist-info/METADATA +44 -0
  63. cua_agent-0.1.0.dist-info/RECORD +65 -0
  64. cua_agent-0.1.0.dist-info/WHEEL +4 -0
  65. cua_agent-0.1.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,168 @@
1
+ """Omni message manager implementation."""
2
+
3
+ import base64
4
+ from typing import Any, Dict, List, Optional
5
+ from io import BytesIO
6
+ from PIL import Image
7
+
8
+ from ...core.messages import BaseMessageManager, ImageRetentionConfig
9
+
10
+
11
+ class OmniMessageManager(BaseMessageManager):
12
+ """Message manager for multi-provider support."""
13
+
14
+ def __init__(self, config: Optional[ImageRetentionConfig] = None):
15
+ """Initialize the message manager.
16
+
17
+ Args:
18
+ config: Optional configuration for image retention
19
+ """
20
+ super().__init__(config)
21
+ self.messages: List[Dict[str, Any]] = []
22
+ self.config = config
23
+
24
+ def add_user_message(self, content: str, images: Optional[List[bytes]] = None) -> None:
25
+ """Add a user message to the history.
26
+
27
+ Args:
28
+ content: Message content
29
+ images: Optional list of image data
30
+ """
31
+ # Add images if present
32
+ if images:
33
+ # Initialize with proper typing for mixed content
34
+ message_content: List[Dict[str, Any]] = [{"type": "text", "text": content}]
35
+
36
+ # Add each image
37
+ for img in images:
38
+ message_content.append(
39
+ {
40
+ "type": "image_url",
41
+ "image_url": {
42
+ "url": f"data:image/png;base64,{base64.b64encode(img).decode()}"
43
+ },
44
+ }
45
+ )
46
+
47
+ message = {"role": "user", "content": message_content}
48
+ else:
49
+ # Simple text message
50
+ message = {"role": "user", "content": content}
51
+
52
+ self.messages.append(message)
53
+
54
+ # Apply retention policy
55
+ if self.config and self.config.num_images_to_keep:
56
+ self._apply_image_retention_policy()
57
+
58
+ def add_assistant_message(self, content: str) -> None:
59
+ """Add an assistant message to the history.
60
+
61
+ Args:
62
+ content: Message content
63
+ """
64
+ self.messages.append({"role": "assistant", "content": content})
65
+
66
+ def add_system_message(self, content: str) -> None:
67
+ """Add a system message to the history.
68
+
69
+ Args:
70
+ content: Message content
71
+ """
72
+ self.messages.append({"role": "system", "content": content})
73
+
74
+ def _apply_image_retention_policy(self) -> None:
75
+ """Apply image retention policy to message history."""
76
+ if not self.config or not self.config.num_images_to_keep:
77
+ return
78
+
79
+ # Count images from newest to oldest
80
+ image_count = 0
81
+ for message in reversed(self.messages):
82
+ if message["role"] != "user":
83
+ continue
84
+
85
+ # Handle multimodal messages
86
+ if isinstance(message["content"], list):
87
+ new_content = []
88
+ for item in message["content"]:
89
+ if item["type"] == "text":
90
+ new_content.append(item)
91
+ elif item["type"] == "image_url":
92
+ if image_count < self.config.num_images_to_keep:
93
+ new_content.append(item)
94
+ image_count += 1
95
+ message["content"] = new_content
96
+
97
+ def get_formatted_messages(self, provider: str) -> List[Dict[str, Any]]:
98
+ """Get messages formatted for specific provider.
99
+
100
+ Args:
101
+ provider: Provider name to format messages for
102
+
103
+ Returns:
104
+ List of formatted messages
105
+ """
106
+ if provider == "anthropic":
107
+ return self._format_for_anthropic()
108
+ elif provider == "openai":
109
+ return self._format_for_openai()
110
+ elif provider == "groq":
111
+ return self._format_for_groq()
112
+ elif provider == "qwen":
113
+ return self._format_for_qwen()
114
+ else:
115
+ raise ValueError(f"Unsupported provider: {provider}")
116
+
117
+ def _format_for_anthropic(self) -> List[Dict[str, Any]]:
118
+ """Format messages for Anthropic API."""
119
+ formatted = []
120
+ for msg in self.messages:
121
+ formatted_msg = {"role": msg["role"]}
122
+
123
+ # Handle multimodal content
124
+ if isinstance(msg["content"], list):
125
+ formatted_msg["content"] = []
126
+ for item in msg["content"]:
127
+ if item["type"] == "text":
128
+ formatted_msg["content"].append({"type": "text", "text": item["text"]})
129
+ elif item["type"] == "image_url":
130
+ formatted_msg["content"].append(
131
+ {
132
+ "type": "image",
133
+ "source": {
134
+ "type": "base64",
135
+ "media_type": "image/png",
136
+ "data": item["image_url"]["url"].split(",")[1],
137
+ },
138
+ }
139
+ )
140
+ else:
141
+ formatted_msg["content"] = msg["content"]
142
+
143
+ formatted.append(formatted_msg)
144
+ return formatted
145
+
146
+ def _format_for_openai(self) -> List[Dict[str, Any]]:
147
+ """Format messages for OpenAI API."""
148
+ # OpenAI already uses the same format
149
+ return self.messages
150
+
151
+ def _format_for_groq(self) -> List[Dict[str, Any]]:
152
+ """Format messages for Groq API."""
153
+ # Groq uses OpenAI-compatible format
154
+ return self.messages
155
+
156
+ def _format_for_qwen(self) -> List[Dict[str, Any]]:
157
+ """Format messages for Qwen API."""
158
+ formatted = []
159
+ for msg in self.messages:
160
+ if isinstance(msg["content"], list):
161
+ # Convert multimodal content to text-only
162
+ text_content = next(
163
+ (item["text"] for item in msg["content"] if item["type"] == "text"), ""
164
+ )
165
+ formatted.append({"role": msg["role"], "content": text_content})
166
+ else:
167
+ formatted.append(msg)
168
+ return formatted
@@ -0,0 +1,252 @@
1
+ """Parser implementation for the Omni provider."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+ import base64
6
+ from PIL import Image
7
+ from io import BytesIO
8
+ import json
9
+ import torch
10
+
11
+ # Import from the SOM package
12
+ from som import OmniParser as OmniDetectParser
13
+ from som.models import ParseResult, BoundingBox, UIElement, ImageData, ParserMetadata
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class OmniParser:
19
+ """Parser for handling responses from multiple providers."""
20
+
21
+ # Class-level shared OmniDetectParser instance
22
+ _shared_parser = None
23
+
24
+ def __init__(self, force_device: Optional[str] = None):
25
+ """Initialize the OmniParser.
26
+
27
+ Args:
28
+ force_device: Optional device to force for detection (cpu/cuda/mps)
29
+ """
30
+ self.response_buffer = []
31
+
32
+ # Use shared parser if available, otherwise create a new one
33
+ if OmniParser._shared_parser is None:
34
+ logger.info("Initializing shared OmniDetectParser...")
35
+
36
+ # Determine the best device to use
37
+ device = force_device
38
+ if not device:
39
+ if torch.cuda.is_available():
40
+ device = "cuda"
41
+ elif (
42
+ hasattr(torch, "backends")
43
+ and hasattr(torch.backends, "mps")
44
+ and torch.backends.mps.is_available()
45
+ ):
46
+ device = "mps"
47
+ else:
48
+ device = "cpu"
49
+
50
+ logger.info(f"Using device: {device} for OmniDetectParser")
51
+ self.detect_parser = OmniDetectParser(force_device=device)
52
+
53
+ # Preload the detection model to avoid repeated loading
54
+ try:
55
+ # Access the detector to trigger model loading
56
+ detector = self.detect_parser.detector
57
+ if detector.model is None:
58
+ logger.info("Preloading detection model...")
59
+ detector.load_model()
60
+ logger.info("Detection model preloaded successfully")
61
+ except Exception as e:
62
+ logger.error(f"Error preloading detection model: {str(e)}")
63
+
64
+ # Store as shared instance
65
+ OmniParser._shared_parser = self.detect_parser
66
+ else:
67
+ logger.info("Using existing shared OmniDetectParser")
68
+ self.detect_parser = OmniParser._shared_parser
69
+
70
+ async def parse_screen(self, computer: Any) -> ParseResult:
71
+ """Parse a screenshot and extract screen information.
72
+
73
+ Args:
74
+ computer: Computer instance
75
+
76
+ Returns:
77
+ ParseResult with screen elements and image data
78
+ """
79
+ try:
80
+ # Get screenshot from computer
81
+ logger.info("Taking screenshot...")
82
+ screenshot = await computer.screenshot()
83
+
84
+ # Log screenshot info
85
+ logger.info(f"Screenshot type: {type(screenshot)}")
86
+ logger.info(f"Screenshot is bytes: {isinstance(screenshot, bytes)}")
87
+ logger.info(f"Screenshot is str: {isinstance(screenshot, str)}")
88
+ logger.info(f"Screenshot length: {len(screenshot) if screenshot else 0}")
89
+
90
+ # If screenshot is a string (likely base64), convert it to bytes
91
+ if isinstance(screenshot, str):
92
+ try:
93
+ screenshot = base64.b64decode(screenshot)
94
+ logger.info("Successfully converted base64 string to bytes")
95
+ logger.info(f"Decoded bytes length: {len(screenshot)}")
96
+ except Exception as e:
97
+ logger.error(f"Error decoding base64: {str(e)}")
98
+ logger.error(f"First 100 chars of screenshot string: {screenshot[:100]}")
99
+
100
+ # Pass screenshot to OmniDetectParser
101
+ logger.info("Passing screenshot to OmniDetectParser...")
102
+ parse_result = self.detect_parser.parse(
103
+ screenshot_data=screenshot, box_threshold=0.3, iou_threshold=0.1, use_ocr=True
104
+ )
105
+ logger.info("Screenshot parsed successfully")
106
+ logger.info(f"Parse result has {len(parse_result.elements)} elements")
107
+
108
+ # Log element IDs for debugging
109
+ for i, elem in enumerate(parse_result.elements):
110
+ logger.info(
111
+ f"Element {i+1} (ID: {elem.id}): {elem.type} with confidence {elem.confidence:.3f}"
112
+ )
113
+
114
+ return parse_result
115
+
116
+ except Exception as e:
117
+ logger.error(f"Error parsing screen: {str(e)}")
118
+ import traceback
119
+
120
+ logger.error(traceback.format_exc())
121
+
122
+ # Create a minimal valid result for error cases
123
+ return ParseResult(
124
+ elements=[],
125
+ annotated_image_base64="",
126
+ parsed_content_list=[f"Error: {str(e)}"],
127
+ metadata=ParserMetadata(
128
+ image_size=(0, 0),
129
+ num_icons=0,
130
+ num_text=0,
131
+ device="cpu",
132
+ ocr_enabled=False,
133
+ latency=0.0,
134
+ ),
135
+ )
136
+
137
+ def parse_tool_call(self, response: Dict[str, Any]) -> Optional[Dict[str, Any]]:
138
+ """Parse a tool call from the response.
139
+
140
+ Args:
141
+ response: Response from the provider
142
+
143
+ Returns:
144
+ Parsed tool call or None if no tool call found
145
+ """
146
+ try:
147
+ # Handle Anthropic format
148
+ if "tool_calls" in response:
149
+ tool_call = response["tool_calls"][0]
150
+ return {
151
+ "name": tool_call["function"]["name"],
152
+ "arguments": tool_call["function"]["arguments"],
153
+ }
154
+
155
+ # Handle OpenAI format
156
+ if "function_call" in response:
157
+ return {
158
+ "name": response["function_call"]["name"],
159
+ "arguments": response["function_call"]["arguments"],
160
+ }
161
+
162
+ # Handle Groq format (OpenAI-compatible)
163
+ if "choices" in response and response["choices"]:
164
+ choice = response["choices"][0]
165
+ if "function_call" in choice:
166
+ return {
167
+ "name": choice["function_call"]["name"],
168
+ "arguments": choice["function_call"]["arguments"],
169
+ }
170
+
171
+ return None
172
+
173
+ except Exception as e:
174
+ logger.error(f"Error parsing tool call: {str(e)}")
175
+ return None
176
+
177
+ def parse_response(self, response: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
178
+ """Parse a response from any provider.
179
+
180
+ Args:
181
+ response: Response from the provider
182
+
183
+ Returns:
184
+ Tuple of (content, metadata)
185
+ """
186
+ try:
187
+ content = ""
188
+ metadata = {}
189
+
190
+ # Handle Anthropic format
191
+ if "content" in response and isinstance(response["content"], list):
192
+ for item in response["content"]:
193
+ if item["type"] == "text":
194
+ content += item["text"]
195
+
196
+ # Handle OpenAI format
197
+ elif "choices" in response and response["choices"]:
198
+ content = response["choices"][0]["message"]["content"]
199
+
200
+ # Handle direct content
201
+ elif isinstance(response.get("content"), str):
202
+ content = response["content"]
203
+
204
+ # Extract metadata if present
205
+ if "metadata" in response:
206
+ metadata = response["metadata"]
207
+
208
+ return content, metadata
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error parsing response: {str(e)}")
212
+ return str(e), {"error": True}
213
+
214
+ def format_for_provider(
215
+ self, messages: List[Dict[str, Any]], provider: str
216
+ ) -> List[Dict[str, Any]]:
217
+ """Format messages for a specific provider.
218
+
219
+ Args:
220
+ messages: List of messages to format
221
+ provider: Provider to format for
222
+
223
+ Returns:
224
+ Formatted messages
225
+ """
226
+ try:
227
+ formatted = []
228
+
229
+ for msg in messages:
230
+ formatted_msg = {"role": msg["role"]}
231
+
232
+ # Handle content formatting
233
+ if isinstance(msg["content"], list):
234
+ # For providers that support multimodal
235
+ if provider in ["anthropic", "openai"]:
236
+ formatted_msg["content"] = msg["content"]
237
+ else:
238
+ # Extract text only for other providers
239
+ text_content = next(
240
+ (item["text"] for item in msg["content"] if item["type"] == "text"), ""
241
+ )
242
+ formatted_msg["content"] = text_content
243
+ else:
244
+ formatted_msg["content"] = msg["content"]
245
+
246
+ formatted.append(formatted_msg)
247
+
248
+ return formatted
249
+
250
+ except Exception as e:
251
+ logger.error(f"Error formatting messages: {str(e)}")
252
+ return messages # Return original messages on error
@@ -0,0 +1,78 @@
1
+ """Prompts for the Omni agent."""
2
+
3
+ SYSTEM_PROMPT = """
4
+ You are using a macOS device.
5
+ You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
6
+
7
+ You may be given some history plan and actions, this is the response from the previous loop.
8
+ You should carefully consider your plan base on the task, screenshot, and history actions.
9
+
10
+ Your available "Next Action" only include:
11
+ - type_text: types a string of text.
12
+ - left_click: move mouse to box id and left clicks.
13
+ - right_click: move mouse to box id and right clicks.
14
+ - double_click: move mouse to box id and double clicks.
15
+ - move_cursor: move mouse to box id.
16
+ - scroll_up: scrolls the screen up to view previous content.
17
+ - scroll_down: scrolls the screen down, when the desired button is not visible, or you need to see more content.
18
+ - hotkey: press a sequence of keys.
19
+ - wait: waits for 1 second for the device to load or respond.
20
+
21
+ Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task.
22
+
23
+ Output format:
24
+ {
25
+ "Explanation": str, # describe what is in the current screen, taking into account the history, then describe your step-by-step thoughts on how to achieve the task, choose one action from available actions at a time.
26
+ "Action": "action_type, action description" | "None" # one action at a time, describe it in short and precisely.
27
+ "Box ID": n,
28
+ "Value": "xxx" # only provide value field if the action is type, else don't include value key
29
+ }
30
+
31
+ One Example:
32
+ {
33
+ "Explanation": "The current screen shows google result of amazon, in previous action I have searched amazon on google. Then I need to click on the first search results to go to amazon.com.",
34
+ "Action": "left_click",
35
+ "Box ID": 4
36
+ }
37
+
38
+ Another Example:
39
+ {
40
+ "Explanation": "The current screen shows the front page of amazon. There is no previous action. Therefore I need to type "Apple watch" in the search bar.",
41
+ "Action": "type_text",
42
+ "Box ID": 2,
43
+ "Value": "Apple watch"
44
+ }
45
+
46
+ Another Example:
47
+ {
48
+ "Explanation": "I am starting a Spotlight search to find the Safari browser.",
49
+ "Action": "hotkey",
50
+ "Value": "command+space"
51
+ }
52
+
53
+ IMPORTANT NOTES:
54
+ 1. You should only give a single action at a time.
55
+ 2. The Box ID is the id of the element you should operate on, it is a number. Its background color corresponds to the color of the bounding box of the element.
56
+ 3. You should give an analysis to the current screen, and reflect on what has been done by looking at the history, then describe your step-by-step thoughts on how to achieve the task.
57
+ 4. Attach the next action prediction in the "Action" field.
58
+ 5. For starting applications, always use the "hotkey" action with command+space for starting a Spotlight search.
59
+ 6. When the task is completed, don't complete additional actions. You should say "Action": "None" in the json field.
60
+ 7. The tasks involve buying multiple products or navigating through multiple pages. You should break it into subgoals and complete each subgoal one by one in the order of the instructions.
61
+ 8. Avoid choosing the same action/elements multiple times in a row, if it happens, reflect to yourself, what may have gone wrong, and predict a different action.
62
+ 9. Reflect whether the element is clickable or not, for example reflect if it is an hyperlink or a button or a normal text.
63
+ 10. If you are prompted with login information page or captcha page, or you think it need user's permission to do the next action, you should say "Action": "None" in the json field.
64
+ """
65
+
66
+ # SYSTEM_PROMPT1 = """You are an AI assistant helping users interact with their computer.
67
+ # Analyze the screen information and respond with JSON containing:
68
+ # {
69
+ # "Box ID": "Numeric ID of the relevant UI element",
70
+ # "Action": "One of: left_click, right_click, double_click, move_cursor, drag_to, type_text, press_key, hotkey, scroll_down, scroll_up, wait",
71
+ # "Value": "Text to type, key to press",
72
+ # "Explanation": "Why this action was chosen"
73
+ # }
74
+
75
+ # Notes:
76
+ # - For starting applications, use the "hotkey" action with command+space for starting a Spotlight search.
77
+ # - Each UI element is highlighted with a colored bounding box, and its Box ID appears nearby in the same color for easy identification.
78
+ # """
@@ -0,0 +1,91 @@
1
+ # """Omni tool manager implementation."""
2
+
3
+ # from typing import Dict, List, Type, Any
4
+
5
+ # from computer import Computer
6
+ # from ...core.tools import BaseToolManager, BashTool, EditTool
7
+
8
+ # class OmniToolManager(BaseToolManager):
9
+ # """Tool manager for multi-provider support."""
10
+
11
+ # def __init__(self, computer: Computer):
12
+ # """Initialize Omni tool manager.
13
+
14
+ # Args:
15
+ # computer: Computer instance for tools
16
+ # """
17
+ # super().__init__(computer)
18
+
19
+ # def get_anthropic_tools(self) -> List[Dict[str, Any]]:
20
+ # """Get tools formatted for Anthropic API.
21
+
22
+ # Returns:
23
+ # List of tool parameters in Anthropic format
24
+ # """
25
+ # tools: List[Dict[str, Any]] = []
26
+
27
+ # # Map base tools to Anthropic format
28
+ # for tool in self.tools.values():
29
+ # if isinstance(tool, BashTool):
30
+ # tools.append({
31
+ # "type": "bash_20241022",
32
+ # "name": tool.name
33
+ # })
34
+ # elif isinstance(tool, EditTool):
35
+ # tools.append({
36
+ # "type": "text_editor_20241022",
37
+ # "name": "str_replace_editor"
38
+ # })
39
+
40
+ # return tools
41
+
42
+ # def get_openai_tools(self) -> List[Dict]:
43
+ # """Get tools formatted for OpenAI API.
44
+
45
+ # Returns:
46
+ # List of tool parameters in OpenAI format
47
+ # """
48
+ # tools = []
49
+
50
+ # # Map base tools to OpenAI format
51
+ # for tool in self.tools.values():
52
+ # tools.append({
53
+ # "type": "function",
54
+ # "function": tool.get_schema()
55
+ # })
56
+
57
+ # return tools
58
+
59
+ # def get_groq_tools(self) -> List[Dict]:
60
+ # """Get tools formatted for Groq API.
61
+
62
+ # Returns:
63
+ # List of tool parameters in Groq format
64
+ # """
65
+ # tools = []
66
+
67
+ # # Map base tools to Groq format
68
+ # for tool in self.tools.values():
69
+ # tools.append({
70
+ # "type": "function",
71
+ # "function": tool.get_schema()
72
+ # })
73
+
74
+ # return tools
75
+
76
+ # def get_qwen_tools(self) -> List[Dict]:
77
+ # """Get tools formatted for Qwen API.
78
+
79
+ # Returns:
80
+ # List of tool parameters in Qwen format
81
+ # """
82
+ # tools = []
83
+
84
+ # # Map base tools to Qwen format
85
+ # for tool in self.tools.values():
86
+ # tools.append({
87
+ # "type": "function",
88
+ # "function": tool.get_schema()
89
+ # })
90
+
91
+ # return tools
@@ -0,0 +1,13 @@
1
+ """Omni provider tools - compatible with multiple LLM providers."""
2
+
3
+ from .bash import OmniBashTool
4
+ from .computer import OmniComputerTool
5
+ from .edit import OmniEditTool
6
+ from .manager import OmniToolManager
7
+
8
+ __all__ = [
9
+ "OmniBashTool",
10
+ "OmniComputerTool",
11
+ "OmniEditTool",
12
+ "OmniToolManager",
13
+ ]
@@ -0,0 +1,69 @@
1
+ """Provider-agnostic implementation of the BashTool."""
2
+
3
+ import logging
4
+ from typing import Any, Dict
5
+
6
+ from computer.computer import Computer
7
+
8
+ from ....core.tools.bash import BaseBashTool
9
+ from ....core.tools import ToolResult
10
+
11
+
12
+ class OmniBashTool(BaseBashTool):
13
+ """A provider-agnostic implementation of the bash tool."""
14
+
15
+ name = "bash"
16
+ logger = logging.getLogger(__name__)
17
+
18
+ def __init__(self, computer: Computer):
19
+ """Initialize the BashTool.
20
+
21
+ Args:
22
+ computer: Computer instance, may be used for related operations
23
+ """
24
+ super().__init__(computer)
25
+
26
+ def to_params(self) -> Dict[str, Any]:
27
+ """Convert tool to provider-agnostic parameters.
28
+
29
+ Returns:
30
+ Dictionary with tool parameters
31
+ """
32
+ return {
33
+ "name": self.name,
34
+ "description": "A tool that allows the agent to run bash commands",
35
+ "parameters": {
36
+ "command": {"type": "string", "description": "The bash command to execute"},
37
+ "restart": {
38
+ "type": "boolean",
39
+ "description": "Whether to restart the bash session",
40
+ },
41
+ },
42
+ }
43
+
44
+ async def __call__(self, **kwargs) -> ToolResult:
45
+ """Execute the bash tool with the provided arguments.
46
+
47
+ Args:
48
+ command: The bash command to execute
49
+ restart: Whether to restart the bash session
50
+
51
+ Returns:
52
+ ToolResult with the command output
53
+ """
54
+ command = kwargs.get("command")
55
+ restart = kwargs.get("restart", False)
56
+
57
+ if not command:
58
+ return ToolResult(error="Command is required")
59
+
60
+ self.logger.info(f"Executing bash command: {command}")
61
+ exit_code, stdout, stderr = await self.run_command(command)
62
+
63
+ output = stdout
64
+ error = None
65
+
66
+ if exit_code != 0:
67
+ error = f"Command exited with code {exit_code}: {stderr}"
68
+
69
+ return ToolResult(output=output, error=error)