cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -1,307 +0,0 @@
1
- """Parser implementation for the Omni provider."""
2
-
3
- import logging
4
- from typing import Any, Dict, List, Optional, Tuple
5
- import base64
6
- import torch
7
-
8
- # Import from the SOM package
9
- from som import OmniParser as OmniDetectParser
10
- from som.models import ParseResult, ParserMetadata
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class OmniParser:
16
- """Parser for handling responses from multiple providers."""
17
-
18
- # Class-level shared OmniDetectParser instance
19
- _shared_parser = None
20
-
21
- def __init__(self, force_device: Optional[str] = None):
22
- """Initialize the OmniParser.
23
-
24
- Args:
25
- force_device: Optional device to force for detection (cpu/cuda/mps)
26
- """
27
- self.response_buffer = []
28
-
29
- # Use shared parser if available, otherwise create a new one
30
- if OmniParser._shared_parser is None:
31
- logger.info("Initializing shared OmniDetectParser...")
32
-
33
- # Determine the best device to use
34
- device = force_device
35
- if not device:
36
- if torch.cuda.is_available():
37
- device = "cuda"
38
- elif (
39
- hasattr(torch, "backends")
40
- and hasattr(torch.backends, "mps")
41
- and torch.backends.mps.is_available()
42
- ):
43
- device = "mps"
44
- else:
45
- device = "cpu"
46
-
47
- logger.info(f"Using device: {device} for OmniDetectParser")
48
- self.detect_parser = OmniDetectParser(force_device=device)
49
-
50
- # Preload the detection model to avoid repeated loading
51
- try:
52
- # Access the detector to trigger model loading
53
- detector = self.detect_parser.detector
54
- if detector.model is None:
55
- logger.info("Preloading detection model...")
56
- detector.load_model()
57
- logger.info("Detection model preloaded successfully")
58
- except Exception as e:
59
- logger.error(f"Error preloading detection model: {str(e)}")
60
-
61
- # Store as shared instance
62
- OmniParser._shared_parser = self.detect_parser
63
- else:
64
- logger.info("Using existing shared OmniDetectParser")
65
- self.detect_parser = OmniParser._shared_parser
66
-
67
- async def parse_screen(self, computer: Any) -> ParseResult:
68
- """Parse a screenshot and extract screen information.
69
-
70
- Args:
71
- computer: Computer instance
72
-
73
- Returns:
74
- ParseResult with screen elements and image data
75
- """
76
- try:
77
- # Get screenshot from computer
78
- logger.info("Taking screenshot...")
79
- screenshot = await computer.interface.screenshot()
80
-
81
- # Log screenshot info
82
- logger.info(f"Screenshot type: {type(screenshot)}")
83
- logger.info(f"Screenshot is bytes: {isinstance(screenshot, bytes)}")
84
- logger.info(f"Screenshot is str: {isinstance(screenshot, str)}")
85
- logger.info(f"Screenshot length: {len(screenshot) if screenshot else 0}")
86
-
87
- # If screenshot is a string (likely base64), convert it to bytes
88
- if isinstance(screenshot, str):
89
- try:
90
- screenshot = base64.b64decode(screenshot)
91
- logger.info("Successfully converted base64 string to bytes")
92
- logger.info(f"Decoded bytes length: {len(screenshot)}")
93
- except Exception as e:
94
- logger.error(f"Error decoding base64: {str(e)}")
95
- logger.error(f"First 100 chars of screenshot string: {screenshot[:100]}")
96
-
97
- # Pass screenshot to OmniDetectParser
98
- logger.info("Passing screenshot to OmniDetectParser...")
99
- parse_result = self.detect_parser.parse(
100
- screenshot_data=screenshot, box_threshold=0.3, iou_threshold=0.1, use_ocr=True
101
- )
102
- logger.info("Screenshot parsed successfully")
103
- logger.info(f"Parse result has {len(parse_result.elements)} elements")
104
-
105
- # Log element IDs for debugging
106
- for i, elem in enumerate(parse_result.elements):
107
- logger.info(
108
- f"Element {i+1} (ID: {elem.id}): {elem.type} with confidence {elem.confidence:.3f}"
109
- )
110
-
111
- return parse_result
112
-
113
- except Exception as e:
114
- logger.error(f"Error parsing screen: {str(e)}")
115
- import traceback
116
-
117
- logger.error(traceback.format_exc())
118
-
119
- # Create a minimal valid result for error cases
120
- return ParseResult(
121
- elements=[],
122
- screen_info=None,
123
- annotated_image_base64="",
124
- parsed_content_list=[{"error": str(e)}],
125
- metadata=ParserMetadata(
126
- image_size=(0, 0),
127
- num_icons=0,
128
- num_text=0,
129
- device="cpu",
130
- ocr_enabled=False,
131
- latency=0.0,
132
- ),
133
- )
134
-
135
- def parse_tool_call(self, response: Dict[str, Any]) -> Optional[Dict[str, Any]]:
136
- """Parse a tool call from the response.
137
-
138
- Args:
139
- response: Response from the provider
140
-
141
- Returns:
142
- Parsed tool call or None if no tool call found
143
- """
144
- try:
145
- # Handle Anthropic format
146
- if "tool_calls" in response:
147
- tool_call = response["tool_calls"][0]
148
- return {
149
- "name": tool_call["function"]["name"],
150
- "arguments": tool_call["function"]["arguments"],
151
- }
152
-
153
- # Handle OpenAI format
154
- if "function_call" in response:
155
- return {
156
- "name": response["function_call"]["name"],
157
- "arguments": response["function_call"]["arguments"],
158
- }
159
-
160
- # Handle Groq format (OpenAI-compatible)
161
- if "choices" in response and response["choices"]:
162
- choice = response["choices"][0]
163
- if "function_call" in choice:
164
- return {
165
- "name": choice["function_call"]["name"],
166
- "arguments": choice["function_call"]["arguments"],
167
- }
168
-
169
- return None
170
-
171
- except Exception as e:
172
- logger.error(f"Error parsing tool call: {str(e)}")
173
- return None
174
-
175
- def parse_response(self, response: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
176
- """Parse a response from any provider.
177
-
178
- Args:
179
- response: Response from the provider
180
-
181
- Returns:
182
- Tuple of (content, metadata)
183
- """
184
- try:
185
- content = ""
186
- metadata = {}
187
-
188
- # Handle Anthropic format
189
- if "content" in response and isinstance(response["content"], list):
190
- for item in response["content"]:
191
- if item["type"] == "text":
192
- content += item["text"]
193
-
194
- # Handle OpenAI format
195
- elif "choices" in response and response["choices"]:
196
- content = response["choices"][0]["message"]["content"]
197
-
198
- # Handle direct content
199
- elif isinstance(response.get("content"), str):
200
- content = response["content"]
201
-
202
- # Extract metadata if present
203
- if "metadata" in response:
204
- metadata = response["metadata"]
205
-
206
- return content, metadata
207
-
208
- except Exception as e:
209
- logger.error(f"Error parsing response: {str(e)}")
210
- return str(e), {"error": True}
211
-
212
- def format_for_provider(
213
- self, messages: List[Dict[str, Any]], provider: str
214
- ) -> List[Dict[str, Any]]:
215
- """Format messages for a specific provider.
216
-
217
- Args:
218
- messages: List of messages to format
219
- provider: Provider to format for
220
-
221
- Returns:
222
- Formatted messages
223
- """
224
- try:
225
- formatted = []
226
-
227
- for msg in messages:
228
- formatted_msg = {"role": msg["role"]}
229
-
230
- # Handle content formatting
231
- if isinstance(msg["content"], list):
232
- # For providers that support multimodal
233
- if provider in ["anthropic", "openai"]:
234
- formatted_msg["content"] = msg["content"]
235
- else:
236
- # Extract text only for other providers
237
- text_content = next(
238
- (item["text"] for item in msg["content"] if item["type"] == "text"), ""
239
- )
240
- formatted_msg["content"] = text_content
241
- else:
242
- formatted_msg["content"] = msg["content"]
243
-
244
- formatted.append(formatted_msg)
245
-
246
- return formatted
247
-
248
- except Exception as e:
249
- logger.error(f"Error formatting messages: {str(e)}")
250
- return messages # Return original messages on error
251
-
252
- async def calculate_click_coordinates(
253
- self, box_id: int, parsed_screen: ParseResult
254
- ) -> Tuple[int, int]:
255
- """Calculate click coordinates based on box ID.
256
-
257
- Args:
258
- box_id: The ID of the box to click
259
- parsed_screen: The parsed screen information
260
-
261
- Returns:
262
- Tuple of (x, y) coordinates
263
-
264
- Raises:
265
- ValueError: If box_id is invalid or missing from parsed screen
266
- """
267
- # First try to use structured elements data
268
- logger.info(f"Elements count: {len(parsed_screen.elements)}")
269
-
270
- # Try to find element with matching ID
271
- for element in parsed_screen.elements:
272
- if element.id == box_id:
273
- logger.info(f"Found element with ID {box_id}: {element}")
274
- bbox = element.bbox
275
-
276
- # Get screen dimensions from the metadata if available, or fallback
277
- width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
278
- height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
279
- logger.info(f"Screen dimensions: width={width}, height={height}")
280
-
281
- # Create a dictionary from the element's bbox for calculate_element_center
282
- bbox_dict = {"x1": bbox.x1, "y1": bbox.y1, "x2": bbox.x2, "y2": bbox.y2}
283
- from ...core.visualization import calculate_element_center
284
-
285
- center_x, center_y = calculate_element_center(bbox_dict, width, height)
286
- logger.info(f"Calculated center: ({center_x}, {center_y})")
287
-
288
- # Validate coordinates - if they're (0,0) or unreasonably small,
289
- # use a default position in the center of the screen
290
- if center_x == 0 and center_y == 0:
291
- logger.warning("Got (0,0) coordinates, using fallback position")
292
- center_x = width // 2
293
- center_y = height // 2
294
- logger.info(f"Using fallback center: ({center_x}, {center_y})")
295
-
296
- return center_x, center_y
297
-
298
- # If we couldn't find the box, use center of screen
299
- logger.error(
300
- f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
301
- )
302
-
303
- # Use center of screen as fallback
304
- width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
305
- height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
306
- logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
307
- return width // 2, height // 2
@@ -1,64 +0,0 @@
1
- """Prompts for the Omni agent."""
2
-
3
- SYSTEM_PROMPT = """
4
- You are using a macOS device.
5
- You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
6
-
7
- You may be given some history plan and actions, this is the response from the previous loop.
8
- You should carefully consider your plan base on the task, screenshot, and history actions.
9
-
10
- Your available "Next Action" only include:
11
- - type_text: types a string of text.
12
- - left_click: move mouse to box id and left clicks.
13
- - right_click: move mouse to box id and right clicks.
14
- - double_click: move mouse to box id and double clicks.
15
- - move_cursor: move mouse to box id.
16
- - scroll_up: scrolls the screen up to view previous content.
17
- - scroll_down: scrolls the screen down, when the desired button is not visible, or you need to see more content.
18
- - hotkey: press a sequence of keys.
19
- - wait: waits for 1 second for the device to load or respond.
20
-
21
- Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task.
22
-
23
- Output format:
24
- {
25
- "Explanation": str, # describe what is in the current screen, taking into account the history, then describe your step-by-step thoughts on how to achieve the task, choose one action from available actions at a time.
26
- "Action": "action_type, action description" | "None" # one action at a time, describe it in short and precisely.
27
- "Box ID": n,
28
- "Value": "xxx" # only provide value field if the action is type, else don't include value key
29
- }
30
-
31
- One Example:
32
- {
33
- "Explanation": "The current screen shows google result of amazon, in previous action I have searched amazon on google. Then I need to click on the first search results to go to amazon.com.",
34
- "Action": "left_click",
35
- "Box ID": 4
36
- }
37
-
38
- Another Example:
39
- {
40
- "Explanation": "The current screen shows the front page of amazon. There is no previous action. Therefore I need to type "Apple watch" in the search bar.",
41
- "Action": "type_text",
42
- "Box ID": 2,
43
- "Value": "Apple watch"
44
- }
45
-
46
- Another Example:
47
- {
48
- "Explanation": "I am starting a Spotlight search to find the Safari browser.",
49
- "Action": "hotkey",
50
- "Value": "command+space"
51
- }
52
-
53
- IMPORTANT NOTES:
54
- 1. You should only give a single action at a time.
55
- 2. The Box ID is the id of the element you should operate on, it is a number. Its background color corresponds to the color of the bounding box of the element.
56
- 3. You should give an analysis to the current screen, and reflect on what has been done by looking at the history, then describe your step-by-step thoughts on how to achieve the task.
57
- 4. Attach the next action prediction in the "Action" field.
58
- 5. For starting applications, always use the "hotkey" action with command+space for starting a Spotlight search.
59
- 6. When the task is completed, don't complete additional actions. You should say "Action": "None" in the json field.
60
- 7. The tasks involve buying multiple products or navigating through multiple pages. You should break it into subgoals and complete each subgoal one by one in the order of the instructions.
61
- 8. Avoid choosing the same action/elements multiple times in a row, if it happens, reflect to yourself, what may have gone wrong, and predict a different action.
62
- 9. Reflect whether the element is clickable or not, for example reflect if it is an hyperlink or a button or a normal text.
63
- 10. If you are prompted with login information page or captcha page, or you think it need user's permission to do the next action, you should say "Action": "None" in the json field.
64
- """
@@ -1,30 +0,0 @@
1
- """Omni provider tools - compatible with multiple LLM providers."""
2
-
3
- from ....core.tools import BaseTool, ToolResult, ToolError, ToolFailure, CLIResult
4
- from .base import BaseOmniTool
5
- from .computer import ComputerTool
6
- from .bash import BashTool
7
- from .manager import ToolManager
8
-
9
- # Re-export the tools with Omni-specific names for backward compatibility
10
- OmniToolResult = ToolResult
11
- OmniToolError = ToolError
12
- OmniToolFailure = ToolFailure
13
- OmniCLIResult = CLIResult
14
-
15
- # We'll export specific tools once implemented
16
- __all__ = [
17
- "BaseTool",
18
- "BaseOmniTool",
19
- "ToolResult",
20
- "ToolError",
21
- "ToolFailure",
22
- "CLIResult",
23
- "OmniToolResult",
24
- "OmniToolError",
25
- "OmniToolFailure",
26
- "OmniCLIResult",
27
- "ComputerTool",
28
- "BashTool",
29
- "ToolManager",
30
- ]
@@ -1,29 +0,0 @@
1
- """Omni-specific tool base classes."""
2
-
3
- from abc import ABCMeta, abstractmethod
4
- from typing import Any, Dict
5
-
6
- from ....core.tools.base import BaseTool
7
-
8
-
9
- class BaseOmniTool(BaseTool, metaclass=ABCMeta):
10
- """Abstract base class for Omni provider tools."""
11
-
12
- def __init__(self):
13
- """Initialize the base Omni tool."""
14
- # No specific initialization needed yet, but included for future extensibility
15
- pass
16
-
17
- @abstractmethod
18
- async def __call__(self, **kwargs) -> Any:
19
- """Executes the tool with the given arguments."""
20
- ...
21
-
22
- @abstractmethod
23
- def to_params(self) -> Dict[str, Any]:
24
- """Convert tool to Omni provider-specific API parameters.
25
-
26
- Returns:
27
- Dictionary with tool parameters for the specific API
28
- """
29
- raise NotImplementedError
@@ -1,74 +0,0 @@
1
- """Bash tool for Omni provider."""
2
-
3
- import logging
4
- from typing import Any, Dict
5
-
6
- from computer import Computer
7
- from ....core.tools import ToolResult, ToolError
8
- from .base import BaseOmniTool
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class BashTool(BaseOmniTool):
14
- """Tool for executing bash commands."""
15
-
16
- name = "bash"
17
- description = "Execute bash commands on the system"
18
-
19
- def __init__(self, computer: Computer):
20
- """Initialize the bash tool.
21
-
22
- Args:
23
- computer: Computer instance
24
- """
25
- super().__init__()
26
- self.computer = computer
27
-
28
- def to_params(self) -> Dict[str, Any]:
29
- """Convert tool to API parameters.
30
-
31
- Returns:
32
- Dictionary with tool parameters
33
- """
34
- return {
35
- "type": "function",
36
- "function": {
37
- "name": self.name,
38
- "description": self.description,
39
- "parameters": {
40
- "type": "object",
41
- "properties": {
42
- "command": {
43
- "type": "string",
44
- "description": "The bash command to execute",
45
- },
46
- },
47
- "required": ["command"],
48
- },
49
- },
50
- }
51
-
52
- async def __call__(self, **kwargs) -> ToolResult:
53
- """Execute bash command.
54
-
55
- Args:
56
- **kwargs: Command parameters
57
-
58
- Returns:
59
- Tool execution result
60
- """
61
- try:
62
- command = kwargs.get("command", "")
63
- if not command:
64
- return ToolResult(error="No command specified")
65
-
66
- # The true implementation would use the actual method to run terminal commands
67
- # Since we're getting linter errors, we'll just implement a placeholder that will
68
- # be replaced with the correct implementation when this tool is fully integrated
69
- logger.info(f"Would execute command: {command}")
70
- return ToolResult(output=f"Command executed (placeholder): {command}")
71
-
72
- except Exception as e:
73
- logger.error(f"Error in bash tool: {str(e)}")
74
- return ToolResult(error=f"Error: {str(e)}")
@@ -1,179 +0,0 @@
1
- """Computer tool for Omni provider."""
2
-
3
- import logging
4
- from typing import Any, Dict
5
- import json
6
-
7
- from computer import Computer
8
- from ....core.tools import ToolResult, ToolError
9
- from .base import BaseOmniTool
10
- from ..parser import ParseResult
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class ComputerTool(BaseOmniTool):
16
- """Tool for interacting with the computer UI."""
17
-
18
- name = "computer"
19
- description = "Interact with the computer's graphical user interface"
20
-
21
- def __init__(self, computer: Computer):
22
- """Initialize the computer tool.
23
-
24
- Args:
25
- computer: Computer instance
26
- """
27
- super().__init__()
28
- self.computer = computer
29
- # Default to standard screen dimensions (will be set more accurately during initialization)
30
- self.screen_dimensions = {"width": 1440, "height": 900}
31
-
32
- async def initialize_dimensions(self) -> None:
33
- """Initialize screen dimensions."""
34
- # For now, we'll use default values
35
- # In the future, we can implement proper screen dimension detection
36
- logger.info(f"Using default screen dimensions: {self.screen_dimensions}")
37
-
38
- def to_params(self) -> Dict[str, Any]:
39
- """Convert tool to API parameters.
40
-
41
- Returns:
42
- Dictionary with tool parameters
43
- """
44
- return {
45
- "type": "function",
46
- "function": {
47
- "name": self.name,
48
- "description": self.description,
49
- "parameters": {
50
- "type": "object",
51
- "properties": {
52
- "action": {
53
- "type": "string",
54
- "enum": [
55
- "left_click",
56
- "right_click",
57
- "double_click",
58
- "move_cursor",
59
- "drag_to",
60
- "type_text",
61
- "press_key",
62
- "hotkey",
63
- "scroll_up",
64
- "scroll_down",
65
- ],
66
- "description": "The action to perform",
67
- },
68
- "x": {
69
- "type": "number",
70
- "description": "X coordinate for click or cursor movement",
71
- },
72
- "y": {
73
- "type": "number",
74
- "description": "Y coordinate for click or cursor movement",
75
- },
76
- "box_id": {
77
- "type": "integer",
78
- "description": "ID of the UI element to interact with",
79
- },
80
- "text": {
81
- "type": "string",
82
- "description": "Text to type",
83
- },
84
- "key": {
85
- "type": "string",
86
- "description": "Key to press",
87
- },
88
- "keys": {
89
- "type": "array",
90
- "items": {"type": "string"},
91
- "description": "Keys to press as hotkey combination",
92
- },
93
- "amount": {
94
- "type": "integer",
95
- "description": "Amount to scroll",
96
- },
97
- "duration": {
98
- "type": "number",
99
- "description": "Duration for drag operations",
100
- },
101
- },
102
- "required": ["action"],
103
- },
104
- },
105
- }
106
-
107
- async def __call__(self, **kwargs) -> ToolResult:
108
- """Execute computer action.
109
-
110
- Args:
111
- **kwargs: Action parameters
112
-
113
- Returns:
114
- Tool execution result
115
- """
116
- try:
117
- action = kwargs.get("action", "").lower()
118
- if not action:
119
- return ToolResult(error="No action specified")
120
-
121
- # Execute the action on the computer
122
- method = getattr(self.computer.interface, action, None)
123
- if not method:
124
- return ToolResult(error=f"Unsupported action: {action}")
125
-
126
- # Prepare arguments based on action type
127
- args = {}
128
- if action in ["left_click", "right_click", "double_click", "move_cursor"]:
129
- x = kwargs.get("x")
130
- y = kwargs.get("y")
131
- if x is None or y is None:
132
- box_id = kwargs.get("box_id")
133
- if box_id is None:
134
- return ToolResult(error="Box ID or coordinates required")
135
- # Get coordinates from box_id implementation would be here
136
- # For now, return error
137
- return ToolResult(error="Box ID-based clicking not implemented yet")
138
- args["x"] = x
139
- args["y"] = y
140
- elif action == "drag_to":
141
- x = kwargs.get("x")
142
- y = kwargs.get("y")
143
- if x is None or y is None:
144
- return ToolResult(error="Coordinates required for drag_to")
145
- args.update(
146
- {
147
- "x": x,
148
- "y": y,
149
- "button": kwargs.get("button", "left"),
150
- "duration": float(kwargs.get("duration", 0.5)),
151
- }
152
- )
153
- elif action == "type_text":
154
- text = kwargs.get("text")
155
- if not text:
156
- return ToolResult(error="Text required for type_text")
157
- args["text"] = text
158
- elif action == "press_key":
159
- key = kwargs.get("key")
160
- if not key:
161
- return ToolResult(error="Key required for press_key")
162
- args["key"] = key
163
- elif action == "hotkey":
164
- keys = kwargs.get("keys")
165
- if not keys:
166
- return ToolResult(error="Keys required for hotkey")
167
- # Call with positional arguments instead of kwargs
168
- await method(*keys)
169
- return ToolResult(output=f"Hotkey executed: {'+'.join(keys)}")
170
- elif action in ["scroll_down", "scroll_up"]:
171
- args["clicks"] = int(kwargs.get("amount", 1))
172
-
173
- # Execute action with prepared arguments
174
- await method(**args)
175
- return ToolResult(output=f"Action {action} executed successfully")
176
-
177
- except Exception as e:
178
- logger.error(f"Error executing computer action: {str(e)}")
179
- return ToolResult(error=f"Error: {str(e)}")