cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +3 -4
- agent/core/__init__.py +3 -10
- agent/core/computer_agent.py +207 -32
- agent/core/experiment.py +20 -3
- agent/core/loop.py +78 -120
- agent/core/messages.py +279 -125
- agent/core/telemetry.py +44 -32
- agent/core/types.py +35 -0
- agent/core/visualization.py +197 -0
- agent/providers/anthropic/api/client.py +142 -1
- agent/providers/anthropic/api_handler.py +140 -0
- agent/providers/anthropic/callbacks/__init__.py +5 -0
- agent/providers/anthropic/loop.py +224 -209
- agent/providers/anthropic/messages/manager.py +3 -1
- agent/providers/anthropic/response_handler.py +229 -0
- agent/providers/anthropic/tools/base.py +1 -1
- agent/providers/anthropic/tools/bash.py +0 -97
- agent/providers/anthropic/tools/collection.py +2 -2
- agent/providers/anthropic/tools/computer.py +34 -24
- agent/providers/anthropic/tools/manager.py +2 -2
- agent/providers/anthropic/utils.py +370 -0
- agent/providers/omni/__init__.py +1 -20
- agent/providers/omni/api_handler.py +42 -0
- agent/providers/omni/clients/anthropic.py +4 -0
- agent/providers/omni/image_utils.py +0 -72
- agent/providers/omni/loop.py +497 -607
- agent/providers/omni/parser.py +60 -5
- agent/providers/omni/tools/__init__.py +25 -8
- agent/providers/omni/tools/base.py +29 -0
- agent/providers/omni/tools/bash.py +43 -38
- agent/providers/omni/tools/computer.py +144 -181
- agent/providers/omni/tools/manager.py +26 -48
- agent/providers/omni/types.py +0 -4
- agent/providers/omni/utils.py +225 -144
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
- cua_agent-0.1.17.dist-info/RECORD +63 -0
- agent/core/agent.py +0 -252
- agent/core/base_agent.py +0 -164
- agent/core/factory.py +0 -102
- agent/providers/omni/callbacks.py +0 -78
- agent/providers/omni/clients/groq.py +0 -101
- agent/providers/omni/experiment.py +0 -273
- agent/providers/omni/messages.py +0 -171
- agent/providers/omni/tool_manager.py +0 -91
- agent/providers/omni/visualization.py +0 -130
- agent/types/__init__.py +0 -26
- agent/types/base.py +0 -53
- agent/types/messages.py +0 -36
- cua_agent-0.1.5.dist-info/RECORD +0 -67
- /agent/{types → core}/tools.py +0 -0
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0
agent/providers/omni/parser.py
CHANGED
|
@@ -3,14 +3,11 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
import base64
|
|
6
|
-
from PIL import Image
|
|
7
|
-
from io import BytesIO
|
|
8
|
-
import json
|
|
9
6
|
import torch
|
|
10
7
|
|
|
11
8
|
# Import from the SOM package
|
|
12
9
|
from som import OmniParser as OmniDetectParser
|
|
13
|
-
from som.models import ParseResult,
|
|
10
|
+
from som.models import ParseResult, ParserMetadata
|
|
14
11
|
|
|
15
12
|
logger = logging.getLogger(__name__)
|
|
16
13
|
|
|
@@ -122,8 +119,9 @@ class OmniParser:
|
|
|
122
119
|
# Create a minimal valid result for error cases
|
|
123
120
|
return ParseResult(
|
|
124
121
|
elements=[],
|
|
122
|
+
screen_info=None,
|
|
125
123
|
annotated_image_base64="",
|
|
126
|
-
parsed_content_list=[
|
|
124
|
+
parsed_content_list=[{"error": str(e)}],
|
|
127
125
|
metadata=ParserMetadata(
|
|
128
126
|
image_size=(0, 0),
|
|
129
127
|
num_icons=0,
|
|
@@ -250,3 +248,60 @@ class OmniParser:
|
|
|
250
248
|
except Exception as e:
|
|
251
249
|
logger.error(f"Error formatting messages: {str(e)}")
|
|
252
250
|
return messages # Return original messages on error
|
|
251
|
+
|
|
252
|
+
async def calculate_click_coordinates(
|
|
253
|
+
self, box_id: int, parsed_screen: ParseResult
|
|
254
|
+
) -> Tuple[int, int]:
|
|
255
|
+
"""Calculate click coordinates based on box ID.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
box_id: The ID of the box to click
|
|
259
|
+
parsed_screen: The parsed screen information
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Tuple of (x, y) coordinates
|
|
263
|
+
|
|
264
|
+
Raises:
|
|
265
|
+
ValueError: If box_id is invalid or missing from parsed screen
|
|
266
|
+
"""
|
|
267
|
+
# First try to use structured elements data
|
|
268
|
+
logger.info(f"Elements count: {len(parsed_screen.elements)}")
|
|
269
|
+
|
|
270
|
+
# Try to find element with matching ID
|
|
271
|
+
for element in parsed_screen.elements:
|
|
272
|
+
if element.id == box_id:
|
|
273
|
+
logger.info(f"Found element with ID {box_id}: {element}")
|
|
274
|
+
bbox = element.bbox
|
|
275
|
+
|
|
276
|
+
# Get screen dimensions from the metadata if available, or fallback
|
|
277
|
+
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
278
|
+
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
279
|
+
logger.info(f"Screen dimensions: width={width}, height={height}")
|
|
280
|
+
|
|
281
|
+
# Create a dictionary from the element's bbox for calculate_element_center
|
|
282
|
+
bbox_dict = {"x1": bbox.x1, "y1": bbox.y1, "x2": bbox.x2, "y2": bbox.y2}
|
|
283
|
+
from ...core.visualization import calculate_element_center
|
|
284
|
+
|
|
285
|
+
center_x, center_y = calculate_element_center(bbox_dict, width, height)
|
|
286
|
+
logger.info(f"Calculated center: ({center_x}, {center_y})")
|
|
287
|
+
|
|
288
|
+
# Validate coordinates - if they're (0,0) or unreasonably small,
|
|
289
|
+
# use a default position in the center of the screen
|
|
290
|
+
if center_x == 0 and center_y == 0:
|
|
291
|
+
logger.warning("Got (0,0) coordinates, using fallback position")
|
|
292
|
+
center_x = width // 2
|
|
293
|
+
center_y = height // 2
|
|
294
|
+
logger.info(f"Using fallback center: ({center_x}, {center_y})")
|
|
295
|
+
|
|
296
|
+
return center_x, center_y
|
|
297
|
+
|
|
298
|
+
# If we couldn't find the box, use center of screen
|
|
299
|
+
logger.error(
|
|
300
|
+
f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Use center of screen as fallback
|
|
304
|
+
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
305
|
+
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
306
|
+
logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
|
|
307
|
+
return width // 2, height // 2
|
|
@@ -1,13 +1,30 @@
|
|
|
1
1
|
"""Omni provider tools - compatible with multiple LLM providers."""
|
|
2
2
|
|
|
3
|
-
from .
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
3
|
+
from ....core.tools import BaseTool, ToolResult, ToolError, ToolFailure, CLIResult
|
|
4
|
+
from .base import BaseOmniTool
|
|
5
|
+
from .computer import ComputerTool
|
|
6
|
+
from .bash import BashTool
|
|
7
|
+
from .manager import ToolManager
|
|
7
8
|
|
|
9
|
+
# Re-export the tools with Omni-specific names for backward compatibility
|
|
10
|
+
OmniToolResult = ToolResult
|
|
11
|
+
OmniToolError = ToolError
|
|
12
|
+
OmniToolFailure = ToolFailure
|
|
13
|
+
OmniCLIResult = CLIResult
|
|
14
|
+
|
|
15
|
+
# We'll export specific tools once implemented
|
|
8
16
|
__all__ = [
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
17
|
+
"BaseTool",
|
|
18
|
+
"BaseOmniTool",
|
|
19
|
+
"ToolResult",
|
|
20
|
+
"ToolError",
|
|
21
|
+
"ToolFailure",
|
|
22
|
+
"CLIResult",
|
|
23
|
+
"OmniToolResult",
|
|
24
|
+
"OmniToolError",
|
|
25
|
+
"OmniToolFailure",
|
|
26
|
+
"OmniCLIResult",
|
|
27
|
+
"ComputerTool",
|
|
28
|
+
"BashTool",
|
|
29
|
+
"ToolManager",
|
|
13
30
|
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Omni-specific tool base classes."""
|
|
2
|
+
|
|
3
|
+
from abc import ABCMeta, abstractmethod
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from ....core.tools.base import BaseTool
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseOmniTool(BaseTool, metaclass=ABCMeta):
|
|
10
|
+
"""Abstract base class for Omni provider tools."""
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""Initialize the base Omni tool."""
|
|
14
|
+
# No specific initialization needed yet, but included for future extensibility
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
async def __call__(self, **kwargs) -> Any:
|
|
19
|
+
"""Executes the tool with the given arguments."""
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def to_params(self) -> Dict[str, Any]:
|
|
24
|
+
"""Convert tool to Omni provider-specific API parameters.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Dictionary with tool parameters for the specific API
|
|
28
|
+
"""
|
|
29
|
+
raise NotImplementedError
|
|
@@ -1,69 +1,74 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Bash tool for Omni provider."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Any, Dict
|
|
5
5
|
|
|
6
|
-
from computer
|
|
6
|
+
from computer import Computer
|
|
7
|
+
from ....core.tools import ToolResult, ToolError
|
|
8
|
+
from .base import BaseOmniTool
|
|
7
9
|
|
|
8
|
-
|
|
9
|
-
from ....core.tools import ToolResult
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
class
|
|
13
|
-
"""
|
|
13
|
+
class BashTool(BaseOmniTool):
|
|
14
|
+
"""Tool for executing bash commands."""
|
|
14
15
|
|
|
15
16
|
name = "bash"
|
|
16
|
-
|
|
17
|
+
description = "Execute bash commands on the system"
|
|
17
18
|
|
|
18
19
|
def __init__(self, computer: Computer):
|
|
19
|
-
"""Initialize the
|
|
20
|
+
"""Initialize the bash tool.
|
|
20
21
|
|
|
21
22
|
Args:
|
|
22
|
-
computer: Computer instance
|
|
23
|
+
computer: Computer instance
|
|
23
24
|
"""
|
|
24
|
-
super().__init__(
|
|
25
|
+
super().__init__()
|
|
26
|
+
self.computer = computer
|
|
25
27
|
|
|
26
28
|
def to_params(self) -> Dict[str, Any]:
|
|
27
|
-
"""Convert tool to
|
|
29
|
+
"""Convert tool to API parameters.
|
|
28
30
|
|
|
29
31
|
Returns:
|
|
30
32
|
Dictionary with tool parameters
|
|
31
33
|
"""
|
|
32
34
|
return {
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
|
|
36
|
-
"
|
|
37
|
-
"
|
|
38
|
-
"type": "
|
|
39
|
-
"
|
|
35
|
+
"type": "function",
|
|
36
|
+
"function": {
|
|
37
|
+
"name": self.name,
|
|
38
|
+
"description": self.description,
|
|
39
|
+
"parameters": {
|
|
40
|
+
"type": "object",
|
|
41
|
+
"properties": {
|
|
42
|
+
"command": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"description": "The bash command to execute",
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
"required": ["command"],
|
|
40
48
|
},
|
|
41
49
|
},
|
|
42
50
|
}
|
|
43
51
|
|
|
44
52
|
async def __call__(self, **kwargs) -> ToolResult:
|
|
45
|
-
"""Execute
|
|
53
|
+
"""Execute bash command.
|
|
46
54
|
|
|
47
55
|
Args:
|
|
48
|
-
|
|
49
|
-
restart: Whether to restart the bash session
|
|
56
|
+
**kwargs: Command parameters
|
|
50
57
|
|
|
51
58
|
Returns:
|
|
52
|
-
|
|
59
|
+
Tool execution result
|
|
53
60
|
"""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
error
|
|
68
|
-
|
|
69
|
-
return ToolResult(output=output, error=error)
|
|
61
|
+
try:
|
|
62
|
+
command = kwargs.get("command", "")
|
|
63
|
+
if not command:
|
|
64
|
+
return ToolResult(error="No command specified")
|
|
65
|
+
|
|
66
|
+
# The true implementation would use the actual method to run terminal commands
|
|
67
|
+
# Since we're getting linter errors, we'll just implement a placeholder that will
|
|
68
|
+
# be replaced with the correct implementation when this tool is fully integrated
|
|
69
|
+
logger.info(f"Would execute command: {command}")
|
|
70
|
+
return ToolResult(output=f"Command executed (placeholder): {command}")
|
|
71
|
+
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.error(f"Error in bash tool: {str(e)}")
|
|
74
|
+
return ToolResult(error=f"Error: {str(e)}")
|
|
@@ -1,216 +1,179 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Computer tool for Omni provider."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
import base64
|
|
5
|
-
import io
|
|
6
4
|
from typing import Any, Dict
|
|
5
|
+
import json
|
|
7
6
|
|
|
8
|
-
from
|
|
9
|
-
from computer.computer import Computer
|
|
10
|
-
|
|
11
|
-
from ....core.tools.computer import BaseComputerTool
|
|
7
|
+
from computer import Computer
|
|
12
8
|
from ....core.tools import ToolResult, ToolError
|
|
9
|
+
from .base import BaseOmniTool
|
|
10
|
+
from ..parser import ParseResult
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class
|
|
16
|
-
"""
|
|
15
|
+
class ComputerTool(BaseOmniTool):
|
|
16
|
+
"""Tool for interacting with the computer UI."""
|
|
17
17
|
|
|
18
18
|
name = "computer"
|
|
19
|
-
|
|
19
|
+
description = "Interact with the computer's graphical user interface"
|
|
20
20
|
|
|
21
21
|
def __init__(self, computer: Computer):
|
|
22
|
-
"""Initialize the
|
|
22
|
+
"""Initialize the computer tool.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
25
|
-
computer: Computer instance
|
|
25
|
+
computer: Computer instance
|
|
26
26
|
"""
|
|
27
|
-
super().__init__(
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
self.
|
|
31
|
-
|
|
27
|
+
super().__init__()
|
|
28
|
+
self.computer = computer
|
|
29
|
+
# Default to standard screen dimensions (will be set more accurately during initialization)
|
|
30
|
+
self.screen_dimensions = {"width": 1440, "height": 900}
|
|
31
|
+
|
|
32
|
+
async def initialize_dimensions(self) -> None:
|
|
33
|
+
"""Initialize screen dimensions."""
|
|
34
|
+
# For now, we'll use default values
|
|
35
|
+
# In the future, we can implement proper screen dimension detection
|
|
36
|
+
logger.info(f"Using default screen dimensions: {self.screen_dimensions}")
|
|
32
37
|
|
|
33
38
|
def to_params(self) -> Dict[str, Any]:
|
|
34
|
-
"""Convert tool to
|
|
39
|
+
"""Convert tool to API parameters.
|
|
35
40
|
|
|
36
41
|
Returns:
|
|
37
42
|
Dictionary with tool parameters
|
|
38
43
|
"""
|
|
39
44
|
return {
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
|
|
43
|
-
"
|
|
44
|
-
|
|
45
|
-
"
|
|
46
|
-
|
|
47
|
-
"
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
45
|
+
"type": "function",
|
|
46
|
+
"function": {
|
|
47
|
+
"name": self.name,
|
|
48
|
+
"description": self.description,
|
|
49
|
+
"parameters": {
|
|
50
|
+
"type": "object",
|
|
51
|
+
"properties": {
|
|
52
|
+
"action": {
|
|
53
|
+
"type": "string",
|
|
54
|
+
"enum": [
|
|
55
|
+
"left_click",
|
|
56
|
+
"right_click",
|
|
57
|
+
"double_click",
|
|
58
|
+
"move_cursor",
|
|
59
|
+
"drag_to",
|
|
60
|
+
"type_text",
|
|
61
|
+
"press_key",
|
|
62
|
+
"hotkey",
|
|
63
|
+
"scroll_up",
|
|
64
|
+
"scroll_down",
|
|
65
|
+
],
|
|
66
|
+
"description": "The action to perform",
|
|
67
|
+
},
|
|
68
|
+
"x": {
|
|
69
|
+
"type": "number",
|
|
70
|
+
"description": "X coordinate for click or cursor movement",
|
|
71
|
+
},
|
|
72
|
+
"y": {
|
|
73
|
+
"type": "number",
|
|
74
|
+
"description": "Y coordinate for click or cursor movement",
|
|
75
|
+
},
|
|
76
|
+
"box_id": {
|
|
77
|
+
"type": "integer",
|
|
78
|
+
"description": "ID of the UI element to interact with",
|
|
79
|
+
},
|
|
80
|
+
"text": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"description": "Text to type",
|
|
83
|
+
},
|
|
84
|
+
"key": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"description": "Key to press",
|
|
87
|
+
},
|
|
88
|
+
"keys": {
|
|
89
|
+
"type": "array",
|
|
90
|
+
"items": {"type": "string"},
|
|
91
|
+
"description": "Keys to press as hotkey combination",
|
|
92
|
+
},
|
|
93
|
+
"amount": {
|
|
94
|
+
"type": "integer",
|
|
95
|
+
"description": "Amount to scroll",
|
|
96
|
+
},
|
|
97
|
+
"duration": {
|
|
98
|
+
"type": "number",
|
|
99
|
+
"description": "Duration for drag operations",
|
|
100
|
+
},
|
|
101
|
+
},
|
|
102
|
+
"required": ["action"],
|
|
77
103
|
},
|
|
78
104
|
},
|
|
79
|
-
**self.options,
|
|
80
105
|
}
|
|
81
106
|
|
|
82
107
|
async def __call__(self, **kwargs) -> ToolResult:
|
|
83
|
-
"""Execute
|
|
108
|
+
"""Execute computer action.
|
|
84
109
|
|
|
85
110
|
Args:
|
|
86
|
-
|
|
87
|
-
text: Text to type or key to press (for key/type actions)
|
|
88
|
-
coordinate: X,Y coordinates (for mouse actions)
|
|
89
|
-
direction: Direction to scroll (for scroll action)
|
|
90
|
-
amount: Amount to scroll (for scroll action)
|
|
111
|
+
**kwargs: Action parameters
|
|
91
112
|
|
|
92
113
|
Returns:
|
|
93
|
-
|
|
114
|
+
Tool execution result
|
|
94
115
|
"""
|
|
95
|
-
# Ensure dimensions are initialized
|
|
96
|
-
if self.width is None or self.height is None:
|
|
97
|
-
await self.initialize_dimensions()
|
|
98
|
-
|
|
99
|
-
action = kwargs.get("action")
|
|
100
|
-
text = kwargs.get("text")
|
|
101
|
-
coordinate = kwargs.get("coordinate")
|
|
102
|
-
direction = kwargs.get("direction", "down")
|
|
103
|
-
amount = kwargs.get("amount", 10)
|
|
104
|
-
|
|
105
|
-
self.logger.info(f"Executing computer action: {action}")
|
|
106
|
-
|
|
107
116
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
screenshot = await self.computer.interface.screenshot()
|
|
144
|
-
screenshot = await self.resize_screenshot_if_needed(screenshot)
|
|
145
|
-
return ToolResult(
|
|
146
|
-
output=f"Performed double click at ({x}, {y})",
|
|
147
|
-
base64_image=base64.b64encode(screenshot).decode(),
|
|
148
|
-
)
|
|
149
|
-
elif action == "mouse_move" and coordinate:
|
|
150
|
-
x, y = coordinate
|
|
151
|
-
self.logger.info(f"Moving cursor to ({x}, {y})")
|
|
152
|
-
await self.computer.interface.move_cursor(x, y)
|
|
153
|
-
|
|
154
|
-
# Take screenshot after action
|
|
155
|
-
screenshot = await self.computer.interface.screenshot()
|
|
156
|
-
screenshot = await self.resize_screenshot_if_needed(screenshot)
|
|
157
|
-
return ToolResult(
|
|
158
|
-
output=f"Moved cursor to ({x}, {y})",
|
|
159
|
-
base64_image=base64.b64encode(screenshot).decode(),
|
|
117
|
+
action = kwargs.get("action", "").lower()
|
|
118
|
+
if not action:
|
|
119
|
+
return ToolResult(error="No action specified")
|
|
120
|
+
|
|
121
|
+
# Execute the action on the computer
|
|
122
|
+
method = getattr(self.computer.interface, action, None)
|
|
123
|
+
if not method:
|
|
124
|
+
return ToolResult(error=f"Unsupported action: {action}")
|
|
125
|
+
|
|
126
|
+
# Prepare arguments based on action type
|
|
127
|
+
args = {}
|
|
128
|
+
if action in ["left_click", "right_click", "double_click", "move_cursor"]:
|
|
129
|
+
x = kwargs.get("x")
|
|
130
|
+
y = kwargs.get("y")
|
|
131
|
+
if x is None or y is None:
|
|
132
|
+
box_id = kwargs.get("box_id")
|
|
133
|
+
if box_id is None:
|
|
134
|
+
return ToolResult(error="Box ID or coordinates required")
|
|
135
|
+
# Get coordinates from box_id implementation would be here
|
|
136
|
+
# For now, return error
|
|
137
|
+
return ToolResult(error="Box ID-based clicking not implemented yet")
|
|
138
|
+
args["x"] = x
|
|
139
|
+
args["y"] = y
|
|
140
|
+
elif action == "drag_to":
|
|
141
|
+
x = kwargs.get("x")
|
|
142
|
+
y = kwargs.get("y")
|
|
143
|
+
if x is None or y is None:
|
|
144
|
+
return ToolResult(error="Coordinates required for drag_to")
|
|
145
|
+
args.update(
|
|
146
|
+
{
|
|
147
|
+
"x": x,
|
|
148
|
+
"y": y,
|
|
149
|
+
"button": kwargs.get("button", "left"),
|
|
150
|
+
"duration": float(kwargs.get("duration", 0.5)),
|
|
151
|
+
}
|
|
160
152
|
)
|
|
161
|
-
elif action == "
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
#
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
screenshot = await self.resize_screenshot_if_needed(screenshot)
|
|
185
|
-
return ToolResult(
|
|
186
|
-
output=f"Pressed key: {text}",
|
|
187
|
-
base64_image=base64.b64encode(screenshot).decode(),
|
|
188
|
-
)
|
|
189
|
-
elif action == "cursor_position":
|
|
190
|
-
pos = await self.computer.interface.get_cursor_position()
|
|
191
|
-
return ToolResult(output=f"X={int(pos[0])},Y={int(pos[1])}")
|
|
192
|
-
elif action == "scroll":
|
|
193
|
-
if direction == "down":
|
|
194
|
-
self.logger.info(f"Scrolling down, amount: {amount}")
|
|
195
|
-
for _ in range(amount):
|
|
196
|
-
await self.computer.interface.hotkey("fn", "down")
|
|
197
|
-
else:
|
|
198
|
-
self.logger.info(f"Scrolling up, amount: {amount}")
|
|
199
|
-
for _ in range(amount):
|
|
200
|
-
await self.computer.interface.hotkey("fn", "up")
|
|
201
|
-
|
|
202
|
-
# Take screenshot after action
|
|
203
|
-
screenshot = await self.computer.interface.screenshot()
|
|
204
|
-
screenshot = await self.resize_screenshot_if_needed(screenshot)
|
|
205
|
-
return ToolResult(
|
|
206
|
-
output=f"Scrolled {direction} by {amount} steps",
|
|
207
|
-
base64_image=base64.b64encode(screenshot).decode(),
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
# Default to screenshot for unimplemented actions
|
|
211
|
-
self.logger.warning(f"Action {action} not fully implemented, taking screenshot")
|
|
212
|
-
return await self.screenshot()
|
|
153
|
+
elif action == "type_text":
|
|
154
|
+
text = kwargs.get("text")
|
|
155
|
+
if not text:
|
|
156
|
+
return ToolResult(error="Text required for type_text")
|
|
157
|
+
args["text"] = text
|
|
158
|
+
elif action == "press_key":
|
|
159
|
+
key = kwargs.get("key")
|
|
160
|
+
if not key:
|
|
161
|
+
return ToolResult(error="Key required for press_key")
|
|
162
|
+
args["key"] = key
|
|
163
|
+
elif action == "hotkey":
|
|
164
|
+
keys = kwargs.get("keys")
|
|
165
|
+
if not keys:
|
|
166
|
+
return ToolResult(error="Keys required for hotkey")
|
|
167
|
+
# Call with positional arguments instead of kwargs
|
|
168
|
+
await method(*keys)
|
|
169
|
+
return ToolResult(output=f"Hotkey executed: {'+'.join(keys)}")
|
|
170
|
+
elif action in ["scroll_down", "scroll_up"]:
|
|
171
|
+
args["clicks"] = int(kwargs.get("amount", 1))
|
|
172
|
+
|
|
173
|
+
# Execute action with prepared arguments
|
|
174
|
+
await method(**args)
|
|
175
|
+
return ToolResult(output=f"Action {action} executed successfully")
|
|
213
176
|
|
|
214
177
|
except Exception as e:
|
|
215
|
-
|
|
216
|
-
return ToolResult(error=f"
|
|
178
|
+
logger.error(f"Error executing computer action: {str(e)}")
|
|
179
|
+
return ToolResult(error=f"Error: {str(e)}")
|