cua-agent 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +21 -12
- agent/__main__.py +21 -0
- agent/adapters/__init__.py +9 -0
- agent/adapters/huggingfacelocal_adapter.py +229 -0
- agent/agent.py +594 -0
- agent/callbacks/__init__.py +19 -0
- agent/callbacks/base.py +153 -0
- agent/callbacks/budget_manager.py +44 -0
- agent/callbacks/image_retention.py +139 -0
- agent/callbacks/logging.py +247 -0
- agent/callbacks/pii_anonymization.py +259 -0
- agent/callbacks/telemetry.py +210 -0
- agent/callbacks/trajectory_saver.py +305 -0
- agent/cli.py +297 -0
- agent/computer_handler.py +107 -0
- agent/decorators.py +90 -0
- agent/loops/__init__.py +11 -0
- agent/loops/anthropic.py +728 -0
- agent/loops/omniparser.py +339 -0
- agent/loops/openai.py +95 -0
- agent/loops/uitars.py +688 -0
- agent/responses.py +207 -0
- agent/telemetry.py +135 -14
- agent/types.py +79 -0
- agent/ui/__init__.py +7 -1
- agent/ui/__main__.py +2 -13
- agent/ui/gradio/__init__.py +6 -19
- agent/ui/gradio/app.py +94 -1313
- agent/ui/gradio/ui_components.py +721 -0
- cua_agent-0.4.0.dist-info/METADATA +424 -0
- cua_agent-0.4.0.dist-info/RECORD +33 -0
- agent/core/__init__.py +0 -27
- agent/core/agent.py +0 -210
- agent/core/base.py +0 -217
- agent/core/callbacks.py +0 -200
- agent/core/experiment.py +0 -249
- agent/core/factory.py +0 -122
- agent/core/messages.py +0 -332
- agent/core/provider_config.py +0 -21
- agent/core/telemetry.py +0 -142
- agent/core/tools/__init__.py +0 -21
- agent/core/tools/base.py +0 -74
- agent/core/tools/bash.py +0 -52
- agent/core/tools/collection.py +0 -46
- agent/core/tools/computer.py +0 -113
- agent/core/tools/edit.py +0 -67
- agent/core/tools/manager.py +0 -56
- agent/core/tools.py +0 -32
- agent/core/types.py +0 -88
- agent/core/visualization.py +0 -197
- agent/providers/__init__.py +0 -4
- agent/providers/anthropic/__init__.py +0 -6
- agent/providers/anthropic/api/client.py +0 -360
- agent/providers/anthropic/api/logging.py +0 -150
- agent/providers/anthropic/api_handler.py +0 -140
- agent/providers/anthropic/callbacks/__init__.py +0 -5
- agent/providers/anthropic/callbacks/manager.py +0 -65
- agent/providers/anthropic/loop.py +0 -568
- agent/providers/anthropic/prompts.py +0 -23
- agent/providers/anthropic/response_handler.py +0 -226
- agent/providers/anthropic/tools/__init__.py +0 -33
- agent/providers/anthropic/tools/base.py +0 -88
- agent/providers/anthropic/tools/bash.py +0 -66
- agent/providers/anthropic/tools/collection.py +0 -34
- agent/providers/anthropic/tools/computer.py +0 -396
- agent/providers/anthropic/tools/edit.py +0 -326
- agent/providers/anthropic/tools/manager.py +0 -54
- agent/providers/anthropic/tools/run.py +0 -42
- agent/providers/anthropic/types.py +0 -16
- agent/providers/anthropic/utils.py +0 -381
- agent/providers/omni/__init__.py +0 -8
- agent/providers/omni/api_handler.py +0 -42
- agent/providers/omni/clients/anthropic.py +0 -103
- agent/providers/omni/clients/base.py +0 -35
- agent/providers/omni/clients/oaicompat.py +0 -195
- agent/providers/omni/clients/ollama.py +0 -122
- agent/providers/omni/clients/openai.py +0 -155
- agent/providers/omni/clients/utils.py +0 -25
- agent/providers/omni/image_utils.py +0 -34
- agent/providers/omni/loop.py +0 -990
- agent/providers/omni/parser.py +0 -307
- agent/providers/omni/prompts.py +0 -64
- agent/providers/omni/tools/__init__.py +0 -30
- agent/providers/omni/tools/base.py +0 -29
- agent/providers/omni/tools/bash.py +0 -74
- agent/providers/omni/tools/computer.py +0 -179
- agent/providers/omni/tools/manager.py +0 -61
- agent/providers/omni/utils.py +0 -236
- agent/providers/openai/__init__.py +0 -6
- agent/providers/openai/api_handler.py +0 -456
- agent/providers/openai/loop.py +0 -472
- agent/providers/openai/response_handler.py +0 -205
- agent/providers/openai/tools/__init__.py +0 -15
- agent/providers/openai/tools/base.py +0 -79
- agent/providers/openai/tools/computer.py +0 -326
- agent/providers/openai/tools/manager.py +0 -106
- agent/providers/openai/types.py +0 -36
- agent/providers/openai/utils.py +0 -98
- agent/providers/uitars/__init__.py +0 -1
- agent/providers/uitars/clients/base.py +0 -35
- agent/providers/uitars/clients/mlxvlm.py +0 -263
- agent/providers/uitars/clients/oaicompat.py +0 -214
- agent/providers/uitars/loop.py +0 -660
- agent/providers/uitars/prompts.py +0 -63
- agent/providers/uitars/tools/__init__.py +0 -1
- agent/providers/uitars/tools/computer.py +0 -283
- agent/providers/uitars/tools/manager.py +0 -60
- agent/providers/uitars/utils.py +0 -264
- cua_agent-0.3.2.dist-info/METADATA +0 -295
- cua_agent-0.3.2.dist-info/RECORD +0 -87
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/WHEEL +0 -0
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0.dist-info}/entry_points.txt +0 -0
agent/providers/omni/parser.py
DELETED
|
@@ -1,307 +0,0 @@
|
|
|
1
|
-
"""Parser implementation for the Omni provider."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
-
import base64
|
|
6
|
-
import torch
|
|
7
|
-
|
|
8
|
-
# Import from the SOM package
|
|
9
|
-
from som import OmniParser as OmniDetectParser
|
|
10
|
-
from som.models import ParseResult, ParserMetadata
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class OmniParser:
|
|
16
|
-
"""Parser for handling responses from multiple providers."""
|
|
17
|
-
|
|
18
|
-
# Class-level shared OmniDetectParser instance
|
|
19
|
-
_shared_parser = None
|
|
20
|
-
|
|
21
|
-
def __init__(self, force_device: Optional[str] = None):
|
|
22
|
-
"""Initialize the OmniParser.
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
force_device: Optional device to force for detection (cpu/cuda/mps)
|
|
26
|
-
"""
|
|
27
|
-
self.response_buffer = []
|
|
28
|
-
|
|
29
|
-
# Use shared parser if available, otherwise create a new one
|
|
30
|
-
if OmniParser._shared_parser is None:
|
|
31
|
-
logger.info("Initializing shared OmniDetectParser...")
|
|
32
|
-
|
|
33
|
-
# Determine the best device to use
|
|
34
|
-
device = force_device
|
|
35
|
-
if not device:
|
|
36
|
-
if torch.cuda.is_available():
|
|
37
|
-
device = "cuda"
|
|
38
|
-
elif (
|
|
39
|
-
hasattr(torch, "backends")
|
|
40
|
-
and hasattr(torch.backends, "mps")
|
|
41
|
-
and torch.backends.mps.is_available()
|
|
42
|
-
):
|
|
43
|
-
device = "mps"
|
|
44
|
-
else:
|
|
45
|
-
device = "cpu"
|
|
46
|
-
|
|
47
|
-
logger.info(f"Using device: {device} for OmniDetectParser")
|
|
48
|
-
self.detect_parser = OmniDetectParser(force_device=device)
|
|
49
|
-
|
|
50
|
-
# Preload the detection model to avoid repeated loading
|
|
51
|
-
try:
|
|
52
|
-
# Access the detector to trigger model loading
|
|
53
|
-
detector = self.detect_parser.detector
|
|
54
|
-
if detector.model is None:
|
|
55
|
-
logger.info("Preloading detection model...")
|
|
56
|
-
detector.load_model()
|
|
57
|
-
logger.info("Detection model preloaded successfully")
|
|
58
|
-
except Exception as e:
|
|
59
|
-
logger.error(f"Error preloading detection model: {str(e)}")
|
|
60
|
-
|
|
61
|
-
# Store as shared instance
|
|
62
|
-
OmniParser._shared_parser = self.detect_parser
|
|
63
|
-
else:
|
|
64
|
-
logger.info("Using existing shared OmniDetectParser")
|
|
65
|
-
self.detect_parser = OmniParser._shared_parser
|
|
66
|
-
|
|
67
|
-
async def parse_screen(self, computer: Any) -> ParseResult:
|
|
68
|
-
"""Parse a screenshot and extract screen information.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
computer: Computer instance
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
ParseResult with screen elements and image data
|
|
75
|
-
"""
|
|
76
|
-
try:
|
|
77
|
-
# Get screenshot from computer
|
|
78
|
-
logger.info("Taking screenshot...")
|
|
79
|
-
screenshot = await computer.interface.screenshot()
|
|
80
|
-
|
|
81
|
-
# Log screenshot info
|
|
82
|
-
logger.info(f"Screenshot type: {type(screenshot)}")
|
|
83
|
-
logger.info(f"Screenshot is bytes: {isinstance(screenshot, bytes)}")
|
|
84
|
-
logger.info(f"Screenshot is str: {isinstance(screenshot, str)}")
|
|
85
|
-
logger.info(f"Screenshot length: {len(screenshot) if screenshot else 0}")
|
|
86
|
-
|
|
87
|
-
# If screenshot is a string (likely base64), convert it to bytes
|
|
88
|
-
if isinstance(screenshot, str):
|
|
89
|
-
try:
|
|
90
|
-
screenshot = base64.b64decode(screenshot)
|
|
91
|
-
logger.info("Successfully converted base64 string to bytes")
|
|
92
|
-
logger.info(f"Decoded bytes length: {len(screenshot)}")
|
|
93
|
-
except Exception as e:
|
|
94
|
-
logger.error(f"Error decoding base64: {str(e)}")
|
|
95
|
-
logger.error(f"First 100 chars of screenshot string: {screenshot[:100]}")
|
|
96
|
-
|
|
97
|
-
# Pass screenshot to OmniDetectParser
|
|
98
|
-
logger.info("Passing screenshot to OmniDetectParser...")
|
|
99
|
-
parse_result = self.detect_parser.parse(
|
|
100
|
-
screenshot_data=screenshot, box_threshold=0.3, iou_threshold=0.1, use_ocr=True
|
|
101
|
-
)
|
|
102
|
-
logger.info("Screenshot parsed successfully")
|
|
103
|
-
logger.info(f"Parse result has {len(parse_result.elements)} elements")
|
|
104
|
-
|
|
105
|
-
# Log element IDs for debugging
|
|
106
|
-
for i, elem in enumerate(parse_result.elements):
|
|
107
|
-
logger.info(
|
|
108
|
-
f"Element {i+1} (ID: {elem.id}): {elem.type} with confidence {elem.confidence:.3f}"
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
return parse_result
|
|
112
|
-
|
|
113
|
-
except Exception as e:
|
|
114
|
-
logger.error(f"Error parsing screen: {str(e)}")
|
|
115
|
-
import traceback
|
|
116
|
-
|
|
117
|
-
logger.error(traceback.format_exc())
|
|
118
|
-
|
|
119
|
-
# Create a minimal valid result for error cases
|
|
120
|
-
return ParseResult(
|
|
121
|
-
elements=[],
|
|
122
|
-
screen_info=None,
|
|
123
|
-
annotated_image_base64="",
|
|
124
|
-
parsed_content_list=[{"error": str(e)}],
|
|
125
|
-
metadata=ParserMetadata(
|
|
126
|
-
image_size=(0, 0),
|
|
127
|
-
num_icons=0,
|
|
128
|
-
num_text=0,
|
|
129
|
-
device="cpu",
|
|
130
|
-
ocr_enabled=False,
|
|
131
|
-
latency=0.0,
|
|
132
|
-
),
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
def parse_tool_call(self, response: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
136
|
-
"""Parse a tool call from the response.
|
|
137
|
-
|
|
138
|
-
Args:
|
|
139
|
-
response: Response from the provider
|
|
140
|
-
|
|
141
|
-
Returns:
|
|
142
|
-
Parsed tool call or None if no tool call found
|
|
143
|
-
"""
|
|
144
|
-
try:
|
|
145
|
-
# Handle Anthropic format
|
|
146
|
-
if "tool_calls" in response:
|
|
147
|
-
tool_call = response["tool_calls"][0]
|
|
148
|
-
return {
|
|
149
|
-
"name": tool_call["function"]["name"],
|
|
150
|
-
"arguments": tool_call["function"]["arguments"],
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
# Handle OpenAI format
|
|
154
|
-
if "function_call" in response:
|
|
155
|
-
return {
|
|
156
|
-
"name": response["function_call"]["name"],
|
|
157
|
-
"arguments": response["function_call"]["arguments"],
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
# Handle Groq format (OpenAI-compatible)
|
|
161
|
-
if "choices" in response and response["choices"]:
|
|
162
|
-
choice = response["choices"][0]
|
|
163
|
-
if "function_call" in choice:
|
|
164
|
-
return {
|
|
165
|
-
"name": choice["function_call"]["name"],
|
|
166
|
-
"arguments": choice["function_call"]["arguments"],
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
return None
|
|
170
|
-
|
|
171
|
-
except Exception as e:
|
|
172
|
-
logger.error(f"Error parsing tool call: {str(e)}")
|
|
173
|
-
return None
|
|
174
|
-
|
|
175
|
-
def parse_response(self, response: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
|
|
176
|
-
"""Parse a response from any provider.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
response: Response from the provider
|
|
180
|
-
|
|
181
|
-
Returns:
|
|
182
|
-
Tuple of (content, metadata)
|
|
183
|
-
"""
|
|
184
|
-
try:
|
|
185
|
-
content = ""
|
|
186
|
-
metadata = {}
|
|
187
|
-
|
|
188
|
-
# Handle Anthropic format
|
|
189
|
-
if "content" in response and isinstance(response["content"], list):
|
|
190
|
-
for item in response["content"]:
|
|
191
|
-
if item["type"] == "text":
|
|
192
|
-
content += item["text"]
|
|
193
|
-
|
|
194
|
-
# Handle OpenAI format
|
|
195
|
-
elif "choices" in response and response["choices"]:
|
|
196
|
-
content = response["choices"][0]["message"]["content"]
|
|
197
|
-
|
|
198
|
-
# Handle direct content
|
|
199
|
-
elif isinstance(response.get("content"), str):
|
|
200
|
-
content = response["content"]
|
|
201
|
-
|
|
202
|
-
# Extract metadata if present
|
|
203
|
-
if "metadata" in response:
|
|
204
|
-
metadata = response["metadata"]
|
|
205
|
-
|
|
206
|
-
return content, metadata
|
|
207
|
-
|
|
208
|
-
except Exception as e:
|
|
209
|
-
logger.error(f"Error parsing response: {str(e)}")
|
|
210
|
-
return str(e), {"error": True}
|
|
211
|
-
|
|
212
|
-
def format_for_provider(
|
|
213
|
-
self, messages: List[Dict[str, Any]], provider: str
|
|
214
|
-
) -> List[Dict[str, Any]]:
|
|
215
|
-
"""Format messages for a specific provider.
|
|
216
|
-
|
|
217
|
-
Args:
|
|
218
|
-
messages: List of messages to format
|
|
219
|
-
provider: Provider to format for
|
|
220
|
-
|
|
221
|
-
Returns:
|
|
222
|
-
Formatted messages
|
|
223
|
-
"""
|
|
224
|
-
try:
|
|
225
|
-
formatted = []
|
|
226
|
-
|
|
227
|
-
for msg in messages:
|
|
228
|
-
formatted_msg = {"role": msg["role"]}
|
|
229
|
-
|
|
230
|
-
# Handle content formatting
|
|
231
|
-
if isinstance(msg["content"], list):
|
|
232
|
-
# For providers that support multimodal
|
|
233
|
-
if provider in ["anthropic", "openai"]:
|
|
234
|
-
formatted_msg["content"] = msg["content"]
|
|
235
|
-
else:
|
|
236
|
-
# Extract text only for other providers
|
|
237
|
-
text_content = next(
|
|
238
|
-
(item["text"] for item in msg["content"] if item["type"] == "text"), ""
|
|
239
|
-
)
|
|
240
|
-
formatted_msg["content"] = text_content
|
|
241
|
-
else:
|
|
242
|
-
formatted_msg["content"] = msg["content"]
|
|
243
|
-
|
|
244
|
-
formatted.append(formatted_msg)
|
|
245
|
-
|
|
246
|
-
return formatted
|
|
247
|
-
|
|
248
|
-
except Exception as e:
|
|
249
|
-
logger.error(f"Error formatting messages: {str(e)}")
|
|
250
|
-
return messages # Return original messages on error
|
|
251
|
-
|
|
252
|
-
async def calculate_click_coordinates(
|
|
253
|
-
self, box_id: int, parsed_screen: ParseResult
|
|
254
|
-
) -> Tuple[int, int]:
|
|
255
|
-
"""Calculate click coordinates based on box ID.
|
|
256
|
-
|
|
257
|
-
Args:
|
|
258
|
-
box_id: The ID of the box to click
|
|
259
|
-
parsed_screen: The parsed screen information
|
|
260
|
-
|
|
261
|
-
Returns:
|
|
262
|
-
Tuple of (x, y) coordinates
|
|
263
|
-
|
|
264
|
-
Raises:
|
|
265
|
-
ValueError: If box_id is invalid or missing from parsed screen
|
|
266
|
-
"""
|
|
267
|
-
# First try to use structured elements data
|
|
268
|
-
logger.info(f"Elements count: {len(parsed_screen.elements)}")
|
|
269
|
-
|
|
270
|
-
# Try to find element with matching ID
|
|
271
|
-
for element in parsed_screen.elements:
|
|
272
|
-
if element.id == box_id:
|
|
273
|
-
logger.info(f"Found element with ID {box_id}: {element}")
|
|
274
|
-
bbox = element.bbox
|
|
275
|
-
|
|
276
|
-
# Get screen dimensions from the metadata if available, or fallback
|
|
277
|
-
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
278
|
-
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
279
|
-
logger.info(f"Screen dimensions: width={width}, height={height}")
|
|
280
|
-
|
|
281
|
-
# Create a dictionary from the element's bbox for calculate_element_center
|
|
282
|
-
bbox_dict = {"x1": bbox.x1, "y1": bbox.y1, "x2": bbox.x2, "y2": bbox.y2}
|
|
283
|
-
from ...core.visualization import calculate_element_center
|
|
284
|
-
|
|
285
|
-
center_x, center_y = calculate_element_center(bbox_dict, width, height)
|
|
286
|
-
logger.info(f"Calculated center: ({center_x}, {center_y})")
|
|
287
|
-
|
|
288
|
-
# Validate coordinates - if they're (0,0) or unreasonably small,
|
|
289
|
-
# use a default position in the center of the screen
|
|
290
|
-
if center_x == 0 and center_y == 0:
|
|
291
|
-
logger.warning("Got (0,0) coordinates, using fallback position")
|
|
292
|
-
center_x = width // 2
|
|
293
|
-
center_y = height // 2
|
|
294
|
-
logger.info(f"Using fallback center: ({center_x}, {center_y})")
|
|
295
|
-
|
|
296
|
-
return center_x, center_y
|
|
297
|
-
|
|
298
|
-
# If we couldn't find the box, use center of screen
|
|
299
|
-
logger.error(
|
|
300
|
-
f"Box ID {box_id} not found in structured elements (count={len(parsed_screen.elements)})"
|
|
301
|
-
)
|
|
302
|
-
|
|
303
|
-
# Use center of screen as fallback
|
|
304
|
-
width = parsed_screen.metadata.width if parsed_screen.metadata else 1920
|
|
305
|
-
height = parsed_screen.metadata.height if parsed_screen.metadata else 1080
|
|
306
|
-
logger.warning(f"Using fallback position in center of screen ({width//2}, {height//2})")
|
|
307
|
-
return width // 2, height // 2
|
agent/providers/omni/prompts.py
DELETED
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
"""Prompts for the Omni agent."""
|
|
2
|
-
|
|
3
|
-
SYSTEM_PROMPT = """
|
|
4
|
-
You are using a macOS device.
|
|
5
|
-
You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
|
|
6
|
-
|
|
7
|
-
You may be given some history plan and actions, this is the response from the previous loop.
|
|
8
|
-
You should carefully consider your plan base on the task, screenshot, and history actions.
|
|
9
|
-
|
|
10
|
-
Your available "Next Action" only include:
|
|
11
|
-
- type_text: types a string of text.
|
|
12
|
-
- left_click: move mouse to box id and left clicks.
|
|
13
|
-
- right_click: move mouse to box id and right clicks.
|
|
14
|
-
- double_click: move mouse to box id and double clicks.
|
|
15
|
-
- move_cursor: move mouse to box id.
|
|
16
|
-
- scroll_up: scrolls the screen up to view previous content.
|
|
17
|
-
- scroll_down: scrolls the screen down, when the desired button is not visible, or you need to see more content.
|
|
18
|
-
- hotkey: press a sequence of keys.
|
|
19
|
-
- wait: waits for 1 second for the device to load or respond.
|
|
20
|
-
|
|
21
|
-
Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task.
|
|
22
|
-
|
|
23
|
-
Output format:
|
|
24
|
-
{
|
|
25
|
-
"Explanation": str, # describe what is in the current screen, taking into account the history, then describe your step-by-step thoughts on how to achieve the task, choose one action from available actions at a time.
|
|
26
|
-
"Action": "action_type, action description" | "None" # one action at a time, describe it in short and precisely.
|
|
27
|
-
"Box ID": n,
|
|
28
|
-
"Value": "xxx" # only provide value field if the action is type, else don't include value key
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
One Example:
|
|
32
|
-
{
|
|
33
|
-
"Explanation": "The current screen shows google result of amazon, in previous action I have searched amazon on google. Then I need to click on the first search results to go to amazon.com.",
|
|
34
|
-
"Action": "left_click",
|
|
35
|
-
"Box ID": 4
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
Another Example:
|
|
39
|
-
{
|
|
40
|
-
"Explanation": "The current screen shows the front page of amazon. There is no previous action. Therefore I need to type "Apple watch" in the search bar.",
|
|
41
|
-
"Action": "type_text",
|
|
42
|
-
"Box ID": 2,
|
|
43
|
-
"Value": "Apple watch"
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
Another Example:
|
|
47
|
-
{
|
|
48
|
-
"Explanation": "I am starting a Spotlight search to find the Safari browser.",
|
|
49
|
-
"Action": "hotkey",
|
|
50
|
-
"Value": "command+space"
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
IMPORTANT NOTES:
|
|
54
|
-
1. You should only give a single action at a time.
|
|
55
|
-
2. The Box ID is the id of the element you should operate on, it is a number. Its background color corresponds to the color of the bounding box of the element.
|
|
56
|
-
3. You should give an analysis to the current screen, and reflect on what has been done by looking at the history, then describe your step-by-step thoughts on how to achieve the task.
|
|
57
|
-
4. Attach the next action prediction in the "Action" field.
|
|
58
|
-
5. For starting applications, always use the "hotkey" action with command+space for starting a Spotlight search.
|
|
59
|
-
6. When the task is completed, don't complete additional actions. You should say "Action": "None" in the json field.
|
|
60
|
-
7. The tasks involve buying multiple products or navigating through multiple pages. You should break it into subgoals and complete each subgoal one by one in the order of the instructions.
|
|
61
|
-
8. Avoid choosing the same action/elements multiple times in a row, if it happens, reflect to yourself, what may have gone wrong, and predict a different action.
|
|
62
|
-
9. Reflect whether the element is clickable or not, for example reflect if it is an hyperlink or a button or a normal text.
|
|
63
|
-
10. If you are prompted with login information page or captcha page, or you think it need user's permission to do the next action, you should say "Action": "None" in the json field.
|
|
64
|
-
"""
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
"""Omni provider tools - compatible with multiple LLM providers."""
|
|
2
|
-
|
|
3
|
-
from ....core.tools import BaseTool, ToolResult, ToolError, ToolFailure, CLIResult
|
|
4
|
-
from .base import BaseOmniTool
|
|
5
|
-
from .computer import ComputerTool
|
|
6
|
-
from .bash import BashTool
|
|
7
|
-
from .manager import ToolManager
|
|
8
|
-
|
|
9
|
-
# Re-export the tools with Omni-specific names for backward compatibility
|
|
10
|
-
OmniToolResult = ToolResult
|
|
11
|
-
OmniToolError = ToolError
|
|
12
|
-
OmniToolFailure = ToolFailure
|
|
13
|
-
OmniCLIResult = CLIResult
|
|
14
|
-
|
|
15
|
-
# We'll export specific tools once implemented
|
|
16
|
-
__all__ = [
|
|
17
|
-
"BaseTool",
|
|
18
|
-
"BaseOmniTool",
|
|
19
|
-
"ToolResult",
|
|
20
|
-
"ToolError",
|
|
21
|
-
"ToolFailure",
|
|
22
|
-
"CLIResult",
|
|
23
|
-
"OmniToolResult",
|
|
24
|
-
"OmniToolError",
|
|
25
|
-
"OmniToolFailure",
|
|
26
|
-
"OmniCLIResult",
|
|
27
|
-
"ComputerTool",
|
|
28
|
-
"BashTool",
|
|
29
|
-
"ToolManager",
|
|
30
|
-
]
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
"""Omni-specific tool base classes."""
|
|
2
|
-
|
|
3
|
-
from abc import ABCMeta, abstractmethod
|
|
4
|
-
from typing import Any, Dict
|
|
5
|
-
|
|
6
|
-
from ....core.tools.base import BaseTool
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BaseOmniTool(BaseTool, metaclass=ABCMeta):
|
|
10
|
-
"""Abstract base class for Omni provider tools."""
|
|
11
|
-
|
|
12
|
-
def __init__(self):
|
|
13
|
-
"""Initialize the base Omni tool."""
|
|
14
|
-
# No specific initialization needed yet, but included for future extensibility
|
|
15
|
-
pass
|
|
16
|
-
|
|
17
|
-
@abstractmethod
|
|
18
|
-
async def __call__(self, **kwargs) -> Any:
|
|
19
|
-
"""Executes the tool with the given arguments."""
|
|
20
|
-
...
|
|
21
|
-
|
|
22
|
-
@abstractmethod
|
|
23
|
-
def to_params(self) -> Dict[str, Any]:
|
|
24
|
-
"""Convert tool to Omni provider-specific API parameters.
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
Dictionary with tool parameters for the specific API
|
|
28
|
-
"""
|
|
29
|
-
raise NotImplementedError
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
"""Bash tool for Omni provider."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict
|
|
5
|
-
|
|
6
|
-
from computer import Computer
|
|
7
|
-
from ....core.tools import ToolResult, ToolError
|
|
8
|
-
from .base import BaseOmniTool
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class BashTool(BaseOmniTool):
|
|
14
|
-
"""Tool for executing bash commands."""
|
|
15
|
-
|
|
16
|
-
name = "bash"
|
|
17
|
-
description = "Execute bash commands on the system"
|
|
18
|
-
|
|
19
|
-
def __init__(self, computer: Computer):
|
|
20
|
-
"""Initialize the bash tool.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
computer: Computer instance
|
|
24
|
-
"""
|
|
25
|
-
super().__init__()
|
|
26
|
-
self.computer = computer
|
|
27
|
-
|
|
28
|
-
def to_params(self) -> Dict[str, Any]:
|
|
29
|
-
"""Convert tool to API parameters.
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
Dictionary with tool parameters
|
|
33
|
-
"""
|
|
34
|
-
return {
|
|
35
|
-
"type": "function",
|
|
36
|
-
"function": {
|
|
37
|
-
"name": self.name,
|
|
38
|
-
"description": self.description,
|
|
39
|
-
"parameters": {
|
|
40
|
-
"type": "object",
|
|
41
|
-
"properties": {
|
|
42
|
-
"command": {
|
|
43
|
-
"type": "string",
|
|
44
|
-
"description": "The bash command to execute",
|
|
45
|
-
},
|
|
46
|
-
},
|
|
47
|
-
"required": ["command"],
|
|
48
|
-
},
|
|
49
|
-
},
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
async def __call__(self, **kwargs) -> ToolResult:
|
|
53
|
-
"""Execute bash command.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
**kwargs: Command parameters
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
Tool execution result
|
|
60
|
-
"""
|
|
61
|
-
try:
|
|
62
|
-
command = kwargs.get("command", "")
|
|
63
|
-
if not command:
|
|
64
|
-
return ToolResult(error="No command specified")
|
|
65
|
-
|
|
66
|
-
# The true implementation would use the actual method to run terminal commands
|
|
67
|
-
# Since we're getting linter errors, we'll just implement a placeholder that will
|
|
68
|
-
# be replaced with the correct implementation when this tool is fully integrated
|
|
69
|
-
logger.info(f"Would execute command: {command}")
|
|
70
|
-
return ToolResult(output=f"Command executed (placeholder): {command}")
|
|
71
|
-
|
|
72
|
-
except Exception as e:
|
|
73
|
-
logger.error(f"Error in bash tool: {str(e)}")
|
|
74
|
-
return ToolResult(error=f"Error: {str(e)}")
|
|
@@ -1,179 +0,0 @@
|
|
|
1
|
-
"""Computer tool for Omni provider."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict
|
|
5
|
-
import json
|
|
6
|
-
|
|
7
|
-
from computer import Computer
|
|
8
|
-
from ....core.tools import ToolResult, ToolError
|
|
9
|
-
from .base import BaseOmniTool
|
|
10
|
-
from ..parser import ParseResult
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class ComputerTool(BaseOmniTool):
|
|
16
|
-
"""Tool for interacting with the computer UI."""
|
|
17
|
-
|
|
18
|
-
name = "computer"
|
|
19
|
-
description = "Interact with the computer's graphical user interface"
|
|
20
|
-
|
|
21
|
-
def __init__(self, computer: Computer):
|
|
22
|
-
"""Initialize the computer tool.
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
computer: Computer instance
|
|
26
|
-
"""
|
|
27
|
-
super().__init__()
|
|
28
|
-
self.computer = computer
|
|
29
|
-
# Default to standard screen dimensions (will be set more accurately during initialization)
|
|
30
|
-
self.screen_dimensions = {"width": 1440, "height": 900}
|
|
31
|
-
|
|
32
|
-
async def initialize_dimensions(self) -> None:
|
|
33
|
-
"""Initialize screen dimensions."""
|
|
34
|
-
# For now, we'll use default values
|
|
35
|
-
# In the future, we can implement proper screen dimension detection
|
|
36
|
-
logger.info(f"Using default screen dimensions: {self.screen_dimensions}")
|
|
37
|
-
|
|
38
|
-
def to_params(self) -> Dict[str, Any]:
|
|
39
|
-
"""Convert tool to API parameters.
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
Dictionary with tool parameters
|
|
43
|
-
"""
|
|
44
|
-
return {
|
|
45
|
-
"type": "function",
|
|
46
|
-
"function": {
|
|
47
|
-
"name": self.name,
|
|
48
|
-
"description": self.description,
|
|
49
|
-
"parameters": {
|
|
50
|
-
"type": "object",
|
|
51
|
-
"properties": {
|
|
52
|
-
"action": {
|
|
53
|
-
"type": "string",
|
|
54
|
-
"enum": [
|
|
55
|
-
"left_click",
|
|
56
|
-
"right_click",
|
|
57
|
-
"double_click",
|
|
58
|
-
"move_cursor",
|
|
59
|
-
"drag_to",
|
|
60
|
-
"type_text",
|
|
61
|
-
"press_key",
|
|
62
|
-
"hotkey",
|
|
63
|
-
"scroll_up",
|
|
64
|
-
"scroll_down",
|
|
65
|
-
],
|
|
66
|
-
"description": "The action to perform",
|
|
67
|
-
},
|
|
68
|
-
"x": {
|
|
69
|
-
"type": "number",
|
|
70
|
-
"description": "X coordinate for click or cursor movement",
|
|
71
|
-
},
|
|
72
|
-
"y": {
|
|
73
|
-
"type": "number",
|
|
74
|
-
"description": "Y coordinate for click or cursor movement",
|
|
75
|
-
},
|
|
76
|
-
"box_id": {
|
|
77
|
-
"type": "integer",
|
|
78
|
-
"description": "ID of the UI element to interact with",
|
|
79
|
-
},
|
|
80
|
-
"text": {
|
|
81
|
-
"type": "string",
|
|
82
|
-
"description": "Text to type",
|
|
83
|
-
},
|
|
84
|
-
"key": {
|
|
85
|
-
"type": "string",
|
|
86
|
-
"description": "Key to press",
|
|
87
|
-
},
|
|
88
|
-
"keys": {
|
|
89
|
-
"type": "array",
|
|
90
|
-
"items": {"type": "string"},
|
|
91
|
-
"description": "Keys to press as hotkey combination",
|
|
92
|
-
},
|
|
93
|
-
"amount": {
|
|
94
|
-
"type": "integer",
|
|
95
|
-
"description": "Amount to scroll",
|
|
96
|
-
},
|
|
97
|
-
"duration": {
|
|
98
|
-
"type": "number",
|
|
99
|
-
"description": "Duration for drag operations",
|
|
100
|
-
},
|
|
101
|
-
},
|
|
102
|
-
"required": ["action"],
|
|
103
|
-
},
|
|
104
|
-
},
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
async def __call__(self, **kwargs) -> ToolResult:
|
|
108
|
-
"""Execute computer action.
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
**kwargs: Action parameters
|
|
112
|
-
|
|
113
|
-
Returns:
|
|
114
|
-
Tool execution result
|
|
115
|
-
"""
|
|
116
|
-
try:
|
|
117
|
-
action = kwargs.get("action", "").lower()
|
|
118
|
-
if not action:
|
|
119
|
-
return ToolResult(error="No action specified")
|
|
120
|
-
|
|
121
|
-
# Execute the action on the computer
|
|
122
|
-
method = getattr(self.computer.interface, action, None)
|
|
123
|
-
if not method:
|
|
124
|
-
return ToolResult(error=f"Unsupported action: {action}")
|
|
125
|
-
|
|
126
|
-
# Prepare arguments based on action type
|
|
127
|
-
args = {}
|
|
128
|
-
if action in ["left_click", "right_click", "double_click", "move_cursor"]:
|
|
129
|
-
x = kwargs.get("x")
|
|
130
|
-
y = kwargs.get("y")
|
|
131
|
-
if x is None or y is None:
|
|
132
|
-
box_id = kwargs.get("box_id")
|
|
133
|
-
if box_id is None:
|
|
134
|
-
return ToolResult(error="Box ID or coordinates required")
|
|
135
|
-
# Get coordinates from box_id implementation would be here
|
|
136
|
-
# For now, return error
|
|
137
|
-
return ToolResult(error="Box ID-based clicking not implemented yet")
|
|
138
|
-
args["x"] = x
|
|
139
|
-
args["y"] = y
|
|
140
|
-
elif action == "drag_to":
|
|
141
|
-
x = kwargs.get("x")
|
|
142
|
-
y = kwargs.get("y")
|
|
143
|
-
if x is None or y is None:
|
|
144
|
-
return ToolResult(error="Coordinates required for drag_to")
|
|
145
|
-
args.update(
|
|
146
|
-
{
|
|
147
|
-
"x": x,
|
|
148
|
-
"y": y,
|
|
149
|
-
"button": kwargs.get("button", "left"),
|
|
150
|
-
"duration": float(kwargs.get("duration", 0.5)),
|
|
151
|
-
}
|
|
152
|
-
)
|
|
153
|
-
elif action == "type_text":
|
|
154
|
-
text = kwargs.get("text")
|
|
155
|
-
if not text:
|
|
156
|
-
return ToolResult(error="Text required for type_text")
|
|
157
|
-
args["text"] = text
|
|
158
|
-
elif action == "press_key":
|
|
159
|
-
key = kwargs.get("key")
|
|
160
|
-
if not key:
|
|
161
|
-
return ToolResult(error="Key required for press_key")
|
|
162
|
-
args["key"] = key
|
|
163
|
-
elif action == "hotkey":
|
|
164
|
-
keys = kwargs.get("keys")
|
|
165
|
-
if not keys:
|
|
166
|
-
return ToolResult(error="Keys required for hotkey")
|
|
167
|
-
# Call with positional arguments instead of kwargs
|
|
168
|
-
await method(*keys)
|
|
169
|
-
return ToolResult(output=f"Hotkey executed: {'+'.join(keys)}")
|
|
170
|
-
elif action in ["scroll_down", "scroll_up"]:
|
|
171
|
-
args["clicks"] = int(kwargs.get("amount", 1))
|
|
172
|
-
|
|
173
|
-
# Execute action with prepared arguments
|
|
174
|
-
await method(**args)
|
|
175
|
-
return ToolResult(output=f"Action {action} executed successfully")
|
|
176
|
-
|
|
177
|
-
except Exception as e:
|
|
178
|
-
logger.error(f"Error executing computer action: {str(e)}")
|
|
179
|
-
return ToolResult(error=f"Error: {str(e)}")
|