cua-agent 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +2 -4
- agent/core/__init__.py +3 -5
- agent/core/computer_agent.py +213 -31
- agent/core/experiment.py +20 -3
- agent/core/loop.py +12 -8
- agent/core/telemetry.py +44 -32
- agent/providers/anthropic/loop.py +44 -15
- agent/providers/anthropic/messages/manager.py +3 -1
- agent/providers/anthropic/tools/base.py +1 -1
- agent/providers/anthropic/tools/collection.py +2 -2
- agent/providers/anthropic/tools/computer.py +34 -24
- agent/providers/anthropic/tools/manager.py +2 -2
- agent/providers/omni/experiment.py +5 -2
- agent/providers/omni/loop.py +12 -6
- agent/providers/omni/parser.py +2 -1
- agent/providers/omni/tools/__init__.py +0 -1
- agent/providers/omni/tools/computer.py +3 -2
- agent/providers/omni/tools/manager.py +1 -3
- agent/providers/omni/utils.py +4 -2
- agent/types/__init__.py +1 -4
- agent/types/base.py +0 -12
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.6.dist-info}/METADATA +1 -1
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.6.dist-info}/RECORD +25 -28
- agent/core/agent.py +0 -252
- agent/core/base_agent.py +0 -164
- agent/core/factory.py +0 -102
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.6.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.5.dist-info → cua_agent-0.1.6.dist-info}/entry_points.txt +0 -0
|
@@ -17,6 +17,7 @@ from anthropic.types.beta import (
|
|
|
17
17
|
BetaTextBlock,
|
|
18
18
|
BetaTextBlockParam,
|
|
19
19
|
BetaToolUseBlockParam,
|
|
20
|
+
BetaContentBlockParam,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
# Computer
|
|
@@ -24,12 +25,12 @@ from computer import Computer
|
|
|
24
25
|
|
|
25
26
|
# Base imports
|
|
26
27
|
from ...core.loop import BaseLoop
|
|
27
|
-
from ...core.messages import ImageRetentionConfig
|
|
28
|
+
from ...core.messages import ImageRetentionConfig as CoreImageRetentionConfig
|
|
28
29
|
|
|
29
30
|
# Anthropic provider-specific imports
|
|
30
31
|
from .api.client import AnthropicClientFactory, BaseAnthropicClient
|
|
31
32
|
from .tools.manager import ToolManager
|
|
32
|
-
from .messages.manager import MessageManager
|
|
33
|
+
from .messages.manager import MessageManager, ImageRetentionConfig
|
|
33
34
|
from .callbacks.manager import CallbackManager
|
|
34
35
|
from .prompts import SYSTEM_PROMPT
|
|
35
36
|
from .types import LLMProvider
|
|
@@ -48,8 +49,8 @@ class AnthropicLoop(BaseLoop):
|
|
|
48
49
|
def __init__(
|
|
49
50
|
self,
|
|
50
51
|
api_key: str,
|
|
52
|
+
computer: Computer,
|
|
51
53
|
model: str = "claude-3-7-sonnet-20250219", # Fixed model
|
|
52
|
-
computer: Optional[Computer] = None,
|
|
53
54
|
only_n_most_recent_images: Optional[int] = 2,
|
|
54
55
|
base_dir: Optional[str] = "trajectories",
|
|
55
56
|
max_retries: int = 3,
|
|
@@ -69,7 +70,7 @@ class AnthropicLoop(BaseLoop):
|
|
|
69
70
|
retry_delay: Delay between retries in seconds
|
|
70
71
|
save_trajectory: Whether to save trajectory data
|
|
71
72
|
"""
|
|
72
|
-
# Initialize base class
|
|
73
|
+
# Initialize base class with core config
|
|
73
74
|
super().__init__(
|
|
74
75
|
computer=computer,
|
|
75
76
|
model=model,
|
|
@@ -93,8 +94,8 @@ class AnthropicLoop(BaseLoop):
|
|
|
93
94
|
self.message_manager = None
|
|
94
95
|
self.callback_manager = None
|
|
95
96
|
|
|
96
|
-
# Configure image retention
|
|
97
|
-
self.image_retention_config =
|
|
97
|
+
# Configure image retention with core config
|
|
98
|
+
self.image_retention_config = CoreImageRetentionConfig(
|
|
98
99
|
num_images_to_keep=only_n_most_recent_images
|
|
99
100
|
)
|
|
100
101
|
|
|
@@ -113,7 +114,7 @@ class AnthropicLoop(BaseLoop):
|
|
|
113
114
|
|
|
114
115
|
# Initialize message manager
|
|
115
116
|
self.message_manager = MessageManager(
|
|
116
|
-
ImageRetentionConfig(
|
|
117
|
+
image_retention_config=ImageRetentionConfig(
|
|
117
118
|
num_images_to_keep=self.only_n_most_recent_images, enable_caching=True
|
|
118
119
|
)
|
|
119
120
|
)
|
|
@@ -250,6 +251,10 @@ class AnthropicLoop(BaseLoop):
|
|
|
250
251
|
await self._process_screen(parsed_screen, self.message_history)
|
|
251
252
|
|
|
252
253
|
# Prepare messages and make API call
|
|
254
|
+
if self.message_manager is None:
|
|
255
|
+
raise RuntimeError(
|
|
256
|
+
"Message manager not initialized. Call initialize_client() first."
|
|
257
|
+
)
|
|
253
258
|
prepared_messages = self.message_manager.prepare_messages(
|
|
254
259
|
cast(List[BetaMessageParam], self.message_history.copy())
|
|
255
260
|
)
|
|
@@ -257,7 +262,7 @@ class AnthropicLoop(BaseLoop):
|
|
|
257
262
|
# Create new turn directory for this API call
|
|
258
263
|
self._create_turn_dir()
|
|
259
264
|
|
|
260
|
-
#
|
|
265
|
+
# Use _make_api_call instead of direct client call to ensure logging
|
|
261
266
|
response = await self._make_api_call(prepared_messages)
|
|
262
267
|
|
|
263
268
|
# Handle the response
|
|
@@ -287,6 +292,11 @@ class AnthropicLoop(BaseLoop):
|
|
|
287
292
|
Returns:
|
|
288
293
|
API response
|
|
289
294
|
"""
|
|
295
|
+
if self.client is None:
|
|
296
|
+
raise RuntimeError("Client not initialized. Call initialize_client() first.")
|
|
297
|
+
if self.tool_manager is None:
|
|
298
|
+
raise RuntimeError("Tool manager not initialized. Call initialize_client() first.")
|
|
299
|
+
|
|
290
300
|
last_error = None
|
|
291
301
|
|
|
292
302
|
for attempt in range(self.max_retries):
|
|
@@ -297,6 +307,7 @@ class AnthropicLoop(BaseLoop):
|
|
|
297
307
|
"max_tokens": self.max_tokens,
|
|
298
308
|
"system": SYSTEM_PROMPT,
|
|
299
309
|
}
|
|
310
|
+
# Let ExperimentManager handle sanitization
|
|
300
311
|
self._log_api_call("request", request_data)
|
|
301
312
|
|
|
302
313
|
# Setup betas and system
|
|
@@ -320,7 +331,7 @@ class AnthropicLoop(BaseLoop):
|
|
|
320
331
|
betas=betas,
|
|
321
332
|
)
|
|
322
333
|
|
|
323
|
-
#
|
|
334
|
+
# Let ExperimentManager handle sanitization
|
|
324
335
|
self._log_api_call("response", request_data, response)
|
|
325
336
|
|
|
326
337
|
return response
|
|
@@ -365,25 +376,38 @@ class AnthropicLoop(BaseLoop):
|
|
|
365
376
|
}
|
|
366
377
|
)
|
|
367
378
|
|
|
379
|
+
if self.callback_manager is None:
|
|
380
|
+
raise RuntimeError(
|
|
381
|
+
"Callback manager not initialized. Call initialize_client() first."
|
|
382
|
+
)
|
|
383
|
+
|
|
368
384
|
# Handle tool use blocks and collect results
|
|
369
385
|
tool_result_content = []
|
|
370
386
|
for content_block in response_params:
|
|
371
387
|
# Notify callback of content
|
|
372
|
-
self.callback_manager.on_content(content_block)
|
|
388
|
+
self.callback_manager.on_content(cast(BetaContentBlockParam, content_block))
|
|
373
389
|
|
|
374
390
|
# Handle tool use
|
|
375
391
|
if content_block.get("type") == "tool_use":
|
|
392
|
+
if self.tool_manager is None:
|
|
393
|
+
raise RuntimeError(
|
|
394
|
+
"Tool manager not initialized. Call initialize_client() first."
|
|
395
|
+
)
|
|
376
396
|
result = await self.tool_manager.execute_tool(
|
|
377
397
|
name=content_block["name"],
|
|
378
398
|
tool_input=cast(Dict[str, Any], content_block["input"]),
|
|
379
399
|
)
|
|
380
400
|
|
|
381
401
|
# Create tool result and add to content
|
|
382
|
-
tool_result = self._make_tool_result(
|
|
402
|
+
tool_result = self._make_tool_result(
|
|
403
|
+
cast(ToolResult, result), content_block["id"]
|
|
404
|
+
)
|
|
383
405
|
tool_result_content.append(tool_result)
|
|
384
406
|
|
|
385
407
|
# Notify callback of tool result
|
|
386
|
-
self.callback_manager.on_tool_result(
|
|
408
|
+
self.callback_manager.on_tool_result(
|
|
409
|
+
cast(ToolResult, result), content_block["id"]
|
|
410
|
+
)
|
|
387
411
|
|
|
388
412
|
# If no tool results, we're done
|
|
389
413
|
if not tool_result_content:
|
|
@@ -495,13 +519,13 @@ class AnthropicLoop(BaseLoop):
|
|
|
495
519
|
result_text = f"<s>{result.system}</s>\n{result_text}"
|
|
496
520
|
return result_text
|
|
497
521
|
|
|
498
|
-
def _handle_content(self, content:
|
|
522
|
+
def _handle_content(self, content: BetaContentBlockParam) -> None:
|
|
499
523
|
"""Handle content updates from the assistant."""
|
|
500
524
|
if content.get("type") == "text":
|
|
501
|
-
|
|
525
|
+
text_content = cast(BetaTextBlockParam, content)
|
|
526
|
+
text = text_content["text"]
|
|
502
527
|
if text == "<DONE>":
|
|
503
528
|
return
|
|
504
|
-
|
|
505
529
|
logger.info(f"Assistant: {text}")
|
|
506
530
|
|
|
507
531
|
def _handle_tool_result(self, result: ToolResult, tool_id: str) -> None:
|
|
@@ -517,5 +541,10 @@ class AnthropicLoop(BaseLoop):
|
|
|
517
541
|
"""Handle API interactions."""
|
|
518
542
|
if error:
|
|
519
543
|
logger.error(f"API error: {error}")
|
|
544
|
+
self._log_api_call("error", request, error=error)
|
|
520
545
|
else:
|
|
521
546
|
logger.debug(f"API request: {request}")
|
|
547
|
+
if response:
|
|
548
|
+
self._log_api_call("response", request, response)
|
|
549
|
+
else:
|
|
550
|
+
self._log_api_call("request", request)
|
|
@@ -90,7 +90,9 @@ class MessageManager:
|
|
|
90
90
|
blocks_with_cache_control += 1
|
|
91
91
|
# Add cache control to the last content block only
|
|
92
92
|
if content and len(content) > 0:
|
|
93
|
-
content[-1]["cache_control"] =
|
|
93
|
+
content[-1]["cache_control"] = BetaCacheControlEphemeralParam(
|
|
94
|
+
type="ephemeral"
|
|
95
|
+
)
|
|
94
96
|
else:
|
|
95
97
|
# Remove any existing cache control
|
|
96
98
|
if content and len(content) > 0:
|
|
@@ -6,7 +6,7 @@ from typing import Any, Dict
|
|
|
6
6
|
|
|
7
7
|
from anthropic.types.beta import BetaToolUnionParam
|
|
8
8
|
|
|
9
|
-
from ....core.tools.base import BaseTool
|
|
9
|
+
from ....core.tools.base import BaseTool
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class BaseAnthropicTool(BaseTool, metaclass=ABCMeta):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Collection classes for managing multiple tools."""
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any, cast
|
|
4
4
|
|
|
5
5
|
from anthropic.types.beta import BetaToolUnionParam
|
|
6
6
|
|
|
@@ -22,7 +22,7 @@ class ToolCollection:
|
|
|
22
22
|
def to_params(
|
|
23
23
|
self,
|
|
24
24
|
) -> list[BetaToolUnionParam]:
|
|
25
|
-
return [tool.to_params() for tool in self.tools]
|
|
25
|
+
return cast(list[BetaToolUnionParam], [tool.to_params() for tool in self.tools])
|
|
26
26
|
|
|
27
27
|
async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
|
|
28
28
|
tool = self.tool_map.get(name)
|
|
@@ -61,9 +61,9 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
61
61
|
|
|
62
62
|
name: Literal["computer"] = "computer"
|
|
63
63
|
api_type: Literal["computer_20250124"] = "computer_20250124"
|
|
64
|
-
width: int | None
|
|
65
|
-
height: int | None
|
|
66
|
-
display_num: int | None
|
|
64
|
+
width: int | None = None
|
|
65
|
+
height: int | None = None
|
|
66
|
+
display_num: int | None = None
|
|
67
67
|
computer: Computer # The CUA Computer instance
|
|
68
68
|
logger = logging.getLogger(__name__)
|
|
69
69
|
|
|
@@ -106,6 +106,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
106
106
|
display_size = await self.computer.interface.get_screen_size()
|
|
107
107
|
self.width = display_size["width"]
|
|
108
108
|
self.height = display_size["height"]
|
|
109
|
+
assert isinstance(self.width, int) and isinstance(self.height, int)
|
|
109
110
|
self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
|
|
110
111
|
|
|
111
112
|
async def __call__(
|
|
@@ -120,6 +121,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
120
121
|
# Ensure dimensions are initialized
|
|
121
122
|
if self.width is None or self.height is None:
|
|
122
123
|
await self.initialize_dimensions()
|
|
124
|
+
if self.width is None or self.height is None:
|
|
125
|
+
raise ToolError("Failed to initialize screen dimensions")
|
|
123
126
|
except Exception as e:
|
|
124
127
|
raise ToolError(f"Failed to initialize dimensions: {e}")
|
|
125
128
|
|
|
@@ -147,7 +150,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
147
150
|
self.logger.info(
|
|
148
151
|
f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
|
|
149
152
|
)
|
|
150
|
-
|
|
153
|
+
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
154
|
+
raise ToolError("Screen dimensions must be integers")
|
|
155
|
+
size = (int(self.width), int(self.height))
|
|
156
|
+
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
151
157
|
|
|
152
158
|
self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
|
|
153
159
|
|
|
@@ -160,15 +166,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
160
166
|
await self.computer.interface.move_cursor(x, y)
|
|
161
167
|
# Then perform drag operation - check if drag_to exists or we need to use other methods
|
|
162
168
|
try:
|
|
163
|
-
|
|
164
|
-
await self.computer.interface.drag_to(x, y)
|
|
165
|
-
else:
|
|
166
|
-
# Alternative approach: press mouse down, move, release
|
|
167
|
-
await self.computer.interface.mouse_down()
|
|
168
|
-
await asyncio.sleep(0.2)
|
|
169
|
-
await self.computer.interface.move_cursor(x, y)
|
|
170
|
-
await asyncio.sleep(0.2)
|
|
171
|
-
await self.computer.interface.mouse_up()
|
|
169
|
+
await self.computer.interface.drag_to(x, y)
|
|
172
170
|
except Exception as e:
|
|
173
171
|
self.logger.error(f"Error during drag operation: {str(e)}")
|
|
174
172
|
raise ToolError(f"Failed to perform drag: {str(e)}")
|
|
@@ -214,9 +212,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
214
212
|
self.logger.info(
|
|
215
213
|
f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
|
|
216
214
|
)
|
|
217
|
-
|
|
218
|
-
(
|
|
219
|
-
)
|
|
215
|
+
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
216
|
+
raise ToolError("Screen dimensions must be integers")
|
|
217
|
+
size = (int(self.width), int(self.height))
|
|
218
|
+
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
220
219
|
# Save the scaled image back to bytes
|
|
221
220
|
buffer = io.BytesIO()
|
|
222
221
|
pre_img.save(buffer, format="PNG")
|
|
@@ -275,9 +274,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
275
274
|
self.logger.info(
|
|
276
275
|
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
277
276
|
)
|
|
278
|
-
|
|
279
|
-
(
|
|
280
|
-
)
|
|
277
|
+
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
278
|
+
raise ToolError("Screen dimensions must be integers")
|
|
279
|
+
size = (int(self.width), int(self.height))
|
|
280
|
+
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
281
281
|
|
|
282
282
|
# Perform the click action
|
|
283
283
|
if action == "left_click":
|
|
@@ -335,7 +335,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
335
335
|
self.logger.info(
|
|
336
336
|
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
|
|
337
337
|
)
|
|
338
|
-
|
|
338
|
+
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
339
|
+
raise ToolError("Screen dimensions must be integers")
|
|
340
|
+
size = (int(self.width), int(self.height))
|
|
341
|
+
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
|
|
339
342
|
|
|
340
343
|
if action == "key":
|
|
341
344
|
# Special handling for page up/down on macOS
|
|
@@ -365,7 +368,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
365
368
|
# Handle single key press
|
|
366
369
|
self.logger.info(f"Pressing key: {text}")
|
|
367
370
|
try:
|
|
368
|
-
await self.computer.interface.
|
|
371
|
+
await self.computer.interface.press_key(text)
|
|
369
372
|
output_text = text
|
|
370
373
|
except ValueError as e:
|
|
371
374
|
raise ToolError(f"Invalid key: {text}. {str(e)}")
|
|
@@ -442,7 +445,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
442
445
|
self.logger.info(
|
|
443
446
|
f"Scaling image from {img.size} to {self.width}x{self.height}"
|
|
444
447
|
)
|
|
445
|
-
|
|
448
|
+
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
449
|
+
raise ToolError("Screen dimensions must be integers")
|
|
450
|
+
size = (int(self.width), int(self.height))
|
|
451
|
+
img = img.resize(size, Image.Resampling.LANCZOS)
|
|
446
452
|
buffer = io.BytesIO()
|
|
447
453
|
img.save(buffer, format="PNG")
|
|
448
454
|
screenshot = buffer.getvalue()
|
|
@@ -451,7 +457,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
451
457
|
|
|
452
458
|
elif action == "cursor_position":
|
|
453
459
|
pos = await self.computer.interface.get_cursor_position()
|
|
454
|
-
|
|
460
|
+
x, y = pos # Unpack the tuple
|
|
461
|
+
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
455
462
|
|
|
456
463
|
except Exception as e:
|
|
457
464
|
self.logger.error(f"Error during {action} action: {str(e)}")
|
|
@@ -517,7 +524,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
517
524
|
# Scale image if needed
|
|
518
525
|
if img.size != (self.width, self.height):
|
|
519
526
|
self.logger.info(f"Scaling image from {img.size} to {self.width}x{self.height}")
|
|
520
|
-
|
|
527
|
+
if not isinstance(self.width, int) or not isinstance(self.height, int):
|
|
528
|
+
raise ToolError("Screen dimensions must be integers")
|
|
529
|
+
size = (int(self.width), int(self.height))
|
|
530
|
+
img = img.resize(size, Image.Resampling.LANCZOS)
|
|
521
531
|
buffer = io.BytesIO()
|
|
522
532
|
img.save(buffer, format="PNG")
|
|
523
533
|
screenshot = buffer.getvalue()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Dict, List
|
|
1
|
+
from typing import Any, Dict, List, cast
|
|
2
2
|
from anthropic.types.beta import BetaToolUnionParam
|
|
3
3
|
from computer.computer import Computer
|
|
4
4
|
|
|
@@ -37,7 +37,7 @@ class ToolManager(BaseToolManager):
|
|
|
37
37
|
"""Get tool parameters for Anthropic API calls."""
|
|
38
38
|
if self.tools is None:
|
|
39
39
|
raise RuntimeError("Tools not initialized. Call initialize() first.")
|
|
40
|
-
return self.tools.to_params()
|
|
40
|
+
return cast(List[BetaToolUnionParam], self.tools.to_params())
|
|
41
41
|
|
|
42
42
|
async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> ToolResult:
|
|
43
43
|
"""Execute a tool with the given input.
|
|
@@ -126,15 +126,18 @@ class ExperimentManager:
|
|
|
126
126
|
# Since we no longer want to use the images/ folder, we'll skip this functionality
|
|
127
127
|
return
|
|
128
128
|
|
|
129
|
-
def save_screenshot(self, img_base64: str, action_type: str = "") ->
|
|
129
|
+
def save_screenshot(self, img_base64: str, action_type: str = "") -> Optional[str]:
|
|
130
130
|
"""Save a screenshot to the experiment directory.
|
|
131
131
|
|
|
132
132
|
Args:
|
|
133
133
|
img_base64: Base64 encoded screenshot
|
|
134
134
|
action_type: Type of action that triggered the screenshot
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Optional[str]: Path to the saved screenshot, or None if saving failed
|
|
135
138
|
"""
|
|
136
139
|
if not self.current_turn_dir:
|
|
137
|
-
return
|
|
140
|
+
return None
|
|
138
141
|
|
|
139
142
|
try:
|
|
140
143
|
# Increment screenshot counter
|
agent/providers/omni/loop.py
CHANGED
|
@@ -13,6 +13,7 @@ import asyncio
|
|
|
13
13
|
from httpx import ConnectError, ReadTimeout
|
|
14
14
|
import shutil
|
|
15
15
|
import copy
|
|
16
|
+
from typing import cast
|
|
16
17
|
|
|
17
18
|
from .parser import OmniParser, ParseResult, ParserMetadata, UIElement
|
|
18
19
|
from ...core.loop import BaseLoop
|
|
@@ -182,8 +183,6 @@ class OmniLoop(BaseLoop):
|
|
|
182
183
|
|
|
183
184
|
if self.provider == LLMProvider.OPENAI:
|
|
184
185
|
self.client = OpenAIClient(api_key=self.api_key, model=self.model)
|
|
185
|
-
elif self.provider == LLMProvider.GROQ:
|
|
186
|
-
self.client = GroqClient(api_key=self.api_key, model=self.model)
|
|
187
186
|
elif self.provider == LLMProvider.ANTHROPIC:
|
|
188
187
|
self.client = AnthropicClient(
|
|
189
188
|
api_key=self.api_key,
|
|
@@ -329,10 +328,15 @@ class OmniLoop(BaseLoop):
|
|
|
329
328
|
raise RuntimeError(error_message)
|
|
330
329
|
|
|
331
330
|
async def _handle_response(
|
|
332
|
-
self, response: Any, messages: List[Dict[str, Any]], parsed_screen:
|
|
331
|
+
self, response: Any, messages: List[Dict[str, Any]], parsed_screen: ParseResult
|
|
333
332
|
) -> Tuple[bool, bool]:
|
|
334
333
|
"""Handle API response.
|
|
335
334
|
|
|
335
|
+
Args:
|
|
336
|
+
response: API response
|
|
337
|
+
messages: List of messages to update
|
|
338
|
+
parsed_screen: Current parsed screen information
|
|
339
|
+
|
|
336
340
|
Returns:
|
|
337
341
|
Tuple of (should_continue, action_screenshot_saved)
|
|
338
342
|
"""
|
|
@@ -394,7 +398,9 @@ class OmniLoop(BaseLoop):
|
|
|
394
398
|
|
|
395
399
|
try:
|
|
396
400
|
# Execute action with current parsed screen info
|
|
397
|
-
await self._execute_action(
|
|
401
|
+
await self._execute_action(
|
|
402
|
+
parsed_content, cast(ParseResult, parsed_screen)
|
|
403
|
+
)
|
|
398
404
|
action_screenshot_saved = True
|
|
399
405
|
except Exception as e:
|
|
400
406
|
logger.error(f"Error executing action: {str(e)}")
|
|
@@ -463,7 +469,7 @@ class OmniLoop(BaseLoop):
|
|
|
463
469
|
|
|
464
470
|
try:
|
|
465
471
|
# Execute action with current parsed screen info
|
|
466
|
-
await self._execute_action(parsed_content, parsed_screen)
|
|
472
|
+
await self._execute_action(parsed_content, cast(ParseResult, parsed_screen))
|
|
467
473
|
action_screenshot_saved = True
|
|
468
474
|
except Exception as e:
|
|
469
475
|
logger.error(f"Error executing action: {str(e)}")
|
|
@@ -488,7 +494,7 @@ class OmniLoop(BaseLoop):
|
|
|
488
494
|
|
|
489
495
|
try:
|
|
490
496
|
# Execute action with current parsed screen info
|
|
491
|
-
await self._execute_action(content, parsed_screen)
|
|
497
|
+
await self._execute_action(content, cast(ParseResult, parsed_screen))
|
|
492
498
|
action_screenshot_saved = True
|
|
493
499
|
except Exception as e:
|
|
494
500
|
logger.error(f"Error executing action: {str(e)}")
|
agent/providers/omni/parser.py
CHANGED
|
@@ -122,8 +122,9 @@ class OmniParser:
|
|
|
122
122
|
# Create a minimal valid result for error cases
|
|
123
123
|
return ParseResult(
|
|
124
124
|
elements=[],
|
|
125
|
+
screen_info=None,
|
|
125
126
|
annotated_image_base64="",
|
|
126
|
-
parsed_content_list=[
|
|
127
|
+
parsed_content_list=[{"error": str(e)}],
|
|
127
128
|
metadata=ParserMetadata(
|
|
128
129
|
image_size=(0, 0),
|
|
129
130
|
num_icons=0,
|
|
@@ -177,7 +177,7 @@ class OmniComputerTool(BaseComputerTool):
|
|
|
177
177
|
keys = text.split("+")
|
|
178
178
|
await self.computer.interface.hotkey(*keys)
|
|
179
179
|
else:
|
|
180
|
-
await self.computer.interface.
|
|
180
|
+
await self.computer.interface.press_key(text)
|
|
181
181
|
|
|
182
182
|
# Take screenshot after action
|
|
183
183
|
screenshot = await self.computer.interface.screenshot()
|
|
@@ -188,7 +188,8 @@ class OmniComputerTool(BaseComputerTool):
|
|
|
188
188
|
)
|
|
189
189
|
elif action == "cursor_position":
|
|
190
190
|
pos = await self.computer.interface.get_cursor_position()
|
|
191
|
-
|
|
191
|
+
x, y = pos
|
|
192
|
+
return ToolResult(output=f"X={int(x)},Y={int(y)}")
|
|
192
193
|
elif action == "scroll":
|
|
193
194
|
if direction == "down":
|
|
194
195
|
self.logger.info(f"Scrolling down, amount: {amount}")
|
|
@@ -10,7 +10,6 @@ from ....core.tools.collection import ToolCollection
|
|
|
10
10
|
|
|
11
11
|
from .bash import OmniBashTool
|
|
12
12
|
from .computer import OmniComputerTool
|
|
13
|
-
from .edit import OmniEditTool
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
class ProviderType(Enum):
|
|
@@ -35,11 +34,10 @@ class OmniToolManager(BaseToolManager):
|
|
|
35
34
|
# Initialize tools
|
|
36
35
|
self.computer_tool = OmniComputerTool(self.computer)
|
|
37
36
|
self.bash_tool = OmniBashTool(self.computer)
|
|
38
|
-
self.edit_tool = OmniEditTool(self.computer)
|
|
39
37
|
|
|
40
38
|
def _initialize_tools(self) -> ToolCollection:
|
|
41
39
|
"""Initialize all available tools."""
|
|
42
|
-
return ToolCollection(self.computer_tool, self.bash_tool
|
|
40
|
+
return ToolCollection(self.computer_tool, self.bash_tool)
|
|
43
41
|
|
|
44
42
|
async def _initialize_tools_specific(self) -> None:
|
|
45
43
|
"""Initialize provider-specific tool requirements."""
|
agent/providers/omni/utils.py
CHANGED
|
@@ -96,7 +96,7 @@ def compress_image_base64(
|
|
|
96
96
|
# Resize image
|
|
97
97
|
new_width = int(img.width * scale_factor)
|
|
98
98
|
new_height = int(img.height * scale_factor)
|
|
99
|
-
current_img = img.resize((new_width, new_height), Image.LANCZOS)
|
|
99
|
+
current_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
100
100
|
|
|
101
101
|
# Try with reduced size and quality
|
|
102
102
|
buffer = io.BytesIO()
|
|
@@ -130,7 +130,9 @@ def compress_image_base64(
|
|
|
130
130
|
|
|
131
131
|
# Last resort: Use minimum quality and size
|
|
132
132
|
buffer = io.BytesIO()
|
|
133
|
-
smallest_img = img.resize(
|
|
133
|
+
smallest_img = img.resize(
|
|
134
|
+
(int(img.width * 0.5), int(img.height * 0.5)), Image.Resampling.LANCZOS
|
|
135
|
+
)
|
|
134
136
|
# Convert to RGB if necessary
|
|
135
137
|
if smallest_img.mode in ("RGBA", "LA") or (
|
|
136
138
|
smallest_img.mode == "P" and "transparency" in smallest_img.info
|
agent/types/__init__.py
CHANGED
|
@@ -1,23 +1,20 @@
|
|
|
1
1
|
"""Type definitions for the agent package."""
|
|
2
2
|
|
|
3
|
-
from .base import
|
|
3
|
+
from .base import HostConfig, TaskResult, Annotation
|
|
4
4
|
from .messages import Message, Request, Response, StepMessage, DisengageMessage
|
|
5
5
|
from .tools import ToolInvocation, ToolInvocationState, ClientAttachment, ToolResult
|
|
6
6
|
|
|
7
7
|
__all__ = [
|
|
8
8
|
# Base types
|
|
9
|
-
"Provider",
|
|
10
9
|
"HostConfig",
|
|
11
10
|
"TaskResult",
|
|
12
11
|
"Annotation",
|
|
13
|
-
|
|
14
12
|
# Message types
|
|
15
13
|
"Message",
|
|
16
14
|
"Request",
|
|
17
15
|
"Response",
|
|
18
16
|
"StepMessage",
|
|
19
17
|
"DisengageMessage",
|
|
20
|
-
|
|
21
18
|
# Tool types
|
|
22
19
|
"ToolInvocation",
|
|
23
20
|
"ToolInvocationState",
|
agent/types/base.py
CHANGED
|
@@ -5,17 +5,6 @@ from typing import Dict, Any
|
|
|
5
5
|
from pydantic import BaseModel, ConfigDict
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class Provider(str, Enum):
|
|
9
|
-
"""Available AI providers."""
|
|
10
|
-
|
|
11
|
-
UNKNOWN = "unknown" # Default provider for base class
|
|
12
|
-
ANTHROPIC = "anthropic"
|
|
13
|
-
OPENAI = "openai"
|
|
14
|
-
OLLAMA = "ollama"
|
|
15
|
-
OMNI = "omni"
|
|
16
|
-
GROQ = "groq"
|
|
17
|
-
|
|
18
|
-
|
|
19
8
|
class HostConfig(BaseModel):
|
|
20
9
|
"""Host configuration."""
|
|
21
10
|
|
|
@@ -48,6 +37,5 @@ class AgentLoop(Enum):
|
|
|
48
37
|
"""Enumeration of available loop types."""
|
|
49
38
|
|
|
50
39
|
ANTHROPIC = auto() # Anthropic implementation
|
|
51
|
-
OPENAI = auto() # OpenAI implementation
|
|
52
40
|
OMNI = auto() # OmniLoop implementation
|
|
53
41
|
# Add more loop types as needed
|