cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (52) hide show
  1. agent/__init__.py +3 -4
  2. agent/core/__init__.py +3 -10
  3. agent/core/computer_agent.py +207 -32
  4. agent/core/experiment.py +20 -3
  5. agent/core/loop.py +78 -120
  6. agent/core/messages.py +279 -125
  7. agent/core/telemetry.py +44 -32
  8. agent/core/types.py +35 -0
  9. agent/core/visualization.py +197 -0
  10. agent/providers/anthropic/api/client.py +142 -1
  11. agent/providers/anthropic/api_handler.py +140 -0
  12. agent/providers/anthropic/callbacks/__init__.py +5 -0
  13. agent/providers/anthropic/loop.py +224 -209
  14. agent/providers/anthropic/messages/manager.py +3 -1
  15. agent/providers/anthropic/response_handler.py +229 -0
  16. agent/providers/anthropic/tools/base.py +1 -1
  17. agent/providers/anthropic/tools/bash.py +0 -97
  18. agent/providers/anthropic/tools/collection.py +2 -2
  19. agent/providers/anthropic/tools/computer.py +34 -24
  20. agent/providers/anthropic/tools/manager.py +2 -2
  21. agent/providers/anthropic/utils.py +370 -0
  22. agent/providers/omni/__init__.py +1 -20
  23. agent/providers/omni/api_handler.py +42 -0
  24. agent/providers/omni/clients/anthropic.py +4 -0
  25. agent/providers/omni/image_utils.py +0 -72
  26. agent/providers/omni/loop.py +497 -607
  27. agent/providers/omni/parser.py +60 -5
  28. agent/providers/omni/tools/__init__.py +25 -8
  29. agent/providers/omni/tools/base.py +29 -0
  30. agent/providers/omni/tools/bash.py +43 -38
  31. agent/providers/omni/tools/computer.py +144 -181
  32. agent/providers/omni/tools/manager.py +26 -48
  33. agent/providers/omni/types.py +0 -4
  34. agent/providers/omni/utils.py +225 -144
  35. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
  36. cua_agent-0.1.17.dist-info/RECORD +63 -0
  37. agent/core/agent.py +0 -252
  38. agent/core/base_agent.py +0 -164
  39. agent/core/factory.py +0 -102
  40. agent/providers/omni/callbacks.py +0 -78
  41. agent/providers/omni/clients/groq.py +0 -101
  42. agent/providers/omni/experiment.py +0 -273
  43. agent/providers/omni/messages.py +0 -171
  44. agent/providers/omni/tool_manager.py +0 -91
  45. agent/providers/omni/visualization.py +0 -130
  46. agent/types/__init__.py +0 -26
  47. agent/types/base.py +0 -53
  48. agent/types/messages.py +0 -36
  49. cua_agent-0.1.5.dist-info/RECORD +0 -67
  50. /agent/{types → core}/tools.py +0 -0
  51. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
  52. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,229 @@
1
+ """Response and tool handling for Anthropic provider."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Tuple, cast
5
+
6
+ from anthropic.types.beta import (
7
+ BetaMessage,
8
+ BetaMessageParam,
9
+ BetaTextBlock,
10
+ BetaTextBlockParam,
11
+ BetaToolUseBlockParam,
12
+ BetaContentBlockParam,
13
+ )
14
+
15
+ from .tools import ToolResult
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class AnthropicResponseHandler:
21
+ """Handles Anthropic API responses and tool execution results."""
22
+
23
+ def __init__(self, loop):
24
+ """Initialize the response handler.
25
+
26
+ Args:
27
+ loop: Reference to the parent loop instance that provides context
28
+ """
29
+ self.loop = loop
30
+
31
+ async def handle_response(
32
+ self, response: BetaMessage, messages: List[Dict[str, Any]]
33
+ ) -> Tuple[List[Dict[str, Any]], bool]:
34
+ """Handle the Anthropic API response.
35
+
36
+ Args:
37
+ response: API response
38
+ messages: List of messages for context
39
+
40
+ Returns:
41
+ Tuple containing:
42
+ - List of new messages to be added
43
+ - Boolean indicating if the loop should continue
44
+ """
45
+ try:
46
+ new_messages = []
47
+
48
+ # Convert response to parameter format
49
+ response_params = self.response_to_params(response)
50
+
51
+ # Collect all existing tool_use IDs from previous messages for validation
52
+ existing_tool_use_ids = set()
53
+ for msg in messages:
54
+ if msg.get("role") == "assistant" and isinstance(msg.get("content"), list):
55
+ for block in msg.get("content", []):
56
+ if (
57
+ isinstance(block, dict)
58
+ and block.get("type") == "tool_use"
59
+ and "id" in block
60
+ ):
61
+ existing_tool_use_ids.add(block["id"])
62
+
63
+ # Also add new tool_use IDs from the current response
64
+ current_tool_use_ids = set()
65
+ for block in response_params:
66
+ if isinstance(block, dict) and block.get("type") == "tool_use" and "id" in block:
67
+ current_tool_use_ids.add(block["id"])
68
+ existing_tool_use_ids.add(block["id"])
69
+
70
+ logger.info(f"Existing tool_use IDs in conversation: {existing_tool_use_ids}")
71
+ logger.info(f"New tool_use IDs in current response: {current_tool_use_ids}")
72
+
73
+ # Create assistant message
74
+ new_messages.append(
75
+ {
76
+ "role": "assistant",
77
+ "content": response_params,
78
+ }
79
+ )
80
+
81
+ if self.loop.callback_manager is None:
82
+ raise RuntimeError(
83
+ "Callback manager not initialized. Call initialize_client() first."
84
+ )
85
+
86
+ # Handle tool use blocks and collect results
87
+ tool_result_content = []
88
+ for content_block in response_params:
89
+ # Notify callback of content
90
+ self.loop.callback_manager.on_content(cast(BetaContentBlockParam, content_block))
91
+
92
+ # Handle tool use
93
+ if content_block.get("type") == "tool_use":
94
+ if self.loop.tool_manager is None:
95
+ raise RuntimeError(
96
+ "Tool manager not initialized. Call initialize_client() first."
97
+ )
98
+
99
+ # Execute the tool
100
+ result = await self.loop.tool_manager.execute_tool(
101
+ name=content_block["name"],
102
+ tool_input=cast(Dict[str, Any], content_block["input"]),
103
+ )
104
+
105
+ # Verify the tool_use ID exists in the conversation (which it should now)
106
+ tool_use_id = content_block["id"]
107
+ if tool_use_id in existing_tool_use_ids:
108
+ # Create tool result and add to content
109
+ tool_result = self.make_tool_result(cast(ToolResult, result), tool_use_id)
110
+ tool_result_content.append(tool_result)
111
+
112
+ # Notify callback of tool result
113
+ self.loop.callback_manager.on_tool_result(
114
+ cast(ToolResult, result), content_block["id"]
115
+ )
116
+ else:
117
+ logger.warning(
118
+ f"Tool use ID {tool_use_id} not found in previous messages. Skipping tool result."
119
+ )
120
+
121
+ # If no tool results, we're done
122
+ if not tool_result_content:
123
+ # Signal completion
124
+ self.loop.callback_manager.on_content({"type": "text", "text": "<DONE>"})
125
+ return new_messages, False
126
+
127
+ # Add tool results as user message
128
+ new_messages.append({"content": tool_result_content, "role": "user"})
129
+ return new_messages, True
130
+
131
+ except Exception as e:
132
+ logger.error(f"Error handling response: {str(e)}")
133
+ new_messages.append(
134
+ {
135
+ "role": "assistant",
136
+ "content": f"Error: {str(e)}",
137
+ }
138
+ )
139
+ return new_messages, False
140
+
141
+ def response_to_params(
142
+ self,
143
+ response: BetaMessage,
144
+ ) -> List[Dict[str, Any]]:
145
+ """Convert API response to message parameters.
146
+
147
+ Args:
148
+ response: API response message
149
+
150
+ Returns:
151
+ List of content blocks
152
+ """
153
+ result = []
154
+ for block in response.content:
155
+ if isinstance(block, BetaTextBlock):
156
+ result.append({"type": "text", "text": block.text})
157
+ else:
158
+ result.append(cast(Dict[str, Any], block.model_dump()))
159
+ return result
160
+
161
+ def make_tool_result(self, result: ToolResult, tool_use_id: str) -> Dict[str, Any]:
162
+ """Convert a tool result to API format.
163
+
164
+ Args:
165
+ result: Tool execution result
166
+ tool_use_id: ID of the tool use
167
+
168
+ Returns:
169
+ Formatted tool result
170
+ """
171
+ if result.content:
172
+ return {
173
+ "type": "tool_result",
174
+ "content": result.content,
175
+ "tool_use_id": tool_use_id,
176
+ "is_error": bool(result.error),
177
+ }
178
+
179
+ tool_result_content = []
180
+ is_error = False
181
+
182
+ if result.error:
183
+ is_error = True
184
+ tool_result_content = [
185
+ {
186
+ "type": "text",
187
+ "text": self.maybe_prepend_system_tool_result(result, result.error),
188
+ }
189
+ ]
190
+ else:
191
+ if result.output:
192
+ tool_result_content.append(
193
+ {
194
+ "type": "text",
195
+ "text": self.maybe_prepend_system_tool_result(result, result.output),
196
+ }
197
+ )
198
+ if result.base64_image:
199
+ tool_result_content.append(
200
+ {
201
+ "type": "image",
202
+ "source": {
203
+ "type": "base64",
204
+ "media_type": "image/png",
205
+ "data": result.base64_image,
206
+ },
207
+ }
208
+ )
209
+
210
+ return {
211
+ "type": "tool_result",
212
+ "content": tool_result_content,
213
+ "tool_use_id": tool_use_id,
214
+ "is_error": is_error,
215
+ }
216
+
217
+ def maybe_prepend_system_tool_result(self, result: ToolResult, result_text: str) -> str:
218
+ """Prepend system information to tool result if available.
219
+
220
+ Args:
221
+ result: Tool execution result
222
+ result_text: Text to prepend to
223
+
224
+ Returns:
225
+ Text with system information prepended if available
226
+ """
227
+ if result.system:
228
+ result_text = f"<s>{result.system}</s>\n{result_text}"
229
+ return result_text
@@ -6,7 +6,7 @@ from typing import Any, Dict
6
6
 
7
7
  from anthropic.types.beta import BetaToolUnionParam
8
8
 
9
- from ....core.tools.base import BaseTool, ToolError, ToolResult, ToolFailure, CLIResult
9
+ from ....core.tools.base import BaseTool
10
10
 
11
11
 
12
12
  class BaseAnthropicTool(BaseTool, metaclass=ABCMeta):
@@ -7,102 +7,6 @@ from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult
7
7
  from ....core.tools.bash import BaseBashTool
8
8
 
9
9
 
10
- class _BashSession:
11
- """A session of a bash shell."""
12
-
13
- _started: bool
14
- _process: asyncio.subprocess.Process
15
-
16
- command: str = "/bin/bash"
17
- _output_delay: float = 0.2 # seconds
18
- _timeout: float = 120.0 # seconds
19
- _sentinel: str = "<<exit>>"
20
-
21
- def __init__(self):
22
- self._started = False
23
- self._timed_out = False
24
-
25
- async def start(self):
26
- if self._started:
27
- return
28
-
29
- self._process = await asyncio.create_subprocess_shell(
30
- self.command,
31
- preexec_fn=os.setsid,
32
- shell=True,
33
- bufsize=0,
34
- stdin=asyncio.subprocess.PIPE,
35
- stdout=asyncio.subprocess.PIPE,
36
- stderr=asyncio.subprocess.PIPE,
37
- )
38
-
39
- self._started = True
40
-
41
- def stop(self):
42
- """Terminate the bash shell."""
43
- if not self._started:
44
- raise ToolError("Session has not started.")
45
- if self._process.returncode is not None:
46
- return
47
- self._process.terminate()
48
-
49
- async def run(self, command: str):
50
- """Execute a command in the bash shell."""
51
- if not self._started:
52
- raise ToolError("Session has not started.")
53
- if self._process.returncode is not None:
54
- return ToolResult(
55
- system="tool must be restarted",
56
- error=f"bash has exited with returncode {self._process.returncode}",
57
- )
58
- if self._timed_out:
59
- raise ToolError(
60
- f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
61
- )
62
-
63
- # we know these are not None because we created the process with PIPEs
64
- assert self._process.stdin
65
- assert self._process.stdout
66
- assert self._process.stderr
67
-
68
- # send command to the process
69
- self._process.stdin.write(command.encode() + f"; echo '{self._sentinel}'\n".encode())
70
- await self._process.stdin.drain()
71
-
72
- # read output from the process, until the sentinel is found
73
- try:
74
- async with asyncio.timeout(self._timeout):
75
- while True:
76
- await asyncio.sleep(self._output_delay)
77
- # if we read directly from stdout/stderr, it will wait forever for
78
- # EOF. use the StreamReader buffer directly instead.
79
- output = (
80
- self._process.stdout._buffer.decode()
81
- ) # pyright: ignore[reportAttributeAccessIssue]
82
- if self._sentinel in output:
83
- # strip the sentinel and break
84
- output = output[: output.index(self._sentinel)]
85
- break
86
- except asyncio.TimeoutError:
87
- self._timed_out = True
88
- raise ToolError(
89
- f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
90
- ) from None
91
-
92
- if output.endswith("\n"):
93
- output = output[:-1]
94
-
95
- error = self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue]
96
- if error.endswith("\n"):
97
- error = error[:-1]
98
-
99
- # clear the buffers so that the next output can be read correctly
100
- self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
101
- self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
102
-
103
- return CLIResult(output=output, error=error)
104
-
105
-
106
10
  class BashTool(BaseBashTool, BaseAnthropicTool):
107
11
  """
108
12
  A tool that allows the agent to run bash commands.
@@ -124,7 +28,6 @@ class BashTool(BaseBashTool, BaseAnthropicTool):
124
28
  # Then initialize the Anthropic tool
125
29
  BaseAnthropicTool.__init__(self)
126
30
  # Initialize bash session
127
- self._session = _BashSession()
128
31
 
129
32
  async def __call__(self, command: str | None = None, restart: bool = False, **kwargs):
130
33
  """Execute a bash command.
@@ -1,6 +1,6 @@
1
1
  """Collection classes for managing multiple tools."""
2
2
 
3
- from typing import Any
3
+ from typing import Any, cast
4
4
 
5
5
  from anthropic.types.beta import BetaToolUnionParam
6
6
 
@@ -22,7 +22,7 @@ class ToolCollection:
22
22
  def to_params(
23
23
  self,
24
24
  ) -> list[BetaToolUnionParam]:
25
- return [tool.to_params() for tool in self.tools]
25
+ return cast(list[BetaToolUnionParam], [tool.to_params() for tool in self.tools])
26
26
 
27
27
  async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
28
28
  tool = self.tool_map.get(name)
@@ -61,9 +61,9 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
61
61
 
62
62
  name: Literal["computer"] = "computer"
63
63
  api_type: Literal["computer_20250124"] = "computer_20250124"
64
- width: int | None
65
- height: int | None
66
- display_num: int | None
64
+ width: int | None = None
65
+ height: int | None = None
66
+ display_num: int | None = None
67
67
  computer: Computer # The CUA Computer instance
68
68
  logger = logging.getLogger(__name__)
69
69
 
@@ -106,6 +106,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
106
106
  display_size = await self.computer.interface.get_screen_size()
107
107
  self.width = display_size["width"]
108
108
  self.height = display_size["height"]
109
+ assert isinstance(self.width, int) and isinstance(self.height, int)
109
110
  self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
110
111
 
111
112
  async def __call__(
@@ -120,6 +121,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
120
121
  # Ensure dimensions are initialized
121
122
  if self.width is None or self.height is None:
122
123
  await self.initialize_dimensions()
124
+ if self.width is None or self.height is None:
125
+ raise ToolError("Failed to initialize screen dimensions")
123
126
  except Exception as e:
124
127
  raise ToolError(f"Failed to initialize dimensions: {e}")
125
128
 
@@ -147,7 +150,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
147
150
  self.logger.info(
148
151
  f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
149
152
  )
150
- pre_img = pre_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
153
+ if not isinstance(self.width, int) or not isinstance(self.height, int):
154
+ raise ToolError("Screen dimensions must be integers")
155
+ size = (int(self.width), int(self.height))
156
+ pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
151
157
 
152
158
  self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
153
159
 
@@ -160,15 +166,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
160
166
  await self.computer.interface.move_cursor(x, y)
161
167
  # Then perform drag operation - check if drag_to exists or we need to use other methods
162
168
  try:
163
- if hasattr(self.computer.interface, "drag_to"):
164
- await self.computer.interface.drag_to(x, y)
165
- else:
166
- # Alternative approach: press mouse down, move, release
167
- await self.computer.interface.mouse_down()
168
- await asyncio.sleep(0.2)
169
- await self.computer.interface.move_cursor(x, y)
170
- await asyncio.sleep(0.2)
171
- await self.computer.interface.mouse_up()
169
+ await self.computer.interface.drag_to(x, y)
172
170
  except Exception as e:
173
171
  self.logger.error(f"Error during drag operation: {str(e)}")
174
172
  raise ToolError(f"Failed to perform drag: {str(e)}")
@@ -214,9 +212,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
214
212
  self.logger.info(
215
213
  f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
216
214
  )
217
- pre_img = pre_img.resize(
218
- (self.width, self.height), Image.Resampling.LANCZOS
219
- )
215
+ if not isinstance(self.width, int) or not isinstance(self.height, int):
216
+ raise ToolError("Screen dimensions must be integers")
217
+ size = (int(self.width), int(self.height))
218
+ pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
220
219
  # Save the scaled image back to bytes
221
220
  buffer = io.BytesIO()
222
221
  pre_img.save(buffer, format="PNG")
@@ -275,9 +274,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
275
274
  self.logger.info(
276
275
  f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
277
276
  )
278
- pre_img = pre_img.resize(
279
- (self.width, self.height), Image.Resampling.LANCZOS
280
- )
277
+ if not isinstance(self.width, int) or not isinstance(self.height, int):
278
+ raise ToolError("Screen dimensions must be integers")
279
+ size = (int(self.width), int(self.height))
280
+ pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
281
281
 
282
282
  # Perform the click action
283
283
  if action == "left_click":
@@ -335,7 +335,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
335
335
  self.logger.info(
336
336
  f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
337
337
  )
338
- pre_img = pre_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
338
+ if not isinstance(self.width, int) or not isinstance(self.height, int):
339
+ raise ToolError("Screen dimensions must be integers")
340
+ size = (int(self.width), int(self.height))
341
+ pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
339
342
 
340
343
  if action == "key":
341
344
  # Special handling for page up/down on macOS
@@ -365,7 +368,7 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
365
368
  # Handle single key press
366
369
  self.logger.info(f"Pressing key: {text}")
367
370
  try:
368
- await self.computer.interface.press(text)
371
+ await self.computer.interface.press_key(text)
369
372
  output_text = text
370
373
  except ValueError as e:
371
374
  raise ToolError(f"Invalid key: {text}. {str(e)}")
@@ -442,7 +445,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
442
445
  self.logger.info(
443
446
  f"Scaling image from {img.size} to {self.width}x{self.height}"
444
447
  )
445
- img = img.resize((self.width, self.height), Image.Resampling.LANCZOS)
448
+ if not isinstance(self.width, int) or not isinstance(self.height, int):
449
+ raise ToolError("Screen dimensions must be integers")
450
+ size = (int(self.width), int(self.height))
451
+ img = img.resize(size, Image.Resampling.LANCZOS)
446
452
  buffer = io.BytesIO()
447
453
  img.save(buffer, format="PNG")
448
454
  screenshot = buffer.getvalue()
@@ -451,7 +457,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
451
457
 
452
458
  elif action == "cursor_position":
453
459
  pos = await self.computer.interface.get_cursor_position()
454
- return ToolResult(output=f"X={int(pos[0])},Y={int(pos[1])}")
460
+ x, y = pos # Unpack the tuple
461
+ return ToolResult(output=f"X={int(x)},Y={int(y)}")
455
462
 
456
463
  except Exception as e:
457
464
  self.logger.error(f"Error during {action} action: {str(e)}")
@@ -517,7 +524,10 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
517
524
  # Scale image if needed
518
525
  if img.size != (self.width, self.height):
519
526
  self.logger.info(f"Scaling image from {img.size} to {self.width}x{self.height}")
520
- img = img.resize((self.width, self.height), Image.Resampling.LANCZOS)
527
+ if not isinstance(self.width, int) or not isinstance(self.height, int):
528
+ raise ToolError("Screen dimensions must be integers")
529
+ size = (int(self.width), int(self.height))
530
+ img = img.resize(size, Image.Resampling.LANCZOS)
521
531
  buffer = io.BytesIO()
522
532
  img.save(buffer, format="PNG")
523
533
  screenshot = buffer.getvalue()
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, List
1
+ from typing import Any, Dict, List, cast
2
2
  from anthropic.types.beta import BetaToolUnionParam
3
3
  from computer.computer import Computer
4
4
 
@@ -37,7 +37,7 @@ class ToolManager(BaseToolManager):
37
37
  """Get tool parameters for Anthropic API calls."""
38
38
  if self.tools is None:
39
39
  raise RuntimeError("Tools not initialized. Call initialize() first.")
40
- return self.tools.to_params()
40
+ return cast(List[BetaToolUnionParam], self.tools.to_params())
41
41
 
42
42
  async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> ToolResult:
43
43
  """Execute a tool with the given input.