cua-agent 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (57) hide show
  1. agent/__init__.py +3 -2
  2. agent/core/__init__.py +1 -6
  3. agent/core/{computer_agent.py → agent.py} +31 -76
  4. agent/core/{loop.py → base.py} +68 -127
  5. agent/core/factory.py +104 -0
  6. agent/core/messages.py +279 -125
  7. agent/core/provider_config.py +15 -0
  8. agent/core/types.py +45 -0
  9. agent/core/visualization.py +197 -0
  10. agent/providers/anthropic/api/client.py +142 -1
  11. agent/providers/anthropic/api_handler.py +140 -0
  12. agent/providers/anthropic/callbacks/__init__.py +5 -0
  13. agent/providers/anthropic/loop.py +207 -221
  14. agent/providers/anthropic/response_handler.py +226 -0
  15. agent/providers/anthropic/tools/bash.py +0 -97
  16. agent/providers/anthropic/utils.py +368 -0
  17. agent/providers/omni/__init__.py +1 -20
  18. agent/providers/omni/api_handler.py +42 -0
  19. agent/providers/omni/clients/anthropic.py +4 -0
  20. agent/providers/omni/image_utils.py +0 -72
  21. agent/providers/omni/loop.py +491 -607
  22. agent/providers/omni/parser.py +58 -4
  23. agent/providers/omni/tools/__init__.py +25 -7
  24. agent/providers/omni/tools/base.py +29 -0
  25. agent/providers/omni/tools/bash.py +43 -38
  26. agent/providers/omni/tools/computer.py +144 -182
  27. agent/providers/omni/tools/manager.py +25 -45
  28. agent/providers/omni/types.py +1 -3
  29. agent/providers/omni/utils.py +224 -145
  30. agent/providers/openai/__init__.py +6 -0
  31. agent/providers/openai/api_handler.py +453 -0
  32. agent/providers/openai/loop.py +440 -0
  33. agent/providers/openai/response_handler.py +205 -0
  34. agent/providers/openai/tools/__init__.py +15 -0
  35. agent/providers/openai/tools/base.py +79 -0
  36. agent/providers/openai/tools/computer.py +319 -0
  37. agent/providers/openai/tools/manager.py +106 -0
  38. agent/providers/openai/types.py +36 -0
  39. agent/providers/openai/utils.py +98 -0
  40. cua_agent-0.1.18.dist-info/METADATA +165 -0
  41. cua_agent-0.1.18.dist-info/RECORD +73 -0
  42. agent/README.md +0 -63
  43. agent/providers/anthropic/messages/manager.py +0 -112
  44. agent/providers/omni/callbacks.py +0 -78
  45. agent/providers/omni/clients/groq.py +0 -101
  46. agent/providers/omni/experiment.py +0 -276
  47. agent/providers/omni/messages.py +0 -171
  48. agent/providers/omni/tool_manager.py +0 -91
  49. agent/providers/omni/visualization.py +0 -130
  50. agent/types/__init__.py +0 -23
  51. agent/types/base.py +0 -41
  52. agent/types/messages.py +0 -36
  53. cua_agent-0.1.6.dist-info/METADATA +0 -120
  54. cua_agent-0.1.6.dist-info/RECORD +0 -64
  55. /agent/{types → core}/tools.py +0 -0
  56. {cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
  57. {cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,368 @@
1
+ """Utility functions for Anthropic message handling."""
2
+
3
+ import logging
4
+ import re
5
+ from typing import Any, Dict, List, Optional, Tuple, cast
6
+ from anthropic.types.beta import BetaMessage
7
+ from ..omni.parser import ParseResult
8
+ from ...core.types import AgentResponse
9
+ from datetime import datetime
10
+
11
+ # Configure module logger
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def to_anthropic_format(
16
+ messages: List[Dict[str, Any]],
17
+ ) -> Tuple[List[Dict[str, Any]], str]:
18
+ """Convert standard OpenAI format messages to Anthropic format.
19
+
20
+ Args:
21
+ messages: List of messages in OpenAI format
22
+
23
+ Returns:
24
+ Tuple containing (anthropic_messages, system_content)
25
+ """
26
+ result = []
27
+ system_content = ""
28
+
29
+ # Process messages in order to maintain conversation flow
30
+ previous_assistant_tool_use_ids = set() # Track tool_use_ids in the previous assistant message
31
+
32
+ for i, msg in enumerate(messages):
33
+ role = msg.get("role", "")
34
+ content = msg.get("content", "")
35
+
36
+ if role == "system":
37
+ # Collect system messages for later use
38
+ system_content += content + "\n"
39
+ continue
40
+
41
+ if role == "assistant":
42
+ # Track tool_use_ids in this assistant message for the next user message
43
+ previous_assistant_tool_use_ids = set()
44
+ if isinstance(content, list):
45
+ for item in content:
46
+ if isinstance(item, dict) and item.get("type") == "tool_use" and "id" in item:
47
+ previous_assistant_tool_use_ids.add(item["id"])
48
+
49
+ if role in ["user", "assistant"]:
50
+ anthropic_msg = {"role": role}
51
+
52
+ # Convert content based on type
53
+ if isinstance(content, str):
54
+ # Simple text content
55
+ anthropic_msg["content"] = [{"type": "text", "text": content}]
56
+ elif isinstance(content, list):
57
+ # Convert complex content
58
+ anthropic_content = []
59
+ for item in content:
60
+ item_type = item.get("type", "")
61
+
62
+ if item_type == "text":
63
+ anthropic_content.append({"type": "text", "text": item.get("text", "")})
64
+ elif item_type == "image_url":
65
+ # Convert OpenAI image format to Anthropic
66
+ image_url = item.get("image_url", {}).get("url", "")
67
+ if image_url.startswith("data:"):
68
+ # Extract base64 data and media type
69
+ match = re.match(r"data:(.+);base64,(.+)", image_url)
70
+ if match:
71
+ media_type, data = match.groups()
72
+ anthropic_content.append(
73
+ {
74
+ "type": "image",
75
+ "source": {
76
+ "type": "base64",
77
+ "media_type": media_type,
78
+ "data": data,
79
+ },
80
+ }
81
+ )
82
+ else:
83
+ # Regular URL
84
+ anthropic_content.append(
85
+ {
86
+ "type": "image",
87
+ "source": {
88
+ "type": "url",
89
+ "url": image_url,
90
+ },
91
+ }
92
+ )
93
+ elif item_type == "tool_use":
94
+ # Always include tool_use blocks
95
+ anthropic_content.append(item)
96
+ elif item_type == "tool_result":
97
+ # Check if this is a user message AND if the tool_use_id exists in the previous assistant message
98
+ tool_use_id = item.get("tool_use_id")
99
+
100
+ # Only include tool_result if it references a tool_use from the immediately preceding assistant message
101
+ if (
102
+ role == "user"
103
+ and tool_use_id
104
+ and tool_use_id in previous_assistant_tool_use_ids
105
+ ):
106
+ anthropic_content.append(item)
107
+ else:
108
+ content_text = "Tool Result: "
109
+ if "content" in item:
110
+ if isinstance(item["content"], list):
111
+ for content_item in item["content"]:
112
+ if (
113
+ isinstance(content_item, dict)
114
+ and content_item.get("type") == "text"
115
+ ):
116
+ content_text += content_item.get("text", "")
117
+ elif isinstance(item["content"], str):
118
+ content_text += item["content"]
119
+ anthropic_content.append({"type": "text", "text": content_text})
120
+
121
+ anthropic_msg["content"] = anthropic_content
122
+
123
+ result.append(anthropic_msg)
124
+
125
+ return result, system_content
126
+
127
+
128
+ def from_anthropic_format(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
129
+ """Convert Anthropic format messages to standard OpenAI format.
130
+
131
+ Args:
132
+ messages: List of messages in Anthropic format
133
+
134
+ Returns:
135
+ List of messages in OpenAI format
136
+ """
137
+ result = []
138
+
139
+ for msg in messages:
140
+ role = msg.get("role", "")
141
+ content = msg.get("content", [])
142
+
143
+ if role in ["user", "assistant"]:
144
+ openai_msg = {"role": role}
145
+
146
+ # Simple case: single text block
147
+ if len(content) == 1 and content[0].get("type") == "text":
148
+ openai_msg["content"] = content[0].get("text", "")
149
+ else:
150
+ # Complex case: multiple blocks or non-text
151
+ openai_content = []
152
+ for item in content:
153
+ item_type = item.get("type", "")
154
+
155
+ if item_type == "text":
156
+ openai_content.append({"type": "text", "text": item.get("text", "")})
157
+ elif item_type == "image":
158
+ # Convert Anthropic image to OpenAI format
159
+ source = item.get("source", {})
160
+ if source.get("type") == "base64":
161
+ media_type = source.get("media_type", "image/png")
162
+ data = source.get("data", "")
163
+ openai_content.append(
164
+ {
165
+ "type": "image_url",
166
+ "image_url": {"url": f"data:{media_type};base64,{data}"},
167
+ }
168
+ )
169
+ else:
170
+ # URL
171
+ openai_content.append(
172
+ {
173
+ "type": "image_url",
174
+ "image_url": {"url": source.get("url", "")},
175
+ }
176
+ )
177
+ elif item_type in ["tool_use", "tool_result"]:
178
+ # Pass through tool-related content
179
+ openai_content.append(item)
180
+
181
+ openai_msg["content"] = openai_content
182
+
183
+ result.append(openai_msg)
184
+
185
+ return result
186
+
187
+
188
+ async def to_agent_response_format(
189
+ response: BetaMessage,
190
+ messages: List[Dict[str, Any]],
191
+ parsed_screen: Optional[ParseResult] = None,
192
+ parser: Optional[Any] = None,
193
+ model: Optional[str] = None,
194
+ ) -> AgentResponse:
195
+ """Convert an Anthropic response to the standard agent response format.
196
+
197
+ Args:
198
+ response: The Anthropic API response (BetaMessage)
199
+ messages: List of messages in standard format
200
+ parsed_screen: Optional pre-parsed screen information
201
+ parser: Optional parser instance for coordinate calculation
202
+ model: Optional model name
203
+
204
+ Returns:
205
+ A response formatted according to the standard agent response format
206
+ """
207
+ # Create unique IDs for this response
208
+ response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
209
+ reasoning_id = f"rs_{response_id}"
210
+ action_id = f"cu_{response_id}"
211
+ call_id = f"call_{response_id}"
212
+
213
+ # Extract content and reasoning from Anthropic response
214
+ content = []
215
+ reasoning_text = None
216
+ action_details = None
217
+
218
+ for block in response.content:
219
+ if block.type == "text":
220
+ # Use the first text block as reasoning
221
+ if reasoning_text is None:
222
+ reasoning_text = block.text
223
+ content.append({"type": "text", "text": block.text})
224
+ elif block.type == "tool_use" and block.name == "computer":
225
+ try:
226
+ input_dict = cast(Dict[str, Any], block.input)
227
+ action = input_dict.get("action", "").lower()
228
+
229
+ # Extract coordinates from coordinate list if provided
230
+ coordinates = input_dict.get("coordinate", [100, 100])
231
+ x, y = coordinates if len(coordinates) == 2 else (100, 100)
232
+
233
+ if action == "screenshot":
234
+ action_details = {
235
+ "type": "screenshot",
236
+ }
237
+ elif action in ["click", "left_click", "right_click", "double_click"]:
238
+ action_details = {
239
+ "type": "click",
240
+ "button": "left" if action in ["click", "left_click"] else "right",
241
+ "double": action == "double_click",
242
+ "x": x,
243
+ "y": y,
244
+ }
245
+ elif action == "type":
246
+ action_details = {
247
+ "type": "type",
248
+ "text": input_dict.get("text", ""),
249
+ }
250
+ elif action == "key":
251
+ action_details = {
252
+ "type": "hotkey",
253
+ "keys": [input_dict.get("text", "")],
254
+ }
255
+ elif action == "scroll":
256
+ scroll_amount = input_dict.get("scroll_amount", 1)
257
+ scroll_direction = input_dict.get("scroll_direction", "down")
258
+ delta_y = scroll_amount if scroll_direction == "down" else -scroll_amount
259
+ action_details = {
260
+ "type": "scroll",
261
+ "x": x,
262
+ "y": y,
263
+ "delta_x": 0,
264
+ "delta_y": delta_y,
265
+ }
266
+ elif action == "move":
267
+ action_details = {
268
+ "type": "move",
269
+ "x": x,
270
+ "y": y,
271
+ }
272
+ except Exception as e:
273
+ logger.error(f"Error extracting action details: {str(e)}")
274
+
275
+ # Create output items with reasoning
276
+ output_items = []
277
+ if reasoning_text:
278
+ output_items.append(
279
+ {
280
+ "type": "reasoning",
281
+ "id": reasoning_id,
282
+ "summary": [
283
+ {
284
+ "type": "summary_text",
285
+ "text": reasoning_text,
286
+ }
287
+ ],
288
+ }
289
+ )
290
+
291
+ # Add computer_call item with extracted or default action
292
+ computer_call = {
293
+ "type": "computer_call",
294
+ "id": action_id,
295
+ "call_id": call_id,
296
+ "action": action_details or {"type": "none", "description": "No action specified"},
297
+ "pending_safety_checks": [],
298
+ "status": "completed",
299
+ }
300
+ output_items.append(computer_call)
301
+
302
+ # Create the standard response format
303
+ standard_response = {
304
+ "id": response_id,
305
+ "object": "response",
306
+ "created_at": int(datetime.now().timestamp()),
307
+ "status": "completed",
308
+ "error": None,
309
+ "incomplete_details": None,
310
+ "instructions": None,
311
+ "max_output_tokens": None,
312
+ "model": model or "anthropic-default",
313
+ "output": output_items,
314
+ "parallel_tool_calls": True,
315
+ "previous_response_id": None,
316
+ "reasoning": {"effort": "medium", "generate_summary": "concise"},
317
+ "store": True,
318
+ "temperature": 1.0,
319
+ "text": {"format": {"type": "text"}},
320
+ "tool_choice": "auto",
321
+ "tools": [
322
+ {
323
+ "type": "computer_use_preview",
324
+ "display_height": 768,
325
+ "display_width": 1024,
326
+ "environment": "mac",
327
+ }
328
+ ],
329
+ "top_p": 1.0,
330
+ "truncation": "auto",
331
+ "usage": {
332
+ "input_tokens": 0,
333
+ "input_tokens_details": {"cached_tokens": 0},
334
+ "output_tokens": 0,
335
+ "output_tokens_details": {"reasoning_tokens": 0},
336
+ "total_tokens": 0,
337
+ },
338
+ "user": None,
339
+ "metadata": {},
340
+ "response": {
341
+ "choices": [
342
+ {
343
+ "message": {
344
+ "role": "assistant",
345
+ "content": content,
346
+ "tool_calls": [],
347
+ },
348
+ "finish_reason": response.stop_reason or "stop",
349
+ }
350
+ ]
351
+ },
352
+ }
353
+
354
+ # Add tool calls if present
355
+ tool_calls = []
356
+ for block in response.content:
357
+ if hasattr(block, "type") and block.type == "tool_use":
358
+ tool_calls.append(
359
+ {
360
+ "id": f"call_{block.id}",
361
+ "type": "function",
362
+ "function": {"name": block.name, "arguments": block.input},
363
+ }
364
+ )
365
+ if tool_calls:
366
+ standard_response["response"]["choices"][0]["message"]["tool_calls"] = tool_calls
367
+
368
+ return cast(AgentResponse, standard_response)
@@ -1,27 +1,8 @@
1
1
  """Omni provider implementation."""
2
2
 
3
- # The OmniComputerAgent has been replaced by the unified ComputerAgent
4
- # which can be found in agent.core.agent
5
3
  from .types import LLMProvider
6
- from .experiment import ExperimentManager
7
- from .visualization import visualize_click, visualize_scroll, calculate_element_center
8
4
  from .image_utils import (
9
5
  decode_base64_image,
10
- encode_image_base64,
11
- clean_base64_data,
12
- extract_base64_from_text,
13
- get_image_dimensions,
14
6
  )
15
7
 
16
- __all__ = [
17
- "LLMProvider",
18
- "ExperimentManager",
19
- "visualize_click",
20
- "visualize_scroll",
21
- "calculate_element_center",
22
- "decode_base64_image",
23
- "encode_image_base64",
24
- "clean_base64_data",
25
- "extract_base64_from_text",
26
- "get_image_dimensions",
27
- ]
8
+ __all__ = ["LLMProvider", "decode_base64_image"]
@@ -0,0 +1,42 @@
1
+ """API handling for Omni provider."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List
5
+
6
+ from .prompts import SYSTEM_PROMPT
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class OmniAPIHandler:
12
+ """Handler for Omni API calls."""
13
+
14
+ def __init__(self, loop):
15
+ """Initialize the API handler.
16
+
17
+ Args:
18
+ loop: Parent loop instance
19
+ """
20
+ self.loop = loop
21
+
22
+ async def make_api_call(
23
+ self, messages: List[Dict[str, Any]], system_prompt: str = SYSTEM_PROMPT
24
+ ) -> Any:
25
+ """Make an API call to the appropriate provider.
26
+
27
+ Args:
28
+ messages: List of messages in standard OpenAI format
29
+ system_prompt: System prompt to use
30
+
31
+ Returns:
32
+ API response
33
+ """
34
+ if not self.loop._make_api_call:
35
+ raise RuntimeError("Loop does not have _make_api_call method")
36
+
37
+ try:
38
+ # Use the loop's _make_api_call method with standard messages
39
+ return await self.loop._make_api_call(messages=messages, system_prompt=system_prompt)
40
+ except Exception as e:
41
+ logger.error(f"Error making API call: {str(e)}")
42
+ raise
@@ -44,6 +44,10 @@ class AnthropicClient(BaseOmniClient):
44
44
  anthropic_messages = []
45
45
 
46
46
  for message in messages:
47
+ # Skip messages with empty content
48
+ if not message.get("content"):
49
+ continue
50
+
47
51
  if message["role"] == "user":
48
52
  anthropic_messages.append({"role": "user", "content": message["content"]})
49
53
  elif message["role"] == "assistant":
@@ -32,75 +32,3 @@ def decode_base64_image(img_base64: str) -> Optional[Image.Image]:
32
32
  except Exception as e:
33
33
  logger.error(f"Error decoding base64 image: {str(e)}")
34
34
  return None
35
-
36
-
37
- def encode_image_base64(img: Image.Image, format: str = "PNG") -> str:
38
- """Encode a PIL Image to base64.
39
-
40
- Args:
41
- img: PIL Image to encode
42
- format: Image format (PNG, JPEG, etc.)
43
-
44
- Returns:
45
- Base64 encoded image string
46
- """
47
- try:
48
- buffered = BytesIO()
49
- img.save(buffered, format=format)
50
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
51
- except Exception as e:
52
- logger.error(f"Error encoding image to base64: {str(e)}")
53
- return ""
54
-
55
-
56
- def clean_base64_data(img_base64: str) -> str:
57
- """Clean base64 image data by removing data URL prefix.
58
-
59
- Args:
60
- img_base64: Base64 encoded image, may include data URL prefix
61
-
62
- Returns:
63
- Clean base64 string without prefix
64
- """
65
- if img_base64.startswith("data:image"):
66
- return img_base64.split(",")[1]
67
- return img_base64
68
-
69
-
70
- def extract_base64_from_text(text: str) -> Optional[str]:
71
- """Extract base64 image data from a text string.
72
-
73
- Args:
74
- text: Text potentially containing base64 image data
75
-
76
- Returns:
77
- Base64 string or None if not found
78
- """
79
- # Look for data URL pattern
80
- data_url_pattern = r"data:image/[^;]+;base64,([a-zA-Z0-9+/=]+)"
81
- match = re.search(data_url_pattern, text)
82
- if match:
83
- return match.group(1)
84
-
85
- # Look for plain base64 pattern (basic heuristic)
86
- base64_pattern = r"([a-zA-Z0-9+/=]{100,})"
87
- match = re.search(base64_pattern, text)
88
- if match:
89
- return match.group(1)
90
-
91
- return None
92
-
93
-
94
- def get_image_dimensions(img_base64: str) -> Tuple[int, int]:
95
- """Get the dimensions of a base64 encoded image.
96
-
97
- Args:
98
- img_base64: Base64 encoded image
99
-
100
- Returns:
101
- Tuple of (width, height) or (0, 0) if decoding fails
102
- """
103
- img = decode_base64_image(img_base64)
104
- if img:
105
- return img.size
106
- return (0, 0)