cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (52) hide show
  1. agent/__init__.py +3 -4
  2. agent/core/__init__.py +3 -10
  3. agent/core/computer_agent.py +207 -32
  4. agent/core/experiment.py +20 -3
  5. agent/core/loop.py +78 -120
  6. agent/core/messages.py +279 -125
  7. agent/core/telemetry.py +44 -32
  8. agent/core/types.py +35 -0
  9. agent/core/visualization.py +197 -0
  10. agent/providers/anthropic/api/client.py +142 -1
  11. agent/providers/anthropic/api_handler.py +140 -0
  12. agent/providers/anthropic/callbacks/__init__.py +5 -0
  13. agent/providers/anthropic/loop.py +224 -209
  14. agent/providers/anthropic/messages/manager.py +3 -1
  15. agent/providers/anthropic/response_handler.py +229 -0
  16. agent/providers/anthropic/tools/base.py +1 -1
  17. agent/providers/anthropic/tools/bash.py +0 -97
  18. agent/providers/anthropic/tools/collection.py +2 -2
  19. agent/providers/anthropic/tools/computer.py +34 -24
  20. agent/providers/anthropic/tools/manager.py +2 -2
  21. agent/providers/anthropic/utils.py +370 -0
  22. agent/providers/omni/__init__.py +1 -20
  23. agent/providers/omni/api_handler.py +42 -0
  24. agent/providers/omni/clients/anthropic.py +4 -0
  25. agent/providers/omni/image_utils.py +0 -72
  26. agent/providers/omni/loop.py +497 -607
  27. agent/providers/omni/parser.py +60 -5
  28. agent/providers/omni/tools/__init__.py +25 -8
  29. agent/providers/omni/tools/base.py +29 -0
  30. agent/providers/omni/tools/bash.py +43 -38
  31. agent/providers/omni/tools/computer.py +144 -181
  32. agent/providers/omni/tools/manager.py +26 -48
  33. agent/providers/omni/types.py +0 -4
  34. agent/providers/omni/utils.py +225 -144
  35. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
  36. cua_agent-0.1.17.dist-info/RECORD +63 -0
  37. agent/core/agent.py +0 -252
  38. agent/core/base_agent.py +0 -164
  39. agent/core/factory.py +0 -102
  40. agent/providers/omni/callbacks.py +0 -78
  41. agent/providers/omni/clients/groq.py +0 -101
  42. agent/providers/omni/experiment.py +0 -273
  43. agent/providers/omni/messages.py +0 -171
  44. agent/providers/omni/tool_manager.py +0 -91
  45. agent/providers/omni/visualization.py +0 -130
  46. agent/types/__init__.py +0 -26
  47. agent/types/base.py +0 -53
  48. agent/types/messages.py +0 -36
  49. cua_agent-0.1.5.dist-info/RECORD +0 -67
  50. /agent/{types → core}/tools.py +0 -0
  51. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
  52. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,370 @@
1
+ """Utility functions for Anthropic message handling."""
2
+
3
+ import time
4
+ import logging
5
+ import re
6
+ from typing import Any, Dict, List, Optional, Tuple, cast
7
+ from anthropic.types.beta import BetaMessage, BetaMessageParam, BetaTextBlock
8
+ from ..omni.parser import ParseResult
9
+ from ...core.types import AgentResponse
10
+ from datetime import datetime
11
+ import json
12
+
13
+ # Configure module logger
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def to_anthropic_format(
18
+ messages: List[Dict[str, Any]],
19
+ ) -> Tuple[List[Dict[str, Any]], str]:
20
+ """Convert standard OpenAI format messages to Anthropic format.
21
+
22
+ Args:
23
+ messages: List of messages in OpenAI format
24
+
25
+ Returns:
26
+ Tuple containing (anthropic_messages, system_content)
27
+ """
28
+ result = []
29
+ system_content = ""
30
+
31
+ # Process messages in order to maintain conversation flow
32
+ previous_assistant_tool_use_ids = set() # Track tool_use_ids in the previous assistant message
33
+
34
+ for i, msg in enumerate(messages):
35
+ role = msg.get("role", "")
36
+ content = msg.get("content", "")
37
+
38
+ if role == "system":
39
+ # Collect system messages for later use
40
+ system_content += content + "\n"
41
+ continue
42
+
43
+ if role == "assistant":
44
+ # Track tool_use_ids in this assistant message for the next user message
45
+ previous_assistant_tool_use_ids = set()
46
+ if isinstance(content, list):
47
+ for item in content:
48
+ if isinstance(item, dict) and item.get("type") == "tool_use" and "id" in item:
49
+ previous_assistant_tool_use_ids.add(item["id"])
50
+
51
+ if role in ["user", "assistant"]:
52
+ anthropic_msg = {"role": role}
53
+
54
+ # Convert content based on type
55
+ if isinstance(content, str):
56
+ # Simple text content
57
+ anthropic_msg["content"] = [{"type": "text", "text": content}]
58
+ elif isinstance(content, list):
59
+ # Convert complex content
60
+ anthropic_content = []
61
+ for item in content:
62
+ item_type = item.get("type", "")
63
+
64
+ if item_type == "text":
65
+ anthropic_content.append({"type": "text", "text": item.get("text", "")})
66
+ elif item_type == "image_url":
67
+ # Convert OpenAI image format to Anthropic
68
+ image_url = item.get("image_url", {}).get("url", "")
69
+ if image_url.startswith("data:"):
70
+ # Extract base64 data and media type
71
+ match = re.match(r"data:(.+);base64,(.+)", image_url)
72
+ if match:
73
+ media_type, data = match.groups()
74
+ anthropic_content.append(
75
+ {
76
+ "type": "image",
77
+ "source": {
78
+ "type": "base64",
79
+ "media_type": media_type,
80
+ "data": data,
81
+ },
82
+ }
83
+ )
84
+ else:
85
+ # Regular URL
86
+ anthropic_content.append(
87
+ {
88
+ "type": "image",
89
+ "source": {
90
+ "type": "url",
91
+ "url": image_url,
92
+ },
93
+ }
94
+ )
95
+ elif item_type == "tool_use":
96
+ # Always include tool_use blocks
97
+ anthropic_content.append(item)
98
+ elif item_type == "tool_result":
99
+ # Check if this is a user message AND if the tool_use_id exists in the previous assistant message
100
+ tool_use_id = item.get("tool_use_id")
101
+
102
+ # Only include tool_result if it references a tool_use from the immediately preceding assistant message
103
+ if (
104
+ role == "user"
105
+ and tool_use_id
106
+ and tool_use_id in previous_assistant_tool_use_ids
107
+ ):
108
+ anthropic_content.append(item)
109
+ else:
110
+ content_text = "Tool Result: "
111
+ if "content" in item:
112
+ if isinstance(item["content"], list):
113
+ for content_item in item["content"]:
114
+ if (
115
+ isinstance(content_item, dict)
116
+ and content_item.get("type") == "text"
117
+ ):
118
+ content_text += content_item.get("text", "")
119
+ elif isinstance(item["content"], str):
120
+ content_text += item["content"]
121
+ anthropic_content.append({"type": "text", "text": content_text})
122
+
123
+ anthropic_msg["content"] = anthropic_content
124
+
125
+ result.append(anthropic_msg)
126
+
127
+ return result, system_content
128
+
129
+
130
+ def from_anthropic_format(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
131
+ """Convert Anthropic format messages to standard OpenAI format.
132
+
133
+ Args:
134
+ messages: List of messages in Anthropic format
135
+
136
+ Returns:
137
+ List of messages in OpenAI format
138
+ """
139
+ result = []
140
+
141
+ for msg in messages:
142
+ role = msg.get("role", "")
143
+ content = msg.get("content", [])
144
+
145
+ if role in ["user", "assistant"]:
146
+ openai_msg = {"role": role}
147
+
148
+ # Simple case: single text block
149
+ if len(content) == 1 and content[0].get("type") == "text":
150
+ openai_msg["content"] = content[0].get("text", "")
151
+ else:
152
+ # Complex case: multiple blocks or non-text
153
+ openai_content = []
154
+ for item in content:
155
+ item_type = item.get("type", "")
156
+
157
+ if item_type == "text":
158
+ openai_content.append({"type": "text", "text": item.get("text", "")})
159
+ elif item_type == "image":
160
+ # Convert Anthropic image to OpenAI format
161
+ source = item.get("source", {})
162
+ if source.get("type") == "base64":
163
+ media_type = source.get("media_type", "image/png")
164
+ data = source.get("data", "")
165
+ openai_content.append(
166
+ {
167
+ "type": "image_url",
168
+ "image_url": {"url": f"data:{media_type};base64,{data}"},
169
+ }
170
+ )
171
+ else:
172
+ # URL
173
+ openai_content.append(
174
+ {
175
+ "type": "image_url",
176
+ "image_url": {"url": source.get("url", "")},
177
+ }
178
+ )
179
+ elif item_type in ["tool_use", "tool_result"]:
180
+ # Pass through tool-related content
181
+ openai_content.append(item)
182
+
183
+ openai_msg["content"] = openai_content
184
+
185
+ result.append(openai_msg)
186
+
187
+ return result
188
+
189
+
190
+ async def to_agent_response_format(
191
+ response: BetaMessage,
192
+ messages: List[Dict[str, Any]],
193
+ parsed_screen: Optional[ParseResult] = None,
194
+ parser: Optional[Any] = None,
195
+ model: Optional[str] = None,
196
+ ) -> AgentResponse:
197
+ """Convert an Anthropic response to the standard agent response format.
198
+
199
+ Args:
200
+ response: The Anthropic API response (BetaMessage)
201
+ messages: List of messages in standard format
202
+ parsed_screen: Optional pre-parsed screen information
203
+ parser: Optional parser instance for coordinate calculation
204
+ model: Optional model name
205
+
206
+ Returns:
207
+ A response formatted according to the standard agent response format
208
+ """
209
+ # Create unique IDs for this response
210
+ response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
211
+ reasoning_id = f"rs_{response_id}"
212
+ action_id = f"cu_{response_id}"
213
+ call_id = f"call_{response_id}"
214
+
215
+ # Extract content and reasoning from Anthropic response
216
+ content = []
217
+ reasoning_text = None
218
+ action_details = None
219
+
220
+ for block in response.content:
221
+ if block.type == "text":
222
+ # Use the first text block as reasoning
223
+ if reasoning_text is None:
224
+ reasoning_text = block.text
225
+ content.append({"type": "text", "text": block.text})
226
+ elif block.type == "tool_use" and block.name == "computer":
227
+ try:
228
+ input_dict = cast(Dict[str, Any], block.input)
229
+ action = input_dict.get("action", "").lower()
230
+
231
+ # Extract coordinates from coordinate list if provided
232
+ coordinates = input_dict.get("coordinate", [100, 100])
233
+ x, y = coordinates if len(coordinates) == 2 else (100, 100)
234
+
235
+ if action == "screenshot":
236
+ action_details = {
237
+ "type": "screenshot",
238
+ }
239
+ elif action in ["click", "left_click", "right_click", "double_click"]:
240
+ action_details = {
241
+ "type": "click",
242
+ "button": "left" if action in ["click", "left_click"] else "right",
243
+ "double": action == "double_click",
244
+ "x": x,
245
+ "y": y,
246
+ }
247
+ elif action == "type":
248
+ action_details = {
249
+ "type": "type",
250
+ "text": input_dict.get("text", ""),
251
+ }
252
+ elif action == "key":
253
+ action_details = {
254
+ "type": "hotkey",
255
+ "keys": [input_dict.get("text", "")],
256
+ }
257
+ elif action == "scroll":
258
+ scroll_amount = input_dict.get("scroll_amount", 1)
259
+ scroll_direction = input_dict.get("scroll_direction", "down")
260
+ delta_y = scroll_amount if scroll_direction == "down" else -scroll_amount
261
+ action_details = {
262
+ "type": "scroll",
263
+ "x": x,
264
+ "y": y,
265
+ "delta_x": 0,
266
+ "delta_y": delta_y,
267
+ }
268
+ elif action == "move":
269
+ action_details = {
270
+ "type": "move",
271
+ "x": x,
272
+ "y": y,
273
+ }
274
+ except Exception as e:
275
+ logger.error(f"Error extracting action details: {str(e)}")
276
+
277
+ # Create output items with reasoning
278
+ output_items = []
279
+ if reasoning_text:
280
+ output_items.append(
281
+ {
282
+ "type": "reasoning",
283
+ "id": reasoning_id,
284
+ "summary": [
285
+ {
286
+ "type": "summary_text",
287
+ "text": reasoning_text,
288
+ }
289
+ ],
290
+ }
291
+ )
292
+
293
+ # Add computer_call item with extracted or default action
294
+ computer_call = {
295
+ "type": "computer_call",
296
+ "id": action_id,
297
+ "call_id": call_id,
298
+ "action": action_details or {"type": "none", "description": "No action specified"},
299
+ "pending_safety_checks": [],
300
+ "status": "completed",
301
+ }
302
+ output_items.append(computer_call)
303
+
304
+ # Create the standard response format
305
+ standard_response = {
306
+ "id": response_id,
307
+ "object": "response",
308
+ "created_at": int(datetime.now().timestamp()),
309
+ "status": "completed",
310
+ "error": None,
311
+ "incomplete_details": None,
312
+ "instructions": None,
313
+ "max_output_tokens": None,
314
+ "model": model or "anthropic-default",
315
+ "output": output_items,
316
+ "parallel_tool_calls": True,
317
+ "previous_response_id": None,
318
+ "reasoning": {"effort": "medium", "generate_summary": "concise"},
319
+ "store": True,
320
+ "temperature": 1.0,
321
+ "text": {"format": {"type": "text"}},
322
+ "tool_choice": "auto",
323
+ "tools": [
324
+ {
325
+ "type": "computer_use_preview",
326
+ "display_height": 768,
327
+ "display_width": 1024,
328
+ "environment": "mac",
329
+ }
330
+ ],
331
+ "top_p": 1.0,
332
+ "truncation": "auto",
333
+ "usage": {
334
+ "input_tokens": 0,
335
+ "input_tokens_details": {"cached_tokens": 0},
336
+ "output_tokens": 0,
337
+ "output_tokens_details": {"reasoning_tokens": 0},
338
+ "total_tokens": 0,
339
+ },
340
+ "user": None,
341
+ "metadata": {},
342
+ "response": {
343
+ "choices": [
344
+ {
345
+ "message": {
346
+ "role": "assistant",
347
+ "content": content,
348
+ "tool_calls": [],
349
+ },
350
+ "finish_reason": response.stop_reason or "stop",
351
+ }
352
+ ]
353
+ },
354
+ }
355
+
356
+ # Add tool calls if present
357
+ tool_calls = []
358
+ for block in response.content:
359
+ if hasattr(block, "type") and block.type == "tool_use":
360
+ tool_calls.append(
361
+ {
362
+ "id": f"call_{block.id}",
363
+ "type": "function",
364
+ "function": {"name": block.name, "arguments": block.input},
365
+ }
366
+ )
367
+ if tool_calls:
368
+ standard_response["response"]["choices"][0]["message"]["tool_calls"] = tool_calls
369
+
370
+ return cast(AgentResponse, standard_response)
@@ -1,27 +1,8 @@
1
1
  """Omni provider implementation."""
2
2
 
3
- # The OmniComputerAgent has been replaced by the unified ComputerAgent
4
- # which can be found in agent.core.agent
5
3
  from .types import LLMProvider
6
- from .experiment import ExperimentManager
7
- from .visualization import visualize_click, visualize_scroll, calculate_element_center
8
4
  from .image_utils import (
9
5
  decode_base64_image,
10
- encode_image_base64,
11
- clean_base64_data,
12
- extract_base64_from_text,
13
- get_image_dimensions,
14
6
  )
15
7
 
16
- __all__ = [
17
- "LLMProvider",
18
- "ExperimentManager",
19
- "visualize_click",
20
- "visualize_scroll",
21
- "calculate_element_center",
22
- "decode_base64_image",
23
- "encode_image_base64",
24
- "clean_base64_data",
25
- "extract_base64_from_text",
26
- "get_image_dimensions",
27
- ]
8
+ __all__ = ["LLMProvider", "decode_base64_image"]
@@ -0,0 +1,42 @@
1
+ """API handling for Omni provider."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List
5
+
6
+ from .prompts import SYSTEM_PROMPT
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class OmniAPIHandler:
12
+ """Handler for Omni API calls."""
13
+
14
+ def __init__(self, loop):
15
+ """Initialize the API handler.
16
+
17
+ Args:
18
+ loop: Parent loop instance
19
+ """
20
+ self.loop = loop
21
+
22
+ async def make_api_call(
23
+ self, messages: List[Dict[str, Any]], system_prompt: str = SYSTEM_PROMPT
24
+ ) -> Any:
25
+ """Make an API call to the appropriate provider.
26
+
27
+ Args:
28
+ messages: List of messages in standard OpenAI format
29
+ system_prompt: System prompt to use
30
+
31
+ Returns:
32
+ API response
33
+ """
34
+ if not self.loop._make_api_call:
35
+ raise RuntimeError("Loop does not have _make_api_call method")
36
+
37
+ try:
38
+ # Use the loop's _make_api_call method with standard messages
39
+ return await self.loop._make_api_call(messages=messages, system_prompt=system_prompt)
40
+ except Exception as e:
41
+ logger.error(f"Error making API call: {str(e)}")
42
+ raise
@@ -44,6 +44,10 @@ class AnthropicClient(BaseOmniClient):
44
44
  anthropic_messages = []
45
45
 
46
46
  for message in messages:
47
+ # Skip messages with empty content
48
+ if not message.get("content"):
49
+ continue
50
+
47
51
  if message["role"] == "user":
48
52
  anthropic_messages.append({"role": "user", "content": message["content"]})
49
53
  elif message["role"] == "assistant":
@@ -32,75 +32,3 @@ def decode_base64_image(img_base64: str) -> Optional[Image.Image]:
32
32
  except Exception as e:
33
33
  logger.error(f"Error decoding base64 image: {str(e)}")
34
34
  return None
35
-
36
-
37
- def encode_image_base64(img: Image.Image, format: str = "PNG") -> str:
38
- """Encode a PIL Image to base64.
39
-
40
- Args:
41
- img: PIL Image to encode
42
- format: Image format (PNG, JPEG, etc.)
43
-
44
- Returns:
45
- Base64 encoded image string
46
- """
47
- try:
48
- buffered = BytesIO()
49
- img.save(buffered, format=format)
50
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
51
- except Exception as e:
52
- logger.error(f"Error encoding image to base64: {str(e)}")
53
- return ""
54
-
55
-
56
- def clean_base64_data(img_base64: str) -> str:
57
- """Clean base64 image data by removing data URL prefix.
58
-
59
- Args:
60
- img_base64: Base64 encoded image, may include data URL prefix
61
-
62
- Returns:
63
- Clean base64 string without prefix
64
- """
65
- if img_base64.startswith("data:image"):
66
- return img_base64.split(",")[1]
67
- return img_base64
68
-
69
-
70
- def extract_base64_from_text(text: str) -> Optional[str]:
71
- """Extract base64 image data from a text string.
72
-
73
- Args:
74
- text: Text potentially containing base64 image data
75
-
76
- Returns:
77
- Base64 string or None if not found
78
- """
79
- # Look for data URL pattern
80
- data_url_pattern = r"data:image/[^;]+;base64,([a-zA-Z0-9+/=]+)"
81
- match = re.search(data_url_pattern, text)
82
- if match:
83
- return match.group(1)
84
-
85
- # Look for plain base64 pattern (basic heuristic)
86
- base64_pattern = r"([a-zA-Z0-9+/=]{100,})"
87
- match = re.search(base64_pattern, text)
88
- if match:
89
- return match.group(1)
90
-
91
- return None
92
-
93
-
94
- def get_image_dimensions(img_base64: str) -> Tuple[int, int]:
95
- """Get the dimensions of a base64 encoded image.
96
-
97
- Args:
98
- img_base64: Base64 encoded image
99
-
100
- Returns:
101
- Tuple of (width, height) or (0, 0) if decoding fails
102
- """
103
- img = decode_base64_image(img_base64)
104
- if img:
105
- return img.size
106
- return (0, 0)