cua-agent 0.1.5__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (52) hide show
  1. agent/__init__.py +3 -4
  2. agent/core/__init__.py +3 -10
  3. agent/core/computer_agent.py +207 -32
  4. agent/core/experiment.py +20 -3
  5. agent/core/loop.py +78 -120
  6. agent/core/messages.py +279 -125
  7. agent/core/telemetry.py +44 -32
  8. agent/core/types.py +35 -0
  9. agent/core/visualization.py +197 -0
  10. agent/providers/anthropic/api/client.py +142 -1
  11. agent/providers/anthropic/api_handler.py +140 -0
  12. agent/providers/anthropic/callbacks/__init__.py +5 -0
  13. agent/providers/anthropic/loop.py +224 -209
  14. agent/providers/anthropic/messages/manager.py +3 -1
  15. agent/providers/anthropic/response_handler.py +229 -0
  16. agent/providers/anthropic/tools/base.py +1 -1
  17. agent/providers/anthropic/tools/bash.py +0 -97
  18. agent/providers/anthropic/tools/collection.py +2 -2
  19. agent/providers/anthropic/tools/computer.py +34 -24
  20. agent/providers/anthropic/tools/manager.py +2 -2
  21. agent/providers/anthropic/utils.py +370 -0
  22. agent/providers/omni/__init__.py +1 -20
  23. agent/providers/omni/api_handler.py +42 -0
  24. agent/providers/omni/clients/anthropic.py +4 -0
  25. agent/providers/omni/image_utils.py +0 -72
  26. agent/providers/omni/loop.py +497 -607
  27. agent/providers/omni/parser.py +60 -5
  28. agent/providers/omni/tools/__init__.py +25 -8
  29. agent/providers/omni/tools/base.py +29 -0
  30. agent/providers/omni/tools/bash.py +43 -38
  31. agent/providers/omni/tools/computer.py +144 -181
  32. agent/providers/omni/tools/manager.py +26 -48
  33. agent/providers/omni/types.py +0 -4
  34. agent/providers/omni/utils.py +225 -144
  35. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/METADATA +6 -36
  36. cua_agent-0.1.17.dist-info/RECORD +63 -0
  37. agent/core/agent.py +0 -252
  38. agent/core/base_agent.py +0 -164
  39. agent/core/factory.py +0 -102
  40. agent/providers/omni/callbacks.py +0 -78
  41. agent/providers/omni/clients/groq.py +0 -101
  42. agent/providers/omni/experiment.py +0 -273
  43. agent/providers/omni/messages.py +0 -171
  44. agent/providers/omni/tool_manager.py +0 -91
  45. agent/providers/omni/visualization.py +0 -130
  46. agent/types/__init__.py +0 -26
  47. agent/types/base.py +0 -53
  48. agent/types/messages.py +0 -36
  49. cua_agent-0.1.5.dist-info/RECORD +0 -67
  50. /agent/{types → core}/tools.py +0 -0
  51. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/WHEEL +0 -0
  52. {cua_agent-0.1.5.dist-info → cua_agent-0.1.17.dist-info}/entry_points.txt +0 -0
agent/core/messages.py CHANGED
@@ -1,12 +1,11 @@
1
1
  """Message handling utilities for agent."""
2
2
 
3
- import base64
4
- from datetime import datetime
5
- from io import BytesIO
6
3
  import logging
7
- from typing import Any, Dict, List, Optional, Union
8
- from PIL import Image
4
+ import json
5
+ from typing import Any, Dict, List, Optional, Union, Tuple
9
6
  from dataclasses import dataclass
7
+ import re
8
+ from ..providers.omni.parser import ParseResult
10
9
 
11
10
  logger = logging.getLogger(__name__)
12
11
 
@@ -123,123 +122,278 @@ class BaseMessageManager:
123
122
  break
124
123
 
125
124
 
126
- def create_user_message(text: str) -> Dict[str, str]:
127
- """Create a user message.
128
-
129
- Args:
130
- text: The message text
131
-
132
- Returns:
133
- Message dictionary
134
- """
135
- return {
136
- "role": "user",
137
- "content": text,
138
- }
139
-
140
-
141
- def create_assistant_message(text: str) -> Dict[str, str]:
142
- """Create an assistant message.
143
-
144
- Args:
145
- text: The message text
146
-
147
- Returns:
148
- Message dictionary
149
- """
150
- return {
151
- "role": "assistant",
152
- "content": text,
153
- }
154
-
155
-
156
- def create_system_message(text: str) -> Dict[str, str]:
157
- """Create a system message.
158
-
159
- Args:
160
- text: The message text
161
-
162
- Returns:
163
- Message dictionary
164
- """
165
- return {
166
- "role": "system",
167
- "content": text,
168
- }
169
-
170
-
171
- def create_image_message(
172
- image_base64: Optional[str] = None,
173
- image_path: Optional[str] = None,
174
- image_obj: Optional[Image.Image] = None,
175
- ) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
176
- """Create a message with an image.
177
-
178
- Args:
179
- image_base64: Base64 encoded image
180
- image_path: Path to image file
181
- image_obj: PIL Image object
182
-
183
- Returns:
184
- Message dictionary with content list
185
-
186
- Raises:
187
- ValueError: If no image source is provided
188
- """
189
- if not any([image_base64, image_path, image_obj]):
190
- raise ValueError("Must provide one of image_base64, image_path, or image_obj")
191
-
192
- # Convert to base64 if needed
193
- if image_path and not image_base64:
194
- with open(image_path, "rb") as f:
195
- image_bytes = f.read()
196
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
197
- elif image_obj and not image_base64:
198
- buffer = BytesIO()
199
- image_obj.save(buffer, format="PNG")
200
- image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
201
-
202
- return {
203
- "role": "user",
204
- "content": [
205
- {"type": "image", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
206
- ],
207
- }
208
-
209
-
210
- def create_screen_message(
211
- parsed_screen: Dict[str, Any],
212
- include_raw: bool = False,
213
- ) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
214
- """Create a message with screen information.
215
-
216
- Args:
217
- parsed_screen: Dictionary containing parsed screen info
218
- include_raw: Whether to include raw screenshot base64
219
-
220
- Returns:
221
- Message dictionary with content
222
- """
223
- if include_raw and "screenshot_base64" in parsed_screen:
224
- # Create content list with both image and text
225
- return {
226
- "role": "user",
227
- "content": [
228
- {
229
- "type": "image",
230
- "image_url": {
231
- "url": f"data:image/png;base64,{parsed_screen['screenshot_base64']}"
232
- },
233
- },
234
- {
235
- "type": "text",
236
- "text": f"Screen dimensions: {parsed_screen['width']}x{parsed_screen['height']}",
237
- },
238
- ],
239
- }
240
- else:
241
- # Create text-only message with screen info
242
- return {
243
- "role": "user",
244
- "content": f"Screen dimensions: {parsed_screen['width']}x{parsed_screen['height']}",
245
- }
125
+ class StandardMessageManager:
126
+ """Manages messages in a standardized OpenAI format across different providers."""
127
+
128
+ def __init__(self, config: Optional[ImageRetentionConfig] = None):
129
+ """Initialize message manager.
130
+
131
+ Args:
132
+ config: Configuration for image retention
133
+ """
134
+ self.messages: List[Dict[str, Any]] = []
135
+ self.config = config or ImageRetentionConfig()
136
+
137
+ def add_user_message(self, content: Union[str, List[Dict[str, Any]]]) -> None:
138
+ """Add a user message.
139
+
140
+ Args:
141
+ content: Message content (text or multimodal content)
142
+ """
143
+ self.messages.append({"role": "user", "content": content})
144
+
145
+ def add_assistant_message(self, content: Union[str, List[Dict[str, Any]]]) -> None:
146
+ """Add an assistant message.
147
+
148
+ Args:
149
+ content: Message content (text or multimodal content)
150
+ """
151
+ self.messages.append({"role": "assistant", "content": content})
152
+
153
+ def add_system_message(self, content: str) -> None:
154
+ """Add a system message.
155
+
156
+ Args:
157
+ content: System message content
158
+ """
159
+ self.messages.append({"role": "system", "content": content})
160
+
161
+ def get_messages(self) -> List[Dict[str, Any]]:
162
+ """Get all messages in standard format.
163
+
164
+ Returns:
165
+ List of messages
166
+ """
167
+ # If image retention is configured, apply it
168
+ if self.config.num_images_to_keep is not None:
169
+ return self._apply_image_retention(self.messages)
170
+ return self.messages
171
+
172
+ def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
173
+ """Apply image retention policy to messages.
174
+
175
+ Args:
176
+ messages: List of messages
177
+
178
+ Returns:
179
+ List of messages with image retention applied
180
+ """
181
+ if not self.config.num_images_to_keep:
182
+ return messages
183
+
184
+ # Find user messages with images
185
+ image_messages = []
186
+ for msg in messages:
187
+ if msg["role"] == "user" and isinstance(msg["content"], list):
188
+ has_image = any(
189
+ item.get("type") == "image_url" or item.get("type") == "image"
190
+ for item in msg["content"]
191
+ )
192
+ if has_image:
193
+ image_messages.append(msg)
194
+
195
+ # If we don't have more images than the limit, return all messages
196
+ if len(image_messages) <= self.config.num_images_to_keep:
197
+ return messages
198
+
199
+ # Get the most recent N images to keep
200
+ images_to_keep = image_messages[-self.config.num_images_to_keep :]
201
+ images_to_remove = image_messages[: -self.config.num_images_to_keep]
202
+
203
+ # Create a new message list without the older images
204
+ result = []
205
+ for msg in messages:
206
+ if msg in images_to_remove:
207
+ # Skip this message
208
+ continue
209
+ result.append(msg)
210
+
211
+ return result
212
+
213
+ def to_anthropic_format(
214
+ self, messages: List[Dict[str, Any]]
215
+ ) -> Tuple[List[Dict[str, Any]], str]:
216
+ """Convert standard OpenAI format messages to Anthropic format.
217
+
218
+ Args:
219
+ messages: List of messages in OpenAI format
220
+
221
+ Returns:
222
+ Tuple containing (anthropic_messages, system_content)
223
+ """
224
+ result = []
225
+ system_content = ""
226
+
227
+ # Process messages in order to maintain conversation flow
228
+ previous_assistant_tool_use_ids = (
229
+ set()
230
+ ) # Track tool_use_ids in the previous assistant message
231
+
232
+ for i, msg in enumerate(messages):
233
+ role = msg.get("role", "")
234
+ content = msg.get("content", "")
235
+
236
+ if role == "system":
237
+ # Collect system messages for later use
238
+ system_content += content + "\n"
239
+ continue
240
+
241
+ if role == "assistant":
242
+ # Track tool_use_ids in this assistant message for the next user message
243
+ previous_assistant_tool_use_ids = set()
244
+ if isinstance(content, list):
245
+ for item in content:
246
+ if (
247
+ isinstance(item, dict)
248
+ and item.get("type") == "tool_use"
249
+ and "id" in item
250
+ ):
251
+ previous_assistant_tool_use_ids.add(item["id"])
252
+
253
+ logger.info(
254
+ f"Tool use IDs in assistant message #{i}: {previous_assistant_tool_use_ids}"
255
+ )
256
+
257
+ if role in ["user", "assistant"]:
258
+ anthropic_msg = {"role": role}
259
+
260
+ # Convert content based on type
261
+ if isinstance(content, str):
262
+ # Simple text content
263
+ anthropic_msg["content"] = [{"type": "text", "text": content}]
264
+ elif isinstance(content, list):
265
+ # Convert complex content
266
+ anthropic_content = []
267
+ for item in content:
268
+ item_type = item.get("type", "")
269
+
270
+ if item_type == "text":
271
+ anthropic_content.append({"type": "text", "text": item.get("text", "")})
272
+ elif item_type == "image_url":
273
+ # Convert OpenAI image format to Anthropic
274
+ image_url = item.get("image_url", {}).get("url", "")
275
+ if image_url.startswith("data:"):
276
+ # Extract base64 data and media type
277
+ match = re.match(r"data:(.+);base64,(.+)", image_url)
278
+ if match:
279
+ media_type, data = match.groups()
280
+ anthropic_content.append(
281
+ {
282
+ "type": "image",
283
+ "source": {
284
+ "type": "base64",
285
+ "media_type": media_type,
286
+ "data": data,
287
+ },
288
+ }
289
+ )
290
+ else:
291
+ # Regular URL
292
+ anthropic_content.append(
293
+ {
294
+ "type": "image",
295
+ "source": {
296
+ "type": "url",
297
+ "url": image_url,
298
+ },
299
+ }
300
+ )
301
+ elif item_type == "tool_use":
302
+ # Always include tool_use blocks
303
+ anthropic_content.append(item)
304
+ elif item_type == "tool_result":
305
+ # Check if this is a user message AND if the tool_use_id exists in the previous assistant message
306
+ tool_use_id = item.get("tool_use_id")
307
+
308
+ # Only include tool_result if it references a tool_use from the immediately preceding assistant message
309
+ if (
310
+ role == "user"
311
+ and tool_use_id
312
+ and tool_use_id in previous_assistant_tool_use_ids
313
+ ):
314
+ anthropic_content.append(item)
315
+ logger.info(
316
+ f"Including tool_result with tool_use_id: {tool_use_id}"
317
+ )
318
+ else:
319
+ # Convert to text to preserve information
320
+ logger.warning(
321
+ f"Converting tool_result to text. Tool use ID {tool_use_id} not found in previous assistant message"
322
+ )
323
+ content_text = "Tool Result: "
324
+ if "content" in item:
325
+ if isinstance(item["content"], list):
326
+ for content_item in item["content"]:
327
+ if (
328
+ isinstance(content_item, dict)
329
+ and content_item.get("type") == "text"
330
+ ):
331
+ content_text += content_item.get("text", "")
332
+ elif isinstance(item["content"], str):
333
+ content_text += item["content"]
334
+ anthropic_content.append({"type": "text", "text": content_text})
335
+
336
+ anthropic_msg["content"] = anthropic_content
337
+
338
+ result.append(anthropic_msg)
339
+
340
+ return result, system_content
341
+
342
+ def from_anthropic_format(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
343
+ """Convert Anthropic format messages to standard OpenAI format.
344
+
345
+ Args:
346
+ messages: List of messages in Anthropic format
347
+
348
+ Returns:
349
+ List of messages in OpenAI format
350
+ """
351
+ result = []
352
+
353
+ for msg in messages:
354
+ role = msg.get("role", "")
355
+ content = msg.get("content", [])
356
+
357
+ if role in ["user", "assistant"]:
358
+ openai_msg = {"role": role}
359
+
360
+ # Simple case: single text block
361
+ if len(content) == 1 and content[0].get("type") == "text":
362
+ openai_msg["content"] = content[0].get("text", "")
363
+ else:
364
+ # Complex case: multiple blocks or non-text
365
+ openai_content = []
366
+ for item in content:
367
+ item_type = item.get("type", "")
368
+
369
+ if item_type == "text":
370
+ openai_content.append({"type": "text", "text": item.get("text", "")})
371
+ elif item_type == "image":
372
+ # Convert Anthropic image to OpenAI format
373
+ source = item.get("source", {})
374
+ if source.get("type") == "base64":
375
+ media_type = source.get("media_type", "image/png")
376
+ data = source.get("data", "")
377
+ openai_content.append(
378
+ {
379
+ "type": "image_url",
380
+ "image_url": {"url": f"data:{media_type};base64,{data}"},
381
+ }
382
+ )
383
+ else:
384
+ # URL
385
+ openai_content.append(
386
+ {
387
+ "type": "image_url",
388
+ "image_url": {"url": source.get("url", "")},
389
+ }
390
+ )
391
+ elif item_type in ["tool_use", "tool_result"]:
392
+ # Pass through tool-related content
393
+ openai_content.append(item)
394
+
395
+ openai_msg["content"] = openai_content
396
+
397
+ result.append(openai_msg)
398
+
399
+ return result
agent/core/telemetry.py CHANGED
@@ -4,58 +4,70 @@ import logging
4
4
  import os
5
5
  import platform
6
6
  import sys
7
- from typing import Dict, Any
7
+ from typing import Dict, Any, Callable
8
8
 
9
9
  # Import the core telemetry module
10
10
  TELEMETRY_AVAILABLE = False
11
11
 
12
+
13
+ # Local fallbacks in case core telemetry isn't available
14
+ def _noop(*args: Any, **kwargs: Any) -> None:
15
+ """No-op function for when telemetry is not available."""
16
+ pass
17
+
18
+
19
+ # Define default functions with unique names to avoid shadowing
20
+ _default_record_event = _noop
21
+ _default_increment_counter = _noop
22
+ _default_set_dimension = _noop
23
+ _default_get_telemetry_client = lambda: None
24
+ _default_flush = _noop
25
+ _default_is_telemetry_enabled = lambda: False
26
+ _default_is_telemetry_globally_disabled = lambda: True
27
+
28
+ # Set the actual functions to the defaults initially
29
+ record_event = _default_record_event
30
+ increment_counter = _default_increment_counter
31
+ set_dimension = _default_set_dimension
32
+ get_telemetry_client = _default_get_telemetry_client
33
+ flush = _default_flush
34
+ is_telemetry_enabled = _default_is_telemetry_enabled
35
+ is_telemetry_globally_disabled = _default_is_telemetry_globally_disabled
36
+
37
+ logger = logging.getLogger("cua.agent.telemetry")
38
+
12
39
  try:
40
+ # Import from core telemetry
13
41
  from core.telemetry import (
14
- record_event,
15
- increment,
16
- get_telemetry_client,
17
- flush,
18
- is_telemetry_enabled,
19
- is_telemetry_globally_disabled,
42
+ record_event as core_record_event,
43
+ increment as core_increment,
44
+ get_telemetry_client as core_get_telemetry_client,
45
+ flush as core_flush,
46
+ is_telemetry_enabled as core_is_telemetry_enabled,
47
+ is_telemetry_globally_disabled as core_is_telemetry_globally_disabled,
20
48
  )
21
49
 
50
+ # Override the default functions with actual implementations
51
+ record_event = core_record_event
52
+ get_telemetry_client = core_get_telemetry_client
53
+ flush = core_flush
54
+ is_telemetry_enabled = core_is_telemetry_enabled
55
+ is_telemetry_globally_disabled = core_is_telemetry_globally_disabled
56
+
22
57
  def increment_counter(counter_name: str, value: int = 1) -> None:
23
58
  """Wrapper for increment to maintain backward compatibility."""
24
59
  if is_telemetry_enabled():
25
- increment(counter_name, value)
60
+ core_increment(counter_name, value)
26
61
 
27
62
  def set_dimension(name: str, value: Any) -> None:
28
63
  """Set a dimension that will be attached to all events."""
29
- logger = logging.getLogger("cua.agent.telemetry")
30
64
  logger.debug(f"Setting dimension {name}={value}")
31
65
 
32
66
  TELEMETRY_AVAILABLE = True
33
- logger = logging.getLogger("cua.agent.telemetry")
34
67
  logger.info("Successfully imported telemetry")
35
68
  except ImportError as e:
36
- logger = logging.getLogger("cua.agent.telemetry")
37
69
  logger.warning(f"Could not import telemetry: {e}")
38
- TELEMETRY_AVAILABLE = False
39
-
40
-
41
- # Local fallbacks in case core telemetry isn't available
42
- def _noop(*args: Any, **kwargs: Any) -> None:
43
- """No-op function for when telemetry is not available."""
44
- pass
45
-
46
-
47
- logger = logging.getLogger("cua.agent.telemetry")
48
-
49
- # If telemetry isn't available, use no-op functions
50
- if not TELEMETRY_AVAILABLE:
51
70
  logger.debug("Telemetry not available, using no-op functions")
52
- record_event = _noop # type: ignore
53
- increment_counter = _noop # type: ignore
54
- set_dimension = _noop # type: ignore
55
- get_telemetry_client = lambda: None # type: ignore
56
- flush = _noop # type: ignore
57
- is_telemetry_enabled = lambda: False # type: ignore
58
- is_telemetry_globally_disabled = lambda: True # type: ignore
59
71
 
60
72
  # Get system info once to use in telemetry
61
73
  SYSTEM_INFO = {
@@ -71,7 +83,7 @@ def enable_telemetry() -> bool:
71
83
  Returns:
72
84
  bool: True if telemetry was successfully enabled, False otherwise
73
85
  """
74
- global TELEMETRY_AVAILABLE
86
+ global TELEMETRY_AVAILABLE, record_event, increment_counter, get_telemetry_client, flush, is_telemetry_enabled, is_telemetry_globally_disabled
75
87
 
76
88
  # Check if globally disabled using core function
77
89
  if TELEMETRY_AVAILABLE and is_telemetry_globally_disabled():
agent/core/types.py ADDED
@@ -0,0 +1,35 @@
1
+ """Core type definitions."""
2
+
3
+ from typing import Any, Dict, List, Optional, TypedDict, Union
4
+
5
+
6
+ class AgentResponse(TypedDict, total=False):
7
+ """Agent response format."""
8
+
9
+ id: str
10
+ object: str
11
+ created_at: int
12
+ status: str
13
+ error: Optional[str]
14
+ incomplete_details: Optional[Any]
15
+ instructions: Optional[Any]
16
+ max_output_tokens: Optional[int]
17
+ model: str
18
+ output: List[Dict[str, Any]]
19
+ parallel_tool_calls: bool
20
+ previous_response_id: Optional[str]
21
+ reasoning: Dict[str, str]
22
+ store: bool
23
+ temperature: float
24
+ text: Dict[str, Dict[str, str]]
25
+ tool_choice: str
26
+ tools: List[Dict[str, Union[str, int]]]
27
+ top_p: float
28
+ truncation: str
29
+ usage: Dict[str, Any]
30
+ user: Optional[str]
31
+ metadata: Dict[str, Any]
32
+ response: Dict[str, List[Dict[str, Any]]]
33
+ # Additional fields for error responses
34
+ role: str
35
+ content: Union[str, List[Dict[str, Any]]]