cua-agent 0.4.35__tar.gz → 0.4.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show
  1. {cua_agent-0.4.35 → cua_agent-0.4.37}/PKG-INFO +1 -1
  2. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/base.py +6 -2
  3. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/cua.py +6 -2
  4. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/custom.py +6 -2
  5. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/omniparser.py +142 -78
  6. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/qwen.py +63 -28
  7. {cua_agent-0.4.35 → cua_agent-0.4.37}/pyproject.toml +1 -1
  8. cua_agent-0.4.37/tests/conftest.py +84 -0
  9. cua_agent-0.4.37/tests/test_computer_agent.py +139 -0
  10. {cua_agent-0.4.35 → cua_agent-0.4.37}/README.md +0 -0
  11. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/__init__.py +0 -0
  12. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/__main__.py +0 -0
  13. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/__init__.py +0 -0
  14. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/huggingfacelocal_adapter.py +0 -0
  15. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/human_adapter.py +0 -0
  16. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/mlxvlm_adapter.py +0 -0
  17. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/__init__.py +0 -0
  18. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/generic.py +0 -0
  19. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/internvl.py +0 -0
  20. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/opencua.py +0 -0
  21. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/qwen2_5_vl.py +0 -0
  22. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/agent.py +0 -0
  23. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/__init__.py +0 -0
  24. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/base.py +0 -0
  25. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/budget_manager.py +0 -0
  26. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/image_retention.py +0 -0
  27. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/logging.py +0 -0
  28. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/operator_validator.py +0 -0
  29. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/pii_anonymization.py +0 -0
  30. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/prompt_instructions.py +0 -0
  31. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/telemetry.py +0 -0
  32. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/trajectory_saver.py +0 -0
  33. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/cli.py +0 -0
  34. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/__init__.py +0 -0
  35. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/decorators.py +0 -0
  36. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/__init__.py +0 -0
  37. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/__main__.py +0 -0
  38. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/server.py +0 -0
  39. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/ui.py +0 -0
  40. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/integrations/hud/__init__.py +0 -0
  41. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/integrations/hud/agent.py +0 -0
  42. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/integrations/hud/proxy.py +0 -0
  43. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/__init__.py +1 -1
  44. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/anthropic.py +0 -0
  45. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/base.py +0 -0
  46. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/composed_grounded.py +0 -0
  47. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/gemini.py +0 -0
  48. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/glm45v.py +0 -0
  49. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/gta1.py +0 -0
  50. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/holo.py +0 -0
  51. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/internvl.py +0 -0
  52. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/model_types.csv +0 -0
  53. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/moondream3.py +0 -0
  54. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/openai.py +0 -0
  55. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/opencua.py +0 -0
  56. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/uitars.py +0 -0
  57. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/proxy/examples.py +0 -0
  58. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/proxy/handlers.py +0 -0
  59. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/responses.py +0 -0
  60. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/types.py +0 -0
  61. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/__init__.py +0 -0
  62. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/__main__.py +0 -0
  63. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/gradio/__init__.py +0 -0
  64. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/gradio/app.py +0 -0
  65. {cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/gradio/ui_components.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.35
3
+ Version: 0.4.37
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.12
@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
28
28
  """Get screen dimensions as (width, height)."""
29
29
  ...
30
30
 
31
- async def screenshot(self) -> str:
32
- """Take a screenshot and return as base64 string."""
31
+ async def screenshot(self, text: Optional[str] = None) -> str:
32
+ """Take a screenshot and return as base64 string.
33
+
34
+ Args:
35
+ text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
36
+ """
33
37
  ...
34
38
 
35
39
  async def click(self, x: int, y: int, button: str = "left") -> None:
@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
36
36
  screen_size = await self.interface.get_screen_size()
37
37
  return screen_size["width"], screen_size["height"]
38
38
 
39
- async def screenshot(self) -> str:
40
- """Take a screenshot and return as base64 string."""
39
+ async def screenshot(self, text: Optional[str] = None) -> str:
40
+ """Take a screenshot and return as base64 string.
41
+
42
+ Args:
43
+ text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
44
+ """
41
45
  assert self.interface is not None
42
46
  screenshot_bytes = await self.interface.screenshot()
43
47
  return base64.b64encode(screenshot_bytes).decode("utf-8")
@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):
122
122
 
123
123
  return self._last_screenshot_size
124
124
 
125
- async def screenshot(self) -> str:
126
- """Take a screenshot and return as base64 string."""
125
+ async def screenshot(self, text: Optional[str] = None) -> str:
126
+ """Take a screenshot and return as base64 string.
127
+
128
+ Args:
129
+ text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
130
+ """
127
131
  result = await self._call_function(self.functions["screenshot"])
128
132
  b64_str = self._to_b64_str(result) # type: ignore
129
133
 
@@ -14,67 +14,73 @@ import litellm
14
14
 
15
15
  from ..decorators import register_agent
16
16
  from ..loops.base import AsyncAgentConfig
17
+ from ..responses import (
18
+ convert_completion_messages_to_responses_items,
19
+ convert_responses_items_to_completion_messages,
20
+ )
17
21
  from ..types import AgentCapability, AgentResponse, Messages, Tools
18
22
 
19
23
  SOM_TOOL_SCHEMA = {
20
24
  "type": "function",
21
- "name": "computer",
22
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
23
- "parameters": {
24
- "type": "object",
25
- "properties": {
26
- "action": {
27
- "type": "string",
28
- "enum": [
29
- "screenshot",
30
- "click",
31
- "double_click",
32
- "drag",
33
- "type",
34
- "keypress",
35
- "scroll",
36
- "move",
37
- "wait",
38
- "get_current_url",
39
- "get_dimensions",
40
- "get_environment",
41
- ],
42
- "description": "The action to perform",
43
- },
44
- "element_id": {
45
- "type": "integer",
46
- "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
47
- },
48
- "start_element_id": {
49
- "type": "integer",
50
- "description": "The ID of the element to start dragging from (required for drag action)",
51
- },
52
- "end_element_id": {
53
- "type": "integer",
54
- "description": "The ID of the element to drag to (required for drag action)",
55
- },
56
- "text": {
57
- "type": "string",
58
- "description": "The text to type (required for type action)",
59
- },
60
- "keys": {
61
- "type": "string",
62
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
63
- },
64
- "button": {
65
- "type": "string",
66
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
67
- },
68
- "scroll_x": {
69
- "type": "integer",
70
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
71
- },
72
- "scroll_y": {
73
- "type": "integer",
74
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
25
+ "function": {
26
+ "name": "computer",
27
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
28
+ "parameters": {
29
+ "type": "object",
30
+ "properties": {
31
+ "action": {
32
+ "type": "string",
33
+ "enum": [
34
+ "screenshot",
35
+ "click",
36
+ "double_click",
37
+ "drag",
38
+ "type",
39
+ "keypress",
40
+ "scroll",
41
+ "move",
42
+ "wait",
43
+ "get_current_url",
44
+ "get_dimensions",
45
+ "get_environment",
46
+ ],
47
+ "description": "The action to perform",
48
+ },
49
+ "element_id": {
50
+ "type": "integer",
51
+ "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
52
+ },
53
+ "start_element_id": {
54
+ "type": "integer",
55
+ "description": "The ID of the element to start dragging from (required for drag action)",
56
+ },
57
+ "end_element_id": {
58
+ "type": "integer",
59
+ "description": "The ID of the element to drag to (required for drag action)",
60
+ },
61
+ "text": {
62
+ "type": "string",
63
+ "description": "The text to type (required for type action)",
64
+ },
65
+ "keys": {
66
+ "type": "string",
67
+ "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
68
+ },
69
+ "button": {
70
+ "type": "string",
71
+ "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
72
+ },
73
+ "scroll_x": {
74
+ "type": "integer",
75
+ "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
76
+ },
77
+ "scroll_y": {
78
+ "type": "integer",
79
+ "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
80
+ },
75
81
  },
82
+ "required": ["action", "element_id"],
76
83
  },
77
- "required": ["action"],
78
84
  },
79
85
  }
80
86
 
@@ -243,18 +249,20 @@ async def replace_computer_call_with_function(
243
249
  "id": item.get("id"),
244
250
  "call_id": item.get("call_id"),
245
251
  "status": "completed",
246
- # Fall back to string representation
247
- "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
248
252
  }
249
253
  ]
250
254
 
251
255
  elif item_type == "computer_call_output":
252
- # Simple conversion: computer_call_output -> function_call_output
256
+ output = item.get("output")
257
+
258
+ if isinstance(output, dict):
259
+ output = [output]
260
+
253
261
  return [
254
262
  {
255
263
  "type": "function_call_output",
256
264
  "call_id": item.get("call_id"),
257
- "content": [item.get("output")],
265
+ "output": item.get("output"),
258
266
  "id": item.get("id"),
259
267
  "status": "completed",
260
268
  }
@@ -296,6 +304,13 @@ class OmniparserConfig(AsyncAgentConfig):
296
304
 
297
305
  llm_model = model.split("+")[-1]
298
306
 
307
+ # Get screen dimensions from computer handler
308
+ try:
309
+ width, height = await computer_handler.get_dimensions()
310
+ except Exception:
311
+ # Fallback to default dimensions if method fails
312
+ width, height = 1024, 768
313
+
299
314
  # Prepare tools for OpenAI API
300
315
  openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
301
316
 
@@ -309,27 +324,43 @@ class OmniparserConfig(AsyncAgentConfig):
309
324
  result = parser.parse(image_data)
310
325
  if _on_screenshot:
311
326
  await _on_screenshot(result.annotated_image_base64, "annotated_image")
327
+
328
+ # Convert OmniParser normalized coordinates (0-1) to absolute pixels, convert to pixels
312
329
  for element in result.elements:
313
- id2xy[element.id] = (
314
- (element.bbox.x1 + element.bbox.x2) / 2,
315
- (element.bbox.y1 + element.bbox.y2) / 2,
316
- )
317
-
318
- # handle computer calls -> function calls
319
- new_messages = []
320
- for message in messages:
330
+ norm_x = (element.bbox.x1 + element.bbox.x2) / 2
331
+ norm_y = (element.bbox.y1 + element.bbox.y2) / 2
332
+ pixel_x = int(norm_x * width)
333
+ pixel_y = int(norm_y * height)
334
+ id2xy[element.id] = (pixel_x, pixel_y)
335
+
336
+ # Replace the original screenshot with the annotated image
337
+ annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
338
+ last_computer_call_output["output"]["image_url"] = annotated_image_url
339
+
340
+ xy2id = {v: k for k, v in id2xy.items()}
341
+ messages_with_element_ids = []
342
+ for i, message in enumerate(messages):
321
343
  if not isinstance(message, dict):
322
344
  message = message.__dict__
323
- new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
324
- messages = new_messages
345
+
346
+ msg_type = message.get("type")
347
+
348
+ if msg_type == "computer_call" and "action" in message:
349
+ action = message.get("action", {})
350
+
351
+ converted = await replace_computer_call_with_function(message, xy2id) # type: ignore
352
+ messages_with_element_ids += converted
353
+
354
+ completion_messages = convert_responses_items_to_completion_messages(
355
+ messages_with_element_ids, allow_images_in_tool_results=False
356
+ )
325
357
 
326
358
  # Prepare API call kwargs
327
359
  api_kwargs = {
328
360
  "model": llm_model,
329
- "input": messages,
361
+ "messages": completion_messages,
330
362
  "tools": openai_tools if openai_tools else None,
331
363
  "stream": stream,
332
- "truncation": "auto",
333
364
  "num_retries": max_retries,
334
365
  **kwargs,
335
366
  }
@@ -340,8 +371,8 @@ class OmniparserConfig(AsyncAgentConfig):
340
371
 
341
372
  print(str(api_kwargs)[:1000])
342
373
 
343
- # Use liteLLM responses
344
- response = await litellm.aresponses(**api_kwargs)
374
+ # Use liteLLM completion
375
+ response = await litellm.acompletion(**api_kwargs)
345
376
 
346
377
  # Call API end hook
347
378
  if _on_api_end:
@@ -355,12 +386,45 @@ class OmniparserConfig(AsyncAgentConfig):
355
386
  if _on_usage:
356
387
  await _on_usage(usage)
357
388
 
358
- # handle som function calls -> xy computer calls
359
- new_output = []
360
- for i in range(len(response.output)): # type: ignore
361
- new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
389
+ response_dict = response.model_dump() # type: ignore
390
+ choice_messages = [choice["message"] for choice in response_dict["choices"]]
391
+ responses_items = []
392
+ for choice_message in choice_messages:
393
+ responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
394
+
395
+ # Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
396
+ final_output = []
397
+ for item in responses_items:
398
+ if item.get("type") == "computer_call" and "action" in item:
399
+ action = item["action"].copy()
400
+
401
+ # Handle single element_id
402
+ if "element_id" in action:
403
+ element_id = action["element_id"]
404
+ if element_id in id2xy:
405
+ x, y = id2xy[element_id]
406
+ action["x"] = x
407
+ action["y"] = y
408
+ del action["element_id"]
409
+
410
+ # Handle start_element_id and end_element_id for drag operations
411
+ elif "start_element_id" in action and "end_element_id" in action:
412
+ start_id = action["start_element_id"]
413
+ end_id = action["end_element_id"]
414
+ if start_id in id2xy and end_id in id2xy:
415
+ start_x, start_y = id2xy[start_id]
416
+ end_x, end_y = id2xy[end_id]
417
+ action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
418
+ del action["start_element_id"]
419
+ del action["end_element_id"]
420
+
421
+ converted_item = item.copy()
422
+ converted_item["action"] = action
423
+ final_output.append(converted_item)
424
+ else:
425
+ final_output.append(item)
362
426
 
363
- return {"output": new_output, "usage": usage}
427
+ return {"output": final_output, "usage": usage}
364
428
 
365
429
  async def predict_click(
366
430
  self, model: str, image_b64: str, instruction: str, **kwargs
@@ -3,12 +3,13 @@ Qwen3-VL agent loop implementation using litellm with function/tool calling.
3
3
  - Passes a ComputerUse tool schema to acompletion
4
4
  - Converts between Responses items and completion messages using helpers
5
5
  """
6
- from __future__ import annotations
7
6
 
8
- from typing import Any, Dict, List, Optional, Tuple
7
+ from __future__ import annotations
9
8
 
10
9
  import json
11
10
  import re
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
12
13
  import litellm
13
14
  from litellm.responses.litellm_completion_transformation.transformation import (
14
15
  LiteLLMCompletionResponsesConfig,
@@ -16,12 +17,11 @@ from litellm.responses.litellm_completion_transformation.transformation import (
16
17
 
17
18
  from ..decorators import register_agent
18
19
  from ..loops.base import AsyncAgentConfig
19
- from ..types import AgentCapability
20
20
  from ..responses import (
21
- convert_responses_items_to_completion_messages,
22
21
  convert_completion_messages_to_responses_items,
22
+ convert_responses_items_to_completion_messages,
23
23
  )
24
-
24
+ from ..types import AgentCapability
25
25
 
26
26
  # ComputerUse tool schema (OpenAI function tool format)
27
27
  QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
@@ -96,18 +96,29 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
96
96
  },
97
97
  }
98
98
 
99
+
99
100
  def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
100
101
  """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
101
102
  try:
102
103
  from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
103
- NousFnCallPrompt,
104
- Message as NousMessage,
105
104
  ContentItem as NousContentItem,
106
105
  )
106
+ from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
107
+ Message as NousMessage,
108
+ )
109
+ from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
110
+ NousFnCallPrompt,
111
+ )
107
112
  except ImportError:
108
- raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
113
+ raise ImportError(
114
+ "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
115
+ )
109
116
  msgs = NousFnCallPrompt().preprocess_fncall_messages(
110
- messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
117
+ messages=[
118
+ NousMessage(
119
+ role="system", content=[NousContentItem(text="You are a helpful assistant.")]
120
+ )
121
+ ],
111
122
  functions=functions,
112
123
  lang="en",
113
124
  )
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
116
127
  content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
117
128
  return {"role": "system", "content": content}
118
129
 
130
+
119
131
  def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
120
132
  """Extract JSON object within <tool_call>...</tool_call> from model text."""
121
133
  m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
126
138
  except Exception:
127
139
  return None
128
140
 
141
+
129
142
  async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
130
143
  """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
131
144
  coord = args.get("coordinate")
@@ -262,7 +275,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
262
275
  pre_output_items: List[Dict[str, Any]] = []
263
276
  if not _has_any_image(completion_messages):
264
277
  if computer_handler is None or not hasattr(computer_handler, "screenshot"):
265
- raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
278
+ raise RuntimeError(
279
+ "No screenshots present and computer_handler.screenshot is not available."
280
+ )
266
281
  screenshot_b64 = await computer_handler.screenshot()
267
282
  if not screenshot_b64:
268
283
  raise RuntimeError("Failed to capture screenshot from computer_handler.")
@@ -271,7 +286,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
271
286
  {
272
287
  "role": "user",
273
288
  "content": [
274
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
289
+ {
290
+ "type": "image_url",
291
+ "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
292
+ },
275
293
  {"type": "text", "text": "Current screen"},
276
294
  ],
277
295
  }
@@ -282,7 +300,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
282
300
  "type": "message",
283
301
  "role": "assistant",
284
302
  "content": [
285
- {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
303
+ {
304
+ "type": "text",
305
+ "text": "Taking a screenshot to see the current computer screen.",
306
+ }
286
307
  ],
287
308
  }
288
309
  )
@@ -294,11 +315,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
294
315
  MIN_PIXELS = 3136
295
316
  MAX_PIXELS = 12845056
296
317
  try:
297
- from qwen_vl_utils import smart_resize # type: ignore
318
+ import base64
319
+ import io
320
+
298
321
  from PIL import Image # type: ignore
299
- import base64, io
322
+ from qwen_vl_utils import smart_resize # type: ignore
300
323
  except Exception:
301
- raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
324
+ raise ImportError(
325
+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
326
+ )
302
327
 
303
328
  for msg in completion_messages:
304
329
  content = msg.get("content")
@@ -306,14 +331,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
306
331
  continue
307
332
  for part in content:
308
333
  if isinstance(part, dict) and part.get("type") == "image_url":
309
- url = (((part.get("image_url") or {}).get("url")) or "")
334
+ url = ((part.get("image_url") or {}).get("url")) or ""
310
335
  # Expect data URL like data:image/png;base64,<b64>
311
336
  if url.startswith("data:") and "," in url:
312
337
  b64 = url.split(",", 1)[1]
313
338
  img_bytes = base64.b64decode(b64)
314
339
  im = Image.open(io.BytesIO(img_bytes))
315
340
  h, w = im.height, im.width
316
- rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
341
+ rh, rw = smart_resize(
342
+ h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
343
+ )
317
344
  # Attach hints on this image block
318
345
  part["min_pixels"] = MIN_PIXELS
319
346
  part["max_pixels"] = MAX_PIXELS
@@ -349,7 +376,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
349
376
  # Parse tool call from text; then convert to responses items via fake tool_calls
350
377
  resp_dict = response.model_dump() # type: ignore
351
378
  choice = (resp_dict.get("choices") or [{}])[0]
352
- content_text = (((choice.get("message") or {}).get("content")) or "")
379
+ content_text = ((choice.get("message") or {}).get("content")) or ""
353
380
  tool_call = _parse_tool_call_from_text(content_text)
354
381
 
355
382
  output_items: List[Dict[str, Any]] = []
@@ -358,7 +385,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
358
385
  raw_args = tool_call.get("arguments") or {}
359
386
  # Unnormalize coordinates to actual screen size using last resized dims
360
387
  if last_rw is None or last_rh is None:
361
- raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
388
+ raise RuntimeError(
389
+ "No screenshots found to derive dimensions for coordinate unnormalization."
390
+ )
362
391
  args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
363
392
 
364
393
  # Build an OpenAI-style tool call so we can reuse the converter
@@ -426,10 +455,12 @@ class Qwen3VlConfig(AsyncAgentConfig):
426
455
  max_pixels = 12845056
427
456
  try:
428
457
  # Lazy import to avoid hard dependency
429
- from qwen_vl_utils import smart_resize # type: ignore
458
+ import base64
459
+ import io
460
+
430
461
  # If PIL is available, estimate size from image to derive smart bounds
431
462
  from PIL import Image
432
- import io, base64
463
+ from qwen_vl_utils import smart_resize # type: ignore
433
464
 
434
465
  img_bytes = base64.b64decode(image_b64)
435
466
  im = Image.open(io.BytesIO(img_bytes))
@@ -437,16 +468,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
437
468
  # Qwen notebook suggests factor=32 and a wide min/max range
438
469
  rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
439
470
  except Exception:
440
- raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
471
+ raise ImportError(
472
+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
473
+ )
441
474
 
442
475
  messages = []
443
476
  if nous_system:
444
477
  messages.append(nous_system)
445
478
  image_block: Dict[str, Any] = {
446
- "type": "image_url",
447
- "image_url": {
448
- "url": f"data:image/png;base64,{image_b64}"
449
- },
479
+ "type": "image_url",
480
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
450
481
  "min_pixels": min_pixels,
451
482
  "max_pixels": max_pixels,
452
483
  }
@@ -461,11 +492,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
461
492
  }
462
493
  )
463
494
 
464
- api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
495
+ api_kwargs: Dict[str, Any] = {
496
+ "model": model,
497
+ "messages": messages,
498
+ **{k: v for k, v in kwargs.items()},
499
+ }
465
500
  response = await litellm.acompletion(**api_kwargs)
466
501
  resp = response.model_dump() # type: ignore
467
502
  choice = (resp.get("choices") or [{}])[0]
468
- content_text = (((choice.get("message") or {}).get("content")) or "")
503
+ content_text = ((choice.get("message") or {}).get("content")) or ""
469
504
  tool_call = _parse_tool_call_from_text(content_text) or {}
470
505
  args = tool_call.get("arguments") or {}
471
506
  args = await _unnormalize_coordinate(args, (rh, rw))
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.4.35"
9
+ version = "0.4.37"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -0,0 +1,84 @@
1
+ """Pytest configuration and shared fixtures for agent package tests.
2
+
3
+ This file contains shared fixtures and configuration for all agent tests.
4
+ Following SRP: This file ONLY handles test setup/teardown.
5
+ """
6
+
7
+ from unittest.mock import AsyncMock, MagicMock, Mock, patch
8
+
9
+ import pytest
10
+
11
+
12
+ @pytest.fixture
13
+ def mock_litellm():
14
+ """Mock liteLLM completion calls.
15
+
16
+ Use this fixture to avoid making real LLM API calls during tests.
17
+ Returns a mock that simulates LLM responses.
18
+ """
19
+ with patch("litellm.acompletion") as mock_completion:
20
+
21
+ async def mock_response(*args, **kwargs):
22
+ """Simulate a typical LLM response."""
23
+ return {
24
+ "id": "chatcmpl-test123",
25
+ "object": "chat.completion",
26
+ "created": 1234567890,
27
+ "model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
28
+ "choices": [
29
+ {
30
+ "index": 0,
31
+ "message": {
32
+ "role": "assistant",
33
+ "content": "This is a mocked response for testing.",
34
+ },
35
+ "finish_reason": "stop",
36
+ }
37
+ ],
38
+ "usage": {
39
+ "prompt_tokens": 10,
40
+ "completion_tokens": 20,
41
+ "total_tokens": 30,
42
+ },
43
+ }
44
+
45
+ mock_completion.side_effect = mock_response
46
+ yield mock_completion
47
+
48
+
49
+ @pytest.fixture
50
+ def mock_computer():
51
+ """Mock Computer interface for agent tests.
52
+
53
+ Use this fixture to test agent logic without requiring a real Computer instance.
54
+ """
55
+ computer = AsyncMock()
56
+ computer.interface = AsyncMock()
57
+ computer.interface.screenshot = AsyncMock(return_value=b"fake_screenshot_data")
58
+ computer.interface.left_click = AsyncMock()
59
+ computer.interface.type = AsyncMock()
60
+ computer.interface.key = AsyncMock()
61
+
62
+ # Mock context manager
63
+ computer.__aenter__ = AsyncMock(return_value=computer)
64
+ computer.__aexit__ = AsyncMock()
65
+
66
+ return computer
67
+
68
+
69
+ @pytest.fixture
70
+ def disable_telemetry(monkeypatch):
71
+ """Disable telemetry for tests.
72
+
73
+ Use this fixture to ensure no telemetry is sent during tests.
74
+ """
75
+ monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")
76
+
77
+
78
+ @pytest.fixture
79
+ def sample_messages():
80
+ """Provide sample messages for testing.
81
+
82
+ Returns a list of messages in the expected format.
83
+ """
84
+ return [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
@@ -0,0 +1,139 @@
1
+ """Unit tests for ComputerAgent class.
2
+
3
+ This file tests ONLY the ComputerAgent initialization and basic functionality.
4
+ Following SRP: This file tests ONE class (ComputerAgent).
5
+ All external dependencies (liteLLM, Computer) are mocked.
6
+ """
7
+
8
+ from unittest.mock import AsyncMock, MagicMock, Mock, patch
9
+
10
+ import pytest
11
+
12
+
13
+ class TestComputerAgentInitialization:
14
+ """Test ComputerAgent initialization (SRP: Only tests initialization)."""
15
+
16
+ @patch("agent.agent.litellm")
17
+ def test_agent_initialization_with_model(self, mock_litellm, disable_telemetry):
18
+ """Test that agent can be initialized with a model string."""
19
+ from agent import ComputerAgent
20
+
21
+ agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
22
+
23
+ assert agent is not None
24
+ assert hasattr(agent, "model")
25
+ assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
26
+
27
+ @patch("agent.agent.litellm")
28
+ def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
29
+ """Test that agent can be initialized with tools."""
30
+ from agent import ComputerAgent
31
+
32
+ agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
33
+
34
+ assert agent is not None
35
+ assert hasattr(agent, "tools")
36
+
37
+ @patch("agent.agent.litellm")
38
+ def test_agent_initialization_with_max_budget(self, mock_litellm, disable_telemetry):
39
+ """Test that agent can be initialized with max trajectory budget."""
40
+ from agent import ComputerAgent
41
+
42
+ budget = 5.0
43
+ agent = ComputerAgent(
44
+ model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
45
+ )
46
+
47
+ assert agent is not None
48
+
49
+ @patch("agent.agent.litellm")
50
+ def test_agent_requires_model(self, mock_litellm, disable_telemetry):
51
+ """Test that agent requires a model parameter."""
52
+ from agent import ComputerAgent
53
+
54
+ with pytest.raises(TypeError):
55
+ # Should fail without model parameter - intentionally missing required argument
56
+ ComputerAgent() # type: ignore[call-arg]
57
+
58
+
59
+ class TestComputerAgentRun:
60
+ """Test ComputerAgent.run() method (SRP: Only tests run logic)."""
61
+
62
+ @pytest.mark.asyncio
63
+ @patch("agent.agent.litellm")
64
+ async def test_agent_run_with_messages(self, mock_litellm, disable_telemetry, sample_messages):
65
+ """Test that agent.run() works with valid messages."""
66
+ from agent import ComputerAgent
67
+
68
+ # Mock liteLLM response
69
+ mock_response = {
70
+ "id": "chatcmpl-test",
71
+ "choices": [
72
+ {
73
+ "message": {"role": "assistant", "content": "Test response"},
74
+ "finish_reason": "stop",
75
+ }
76
+ ],
77
+ "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
78
+ }
79
+
80
+ mock_litellm.acompletion = AsyncMock(return_value=mock_response)
81
+
82
+ agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
83
+
84
+ # Run should return an async generator
85
+ result_generator = agent.run(sample_messages)
86
+
87
+ assert result_generator is not None
88
+ # Check it's an async generator
89
+ assert hasattr(result_generator, "__anext__")
90
+
91
+ def test_agent_has_run_method(self, disable_telemetry):
92
+ """Test that agent has run method available."""
93
+ from agent import ComputerAgent
94
+
95
+ agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
96
+
97
+ # Verify run method exists
98
+ assert hasattr(agent, "run")
99
+ assert callable(agent.run)
100
+
101
+ def test_agent_has_agent_loop(self, disable_telemetry):
102
+ """Test that agent has agent_loop initialized."""
103
+ from agent import ComputerAgent
104
+
105
+ agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
106
+
107
+ # Verify agent_loop is initialized
108
+ assert hasattr(agent, "agent_loop")
109
+ assert agent.agent_loop is not None
110
+
111
+
112
+ class TestComputerAgentTypes:
113
+ """Test AgentResponse and Messages types (SRP: Only tests type definitions)."""
114
+
115
+ def test_messages_type_exists(self):
116
+ """Test that Messages type is exported."""
117
+ from agent import Messages
118
+
119
+ assert Messages is not None
120
+
121
+ def test_agent_response_type_exists(self):
122
+ """Test that AgentResponse type is exported."""
123
+ from agent import AgentResponse
124
+
125
+ assert AgentResponse is not None
126
+
127
+
128
+ class TestComputerAgentIntegration:
129
+ """Test ComputerAgent integration with Computer tool (SRP: Integration within package)."""
130
+
131
+ def test_agent_accepts_computer_tool(self, disable_telemetry, mock_computer):
132
+ """Test that agent can be initialized with Computer tool."""
133
+ from agent import ComputerAgent
134
+
135
+ agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
136
+
137
+ # Verify agent accepted the tool
138
+ assert agent is not None
139
+ assert hasattr(agent, "tools")
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -15,8 +15,8 @@ from . import (
15
15
  omniparser,
16
16
  openai,
17
17
  opencua,
18
- uitars,
19
18
  qwen,
19
+ uitars,
20
20
  )
21
21
 
22
22
  __all__ = [
File without changes