cua-agent 0.4.36__tar.gz → 0.4.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show
  1. {cua_agent-0.4.36 → cua_agent-0.4.37}/PKG-INFO +1 -1
  2. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/omniparser.py +137 -75
  3. {cua_agent-0.4.36 → cua_agent-0.4.37}/pyproject.toml +1 -1
  4. {cua_agent-0.4.36 → cua_agent-0.4.37}/README.md +0 -0
  5. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/__init__.py +0 -0
  6. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/__main__.py +0 -0
  7. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/__init__.py +0 -0
  8. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/huggingfacelocal_adapter.py +0 -0
  9. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/human_adapter.py +0 -0
  10. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/mlxvlm_adapter.py +0 -0
  11. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/__init__.py +0 -0
  12. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/generic.py +0 -0
  13. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/internvl.py +0 -0
  14. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/opencua.py +0 -0
  15. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/adapters/models/qwen2_5_vl.py +0 -0
  16. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/agent.py +0 -0
  17. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/__init__.py +0 -0
  18. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/base.py +0 -0
  19. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/budget_manager.py +0 -0
  20. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/image_retention.py +0 -0
  21. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/logging.py +0 -0
  22. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/operator_validator.py +0 -0
  23. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/pii_anonymization.py +0 -0
  24. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/prompt_instructions.py +0 -0
  25. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/telemetry.py +0 -0
  26. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/callbacks/trajectory_saver.py +0 -0
  27. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/cli.py +0 -0
  28. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/__init__.py +0 -0
  29. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/base.py +0 -0
  30. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/cua.py +0 -0
  31. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/computers/custom.py +0 -0
  32. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/decorators.py +0 -0
  33. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/__init__.py +0 -0
  34. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/__main__.py +0 -0
  35. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/server.py +0 -0
  36. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/human_tool/ui.py +0 -0
  37. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/integrations/hud/__init__.py +0 -0
  38. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/integrations/hud/agent.py +0 -0
  39. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/integrations/hud/proxy.py +0 -0
  40. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/__init__.py +0 -0
  41. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/anthropic.py +0 -0
  42. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/base.py +0 -0
  43. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/composed_grounded.py +0 -0
  44. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/gemini.py +0 -0
  45. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/glm45v.py +0 -0
  46. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/gta1.py +0 -0
  47. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/holo.py +0 -0
  48. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/internvl.py +0 -0
  49. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/model_types.csv +0 -0
  50. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/moondream3.py +0 -0
  51. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/openai.py +0 -0
  52. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/opencua.py +0 -0
  53. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/qwen.py +0 -0
  54. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/loops/uitars.py +0 -0
  55. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/proxy/examples.py +0 -0
  56. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/proxy/handlers.py +0 -0
  57. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/responses.py +0 -0
  58. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/types.py +0 -0
  59. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/__init__.py +0 -0
  60. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/__main__.py +0 -0
  61. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/gradio/__init__.py +0 -0
  62. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/gradio/app.py +0 -0
  63. {cua_agent-0.4.36 → cua_agent-0.4.37}/agent/ui/gradio/ui_components.py +0 -0
  64. {cua_agent-0.4.36 → cua_agent-0.4.37}/tests/conftest.py +0 -0
  65. {cua_agent-0.4.36 → cua_agent-0.4.37}/tests/test_computer_agent.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.36
3
+ Version: 0.4.37
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.12
@@ -14,67 +14,73 @@ import litellm
14
14
 
15
15
  from ..decorators import register_agent
16
16
  from ..loops.base import AsyncAgentConfig
17
+ from ..responses import (
18
+ convert_completion_messages_to_responses_items,
19
+ convert_responses_items_to_completion_messages,
20
+ )
17
21
  from ..types import AgentCapability, AgentResponse, Messages, Tools
18
22
 
19
23
  SOM_TOOL_SCHEMA = {
20
24
  "type": "function",
21
- "name": "computer",
22
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
23
- "parameters": {
24
- "type": "object",
25
- "properties": {
26
- "action": {
27
- "type": "string",
28
- "enum": [
29
- "screenshot",
30
- "click",
31
- "double_click",
32
- "drag",
33
- "type",
34
- "keypress",
35
- "scroll",
36
- "move",
37
- "wait",
38
- "get_current_url",
39
- "get_dimensions",
40
- "get_environment",
41
- ],
42
- "description": "The action to perform",
43
- },
44
- "element_id": {
45
- "type": "integer",
46
- "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
47
- },
48
- "start_element_id": {
49
- "type": "integer",
50
- "description": "The ID of the element to start dragging from (required for drag action)",
51
- },
52
- "end_element_id": {
53
- "type": "integer",
54
- "description": "The ID of the element to drag to (required for drag action)",
55
- },
56
- "text": {
57
- "type": "string",
58
- "description": "The text to type (required for type action)",
59
- },
60
- "keys": {
61
- "type": "string",
62
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
63
- },
64
- "button": {
65
- "type": "string",
66
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
67
- },
68
- "scroll_x": {
69
- "type": "integer",
70
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
71
- },
72
- "scroll_y": {
73
- "type": "integer",
74
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
25
+ "function": {
26
+ "name": "computer",
27
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
28
+ "parameters": {
29
+ "type": "object",
30
+ "properties": {
31
+ "action": {
32
+ "type": "string",
33
+ "enum": [
34
+ "screenshot",
35
+ "click",
36
+ "double_click",
37
+ "drag",
38
+ "type",
39
+ "keypress",
40
+ "scroll",
41
+ "move",
42
+ "wait",
43
+ "get_current_url",
44
+ "get_dimensions",
45
+ "get_environment",
46
+ ],
47
+ "description": "The action to perform",
48
+ },
49
+ "element_id": {
50
+ "type": "integer",
51
+ "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
52
+ },
53
+ "start_element_id": {
54
+ "type": "integer",
55
+ "description": "The ID of the element to start dragging from (required for drag action)",
56
+ },
57
+ "end_element_id": {
58
+ "type": "integer",
59
+ "description": "The ID of the element to drag to (required for drag action)",
60
+ },
61
+ "text": {
62
+ "type": "string",
63
+ "description": "The text to type (required for type action)",
64
+ },
65
+ "keys": {
66
+ "type": "string",
67
+ "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
68
+ },
69
+ "button": {
70
+ "type": "string",
71
+ "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
72
+ },
73
+ "scroll_x": {
74
+ "type": "integer",
75
+ "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
76
+ },
77
+ "scroll_y": {
78
+ "type": "integer",
79
+ "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
80
+ },
75
81
  },
82
+ "required": ["action", "element_id"],
76
83
  },
77
- "required": ["action"],
78
84
  },
79
85
  }
80
86
 
@@ -256,7 +262,7 @@ async def replace_computer_call_with_function(
256
262
  {
257
263
  "type": "function_call_output",
258
264
  "call_id": item.get("call_id"),
259
- "output": output,
265
+ "output": item.get("output"),
260
266
  "id": item.get("id"),
261
267
  "status": "completed",
262
268
  }
@@ -298,6 +304,13 @@ class OmniparserConfig(AsyncAgentConfig):
298
304
 
299
305
  llm_model = model.split("+")[-1]
300
306
 
307
+ # Get screen dimensions from computer handler
308
+ try:
309
+ width, height = await computer_handler.get_dimensions()
310
+ except Exception:
311
+ # Fallback to default dimensions if method fails
312
+ width, height = 1024, 768
313
+
301
314
  # Prepare tools for OpenAI API
302
315
  openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
303
316
 
@@ -311,27 +324,43 @@ class OmniparserConfig(AsyncAgentConfig):
311
324
  result = parser.parse(image_data)
312
325
  if _on_screenshot:
313
326
  await _on_screenshot(result.annotated_image_base64, "annotated_image")
327
+
328
+ # Convert OmniParser normalized coordinates (0-1) to absolute pixels, convert to pixels
314
329
  for element in result.elements:
315
- id2xy[element.id] = (
316
- (element.bbox.x1 + element.bbox.x2) / 2,
317
- (element.bbox.y1 + element.bbox.y2) / 2,
318
- )
319
-
320
- # handle computer calls -> function calls
321
- new_messages = []
322
- for message in messages:
330
+ norm_x = (element.bbox.x1 + element.bbox.x2) / 2
331
+ norm_y = (element.bbox.y1 + element.bbox.y2) / 2
332
+ pixel_x = int(norm_x * width)
333
+ pixel_y = int(norm_y * height)
334
+ id2xy[element.id] = (pixel_x, pixel_y)
335
+
336
+ # Replace the original screenshot with the annotated image
337
+ annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
338
+ last_computer_call_output["output"]["image_url"] = annotated_image_url
339
+
340
+ xy2id = {v: k for k, v in id2xy.items()}
341
+ messages_with_element_ids = []
342
+ for i, message in enumerate(messages):
323
343
  if not isinstance(message, dict):
324
344
  message = message.__dict__
325
- new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
326
- messages = new_messages
345
+
346
+ msg_type = message.get("type")
347
+
348
+ if msg_type == "computer_call" and "action" in message:
349
+ action = message.get("action", {})
350
+
351
+ converted = await replace_computer_call_with_function(message, xy2id) # type: ignore
352
+ messages_with_element_ids += converted
353
+
354
+ completion_messages = convert_responses_items_to_completion_messages(
355
+ messages_with_element_ids, allow_images_in_tool_results=False
356
+ )
327
357
 
328
358
  # Prepare API call kwargs
329
359
  api_kwargs = {
330
360
  "model": llm_model,
331
- "input": messages,
361
+ "messages": completion_messages,
332
362
  "tools": openai_tools if openai_tools else None,
333
363
  "stream": stream,
334
- "truncation": "auto",
335
364
  "num_retries": max_retries,
336
365
  **kwargs,
337
366
  }
@@ -342,8 +371,8 @@ class OmniparserConfig(AsyncAgentConfig):
342
371
 
343
372
  print(str(api_kwargs)[:1000])
344
373
 
345
- # Use liteLLM responses
346
- response = await litellm.aresponses(**api_kwargs)
374
+ # Use liteLLM completion
375
+ response = await litellm.acompletion(**api_kwargs)
347
376
 
348
377
  # Call API end hook
349
378
  if _on_api_end:
@@ -357,12 +386,45 @@ class OmniparserConfig(AsyncAgentConfig):
357
386
  if _on_usage:
358
387
  await _on_usage(usage)
359
388
 
360
- # handle som function calls -> xy computer calls
361
- new_output = []
362
- for i in range(len(response.output)): # type: ignore
363
- new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
389
+ response_dict = response.model_dump() # type: ignore
390
+ choice_messages = [choice["message"] for choice in response_dict["choices"]]
391
+ responses_items = []
392
+ for choice_message in choice_messages:
393
+ responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
394
+
395
+ # Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
396
+ final_output = []
397
+ for item in responses_items:
398
+ if item.get("type") == "computer_call" and "action" in item:
399
+ action = item["action"].copy()
400
+
401
+ # Handle single element_id
402
+ if "element_id" in action:
403
+ element_id = action["element_id"]
404
+ if element_id in id2xy:
405
+ x, y = id2xy[element_id]
406
+ action["x"] = x
407
+ action["y"] = y
408
+ del action["element_id"]
409
+
410
+ # Handle start_element_id and end_element_id for drag operations
411
+ elif "start_element_id" in action and "end_element_id" in action:
412
+ start_id = action["start_element_id"]
413
+ end_id = action["end_element_id"]
414
+ if start_id in id2xy and end_id in id2xy:
415
+ start_x, start_y = id2xy[start_id]
416
+ end_x, end_y = id2xy[end_id]
417
+ action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
418
+ del action["start_element_id"]
419
+ del action["end_element_id"]
420
+
421
+ converted_item = item.copy()
422
+ converted_item["action"] = action
423
+ final_output.append(converted_item)
424
+ else:
425
+ final_output.append(item)
364
426
 
365
- return {"output": new_output, "usage": usage}
427
+ return {"output": final_output, "usage": usage}
366
428
 
367
429
  async def predict_click(
368
430
  self, model: str, image_b64: str, instruction: str, **kwargs
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.4.36"
9
+ version = "0.4.37"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes