cua-agent 0.4.31__py3-none-any.whl → 0.4.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/loops/opencua.py ADDED
@@ -0,0 +1,142 @@
1
+ """
2
+ OpenCUA agent loop implementation for click prediction using litellm.acompletion
3
+ Based on OpenCUA model for GUI grounding tasks.
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ import re
9
+ import base64
10
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
11
+ from io import BytesIO
12
+ import uuid
13
+ from PIL import Image
14
+ import litellm
15
+ import math
16
+
17
+ from .composed_grounded import ComposedGroundedConfig
18
+ from ..decorators import register_agent
19
+ from ..types import Messages, AgentResponse, Tools, AgentCapability
20
+ from ..loops.base import AsyncAgentConfig
21
+
22
+ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
23
+ """Extract coordinates from pyautogui.click(x=..., y=...) format."""
24
+ try:
25
+ # Look for pyautogui.click(x=1443, y=343) pattern
26
+ pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
27
+ match = re.search(pattern, text)
28
+ if match:
29
+ x, y = int(match.group(1)), int(match.group(2))
30
+ return (x, y)
31
+ return None
32
+ except Exception:
33
+ return None
34
+
35
+ @register_agent(models=r"(?i).*OpenCUA.*")
36
+ class OpenCUAConfig(ComposedGroundedConfig):
37
+ """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
38
+
39
+ def __init__(self):
40
+ super().__init__()
41
+ self.current_model = None
42
+ self.last_screenshot_b64 = None
43
+
44
+ async def predict_step(
45
+ self,
46
+ messages: List[Dict[str, Any]],
47
+ model: str,
48
+ tools: Optional[List[Dict[str, Any]]] = None,
49
+ max_retries: Optional[int] = None,
50
+ stream: bool = False,
51
+ computer_handler=None,
52
+ _on_api_start=None,
53
+ _on_api_end=None,
54
+ _on_usage=None,
55
+ _on_screenshot=None,
56
+ **kwargs
57
+ ) -> Dict[str, Any]:
58
+ """Fallback to a self-composed model"""
59
+ return await super().predict_step(
60
+ messages=messages,
61
+ model=f"{model}+{model}",
62
+ tools=tools,
63
+ max_retries=max_retries,
64
+ stream=stream,
65
+ computer_handler=computer_handler,
66
+ _on_api_start=_on_api_start,
67
+ _on_api_end=_on_api_end,
68
+ _on_usage=_on_usage,
69
+ _on_screenshot=_on_screenshot,
70
+ **kwargs
71
+ )
72
+
73
+ async def predict_click(
74
+ self,
75
+ model: str,
76
+ image_b64: str,
77
+ instruction: str,
78
+ **kwargs
79
+ ) -> Optional[Tuple[int, int]]:
80
+ """
81
+ Predict click coordinates using OpenCUA model via litellm.acompletion.
82
+
83
+ Args:
84
+ model: The OpenCUA model name
85
+ image_b64: Base64 encoded image
86
+ instruction: Instruction for where to click
87
+
88
+ Returns:
89
+ Tuple of (x, y) coordinates or None if prediction fails
90
+ """
91
+ # Prepare system message
92
+ system_prompt = (
93
+ "You are a GUI agent. You are given a task and a screenshot of the screen. "
94
+ "You need to perform a series of pyautogui actions to complete the task."
95
+ )
96
+
97
+ system_message = {
98
+ "role": "system",
99
+ "content": system_prompt
100
+ }
101
+
102
+ # Prepare user message with image and instruction
103
+ user_message = {
104
+ "role": "user",
105
+ "content": [
106
+ {
107
+ "type": "image_url",
108
+ "image_url": {
109
+ "url": f"data:image/png;base64,{image_b64}"
110
+ }
111
+ },
112
+ {
113
+ "type": "text",
114
+ "text": f"Click on {instruction}"
115
+ }
116
+ ]
117
+ }
118
+
119
+ # Prepare API call kwargs
120
+ api_kwargs = {
121
+ "model": model,
122
+ "messages": [system_message, user_message],
123
+ "max_new_tokens": 2056,
124
+ "temperature": 0,
125
+ **kwargs
126
+ }
127
+
128
+ # Use liteLLM acompletion
129
+ response = await litellm.acompletion(**api_kwargs)
130
+
131
+ # Extract response text
132
+ output_text = response.choices[0].message.content
133
+ # print(output_text)
134
+
135
+ # Extract coordinates from pyautogui format
136
+ coordinates = extract_coordinates_from_pyautogui(output_text)
137
+
138
+ return coordinates
139
+
140
+ def get_capabilities(self) -> List[AgentCapability]:
141
+ """Return the capabilities supported by this agent."""
142
+ return ["click"]
agent/loops/uitars.py CHANGED
@@ -780,7 +780,7 @@ class UITARSConfig:
780
780
  api_kwargs = {
781
781
  "model": model,
782
782
  "messages": litellm_messages,
783
- "max_tokens": 100,
783
+ "max_tokens": 2056,
784
784
  "temperature": 0.0,
785
785
  "do_sample": False
786
786
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.31
3
+ Version: 0.4.32
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.12
@@ -31,22 +31,38 @@ Provides-Extra: glm45v-hf
31
31
  Requires-Dist: accelerate; extra == "glm45v-hf"
32
32
  Requires-Dist: torch; extra == "glm45v-hf"
33
33
  Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
34
+ Provides-Extra: opencua-hf
35
+ Requires-Dist: accelerate; extra == "opencua-hf"
36
+ Requires-Dist: torch; extra == "opencua-hf"
37
+ Requires-Dist: transformers==4.53.0; extra == "opencua-hf"
38
+ Requires-Dist: tiktoken>=0.11.0; extra == "opencua-hf"
39
+ Requires-Dist: blobfile>=3.0.0; extra == "opencua-hf"
40
+ Provides-Extra: internvl-hf
41
+ Requires-Dist: accelerate; extra == "internvl-hf"
42
+ Requires-Dist: torch; extra == "internvl-hf"
43
+ Requires-Dist: transformers>=4.55.0; extra == "internvl-hf"
44
+ Requires-Dist: einops; extra == "internvl-hf"
45
+ Requires-Dist: timm; extra == "internvl-hf"
34
46
  Provides-Extra: ui
35
47
  Requires-Dist: gradio>=5.23.3; extra == "ui"
36
48
  Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
37
49
  Provides-Extra: cli
38
50
  Requires-Dist: yaspin>=3.1.0; extra == "cli"
39
51
  Provides-Extra: hud
40
- Requires-Dist: hud-python==0.4.19; extra == "hud"
52
+ Requires-Dist: hud-python==0.4.26; extra == "hud"
41
53
  Provides-Extra: all
42
54
  Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
43
55
  Requires-Dist: accelerate; extra == "all"
44
56
  Requires-Dist: torch; extra == "all"
45
- Requires-Dist: transformers>=4.54.0; extra == "all"
57
+ Requires-Dist: transformers>=4.55.0; extra == "all"
58
+ Requires-Dist: einops; extra == "all"
59
+ Requires-Dist: timm; extra == "all"
60
+ Requires-Dist: tiktoken>=0.11.0; extra == "all"
61
+ Requires-Dist: blobfile>=3.0.0; extra == "all"
46
62
  Requires-Dist: gradio>=5.23.3; extra == "all"
47
63
  Requires-Dist: python-dotenv>=1.0.1; extra == "all"
48
64
  Requires-Dist: yaspin>=3.1.0; extra == "all"
49
- Requires-Dist: hud-python==0.4.19; extra == "all"
65
+ Requires-Dist: hud-python==0.4.26; extra == "all"
50
66
  Description-Content-Type: text/markdown
51
67
 
52
68
  <div align="center">
@@ -1,10 +1,15 @@
1
1
  agent/__init__.py,sha256=MaW-BczJ-lCACPYH39DvFhE7ZWiSo7sBO6pBfyO7Nxc,1269
2
2
  agent/__main__.py,sha256=lBUe8Niqa5XoCjwFfXyX7GtnUwjjZXC1-j4V9mvUYSc,538
3
3
  agent/adapters/__init__.py,sha256=Q_OxxwXBcBIetQ_DtHS5bwZWXrvCKPX2grCg8R0UKek,301
4
- agent/adapters/huggingfacelocal_adapter.py,sha256=Uqjtcohhzd33VFh38Ra2y4Uv_lTghMswoqS1t-KKFkw,8480
4
+ agent/adapters/huggingfacelocal_adapter.py,sha256=3ht4jCUP4rpjPxi7vj8xOJNelTgfsUq0YbS44FwVN0c,7089
5
5
  agent/adapters/human_adapter.py,sha256=xT4nnfNXb1z-vnGFlLmFEZN7TMcoMBGS40MtR1Zwv4o,13079
6
6
  agent/adapters/mlxvlm_adapter.py,sha256=4VhhKDZfLLKL5joL1v4PPFvYw-R8spoDsat3vOAGnpE,14864
7
- agent/agent.py,sha256=F_nPTRv-FCo3oKaEFCugun-sj6_iD_iw2RQxcaTMVmM,30192
7
+ agent/adapters/models/__init__.py,sha256=23ETHLbn1C3VW4zMzSn1Ql-gsZz-wQCeJX12Lv_0n8M,1412
8
+ agent/adapters/models/generic.py,sha256=hpJt73jQePV80NYRAJTtOaElcrtGcpYK-QnInZIUkk0,2768
9
+ agent/adapters/models/internvl.py,sha256=PbFjU_Fu1JIahlEtR5pTO7RSV2UYJ9CAlQrMXKXJrTA,11385
10
+ agent/adapters/models/opencua.py,sha256=gPoZBMXyjiPEJ7Py2mpRWHiyay8X-y-pOSKw1LC_ihU,3924
11
+ agent/adapters/models/qwen2_5_vl.py,sha256=kc9YrtCB0FFy-oB1EkD_zasxYnbJg1wGiss6i0ilwdo,2809
12
+ agent/agent.py,sha256=NCRK1xRGt_sa1Yh_do2IPRceV024P2p7KR81wwCYlY8,30462
8
13
  agent/callbacks/__init__.py,sha256=VqYHFt_wk1mc3hKudMZk2Qakrh-bn2rVKh_4xebF0tI,725
9
14
  agent/callbacks/base.py,sha256=UnnnYlh6XCm6HKZZsAPaT_Eyo9LUYLyjyNwF-QRm6Ns,4691
10
15
  agent/callbacks/budget_manager.py,sha256=RyKM-7iXQcDotYvrw3eURzeEHEXvQjID-NobtvQWE7k,1832
@@ -14,8 +19,8 @@ agent/callbacks/operator_validator.py,sha256=T5tp62pkShkcdHu2rgREUGdk8fryL_ziJsI
14
19
  agent/callbacks/pii_anonymization.py,sha256=NEkUTUjQBi82nqus7kT-1E4RaeQ2hQrY7YCnKndLhP8,3272
15
20
  agent/callbacks/prompt_instructions.py,sha256=RUqsJhiNiXqaOM_P2AfyBinWUDdgDku46BExLMUJHn4,1517
16
21
  agent/callbacks/telemetry.py,sha256=RbUDhE41mTi8g9hNre0EpltK_NUZkLj8buJLWBzs0Ek,7363
17
- agent/callbacks/trajectory_saver.py,sha256=rslgg4Ak7JHSNmmJgANRQ5TsUYWGuUJDZ6amureaz_o,15963
18
- agent/cli.py,sha256=AgaXwywHd3nGQWuqMRj6SbPyFaCPjfo5980Y1ApQOTQ,12413
22
+ agent/callbacks/trajectory_saver.py,sha256=-XNgiKU6T8Qw_i2AZMQuw0HuUe6MHkU89rjn_T386Rw,16128
23
+ agent/cli.py,sha256=HddU18IvvKdyvQu0ru21nAcNc6k7toYuyjgORIzX_qo,16110
19
24
  agent/computers/__init__.py,sha256=39ISJsaREaQIZckpzxSuLhuR763wUU3TxUux78EKjAg,1477
20
25
  agent/computers/base.py,sha256=hZntX4vgc1ahD3EnFeb9lUjtBmgka1vb27hndPl9tKQ,2187
21
26
  agent/computers/cua.py,sha256=xp2A34kT2C1NKqSRo2GB6766gkraM-UtpFjRv8LUTSc,4889
@@ -28,16 +33,19 @@ agent/human_tool/ui.py,sha256=wu9eZorhxCkyPTlBSZjYaVzutoHMlucAz8UGNpAT4bM,30644
28
33
  agent/integrations/hud/__init__.py,sha256=xir5BVAlG2cFc7rHSx_Ea_2b1kp2TtFuKJk07jny7qY,5969
29
34
  agent/integrations/hud/agent.py,sha256=GBikd9MhjDNKMiMG8J7PE3OMSmvmC_JLZ1p5xr2cZoc,14006
30
35
  agent/integrations/hud/proxy.py,sha256=8HUoh7uZ8Z3vkhPXK0dskgePGsP8oCqyYij0mE_E7X8,10902
31
- agent/loops/__init__.py,sha256=Ef8aj07l3osibwDk-DTo80PrpL4_GdKRTP1ikl_b-BQ,328
32
- agent/loops/anthropic.py,sha256=q7lr1PjI6VPtlozoweluY2c3hCGqa_2s-whzxa37iKE,70250
36
+ agent/loops/__init__.py,sha256=c6stEkT15smK8ZIf9j2kyOko84uz1YIvHXx0Mbe2wq8,472
37
+ agent/loops/anthropic.py,sha256=ODrMvmTkyzIOLjGq6HbKzzgBu19TE_Xlsi--7vc5T6o,70196
33
38
  agent/loops/base.py,sha256=LK7kSTnc2CB88LI7qr2VP7LMq0eS5r2bSEnrxO6IN5U,2345
34
- agent/loops/composed_grounded.py,sha256=8oJoqaRzKWbI9I4VoFuAoUzQ11_CFnYT-EdPOy-NVEQ,12349
35
- agent/loops/glm45v.py,sha256=V1f-5vAifbYcY-qTc7fW2KXVRkAfApQI_EjavH3X2ak,35110
36
- agent/loops/gta1.py,sha256=ha5TaUWqUzTffx_ow1WiBU8i3VNP-6FL5XC66ajPFjg,5829
39
+ agent/loops/composed_grounded.py,sha256=Um_8G0v5DEzF_A9wWIGp_IDPDMvv4IXDTFpEDH92Vto,12367
40
+ agent/loops/glm45v.py,sha256=EKAoh-PWkcCdzBVebjXbdqoDNkXgcmJpIqmTNPiZ8TM,35127
41
+ agent/loops/gta1.py,sha256=uGIcUH5ChzO75eGvoQxuKMBWjX-1J9-xmC7vPetobjU,5831
42
+ agent/loops/holo.py,sha256=peQ0xx4XQDBQ3g2XKRLCgyrU_2PkXe3RaysNBqFyS90,7481
43
+ agent/loops/internvl.py,sha256=iQs6DSoP9JOyUxRAz_HPuv4Hi2Sbv-Jc3022W-oPX5Y,6596
37
44
  agent/loops/model_types.csv,sha256=GmFn4x80yoUpQZuQ-GXtJkPVlOLYWZ5u_5A73HRyeNE,112
38
45
  agent/loops/omniparser.py,sha256=-db8JUL2Orn47ERIaLbuNShAXn4LeIgYzRWphn_9Dg4,15071
39
46
  agent/loops/openai.py,sha256=3UEXdecqGkyknhTgp6zxr_cNCVg5vM-61I6SKMNl6m8,8692
40
- agent/loops/uitars.py,sha256=QyEWyrhkI5MCksTunntY-5jtETd4pwcQB-DyzmiWezo,32350
47
+ agent/loops/opencua.py,sha256=Chb4UASHDrdcX_fO__Gw2e9ay4Hl6Vq38K5x-IoHyuo,4432
48
+ agent/loops/uitars.py,sha256=mVPt4V-HabX7ZiQnM55BVQt73CuZUjmUAsbm4Tf6TXk,32351
41
49
  agent/proxy/examples.py,sha256=GYFJ-sfDsSNZr9n_qpvDx_0rShqoKE5JW0ibbljWfoo,6192
42
50
  agent/proxy/handlers.py,sha256=48mMNyZOU3dJQ6oI5r2kDDe29rcU49MConlB0MZeCsU,9602
43
51
  agent/responses.py,sha256=_SoN4BkaTxMHMB21EOtDc_aDBIJlfDwsCzszMBnIkH0,30764
@@ -47,7 +55,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
47
55
  agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
48
56
  agent/ui/gradio/app.py,sha256=Ol97YEbwREZZQ9_PMjVHlfOcu9BGsawxgAGAm79hT80,9117
49
57
  agent/ui/gradio/ui_components.py,sha256=dJUvKDmc1oSejtoR_gU_oWWYwxaOOQyPloSYRGMrUCQ,36068
50
- cua_agent-0.4.31.dist-info/METADATA,sha256=HywszWj2eDguXdge9eANyyz57gVlY9YZr4yacHHkUhU,5610
51
- cua_agent-0.4.31.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
52
- cua_agent-0.4.31.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
53
- cua_agent-0.4.31.dist-info/RECORD,,
58
+ cua_agent-0.4.32.dist-info/METADATA,sha256=9DM4yfZ8hH6-JeNvke6WOgzZLEF0i3A8cDeb3aTGpyk,6340
59
+ cua_agent-0.4.32.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
60
+ cua_agent-0.4.32.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
61
+ cua_agent-0.4.32.dist-info/RECORD,,