cua-agent 0.4.12__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -0,0 +1,187 @@
1
+ """HUD Computer Handler for ComputerAgent integration."""
2
+
3
+ import base64
4
+ from io import BytesIO
5
+ from typing import Literal, Optional, Any, Dict, Callable
6
+ from PIL import Image
7
+
8
+ from agent.computers import AsyncComputerHandler
9
+
10
+
11
+ class HUDComputerHandler(AsyncComputerHandler):
12
+ """Computer handler that interfaces with HUD environment."""
13
+
14
+ def __init__(
15
+ self,
16
+ environment: Literal["windows", "mac", "linux", "browser"] = "linux",
17
+ dimensions: tuple[int, int] = (1024, 768),
18
+ screenshot_callback: Optional[Callable] = None,
19
+ action_callback: Optional[Callable] = None,
20
+ ):
21
+ """
22
+ Initialize HUD computer handler.
23
+
24
+ Args:
25
+ environment: The environment type for HUD
26
+ dimensions: Screen dimensions as (width, height)
27
+ screenshot_callback: Optional callback to get screenshots from HUD environment
28
+ action_callback: Optional callback to execute actions in HUD environment
29
+ """
30
+ super().__init__()
31
+ self._environment = environment
32
+ self._dimensions = dimensions
33
+ self._screenshot_callback = screenshot_callback
34
+ self._action_callback = action_callback
35
+
36
+ # Store the last screenshot for reuse
37
+ self._last_screenshot: Optional[str] = None
38
+
39
+ def set_screenshot_callback(self, callback: Callable) -> None:
40
+ """Set the screenshot callback."""
41
+ self._screenshot_callback = callback
42
+
43
+ def set_action_callback(self, callback: Callable) -> None:
44
+ """Set the action callback."""
45
+ self._action_callback = callback
46
+
47
+ def update_screenshot(self, screenshot: str) -> None:
48
+ """Update the stored screenshot (base64 string)."""
49
+ self._last_screenshot = screenshot
50
+
51
+ async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
52
+ """Get the current environment type."""
53
+ return self._environment # type: ignore
54
+
55
+ async def get_dimensions(self) -> tuple[int, int]:
56
+ """Get screen dimensions as (width, height)."""
57
+ return self._dimensions
58
+
59
+ async def screenshot(self) -> str:
60
+ """Take a screenshot and return as base64 string."""
61
+ if self._screenshot_callback:
62
+ screenshot = await self._screenshot_callback()
63
+ if isinstance(screenshot, str):
64
+ self._last_screenshot = screenshot
65
+ return screenshot
66
+ elif isinstance(screenshot, Image.Image):
67
+ # Convert PIL Image to base64
68
+ buffer = BytesIO()
69
+ screenshot.save(buffer, format="PNG")
70
+ screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
71
+ self._last_screenshot = screenshot_b64
72
+ return screenshot_b64
73
+ elif isinstance(screenshot, bytes):
74
+ screenshot_b64 = base64.b64encode(screenshot).decode()
75
+ self._last_screenshot = screenshot_b64
76
+ return screenshot_b64
77
+
78
+ # Return last screenshot if available, otherwise create a blank one
79
+ if self._last_screenshot:
80
+ return self._last_screenshot
81
+
82
+ # Create a blank screenshot as fallback
83
+ blank_image = Image.new('RGB', self._dimensions, color='white')
84
+ buffer = BytesIO()
85
+ blank_image.save(buffer, format="PNG")
86
+ screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
87
+ self._last_screenshot = screenshot_b64
88
+ return screenshot_b64
89
+
90
+ async def click(self, x: int, y: int, button: str = "left") -> None:
91
+ """Click at coordinates with specified button."""
92
+ if self._action_callback:
93
+ await self._action_callback({
94
+ "type": "click",
95
+ "x": x,
96
+ "y": y,
97
+ "button": button
98
+ })
99
+
100
+ async def double_click(self, x: int, y: int) -> None:
101
+ """Double click at coordinates."""
102
+ if self._action_callback:
103
+ await self._action_callback({
104
+ "type": "double_click",
105
+ "x": x,
106
+ "y": y
107
+ })
108
+
109
+ async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
110
+ """Scroll at coordinates with specified scroll amounts."""
111
+ if self._action_callback:
112
+ await self._action_callback({
113
+ "type": "scroll",
114
+ "x": x,
115
+ "y": y,
116
+ "scroll_x": scroll_x,
117
+ "scroll_y": scroll_y
118
+ })
119
+
120
+ async def type(self, text: str) -> None:
121
+ """Type text."""
122
+ if self._action_callback:
123
+ await self._action_callback({
124
+ "type": "type",
125
+ "text": text
126
+ })
127
+
128
+ async def wait(self, ms: int = 1000) -> None:
129
+ """Wait for specified milliseconds."""
130
+ if self._action_callback:
131
+ await self._action_callback({
132
+ "type": "wait",
133
+ "ms": ms
134
+ })
135
+
136
+ async def move(self, x: int, y: int) -> None:
137
+ """Move cursor to coordinates."""
138
+ if self._action_callback:
139
+ await self._action_callback({
140
+ "type": "move",
141
+ "x": x,
142
+ "y": y
143
+ })
144
+
145
+ async def keypress(self, keys: list[str] | str) -> None:
146
+ """Press key combination."""
147
+ if isinstance(keys, str):
148
+ keys = [keys]
149
+ if self._action_callback:
150
+ await self._action_callback({
151
+ "type": "keypress",
152
+ "keys": keys
153
+ })
154
+
155
+ async def drag(self, path: list[dict[str, int]]) -> None:
156
+ """Drag along a path of points."""
157
+ if self._action_callback:
158
+ await self._action_callback({
159
+ "type": "drag",
160
+ "path": path
161
+ })
162
+
163
+ async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
164
+ """Left mouse down at coordinates."""
165
+ if self._action_callback:
166
+ await self._action_callback({
167
+ "type": "left_mouse_down",
168
+ "x": x,
169
+ "y": y
170
+ })
171
+
172
+ async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
173
+ """Left mouse up at coordinates."""
174
+ if self._action_callback:
175
+ await self._action_callback({
176
+ "type": "left_mouse_up",
177
+ "x": x,
178
+ "y": y
179
+ })
180
+
181
+ async def get_current_url(self) -> str:
182
+ """Get the current URL."""
183
+ if self._action_callback:
184
+ return await self._action_callback({
185
+ "type": "get_current_url"
186
+ })
187
+ return ""
agent/types.py CHANGED
@@ -9,7 +9,7 @@ from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
9
9
  from collections.abc import Iterable
10
10
 
11
11
  # Agent input types
12
- Messages = str | ResponseInputParam
12
+ Messages = str | ResponseInputParam | List[Dict[str, Any]]
13
13
  Tools = Optional[Iterable[ToolParam]]
14
14
 
15
15
  # Agent output types
@@ -27,55 +27,3 @@ class AgentConfigInfo(BaseModel):
27
27
  def matches_model(self, model: str) -> bool:
28
28
  """Check if this agent config matches the given model"""
29
29
  return bool(re.match(self.models_regex, model))
30
-
31
- # Computer tool interface
32
- class Computer(Protocol):
33
- """Protocol defining the interface for computer interactions."""
34
-
35
- async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
36
- """Get the current environment type."""
37
- ...
38
-
39
- async def get_dimensions(self) -> tuple[int, int]:
40
- """Get screen dimensions as (width, height)."""
41
- ...
42
-
43
- async def screenshot(self) -> str:
44
- """Take a screenshot and return as base64 string."""
45
- ...
46
-
47
- async def click(self, x: int, y: int, button: str = "left") -> None:
48
- """Click at coordinates with specified button."""
49
- ...
50
-
51
- async def double_click(self, x: int, y: int) -> None:
52
- """Double click at coordinates."""
53
- ...
54
-
55
- async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
56
- """Scroll at coordinates with specified scroll amounts."""
57
- ...
58
-
59
- async def type(self, text: str) -> None:
60
- """Type text."""
61
- ...
62
-
63
- async def wait(self, ms: int = 1000) -> None:
64
- """Wait for specified milliseconds."""
65
- ...
66
-
67
- async def move(self, x: int, y: int) -> None:
68
- """Move cursor to coordinates."""
69
- ...
70
-
71
- async def keypress(self, keys: List[str]) -> None:
72
- """Press key combination."""
73
- ...
74
-
75
- async def drag(self, path: List[Dict[str, int]]) -> None:
76
- """Drag along specified path."""
77
- ...
78
-
79
- async def get_current_url(self) -> str:
80
- """Get current URL (for browser environments)."""
81
- ...
agent/ui/gradio/app.py CHANGED
@@ -39,6 +39,7 @@ global_agent = None
39
39
  global_computer = None
40
40
  SETTINGS_FILE = Path(".gradio_settings.json")
41
41
 
42
+ logging.basicConfig(level=logging.INFO)
42
43
 
43
44
  import dotenv
44
45
  if dotenv.load_dotenv():
@@ -187,7 +187,7 @@ if __name__ == "__main__":
187
187
  """
188
188
  <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
189
189
  <img alt="CUA Logo" style="width: 80px;"
190
- src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
190
+ src="https://github.com/trycua/cua/blob/main/img/logo_white.png?raw=true" />
191
191
  </div>
192
192
  """
193
193
  )
@@ -201,22 +201,33 @@ if __name__ == "__main__":
201
201
  )
202
202
 
203
203
  with gr.Accordion("Computer Configuration", open=True):
204
- computer_os = gr.Radio(
205
- choices=["macos", "linux", "windows"],
206
- label="Operating System",
207
- value="macos",
208
- info="Select the operating system for the computer",
209
- )
210
-
211
204
  is_windows = platform.system().lower() == "windows"
212
205
  is_mac = platform.system().lower() == "darwin"
213
206
 
214
- providers = ["cloud", "localhost"]
207
+ providers = ["cloud", "localhost", "docker"]
215
208
  if is_mac:
216
209
  providers += ["lume"]
217
210
  if is_windows:
218
211
  providers += ["winsandbox"]
219
212
 
213
+ # Remove unavailable options
214
+ # MacOS is unavailable if Lume is not available
215
+ # Windows is unavailable if Winsandbox is not available
216
+ # Linux is always available
217
+ # This should be removed once we support macOS and Windows on the cloud provider
218
+ computer_choices = ["macos", "linux", "windows"]
219
+ if not is_mac or "lume" not in providers:
220
+ computer_choices.remove("macos")
221
+ if not is_windows or "winsandbox" not in providers:
222
+ computer_choices.remove("windows")
223
+
224
+ computer_os = gr.Radio(
225
+ choices=computer_choices,
226
+ label="Operating System",
227
+ value=computer_choices[0],
228
+ info="Select the operating system for the computer",
229
+ )
230
+
220
231
  computer_provider = gr.Radio(
221
232
  choices=providers,
222
233
  label="Provider",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.12
3
+ Version: 0.4.13
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.11
@@ -28,15 +28,17 @@ Provides-Extra: uitars-hf
28
28
  Requires-Dist: accelerate; extra == "uitars-hf"
29
29
  Requires-Dist: torch; extra == "uitars-hf"
30
30
  Requires-Dist: transformers>=4.54.0; extra == "uitars-hf"
31
+ Provides-Extra: glm45v-hf
32
+ Requires-Dist: accelerate; extra == "glm45v-hf"
33
+ Requires-Dist: torch; extra == "glm45v-hf"
34
+ Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
31
35
  Provides-Extra: ui
32
36
  Requires-Dist: gradio>=5.23.3; extra == "ui"
33
37
  Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
34
38
  Provides-Extra: cli
35
39
  Requires-Dist: yaspin>=3.1.0; extra == "cli"
36
- Provides-Extra: glm45v-hf
37
- Requires-Dist: accelerate; extra == "glm45v-hf"
38
- Requires-Dist: torch; extra == "glm45v-hf"
39
- Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
40
+ Provides-Extra: hud
41
+ Requires-Dist: hud-python==0.2.10; extra == "hud"
40
42
  Provides-Extra: all
41
43
  Requires-Dist: ultralytics>=8.0.0; extra == "all"
42
44
  Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
@@ -47,6 +49,7 @@ Requires-Dist: transformers>=4.54.0; extra == "all"
47
49
  Requires-Dist: gradio>=5.23.3; extra == "all"
48
50
  Requires-Dist: python-dotenv>=1.0.1; extra == "all"
49
51
  Requires-Dist: yaspin>=3.1.0; extra == "all"
52
+ Requires-Dist: hud-python==0.2.10; extra == "all"
50
53
  Description-Content-Type: text/markdown
51
54
 
52
55
  <div align="center">
@@ -1,8 +1,9 @@
1
1
  agent/__init__.py,sha256=vWbQYgjkzIso7zILSm4OAbNU_vrmN4HyYkfX8vC-Yi0,1547
2
2
  agent/__main__.py,sha256=lBUe8Niqa5XoCjwFfXyX7GtnUwjjZXC1-j4V9mvUYSc,538
3
- agent/adapters/__init__.py,sha256=szM2HMten2WkcqXeRnan__-sXjpyS4eyvIW0LXSfj4U,178
4
- agent/adapters/huggingfacelocal_adapter.py,sha256=CT3dwJnOWItB5eTqpn5i0Y1Ec6yRjaW7zhA14Ot9gz8,8066
5
- agent/agent.py,sha256=PP6UvNq_QYBYuEt97Dhono7g3hz1fQlMIapSQRhw59c,27761
3
+ agent/adapters/__init__.py,sha256=lNH6srgIMmZOI7dgicJs3LCk_1MeqLF0lou9n7b23Ts,238
4
+ agent/adapters/huggingfacelocal_adapter.py,sha256=Uqjtcohhzd33VFh38Ra2y4Uv_lTghMswoqS1t-KKFkw,8480
5
+ agent/adapters/human_adapter.py,sha256=xT4nnfNXb1z-vnGFlLmFEZN7TMcoMBGS40MtR1Zwv4o,13079
6
+ agent/agent.py,sha256=mEbmN5G6y8jZ0FrlUnHfJQFE7r_GlXrHxqC93twv54k,27881
6
7
  agent/callbacks/__init__.py,sha256=yxxBXUqpXQ-jRi_ixJMtmQPxoNRy5Vz1PUBzNNa1Dwg,538
7
8
  agent/callbacks/base.py,sha256=UnnnYlh6XCm6HKZZsAPaT_Eyo9LUYLyjyNwF-QRm6Ns,4691
8
9
  agent/callbacks/budget_manager.py,sha256=RyKM-7iXQcDotYvrw3eURzeEHEXvQjID-NobtvQWE7k,1832
@@ -10,10 +11,21 @@ agent/callbacks/image_retention.py,sha256=tiuRT5ke9xXTb2eP8Gz-2ITyAMY29LURUH6Abj
10
11
  agent/callbacks/logging.py,sha256=OOxU97EzrxlnUAtiEnvy9FB7SwCUK90-rdpDFA2Ae4E,10921
11
12
  agent/callbacks/pii_anonymization.py,sha256=NEkUTUjQBi82nqus7kT-1E4RaeQ2hQrY7YCnKndLhP8,3272
12
13
  agent/callbacks/telemetry.py,sha256=PU7pkK7W1v1xjDN-9gA30lGvn4-WhqK3BPHGW3HpTOc,7497
13
- agent/callbacks/trajectory_saver.py,sha256=POE8aPT-MBzfW873wr6C7iiVUHtp483KwvLPxC1S3EY,11626
14
- agent/cli.py,sha256=cVMwUzwsLXHK9nA-dR7SFx1WwJdSnaQROLtynjEQhD8,12401
15
- agent/computer_handler.py,sha256=32w0Nnby3TJOSgOtceVhj46ExGIhAPSbwqIqm0HGi0A,4649
14
+ agent/callbacks/trajectory_saver.py,sha256=VHbiDQzI_XludkWhZIVqIMrsxgwKfFWwVtqaRot_D4U,12231
15
+ agent/cli.py,sha256=AgaXwywHd3nGQWuqMRj6SbPyFaCPjfo5980Y1ApQOTQ,12413
16
+ agent/computers/__init__.py,sha256=39ISJsaREaQIZckpzxSuLhuR763wUU3TxUux78EKjAg,1477
17
+ agent/computers/base.py,sha256=hZntX4vgc1ahD3EnFeb9lUjtBmgka1vb27hndPl9tKQ,2187
18
+ agent/computers/cua.py,sha256=xp2A34kT2C1NKqSRo2GB6766gkraM-UtpFjRv8LUTSc,4889
19
+ agent/computers/custom.py,sha256=I_CHXvczLg43c_QBk8F_WDOlaSOOzK6b-Tkum2OSRIM,8029
16
20
  agent/decorators.py,sha256=n8VvMsififWkmuk75Q7HIpo0xAA2yAeQ6J-OOiwbAKc,1836
21
+ agent/human_tool/__init__.py,sha256=3m5_g-Fo_0yX5vi7eg-A92oTqO0N3aY929Ajp78HKsE,771
22
+ agent/human_tool/__main__.py,sha256=VsW2BAghlonOuqZbP_xuCsaec9bemA1I_ibnDcED9D4,1068
23
+ agent/human_tool/server.py,sha256=ceuL5kw_RjgAi8fueLU3nTjyzOLE25Shv1oTJnSHsoQ,7964
24
+ agent/human_tool/ui.py,sha256=2Jk3Bh-Jctya8GUG-qtYbdi-1qDdwOtcAlUeiIqsoIE,26584
25
+ agent/integrations/hud/__init__.py,sha256=1lqeM6vJAekr38l7yteLNa-Hn3R2eXCusT2FAaY8VPE,2943
26
+ agent/integrations/hud/adapter.py,sha256=M7J71q29Ndr4xXIW7Y6H_HIlJmnp-JlKG_4zKZTuyps,4088
27
+ agent/integrations/hud/agent.py,sha256=vXmI7OBez5lokQ9dCcgWeT8N68xfWpsWT3S36MLhdas,17264
28
+ agent/integrations/hud/computer_handler.py,sha256=N5pVKeKW9bJ-oceYrE7IIHbx6ZrQRQnHItTGrytoHRM,6788
17
29
  agent/loops/__init__.py,sha256=Ef8aj07l3osibwDk-DTo80PrpL4_GdKRTP1ikl_b-BQ,328
18
30
  agent/loops/anthropic.py,sha256=lvDscOaOcESBWZvnjKntQRWJZ4cEaFJhSsmmFc7J1ow,69562
19
31
  agent/loops/base.py,sha256=LK7kSTnc2CB88LI7qr2VP7LMq0eS5r2bSEnrxO6IN5U,2345
@@ -26,13 +38,13 @@ agent/loops/openai.py,sha256=8Ad_XufpENmLq1nEnhzF3oswPrPK1EPz-C5NU8UOEs0,8035
26
38
  agent/loops/uitars.py,sha256=EDq8AO20lrnwB013uJoWSkkz3TVRU9oG8DQ1VviXltc,31445
27
39
  agent/responses.py,sha256=TTJ3wXN_eb0J26GKhO3cVQngOiZ1AgUPIUadozLUQyE,28991
28
40
  agent/telemetry.py,sha256=87ZTyBaT0wEPQn4v76II3g0V3GERuIVbypoX-Ug6FKQ,4786
29
- agent/types.py,sha256=zXev_CV9LvlYRkxzO_EmW1ZT70Z8qeGG3iHbzyYmV30,2425
41
+ agent/types.py,sha256=ZoWY8a3GZtB8V0SnOzoI7DQy4nP_GRubxJKbuLPOc8c,840
30
42
  agent/ui/__init__.py,sha256=DTZpK85QXscXK2nM9HtpAhVBF13yAamUrtwrQSuV-kM,126
31
43
  agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
32
44
  agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
33
- agent/ui/gradio/app.py,sha256=9UOPwuwspLrnHGY91zdzuRqkMH4cmwOBH-f-BC0gVC4,9077
34
- agent/ui/gradio/ui_components.py,sha256=hVMGZxAEq1LBHOqKj-RbDXJsj1j0Qw5dOV0ecWIHxmc,35397
35
- cua_agent-0.4.12.dist-info/METADATA,sha256=V2vFnsnQ8atsj-8B5aVfBp2-5Hpl2df2uPxflc5vNNo,12496
36
- cua_agent-0.4.12.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
37
- cua_agent-0.4.12.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
38
- cua_agent-0.4.12.dist-info/RECORD,,
45
+ agent/ui/gradio/app.py,sha256=m2yDd6Tua_lMMZT1zCzOty2meYEy756d8OlFF7lpdeU,9117
46
+ agent/ui/gradio/ui_components.py,sha256=vfsqVo_COsFfw11ouMHClib9fdBf3q52G-qbuo0RyOY,36068
47
+ cua_agent-0.4.13.dist-info/METADATA,sha256=Xr2mN1uCE8Mu5TiYskFjb6DpN-FgmihPTyNyPa665yo,12616
48
+ cua_agent-0.4.13.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
49
+ cua_agent-0.4.13.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
50
+ cua_agent-0.4.13.dist-info/RECORD,,