cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,339 @@
1
+ """
2
+ OpenAI computer-use-preview agent loop implementation using liteLLM
3
+ """
4
+
5
+ import asyncio
6
+ import json
7
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
8
+ import litellm
9
+ import inspect
10
+ import base64
11
+
12
+ from ..decorators import agent_loop
13
+ from ..types import Messages, AgentResponse, Tools
14
+
15
+ SOM_TOOL_SCHEMA = {
16
+ "type": "function",
17
+ "name": "computer",
18
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
19
+ "parameters": {
20
+ "type": "object",
21
+ "properties": {
22
+ "action": {
23
+ "type": "string",
24
+ "enum": [
25
+ "screenshot",
26
+ "click",
27
+ "double_click",
28
+ "drag",
29
+ "type",
30
+ "keypress",
31
+ "scroll",
32
+ "move",
33
+ "wait",
34
+ "get_current_url",
35
+ "get_dimensions",
36
+ "get_environment"
37
+ ],
38
+ "description": "The action to perform"
39
+ },
40
+ "element_id": {
41
+ "type": "integer",
42
+ "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
43
+ },
44
+ "start_element_id": {
45
+ "type": "integer",
46
+ "description": "The ID of the element to start dragging from (required for drag action)"
47
+ },
48
+ "end_element_id": {
49
+ "type": "integer",
50
+ "description": "The ID of the element to drag to (required for drag action)"
51
+ },
52
+ "text": {
53
+ "type": "string",
54
+ "description": "The text to type (required for type action)"
55
+ },
56
+ "keys": {
57
+ "type": "string",
58
+ "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
59
+ },
60
+ "button": {
61
+ "type": "string",
62
+ "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
63
+ },
64
+ "scroll_x": {
65
+ "type": "integer",
66
+ "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
67
+ },
68
+ "scroll_y": {
69
+ "type": "integer",
70
+ "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
71
+ },
72
+ },
73
+ "required": [
74
+ "action"
75
+ ]
76
+ }
77
+ }
78
+
79
+ OMNIPARSER_AVAILABLE = False
80
+ try:
81
+ from som import OmniParser
82
+ OMNIPARSER_AVAILABLE = True
83
+ except ImportError:
84
+ pass
85
+ OMNIPARSER_SINGLETON = None
86
+
87
+ def get_parser() -> OmniParser:
88
+ global OMNIPARSER_SINGLETON
89
+ if OMNIPARSER_SINGLETON is None:
90
+ OMNIPARSER_SINGLETON = OmniParser()
91
+ return OMNIPARSER_SINGLETON
92
+
93
+ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
94
+ """Get the last computer_call_output message from a messages list.
95
+
96
+ Args:
97
+ messages: List of messages to search through
98
+
99
+ Returns:
100
+ The last computer_call_output message dict, or None if not found
101
+ """
102
+ for message in reversed(messages):
103
+ if isinstance(message, dict) and message.get("type") == "computer_call_output":
104
+ return message
105
+ return None
106
+
107
+ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
108
+ """Prepare tools for OpenAI API format"""
109
+ omniparser_tools = []
110
+ id2xy = dict()
111
+
112
+ for schema in tool_schemas:
113
+ if schema["type"] == "computer":
114
+ omniparser_tools.append(SOM_TOOL_SCHEMA)
115
+ if "id2xy" in schema:
116
+ id2xy = schema["id2xy"]
117
+ else:
118
+ schema["id2xy"] = id2xy
119
+ elif schema["type"] == "function":
120
+ # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
121
+ # Schema should be: {type, name, description, parameters}
122
+ omniparser_tools.append({ "type": "function", **schema["function"] })
123
+
124
+ return omniparser_tools, id2xy
125
+
126
+ async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
127
+ item_type = item.get("type")
128
+
129
+ def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
130
+ if element_id is None:
131
+ return (None, None)
132
+ return id2xy.get(element_id, (None, None))
133
+
134
+ if item_type == "function_call":
135
+ fn_name = item.get("name")
136
+ fn_args = json.loads(item.get("arguments", "{}"))
137
+
138
+ item_id = item.get("id")
139
+ call_id = item.get("call_id")
140
+
141
+ if fn_name == "computer":
142
+ action = fn_args.get("action")
143
+ element_id = fn_args.get("element_id")
144
+ start_element_id = fn_args.get("start_element_id")
145
+ end_element_id = fn_args.get("end_element_id")
146
+ text = fn_args.get("text")
147
+ keys = fn_args.get("keys")
148
+ button = fn_args.get("button")
149
+ scroll_x = fn_args.get("scroll_x")
150
+ scroll_y = fn_args.get("scroll_y")
151
+
152
+ x, y = _get_xy(element_id)
153
+ start_x, start_y = _get_xy(start_element_id)
154
+ end_x, end_y = _get_xy(end_element_id)
155
+
156
+ action_args = {
157
+ "type": action,
158
+ "x": x,
159
+ "y": y,
160
+ "start_x": start_x,
161
+ "start_y": start_y,
162
+ "end_x": end_x,
163
+ "end_y": end_y,
164
+ "text": text,
165
+ "keys": keys,
166
+ "button": button,
167
+ "scroll_x": scroll_x,
168
+ "scroll_y": scroll_y
169
+ }
170
+ # Remove None values to keep the JSON clean
171
+ action_args = {k: v for k, v in action_args.items() if v is not None}
172
+
173
+ return [{
174
+ "type": "computer_call",
175
+ "action": action_args,
176
+ "id": item_id,
177
+ "call_id": call_id,
178
+ "status": "completed"
179
+ }]
180
+
181
+ return [item]
182
+
183
+ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
184
+ """
185
+ Convert computer_call back to function_call format.
186
+ Also handles computer_call_output -> function_call_output conversion.
187
+
188
+ Args:
189
+ item: The item to convert
190
+ xy2id: Mapping from (x, y) coordinates to element IDs
191
+ """
192
+ item_type = item.get("type")
193
+
194
+ def _get_element_id(x: Optional[float], y: Optional[float]) -> Optional[int]:
195
+ """Get element ID from coordinates, return None if coordinates are None"""
196
+ if x is None or y is None:
197
+ return None
198
+ return xy2id.get((x, y))
199
+
200
+ if item_type == "computer_call":
201
+ action_data = item.get("action", {})
202
+
203
+ # Extract coordinates and convert back to element IDs
204
+ element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
205
+ start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
206
+ end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
207
+
208
+ # Build function arguments
209
+ fn_args = {
210
+ "action": action_data.get("type"),
211
+ "element_id": element_id,
212
+ "start_element_id": start_element_id,
213
+ "end_element_id": end_element_id,
214
+ "text": action_data.get("text"),
215
+ "keys": action_data.get("keys"),
216
+ "button": action_data.get("button"),
217
+ "scroll_x": action_data.get("scroll_x"),
218
+ "scroll_y": action_data.get("scroll_y")
219
+ }
220
+
221
+ # Remove None values to keep the JSON clean
222
+ fn_args = {k: v for k, v in fn_args.items() if v is not None}
223
+
224
+ return [{
225
+ "type": "function_call",
226
+ "name": "computer",
227
+ "arguments": json.dumps(fn_args),
228
+ "id": item.get("id"),
229
+ "call_id": item.get("call_id"),
230
+ "status": "completed",
231
+
232
+ # Fall back to string representation
233
+ "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
234
+ }]
235
+
236
+ elif item_type == "computer_call_output":
237
+ # Simple conversion: computer_call_output -> function_call_output
238
+ return [{
239
+ "type": "function_call_output",
240
+ "call_id": item.get("call_id"),
241
+ "content": [item.get("output")],
242
+ "id": item.get("id"),
243
+ "status": "completed"
244
+ }]
245
+
246
+ return [item]
247
+
248
+
249
+ @agent_loop(models=r"omniparser\+.*|omni\+.*", priority=10)
250
+ async def omniparser_loop(
251
+ messages: Messages,
252
+ model: str,
253
+ tools: Optional[List[Dict[str, Any]]] = None,
254
+ max_retries: Optional[int] = None,
255
+ stream: bool = False,
256
+ computer_handler=None,
257
+ use_prompt_caching: Optional[bool] = False,
258
+ _on_api_start=None,
259
+ _on_api_end=None,
260
+ _on_usage=None,
261
+ _on_screenshot=None,
262
+ **kwargs
263
+ ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
264
+ """
265
+ OpenAI computer-use-preview agent loop using liteLLM responses.
266
+
267
+ Supports OpenAI's computer use preview models.
268
+ """
269
+ if not OMNIPARSER_AVAILABLE:
270
+ raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
271
+
272
+ tools = tools or []
273
+
274
+ llm_model = model.split('+')[-1]
275
+
276
+ # Prepare tools for OpenAI API
277
+ openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
278
+
279
+ # Find last computer_call_output
280
+ last_computer_call_output = get_last_computer_call_output(messages)
281
+ if last_computer_call_output:
282
+ image_url = last_computer_call_output.get("output", {}).get("image_url", "")
283
+ image_data = image_url.split(",")[-1]
284
+ if image_data:
285
+ parser = get_parser()
286
+ result = parser.parse(image_data)
287
+ if _on_screenshot:
288
+ await _on_screenshot(result.annotated_image_base64, "annotated_image")
289
+ for element in result.elements:
290
+ id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
291
+
292
+ # handle computer calls -> function calls
293
+ new_messages = []
294
+ for message in messages:
295
+ if not isinstance(message, dict):
296
+ message = message.__dict__
297
+ new_messages += await replace_computer_call_with_function(message, id2xy)
298
+ messages = new_messages
299
+
300
+ # Prepare API call kwargs
301
+ api_kwargs = {
302
+ "model": llm_model,
303
+ "input": messages,
304
+ "tools": openai_tools if openai_tools else None,
305
+ "stream": stream,
306
+ "reasoning": {"summary": "concise"},
307
+ "truncation": "auto",
308
+ "num_retries": max_retries,
309
+ **kwargs
310
+ }
311
+
312
+ # Call API start hook
313
+ if _on_api_start:
314
+ await _on_api_start(api_kwargs)
315
+
316
+ print(str(api_kwargs)[:1000])
317
+
318
+ # Use liteLLM responses
319
+ response = await litellm.aresponses(**api_kwargs)
320
+
321
+ # Call API end hook
322
+ if _on_api_end:
323
+ await _on_api_end(api_kwargs, response)
324
+
325
+ # Extract usage information
326
+ response.usage = {
327
+ **response.usage.model_dump(),
328
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
329
+ }
330
+ if _on_usage:
331
+ await _on_usage(response.usage)
332
+
333
+ # handle som function calls -> xy computer calls
334
+ new_output = []
335
+ for i in range(len(response.output)):
336
+ new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)
337
+ response.output = new_output
338
+
339
+ return response
agent/loops/openai.py ADDED
@@ -0,0 +1,95 @@
1
+ """
2
+ OpenAI computer-use-preview agent loop implementation using liteLLM
3
+ """
4
+
5
+ import asyncio
6
+ import json
7
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional
8
+ import litellm
9
+
10
+ from ..decorators import agent_loop
11
+ from ..types import Messages, AgentResponse, Tools
12
+
13
+ def _map_computer_tool_to_openai(computer_tool: Any) -> Dict[str, Any]:
14
+ """Map a computer tool to OpenAI's computer-use-preview tool schema"""
15
+ return {
16
+ "type": "computer_use_preview",
17
+ "display_width": getattr(computer_tool, 'display_width', 1024),
18
+ "display_height": getattr(computer_tool, 'display_height', 768),
19
+ "environment": getattr(computer_tool, 'environment', "linux") # mac, windows, linux, browser
20
+ }
21
+
22
+
23
+ def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
24
+ """Prepare tools for OpenAI API format"""
25
+ openai_tools = []
26
+
27
+ for schema in tool_schemas:
28
+ if schema["type"] == "computer":
29
+ # Map computer tool to OpenAI format
30
+ openai_tools.append(_map_computer_tool_to_openai(schema["computer"]))
31
+ elif schema["type"] == "function":
32
+ # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
33
+ # Schema should be: {type, name, description, parameters}
34
+ openai_tools.append({ "type": "function", **schema["function"] })
35
+
36
+ return openai_tools
37
+
38
+
39
+ @agent_loop(models=r".*computer-use-preview.*", priority=10)
40
+ async def openai_computer_use_loop(
41
+ messages: Messages,
42
+ model: str,
43
+ tools: Optional[List[Dict[str, Any]]] = None,
44
+ max_retries: Optional[int] = None,
45
+ stream: bool = False,
46
+ computer_handler=None,
47
+ use_prompt_caching: Optional[bool] = False,
48
+ _on_api_start=None,
49
+ _on_api_end=None,
50
+ _on_usage=None,
51
+ _on_screenshot=None,
52
+ **kwargs
53
+ ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
54
+ """
55
+ OpenAI computer-use-preview agent loop using liteLLM responses.
56
+
57
+ Supports OpenAI's computer use preview models.
58
+ """
59
+ tools = tools or []
60
+
61
+ # Prepare tools for OpenAI API
62
+ openai_tools = _prepare_tools_for_openai(tools)
63
+
64
+ # Prepare API call kwargs
65
+ api_kwargs = {
66
+ "model": model,
67
+ "input": messages,
68
+ "tools": openai_tools if openai_tools else None,
69
+ "stream": stream,
70
+ "reasoning": {"summary": "concise"},
71
+ "truncation": "auto",
72
+ "num_retries": max_retries,
73
+ **kwargs
74
+ }
75
+
76
+ # Call API start hook
77
+ if _on_api_start:
78
+ await _on_api_start(api_kwargs)
79
+
80
+ # Use liteLLM responses
81
+ response = await litellm.aresponses(**api_kwargs)
82
+
83
+ # Call API end hook
84
+ if _on_api_end:
85
+ await _on_api_end(api_kwargs, response)
86
+
87
+ # Extract usage information
88
+ response.usage = {
89
+ **response.usage.model_dump(),
90
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
91
+ }
92
+ if _on_usage:
93
+ await _on_usage(response.usage)
94
+
95
+ return response