cua-agent 0.3.1__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +1 -1
  30. agent/core/__init__.py +0 -27
  31. agent/core/agent.py +0 -210
  32. agent/core/base.py +0 -217
  33. agent/core/callbacks.py +0 -200
  34. agent/core/experiment.py +0 -249
  35. agent/core/factory.py +0 -122
  36. agent/core/messages.py +0 -332
  37. agent/core/provider_config.py +0 -21
  38. agent/core/telemetry.py +0 -142
  39. agent/core/tools/__init__.py +0 -21
  40. agent/core/tools/base.py +0 -74
  41. agent/core/tools/bash.py +0 -52
  42. agent/core/tools/collection.py +0 -46
  43. agent/core/tools/computer.py +0 -113
  44. agent/core/tools/edit.py +0 -67
  45. agent/core/tools/manager.py +0 -56
  46. agent/core/tools.py +0 -32
  47. agent/core/types.py +0 -88
  48. agent/core/visualization.py +0 -197
  49. agent/providers/__init__.py +0 -4
  50. agent/providers/anthropic/__init__.py +0 -6
  51. agent/providers/anthropic/api/client.py +0 -360
  52. agent/providers/anthropic/api/logging.py +0 -150
  53. agent/providers/anthropic/api_handler.py +0 -140
  54. agent/providers/anthropic/callbacks/__init__.py +0 -5
  55. agent/providers/anthropic/callbacks/manager.py +0 -65
  56. agent/providers/anthropic/loop.py +0 -568
  57. agent/providers/anthropic/prompts.py +0 -23
  58. agent/providers/anthropic/response_handler.py +0 -226
  59. agent/providers/anthropic/tools/__init__.py +0 -33
  60. agent/providers/anthropic/tools/base.py +0 -88
  61. agent/providers/anthropic/tools/bash.py +0 -66
  62. agent/providers/anthropic/tools/collection.py +0 -34
  63. agent/providers/anthropic/tools/computer.py +0 -396
  64. agent/providers/anthropic/tools/edit.py +0 -326
  65. agent/providers/anthropic/tools/manager.py +0 -54
  66. agent/providers/anthropic/tools/run.py +0 -42
  67. agent/providers/anthropic/types.py +0 -16
  68. agent/providers/anthropic/utils.py +0 -367
  69. agent/providers/omni/__init__.py +0 -8
  70. agent/providers/omni/api_handler.py +0 -42
  71. agent/providers/omni/clients/anthropic.py +0 -103
  72. agent/providers/omni/clients/base.py +0 -35
  73. agent/providers/omni/clients/oaicompat.py +0 -195
  74. agent/providers/omni/clients/ollama.py +0 -122
  75. agent/providers/omni/clients/openai.py +0 -155
  76. agent/providers/omni/clients/utils.py +0 -25
  77. agent/providers/omni/image_utils.py +0 -34
  78. agent/providers/omni/loop.py +0 -990
  79. agent/providers/omni/parser.py +0 -307
  80. agent/providers/omni/prompts.py +0 -64
  81. agent/providers/omni/tools/__init__.py +0 -30
  82. agent/providers/omni/tools/base.py +0 -29
  83. agent/providers/omni/tools/bash.py +0 -74
  84. agent/providers/omni/tools/computer.py +0 -179
  85. agent/providers/omni/tools/manager.py +0 -61
  86. agent/providers/omni/utils.py +0 -236
  87. agent/providers/openai/__init__.py +0 -6
  88. agent/providers/openai/api_handler.py +0 -456
  89. agent/providers/openai/loop.py +0 -472
  90. agent/providers/openai/response_handler.py +0 -205
  91. agent/providers/openai/tools/__init__.py +0 -15
  92. agent/providers/openai/tools/base.py +0 -79
  93. agent/providers/openai/tools/computer.py +0 -326
  94. agent/providers/openai/tools/manager.py +0 -106
  95. agent/providers/openai/types.py +0 -36
  96. agent/providers/openai/utils.py +0 -98
  97. agent/providers/uitars/__init__.py +0 -1
  98. agent/providers/uitars/clients/base.py +0 -35
  99. agent/providers/uitars/clients/mlxvlm.py +0 -263
  100. agent/providers/uitars/clients/oaicompat.py +0 -214
  101. agent/providers/uitars/loop.py +0 -660
  102. agent/providers/uitars/prompts.py +0 -63
  103. agent/providers/uitars/tools/__init__.py +0 -1
  104. agent/providers/uitars/tools/computer.py +0 -283
  105. agent/providers/uitars/tools/manager.py +0 -60
  106. agent/providers/uitars/utils.py +0 -264
  107. agent/telemetry.py +0 -21
  108. agent/ui/__main__.py +0 -15
  109. cua_agent-0.3.1.dist-info/METADATA +0 -295
  110. cua_agent-0.3.1.dist-info/RECORD +0 -87
  111. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,339 @@
1
+ """
2
+ OpenAI computer-use-preview agent loop implementation using liteLLM
3
+ """
4
+
5
+ import asyncio
6
+ import json
7
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
8
+ import litellm
9
+ import inspect
10
+ import base64
11
+
12
+ from ..decorators import agent_loop
13
+ from ..types import Messages, AgentResponse, Tools
14
+
15
+ SOM_TOOL_SCHEMA = {
16
+ "type": "function",
17
+ "name": "computer",
18
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
19
+ "parameters": {
20
+ "type": "object",
21
+ "properties": {
22
+ "action": {
23
+ "type": "string",
24
+ "enum": [
25
+ "screenshot",
26
+ "click",
27
+ "double_click",
28
+ "drag",
29
+ "type",
30
+ "keypress",
31
+ "scroll",
32
+ "move",
33
+ "wait",
34
+ "get_current_url",
35
+ "get_dimensions",
36
+ "get_environment"
37
+ ],
38
+ "description": "The action to perform"
39
+ },
40
+ "element_id": {
41
+ "type": "integer",
42
+ "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
43
+ },
44
+ "start_element_id": {
45
+ "type": "integer",
46
+ "description": "The ID of the element to start dragging from (required for drag action)"
47
+ },
48
+ "end_element_id": {
49
+ "type": "integer",
50
+ "description": "The ID of the element to drag to (required for drag action)"
51
+ },
52
+ "text": {
53
+ "type": "string",
54
+ "description": "The text to type (required for type action)"
55
+ },
56
+ "keys": {
57
+ "type": "string",
58
+ "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
59
+ },
60
+ "button": {
61
+ "type": "string",
62
+ "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
63
+ },
64
+ "scroll_x": {
65
+ "type": "integer",
66
+ "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
67
+ },
68
+ "scroll_y": {
69
+ "type": "integer",
70
+ "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
71
+ },
72
+ },
73
+ "required": [
74
+ "action"
75
+ ]
76
+ }
77
+ }
78
+
79
+ OMNIPARSER_AVAILABLE = False
80
+ try:
81
+ from som import OmniParser
82
+ OMNIPARSER_AVAILABLE = True
83
+ except ImportError:
84
+ pass
85
+ OMNIPARSER_SINGLETON = None
86
+
87
+ def get_parser() -> OmniParser:
88
+ global OMNIPARSER_SINGLETON
89
+ if OMNIPARSER_SINGLETON is None:
90
+ OMNIPARSER_SINGLETON = OmniParser()
91
+ return OMNIPARSER_SINGLETON
92
+
93
+ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
94
+ """Get the last computer_call_output message from a messages list.
95
+
96
+ Args:
97
+ messages: List of messages to search through
98
+
99
+ Returns:
100
+ The last computer_call_output message dict, or None if not found
101
+ """
102
+ for message in reversed(messages):
103
+ if isinstance(message, dict) and message.get("type") == "computer_call_output":
104
+ return message
105
+ return None
106
+
107
+ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
108
+ """Prepare tools for OpenAI API format"""
109
+ omniparser_tools = []
110
+ id2xy = dict()
111
+
112
+ for schema in tool_schemas:
113
+ if schema["type"] == "computer":
114
+ omniparser_tools.append(SOM_TOOL_SCHEMA)
115
+ if "id2xy" in schema:
116
+ id2xy = schema["id2xy"]
117
+ else:
118
+ schema["id2xy"] = id2xy
119
+ elif schema["type"] == "function":
120
+ # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
121
+ # Schema should be: {type, name, description, parameters}
122
+ omniparser_tools.append({ "type": "function", **schema["function"] })
123
+
124
+ return omniparser_tools, id2xy
125
+
126
+ async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
127
+ item_type = item.get("type")
128
+
129
+ def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
130
+ if element_id is None:
131
+ return (None, None)
132
+ return id2xy.get(element_id, (None, None))
133
+
134
+ if item_type == "function_call":
135
+ fn_name = item.get("name")
136
+ fn_args = json.loads(item.get("arguments", "{}"))
137
+
138
+ item_id = item.get("id")
139
+ call_id = item.get("call_id")
140
+
141
+ if fn_name == "computer":
142
+ action = fn_args.get("action")
143
+ element_id = fn_args.get("element_id")
144
+ start_element_id = fn_args.get("start_element_id")
145
+ end_element_id = fn_args.get("end_element_id")
146
+ text = fn_args.get("text")
147
+ keys = fn_args.get("keys")
148
+ button = fn_args.get("button")
149
+ scroll_x = fn_args.get("scroll_x")
150
+ scroll_y = fn_args.get("scroll_y")
151
+
152
+ x, y = _get_xy(element_id)
153
+ start_x, start_y = _get_xy(start_element_id)
154
+ end_x, end_y = _get_xy(end_element_id)
155
+
156
+ action_args = {
157
+ "type": action,
158
+ "x": x,
159
+ "y": y,
160
+ "start_x": start_x,
161
+ "start_y": start_y,
162
+ "end_x": end_x,
163
+ "end_y": end_y,
164
+ "text": text,
165
+ "keys": keys,
166
+ "button": button,
167
+ "scroll_x": scroll_x,
168
+ "scroll_y": scroll_y
169
+ }
170
+ # Remove None values to keep the JSON clean
171
+ action_args = {k: v for k, v in action_args.items() if v is not None}
172
+
173
+ return [{
174
+ "type": "computer_call",
175
+ "action": action_args,
176
+ "id": item_id,
177
+ "call_id": call_id,
178
+ "status": "completed"
179
+ }]
180
+
181
+ return [item]
182
+
183
+ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
184
+ """
185
+ Convert computer_call back to function_call format.
186
+ Also handles computer_call_output -> function_call_output conversion.
187
+
188
+ Args:
189
+ item: The item to convert
190
+ xy2id: Mapping from (x, y) coordinates to element IDs
191
+ """
192
+ item_type = item.get("type")
193
+
194
+ def _get_element_id(x: Optional[float], y: Optional[float]) -> Optional[int]:
195
+ """Get element ID from coordinates, return None if coordinates are None"""
196
+ if x is None or y is None:
197
+ return None
198
+ return xy2id.get((x, y))
199
+
200
+ if item_type == "computer_call":
201
+ action_data = item.get("action", {})
202
+
203
+ # Extract coordinates and convert back to element IDs
204
+ element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
205
+ start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
206
+ end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
207
+
208
+ # Build function arguments
209
+ fn_args = {
210
+ "action": action_data.get("type"),
211
+ "element_id": element_id,
212
+ "start_element_id": start_element_id,
213
+ "end_element_id": end_element_id,
214
+ "text": action_data.get("text"),
215
+ "keys": action_data.get("keys"),
216
+ "button": action_data.get("button"),
217
+ "scroll_x": action_data.get("scroll_x"),
218
+ "scroll_y": action_data.get("scroll_y")
219
+ }
220
+
221
+ # Remove None values to keep the JSON clean
222
+ fn_args = {k: v for k, v in fn_args.items() if v is not None}
223
+
224
+ return [{
225
+ "type": "function_call",
226
+ "name": "computer",
227
+ "arguments": json.dumps(fn_args),
228
+ "id": item.get("id"),
229
+ "call_id": item.get("call_id"),
230
+ "status": "completed",
231
+
232
+ # Fall back to string representation
233
+ "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
234
+ }]
235
+
236
+ elif item_type == "computer_call_output":
237
+ # Simple conversion: computer_call_output -> function_call_output
238
+ return [{
239
+ "type": "function_call_output",
240
+ "call_id": item.get("call_id"),
241
+ "content": [item.get("output")],
242
+ "id": item.get("id"),
243
+ "status": "completed"
244
+ }]
245
+
246
+ return [item]
247
+
248
+
249
+ @agent_loop(models=r"omniparser\+.*|omni\+.*", priority=10)
250
+ async def omniparser_loop(
251
+ messages: Messages,
252
+ model: str,
253
+ tools: Optional[List[Dict[str, Any]]] = None,
254
+ max_retries: Optional[int] = None,
255
+ stream: bool = False,
256
+ computer_handler=None,
257
+ use_prompt_caching: Optional[bool] = False,
258
+ _on_api_start=None,
259
+ _on_api_end=None,
260
+ _on_usage=None,
261
+ _on_screenshot=None,
262
+ **kwargs
263
+ ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
264
+ """
265
+ OpenAI computer-use-preview agent loop using liteLLM responses.
266
+
267
+ Supports OpenAI's computer use preview models.
268
+ """
269
+ if not OMNIPARSER_AVAILABLE:
270
+ raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
271
+
272
+ tools = tools or []
273
+
274
+ llm_model = model.split('+')[-1]
275
+
276
+ # Prepare tools for OpenAI API
277
+ openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
278
+
279
+ # Find last computer_call_output
280
+ last_computer_call_output = get_last_computer_call_output(messages)
281
+ if last_computer_call_output:
282
+ image_url = last_computer_call_output.get("output", {}).get("image_url", "")
283
+ image_data = image_url.split(",")[-1]
284
+ if image_data:
285
+ parser = get_parser()
286
+ result = parser.parse(image_data)
287
+ if _on_screenshot:
288
+ await _on_screenshot(result.annotated_image_base64, "annotated_image")
289
+ for element in result.elements:
290
+ id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
291
+
292
+ # handle computer calls -> function calls
293
+ new_messages = []
294
+ for message in messages:
295
+ if not isinstance(message, dict):
296
+ message = message.__dict__
297
+ new_messages += await replace_computer_call_with_function(message, id2xy)
298
+ messages = new_messages
299
+
300
+ # Prepare API call kwargs
301
+ api_kwargs = {
302
+ "model": llm_model,
303
+ "input": messages,
304
+ "tools": openai_tools if openai_tools else None,
305
+ "stream": stream,
306
+ "reasoning": {"summary": "concise"},
307
+ "truncation": "auto",
308
+ "num_retries": max_retries,
309
+ **kwargs
310
+ }
311
+
312
+ # Call API start hook
313
+ if _on_api_start:
314
+ await _on_api_start(api_kwargs)
315
+
316
+ print(str(api_kwargs)[:1000])
317
+
318
+ # Use liteLLM responses
319
+ response = await litellm.aresponses(**api_kwargs)
320
+
321
+ # Call API end hook
322
+ if _on_api_end:
323
+ await _on_api_end(api_kwargs, response)
324
+
325
+ # Extract usage information
326
+ response.usage = {
327
+ **response.usage.model_dump(),
328
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
329
+ }
330
+ if _on_usage:
331
+ await _on_usage(response.usage)
332
+
333
+ # handle som function calls -> xy computer calls
334
+ new_output = []
335
+ for i in range(len(response.output)):
336
+ new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)
337
+ response.output = new_output
338
+
339
+ return response
agent/loops/openai.py ADDED
@@ -0,0 +1,95 @@
1
+ """
2
+ OpenAI computer-use-preview agent loop implementation using liteLLM
3
+ """
4
+
5
+ import asyncio
6
+ import json
7
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional
8
+ import litellm
9
+
10
+ from ..decorators import agent_loop
11
+ from ..types import Messages, AgentResponse, Tools
12
+
13
+ def _map_computer_tool_to_openai(computer_tool: Any) -> Dict[str, Any]:
14
+ """Map a computer tool to OpenAI's computer-use-preview tool schema"""
15
+ return {
16
+ "type": "computer_use_preview",
17
+ "display_width": getattr(computer_tool, 'display_width', 1024),
18
+ "display_height": getattr(computer_tool, 'display_height', 768),
19
+ "environment": getattr(computer_tool, 'environment', "linux") # mac, windows, linux, browser
20
+ }
21
+
22
+
23
+ def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
24
+ """Prepare tools for OpenAI API format"""
25
+ openai_tools = []
26
+
27
+ for schema in tool_schemas:
28
+ if schema["type"] == "computer":
29
+ # Map computer tool to OpenAI format
30
+ openai_tools.append(_map_computer_tool_to_openai(schema["computer"]))
31
+ elif schema["type"] == "function":
32
+ # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
33
+ # Schema should be: {type, name, description, parameters}
34
+ openai_tools.append({ "type": "function", **schema["function"] })
35
+
36
+ return openai_tools
37
+
38
+
39
+ @agent_loop(models=r".*computer-use-preview.*", priority=10)
40
+ async def openai_computer_use_loop(
41
+ messages: Messages,
42
+ model: str,
43
+ tools: Optional[List[Dict[str, Any]]] = None,
44
+ max_retries: Optional[int] = None,
45
+ stream: bool = False,
46
+ computer_handler=None,
47
+ use_prompt_caching: Optional[bool] = False,
48
+ _on_api_start=None,
49
+ _on_api_end=None,
50
+ _on_usage=None,
51
+ _on_screenshot=None,
52
+ **kwargs
53
+ ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
54
+ """
55
+ OpenAI computer-use-preview agent loop using liteLLM responses.
56
+
57
+ Supports OpenAI's computer use preview models.
58
+ """
59
+ tools = tools or []
60
+
61
+ # Prepare tools for OpenAI API
62
+ openai_tools = _prepare_tools_for_openai(tools)
63
+
64
+ # Prepare API call kwargs
65
+ api_kwargs = {
66
+ "model": model,
67
+ "input": messages,
68
+ "tools": openai_tools if openai_tools else None,
69
+ "stream": stream,
70
+ "reasoning": {"summary": "concise"},
71
+ "truncation": "auto",
72
+ "num_retries": max_retries,
73
+ **kwargs
74
+ }
75
+
76
+ # Call API start hook
77
+ if _on_api_start:
78
+ await _on_api_start(api_kwargs)
79
+
80
+ # Use liteLLM responses
81
+ response = await litellm.aresponses(**api_kwargs)
82
+
83
+ # Call API end hook
84
+ if _on_api_end:
85
+ await _on_api_end(api_kwargs, response)
86
+
87
+ # Extract usage information
88
+ response.usage = {
89
+ **response.usage.model_dump(),
90
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
91
+ }
92
+ if _on_usage:
93
+ await _on_usage(response.usage)
94
+
95
+ return response