cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/base.py CHANGED
@@ -2,13 +2,15 @@
2
2
  Base protocol for async agent configurations
3
3
  """
4
4
 
5
- from typing import Protocol, List, Dict, Any, Optional, Tuple, Union
6
5
  from abc import abstractmethod
6
+ from typing import Any, Dict, List, Optional, Protocol, Tuple, Union
7
+
7
8
  from ..types import AgentCapability
8
9
 
10
+
9
11
  class AsyncAgentConfig(Protocol):
10
12
  """Protocol defining the interface for async agent configurations."""
11
-
13
+
12
14
  @abstractmethod
13
15
  async def predict_step(
14
16
  self,
@@ -22,11 +24,11 @@ class AsyncAgentConfig(Protocol):
22
24
  _on_api_end=None,
23
25
  _on_usage=None,
24
26
  _on_screenshot=None,
25
- **kwargs
27
+ **generation_config,
26
28
  ) -> Dict[str, Any]:
27
29
  """
28
30
  Predict the next step based on input items.
29
-
31
+
30
32
  Args:
31
33
  messages: Input items following Responses format (message, function_call, computer_call)
32
34
  model: Model name to use
@@ -38,38 +40,40 @@ class AsyncAgentConfig(Protocol):
38
40
  _on_api_end: Callback for API end
39
41
  _on_usage: Callback for usage tracking
40
42
  _on_screenshot: Callback for screenshot events
41
- **kwargs: Additional arguments
42
-
43
+ **generation_config: Additional arguments to pass to the model provider
44
+ - api_key: Optional API key for the provider
45
+ - api_base: Optional API base URL for the provider
46
+
43
47
  Returns:
44
48
  Dictionary with "output" (output items) and "usage" array
45
49
  """
46
50
  ...
47
-
51
+
48
52
  @abstractmethod
49
53
  async def predict_click(
50
- self,
51
- model: str,
52
- image_b64: str,
53
- instruction: str
54
+ self, model: str, image_b64: str, instruction: str, **generation_config
54
55
  ) -> Optional[Tuple[int, int]]:
55
56
  """
56
57
  Predict click coordinates based on image and instruction.
57
-
58
+
58
59
  Args:
59
60
  model: Model name to use
60
61
  image_b64: Base64 encoded image
61
62
  instruction: Instruction for where to click
62
-
63
+ **generation_config: Additional arguments to pass to the model provider
64
+ - api_key: Optional API key for the provider
65
+ - api_base: Optional API base URL for the provider
66
+
63
67
  Returns:
64
68
  None or tuple with (x, y) coordinates
65
69
  """
66
70
  ...
67
-
71
+
68
72
  @abstractmethod
69
73
  def get_capabilities(self) -> List[AgentCapability]:
70
74
  """
71
75
  Get list of capabilities supported by this agent config.
72
-
76
+
73
77
  Returns:
74
78
  List of capability strings (e.g., ["step", "click"])
75
79
  """
@@ -3,112 +3,117 @@ Composed-grounded agent loop implementation that combines grounding and thinking
3
3
  Uses a two-stage approach: grounding model for element detection, thinking model for reasoning.
4
4
  """
5
5
 
6
- import uuid
7
6
  import asyncio
8
- import json
9
7
  import base64
10
- from typing import Dict, List, Any, Optional, Tuple
8
+ import json
9
+ import uuid
11
10
  from io import BytesIO
12
- from PIL import Image
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
13
  import litellm
14
+ from PIL import Image
14
15
 
16
+ from ..agent import find_agent_config
15
17
  from ..decorators import register_agent
16
- from ..types import Messages, AgentResponse, Tools, AgentCapability
17
18
  from ..loops.base import AsyncAgentConfig
18
19
  from ..responses import (
19
- convert_computer_calls_xy2desc,
20
- convert_responses_items_to_completion_messages,
21
20
  convert_completion_messages_to_responses_items,
22
21
  convert_computer_calls_desc2xy,
23
- get_all_element_descriptions
22
+ convert_computer_calls_xy2desc,
23
+ convert_responses_items_to_completion_messages,
24
+ get_all_element_descriptions,
24
25
  )
25
- from ..agent import find_agent_config
26
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
26
27
 
27
28
  GROUNDED_COMPUTER_TOOL_SCHEMA = {
28
- "type": "function",
29
- "function": {
30
- "name": "computer",
31
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
32
- "parameters": {
33
- "type": "object",
34
- "properties": {
35
- "action": {
36
- "type": "string",
37
- "enum": [
38
- "screenshot",
39
- "click",
40
- "double_click",
41
- "drag",
42
- "type",
43
- "keypress",
44
- "scroll",
45
- "move",
46
- "wait",
47
- "get_current_url",
48
- "get_dimensions",
49
- "get_environment"
50
- ],
51
- "description": "The action to perform"
52
- },
53
- "element_description": {
54
- "type": "string",
55
- "description": "Description of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
56
- },
57
- "start_element_description": {
58
- "type": "string",
59
- "description": "Description of the element to start dragging from (required for drag action)"
60
- },
61
- "end_element_description": {
62
- "type": "string",
63
- "description": "Description of the element to drag to (required for drag action)"
29
+ "type": "function",
30
+ "function": {
31
+ "name": "computer",
32
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool uses element descriptions to locate and interact with UI elements on the screen (e.g., 'red submit button', 'search text field', 'hamburger menu icon', 'close button in top right corner').",
33
+ "parameters": {
34
+ "type": "object",
35
+ "properties": {
36
+ "action": {
37
+ "type": "string",
38
+ "enum": [
39
+ "screenshot",
40
+ "click",
41
+ "double_click",
42
+ "drag",
43
+ "type",
44
+ "keypress",
45
+ "scroll",
46
+ "move",
47
+ "wait",
48
+ "get_current_url",
49
+ "get_dimensions",
50
+ "get_environment",
51
+ ],
52
+ "description": "The action to perform (required for all actions)",
53
+ },
54
+ "element_description": {
55
+ "type": "string",
56
+ "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)",
57
+ },
58
+ "start_element_description": {
59
+ "type": "string",
60
+ "description": "Description of the element to start dragging from (required for drag action)",
61
+ },
62
+ "end_element_description": {
63
+ "type": "string",
64
+ "description": "Description of the element to drag to (required for drag action)",
65
+ },
66
+ "text": {
67
+ "type": "string",
68
+ "description": "The text to type (required for type action)",
69
+ },
70
+ "keys": {
71
+ "type": "array",
72
+ "items": {"type": "string"},
73
+ "description": "Key(s) to press (required for keypress action)",
74
+ },
75
+ "button": {
76
+ "type": "string",
77
+ "enum": ["left", "right", "wheel", "back", "forward"],
78
+ "description": "The mouse button to use for click action (required for click and double_click action)",
79
+ },
80
+ "scroll_x": {
81
+ "type": "integer",
82
+ "description": "Horizontal scroll amount for scroll action (required for scroll action)",
83
+ },
84
+ "scroll_y": {
85
+ "type": "integer",
86
+ "description": "Vertical scroll amount for scroll action (required for scroll action)",
87
+ },
88
+ },
89
+ "required": ["action"],
64
90
  },
65
- "text": {
66
- "type": "string",
67
- "description": "The text to type (required for type action)"
68
- },
69
- "keys": {
70
- "type": "string",
71
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
72
- },
73
- "button": {
74
- "type": "string",
75
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
76
- },
77
- "scroll_x": {
78
- "type": "integer",
79
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
80
- },
81
- "scroll_y": {
82
- "type": "integer",
83
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
84
- },
85
- },
86
- "required": [
87
- "action"
88
- ]
89
- }
90
- }
91
+ },
91
92
  }
92
93
 
94
+
93
95
  def _prepare_tools_for_grounded(tool_schemas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
94
96
  """Prepare tools for grounded API format"""
95
97
  grounded_tools = []
96
-
98
+
97
99
  for schema in tool_schemas:
98
100
  if schema["type"] == "computer":
99
101
  grounded_tools.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
100
102
  else:
101
103
  grounded_tools.append(schema)
102
-
104
+
103
105
  return grounded_tools
104
106
 
107
+
105
108
  def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str]:
106
109
  """Get the last computer call output image from messages."""
107
110
  for message in reversed(messages):
108
- if (isinstance(message, dict) and
109
- message.get("type") == "computer_call_output" and
110
- isinstance(message.get("output"), dict) and
111
- message["output"].get("type") == "input_image"):
111
+ if (
112
+ isinstance(message, dict)
113
+ and message.get("type") == "computer_call_output"
114
+ and isinstance(message.get("output"), dict)
115
+ and message["output"].get("type") == "input_image"
116
+ ):
112
117
  image_url = message["output"].get("image_url", "")
113
118
  if image_url.startswith("data:image/png;base64,"):
114
119
  return image_url.split(",", 1)[1]
@@ -116,17 +121,17 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
116
121
 
117
122
 
118
123
  @register_agent(r".*\+.*", priority=1)
119
- class ComposedGroundedConfig:
124
+ class ComposedGroundedConfig(AsyncAgentConfig):
120
125
  """
121
126
  Composed-grounded agent configuration that uses both grounding and thinking models.
122
-
127
+
123
128
  The model parameter should be in format: "grounding_model+thinking_model"
124
129
  e.g., "huggingface-local/HelloKKMe/GTA1-7B+gemini/gemini-1.5-pro"
125
130
  """
126
-
131
+
127
132
  def __init__(self):
128
133
  self.desc2xy: Dict[str, Tuple[float, float]] = {}
129
-
134
+
130
135
  async def predict_step(
131
136
  self,
132
137
  messages: List[Dict[str, Any]],
@@ -140,11 +145,11 @@ class ComposedGroundedConfig:
140
145
  _on_api_end=None,
141
146
  _on_usage=None,
142
147
  _on_screenshot=None,
143
- **kwargs
148
+ **kwargs,
144
149
  ) -> Dict[str, Any]:
145
150
  """
146
151
  Composed-grounded predict step implementation.
147
-
152
+
148
153
  Process:
149
154
  0. Store last computer call image, if none then take a screenshot
150
155
  1. Convert computer calls from xy to descriptions
@@ -157,18 +162,20 @@ class ComposedGroundedConfig:
157
162
  """
158
163
  # Parse the composed model
159
164
  if "+" not in model:
160
- raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
165
+ raise ValueError(
166
+ f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
167
+ )
161
168
  grounding_model, thinking_model = model.split("+", 1)
162
-
169
+
163
170
  pre_output_items = []
164
-
171
+
165
172
  # Step 0: Store last computer call image, if none then take a screenshot
166
173
  last_image_b64 = get_last_computer_call_image(messages)
167
174
  if last_image_b64 is None:
168
175
  # Take a screenshot
169
- screenshot_b64 = await computer_handler.screenshot() # type: ignore
176
+ screenshot_b64 = await computer_handler.screenshot() # type: ignore
170
177
  if screenshot_b64:
171
-
178
+
172
179
  call_id = uuid.uuid4().hex
173
180
  pre_output_items += [
174
181
  {
@@ -177,45 +184,42 @@ class ComposedGroundedConfig:
177
184
  "content": [
178
185
  {
179
186
  "type": "output_text",
180
- "text": "Taking a screenshot to see the current computer screen."
187
+ "text": "Taking a screenshot to see the current computer screen.",
181
188
  }
182
- ]
189
+ ],
183
190
  },
184
191
  {
185
- "action": {
186
- "type": "screenshot"
187
- },
192
+ "action": {"type": "screenshot"},
188
193
  "call_id": call_id,
189
194
  "status": "completed",
190
- "type": "computer_call"
195
+ "type": "computer_call",
191
196
  },
192
197
  {
193
198
  "type": "computer_call_output",
194
199
  "call_id": call_id,
195
200
  "output": {
196
201
  "type": "input_image",
197
- "image_url": f"data:image/png;base64,{screenshot_b64}"
198
- }
202
+ "image_url": f"data:image/png;base64,{screenshot_b64}",
203
+ },
199
204
  },
200
205
  ]
201
206
  last_image_b64 = screenshot_b64
202
-
207
+
203
208
  # Call screenshot callback if provided
204
209
  if _on_screenshot:
205
210
  await _on_screenshot(screenshot_b64)
206
-
207
- tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
211
+
212
+ tool_schemas = _prepare_tools_for_grounded(tools) # type: ignore
208
213
 
209
214
  # Step 1: Convert computer calls from xy to descriptions
210
215
  input_messages = messages + pre_output_items
211
216
  messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
212
-
217
+
213
218
  # Step 2: Convert responses items to completion messages
214
219
  completion_messages = convert_responses_items_to_completion_messages(
215
- messages_with_descriptions,
216
- allow_images_in_tool_results=False
220
+ messages_with_descriptions, allow_images_in_tool_results=False
217
221
  )
218
-
222
+
219
223
  # Step 3: Call thinking model with litellm.acompletion
220
224
  api_kwargs = {
221
225
  "model": thinking_model,
@@ -223,96 +227,90 @@ class ComposedGroundedConfig:
223
227
  "tools": tool_schemas,
224
228
  "max_retries": max_retries,
225
229
  "stream": stream,
226
- **kwargs
230
+ **kwargs,
227
231
  }
228
232
 
229
233
  if use_prompt_caching:
230
234
  api_kwargs["use_prompt_caching"] = use_prompt_caching
231
-
235
+
232
236
  # Call API start hook
233
237
  if _on_api_start:
234
238
  await _on_api_start(api_kwargs)
235
-
239
+
236
240
  # Make the completion call
237
241
  response = await litellm.acompletion(**api_kwargs)
238
-
242
+
239
243
  # Call API end hook
240
244
  if _on_api_end:
241
245
  await _on_api_end(api_kwargs, response)
242
-
246
+
243
247
  # Extract usage information
244
248
  usage = {
245
- **response.usage.model_dump(), # type: ignore
249
+ **response.usage.model_dump(), # type: ignore
246
250
  "response_cost": response._hidden_params.get("response_cost", 0.0),
247
251
  }
248
252
  if _on_usage:
249
253
  await _on_usage(usage)
250
-
254
+
251
255
  # Step 4: Convert completion messages back to responses items format
252
- response_dict = response.model_dump() # type: ignore
256
+ response_dict = response.model_dump() # type: ignore
253
257
  choice_messages = [choice["message"] for choice in response_dict["choices"]]
254
258
  thinking_output_items = []
255
-
259
+
256
260
  for choice_message in choice_messages:
257
- thinking_output_items.extend(convert_completion_messages_to_responses_items([choice_message]))
258
-
261
+ thinking_output_items.extend(
262
+ convert_completion_messages_to_responses_items([choice_message])
263
+ )
264
+
259
265
  # Step 5: Get all element descriptions and populate desc2xy mapping
260
266
  element_descriptions = get_all_element_descriptions(thinking_output_items)
261
-
267
+
262
268
  if element_descriptions and last_image_b64:
263
269
  # Use grounding model to predict coordinates for each description
264
270
  grounding_agent_conf = find_agent_config(grounding_model)
265
271
  if grounding_agent_conf:
266
272
  grounding_agent = grounding_agent_conf.agent_class()
267
-
273
+
268
274
  for desc in element_descriptions:
269
- coords = await grounding_agent.predict_click(
270
- model=grounding_model,
271
- image_b64=last_image_b64,
272
- instruction=desc
273
- )
274
- if coords:
275
- self.desc2xy[desc] = coords
276
-
275
+ for _ in range(3): # try 3 times
276
+ coords = await grounding_agent.predict_click(
277
+ model=grounding_model, image_b64=last_image_b64, instruction=desc
278
+ )
279
+ if coords:
280
+ self.desc2xy[desc] = coords
281
+ break
282
+
277
283
  # Step 6: Convert computer calls from descriptions back to xy coordinates
278
284
  final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
279
-
285
+
280
286
  # Step 7: Return output and usage
281
- return {
282
- "output": pre_output_items + final_output_items,
283
- "usage": usage
284
- }
285
-
287
+ return {"output": pre_output_items + final_output_items, "usage": usage}
288
+
286
289
  async def predict_click(
287
- self,
288
- model: str,
289
- image_b64: str,
290
- instruction: str,
291
- **kwargs
290
+ self, model: str, image_b64: str, instruction: str, **kwargs
292
291
  ) -> Optional[Tuple[int, int]]:
293
292
  """
294
293
  Predict click coordinates using the grounding model.
295
-
294
+
296
295
  For composed models, uses only the grounding model part for click prediction.
297
296
  """
298
297
  # Parse the composed model to get grounding model
299
298
  if "+" not in model:
300
- raise ValueError(f"Composed model must be in format 'grounding_model+thinking_model', got: {model}")
299
+ raise ValueError(
300
+ f"Composed model must be in format 'grounding_model+thinking_model', got: {model}"
301
+ )
301
302
  grounding_model, thinking_model = model.split("+", 1)
302
-
303
+
303
304
  # Find and use the grounding agent
304
305
  grounding_agent_conf = find_agent_config(grounding_model)
305
306
  if grounding_agent_conf:
306
307
  grounding_agent = grounding_agent_conf.agent_class()
307
308
  return await grounding_agent.predict_click(
308
- model=grounding_model,
309
- image_b64=image_b64,
310
- instruction=instruction,
311
- **kwargs
309
+ model=grounding_model, image_b64=image_b64, instruction=instruction, **kwargs
312
310
  )
313
-
311
+
314
312
  return None
315
-
313
+
316
314
  def get_capabilities(self) -> List[AgentCapability]:
317
315
  """Return the capabilities supported by this agent."""
318
316
  return ["click", "step"]
@@ -0,0 +1,8 @@
1
+ """
2
+ FARA-7B agent loop implementation.
3
+ Original implementation from Microsoft: https://github.com/microsoft/Fara
4
+ """
5
+
6
+ from .config import FaraVlmConfig
7
+
8
+ __all__ = ("FaraVlmConfig",)