cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/gta1.py CHANGED
@@ -5,75 +5,80 @@ Code: https://github.com/Yan98/GTA1
5
5
  """
6
6
 
7
7
  import asyncio
8
+ import base64
8
9
  import json
10
+ import math
9
11
  import re
10
- import base64
11
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
12
- from io import BytesIO
13
12
  import uuid
14
- from PIL import Image
13
+ from io import BytesIO
14
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
15
+
15
16
  import litellm
16
- import math
17
+ from PIL import Image
17
18
 
18
19
  from ..decorators import register_agent
19
- from ..types import Messages, AgentResponse, Tools, AgentCapability
20
20
  from ..loops.base import AsyncAgentConfig
21
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
21
22
 
22
- SYSTEM_PROMPT = '''
23
+ SYSTEM_PROMPT = """
23
24
  You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
24
25
 
25
26
  Output the coordinate pair exactly:
26
27
  (x,y)
27
- '''.strip()
28
+ """.strip()
29
+
28
30
 
29
31
  def extract_coordinates(raw_string: str) -> Tuple[float, float]:
30
32
  """Extract coordinates from model output."""
31
33
  try:
32
34
  matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
33
- return tuple(map(float, matches[0])) # type: ignore
35
+ return tuple(map(float, matches[0])) # type: ignore
34
36
  except:
35
37
  return (0.0, 0.0)
36
38
 
37
- def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360) -> Tuple[int, int]:
39
+
40
+ def smart_resize(
41
+ height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
42
+ ) -> Tuple[int, int]:
38
43
  """Smart resize function similar to qwen_vl_utils."""
39
44
  # Calculate the total pixels
40
45
  total_pixels = height * width
41
-
46
+
42
47
  # If already within bounds, return original dimensions
43
48
  if min_pixels <= total_pixels <= max_pixels:
44
49
  # Round to nearest factor
45
50
  new_height = (height // factor) * factor
46
51
  new_width = (width // factor) * factor
47
52
  return new_height, new_width
48
-
53
+
49
54
  # Calculate scaling factor
50
55
  if total_pixels > max_pixels:
51
56
  scale = (max_pixels / total_pixels) ** 0.5
52
57
  else:
53
58
  scale = (min_pixels / total_pixels) ** 0.5
54
-
59
+
55
60
  # Apply scaling
56
61
  new_height = int(height * scale)
57
62
  new_width = int(width * scale)
58
-
63
+
59
64
  # Round to nearest factor
60
65
  new_height = (new_height // factor) * factor
61
66
  new_width = (new_width // factor) * factor
62
-
67
+
63
68
  # Ensure minimum size
64
69
  new_height = max(new_height, factor)
65
70
  new_width = max(new_width, factor)
66
-
71
+
67
72
  return new_height, new_width
68
73
 
74
+
69
75
  @register_agent(models=r".*GTA1.*")
70
76
  class GTA1Config(AsyncAgentConfig):
71
77
  """GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction."""
72
-
78
+
73
79
  def __init__(self):
74
80
  self.current_model = None
75
81
  self.last_screenshot_b64 = None
76
-
77
82
 
78
83
  async def predict_step(
79
84
  self,
@@ -87,25 +92,21 @@ class GTA1Config(AsyncAgentConfig):
87
92
  _on_api_end=None,
88
93
  _on_usage=None,
89
94
  _on_screenshot=None,
90
- **kwargs
95
+ **kwargs,
91
96
  ) -> Dict[str, Any]:
92
97
  raise NotImplementedError()
93
98
 
94
99
  async def predict_click(
95
- self,
96
- model: str,
97
- image_b64: str,
98
- instruction: str,
99
- **kwargs
100
+ self, model: str, image_b64: str, instruction: str, **kwargs
100
101
  ) -> Optional[Tuple[float, float]]:
101
102
  """
102
103
  Predict click coordinates using GTA1 model via litellm.acompletion.
103
-
104
+
104
105
  Args:
105
106
  model: The GTA1 model name
106
107
  image_b64: Base64 encoded image
107
108
  instruction: Instruction for where to click
108
-
109
+
109
110
  Returns:
110
111
  Tuple of (x, y) coordinates or None if prediction fails
111
112
  """
@@ -113,66 +114,62 @@ class GTA1Config(AsyncAgentConfig):
113
114
  image_data = base64.b64decode(image_b64)
114
115
  image = Image.open(BytesIO(image_data))
115
116
  width, height = image.width, image.height
116
-
117
+
117
118
  # Smart resize the image (similar to qwen_vl_utils)
118
119
  resized_height, resized_width = smart_resize(
119
- height, width,
120
+ height,
121
+ width,
120
122
  factor=28, # Default factor for Qwen models
121
123
  min_pixels=3136,
122
- max_pixels=4096 * 2160
124
+ max_pixels=4096 * 2160,
123
125
  )
124
126
  resized_image = image.resize((resized_width, resized_height))
125
127
  scale_x, scale_y = width / resized_width, height / resized_height
126
-
128
+
127
129
  # Convert resized image back to base64
128
130
  buffered = BytesIO()
129
131
  resized_image.save(buffered, format="PNG")
130
132
  resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
131
-
133
+
132
134
  # Prepare system and user messages
133
135
  system_message = {
134
136
  "role": "system",
135
- "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width)
137
+ "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width),
136
138
  }
137
-
139
+
138
140
  user_message = {
139
141
  "role": "user",
140
142
  "content": [
141
143
  {
142
144
  "type": "image_url",
143
- "image_url": {
144
- "url": f"data:image/png;base64,{resized_image_b64}"
145
- }
145
+ "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
146
146
  },
147
- {
148
- "type": "text",
149
- "text": instruction
150
- }
151
- ]
147
+ {"type": "text", "text": instruction},
148
+ ],
152
149
  }
153
-
150
+
154
151
  # Prepare API call kwargs
155
152
  api_kwargs = {
156
153
  "model": model,
157
154
  "messages": [system_message, user_message],
158
155
  "max_tokens": 2056,
159
156
  "temperature": 0.0,
160
- **kwargs
157
+ **kwargs,
161
158
  }
162
-
159
+
163
160
  # Use liteLLM acompletion
164
161
  response = await litellm.acompletion(**api_kwargs)
165
-
162
+
166
163
  # Extract response text
167
- output_text = response.choices[0].message.content # type: ignore
168
-
164
+ output_text = response.choices[0].message.content # type: ignore
165
+
169
166
  # Extract and rescale coordinates
170
- pred_x, pred_y = extract_coordinates(output_text) # type: ignore
167
+ pred_x, pred_y = extract_coordinates(output_text) # type: ignore
171
168
  pred_x *= scale_x
172
169
  pred_y *= scale_y
173
-
170
+
174
171
  return (math.floor(pred_x), math.floor(pred_y))
175
-
172
+
176
173
  def get_capabilities(self) -> List[AgentCapability]:
177
174
  """Return the capabilities supported by this agent."""
178
175
  return ["click"]
agent/loops/holo.py CHANGED
@@ -21,8 +21,8 @@ import litellm
21
21
  from PIL import Image
22
22
 
23
23
  from ..decorators import register_agent
24
- from .base import AsyncAgentConfig
25
24
  from ..types import AgentCapability
25
+ from .base import AsyncAgentConfig
26
26
 
27
27
 
28
28
  def _strip_hf_prefix(model: str) -> str:
@@ -53,7 +53,9 @@ def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tu
53
53
  if image_processor is None:
54
54
  return image, (orig_w, orig_h)
55
55
 
56
- factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1)
56
+ factor = getattr(image_processor, "patch_size", 14) * getattr(
57
+ image_processor, "merge_size", 1
58
+ )
57
59
  min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
58
60
  max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
59
61
 
agent/loops/internvl.py CHANGED
@@ -18,13 +18,12 @@ import re
18
18
  from io import BytesIO
19
19
  from typing import Any, Dict, List, Optional, Tuple
20
20
 
21
- from PIL import Image
22
21
  import litellm
22
+ from PIL import Image
23
23
 
24
24
  from ..decorators import register_agent
25
- from .composed_grounded import ComposedGroundedConfig
26
25
  from ..types import AgentCapability
27
-
26
+ from .composed_grounded import ComposedGroundedConfig
28
27
 
29
28
  # Regex patterns for extracting coordinates
30
29
  # Accept optional whitespace and optional decimal fractions
@@ -91,7 +90,7 @@ class InternVLConfig(ComposedGroundedConfig):
91
90
  _on_api_end=None,
92
91
  _on_usage=None,
93
92
  _on_screenshot=None,
94
- **kwargs
93
+ **kwargs,
95
94
  ) -> Dict[str, Any]:
96
95
  """Fallback to a self-composed model"""
97
96
  return await super().predict_step(
@@ -105,15 +104,11 @@ class InternVLConfig(ComposedGroundedConfig):
105
104
  _on_api_end=_on_api_end,
106
105
  _on_usage=_on_usage,
107
106
  _on_screenshot=_on_screenshot,
108
- **kwargs
107
+ **kwargs,
109
108
  )
110
-
109
+
111
110
  async def predict_click(
112
- self,
113
- model: str,
114
- image_b64: str,
115
- instruction: str,
116
- **kwargs
111
+ self, model: str, image_b64: str, instruction: str, **kwargs
117
112
  ) -> Optional[Tuple[int, int]]:
118
113
  """
119
114
  Predict click coordinates using InternVL via litellm.acompletion.
agent/loops/moondream3.py CHANGED
@@ -14,27 +14,28 @@ Differences from composed_grounded:
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
- import uuid
18
17
  import base64
19
18
  import io
20
- from typing import Dict, List, Any, Optional, Tuple, Any
19
+ import uuid
20
+ from typing import Any, Dict, List, Optional, Tuple
21
21
 
22
- from PIL import Image, ImageDraw, ImageFont
23
22
  import litellm
23
+ from PIL import Image, ImageDraw, ImageFont
24
24
 
25
25
  from ..decorators import register_agent
26
- from ..types import AgentCapability
27
26
  from ..loops.base import AsyncAgentConfig
28
27
  from ..responses import (
29
- convert_computer_calls_xy2desc,
30
- convert_responses_items_to_completion_messages,
31
28
  convert_completion_messages_to_responses_items,
32
29
  convert_computer_calls_desc2xy,
30
+ convert_computer_calls_xy2desc,
31
+ convert_responses_items_to_completion_messages,
33
32
  get_all_element_descriptions,
34
33
  )
34
+ from ..types import AgentCapability
35
35
 
36
36
  _MOONDREAM_SINGLETON = None
37
37
 
38
+
38
39
  def get_moondream_model() -> Any:
39
40
  """Get a singleton instance of the Moondream3 preview model."""
40
41
  global _MOONDREAM_SINGLETON
@@ -42,6 +43,7 @@ def get_moondream_model() -> Any:
42
43
  try:
43
44
  import torch
44
45
  from transformers import AutoModelForCausalLM
46
+
45
47
  _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
46
48
  "moondream/moondream3-preview",
47
49
  trust_remote_code=True,
@@ -95,6 +97,7 @@ def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> L
95
97
  filtered.append(msg_copy)
96
98
  return filtered
97
99
 
100
+
98
101
  def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
99
102
  """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
100
103
 
@@ -132,7 +135,12 @@ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str,
132
135
  y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
133
136
  x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
134
137
  y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
135
- left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H)
138
+ left, top, right, bottom = (
139
+ int(x_min * W),
140
+ int(y_min * H),
141
+ int(x_max * W),
142
+ int(y_max * H),
143
+ )
136
144
  left, top = max(0, left), max(0, top)
137
145
  right, bottom = min(W - 1, right), min(H - 1, bottom)
138
146
  crop = base_img.crop((left, top, right, bottom))
@@ -200,6 +208,7 @@ def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str,
200
208
  annotated_b64 = _image_to_b64(annotated)
201
209
  return annotated_b64, detected_names
202
210
 
211
+
203
212
  GROUNDED_COMPUTER_TOOL_SCHEMA = {
204
213
  "type": "function",
205
214
  "function": {
@@ -270,6 +279,7 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
270
279
  },
271
280
  }
272
281
 
282
+
273
283
  @register_agent(r"moondream3\+.*", priority=2)
274
284
  class Moondream3PlusConfig(AsyncAgentConfig):
275
285
  def __init__(self):
@@ -321,14 +331,25 @@ class Moondream3PlusConfig(AsyncAgentConfig):
321
331
  "type": "message",
322
332
  "role": "assistant",
323
333
  "content": [
324
- {"type": "output_text", "text": "Taking a screenshot to analyze the current screen."}
334
+ {
335
+ "type": "output_text",
336
+ "text": "Taking a screenshot to analyze the current screen.",
337
+ }
325
338
  ],
326
339
  },
327
- {"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}},
340
+ {
341
+ "type": "computer_call",
342
+ "call_id": call_id,
343
+ "status": "completed",
344
+ "action": {"type": "screenshot"},
345
+ },
328
346
  {
329
347
  "type": "computer_call_output",
330
348
  "call_id": call_id,
331
- "output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
349
+ "output": {
350
+ "type": "input_image",
351
+ "image_url": f"data:image/png;base64,{screenshot_b64}",
352
+ },
332
353
  },
333
354
  ]
334
355
  last_image_b64 = screenshot_b64
@@ -354,13 +375,16 @@ class Moondream3PlusConfig(AsyncAgentConfig):
354
375
  "content": [
355
376
  {"type": "input_text", "text": "Detected form UI elements on screen:"},
356
377
  {"type": "input_text", "text": names_text},
357
- {"type": "input_text", "text": "Please continue with the next action needed to perform your task."}
378
+ {
379
+ "type": "input_text",
380
+ "text": "Please continue with the next action needed to perform your task.",
381
+ },
358
382
  ],
359
383
  }
360
384
  )
361
385
 
362
386
  tool_schemas = []
363
- for schema in (tools or []):
387
+ for schema in tools or []:
364
388
  if schema.get("type") == "computer":
365
389
  tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
366
390
  else: