cua-agent 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -1,77 +1,228 @@
1
- """HUD integration for ComputerAgent."""
2
-
3
- import logging
4
- from typing import Any, Optional, Dict
5
- from hud import run_job as hud_run_job
6
-
7
- from .agent import ComputerAgent
8
- from .adapter import ComputerAgentAdapter
9
- from .computer_handler import HUDComputerHandler
10
-
11
-
12
- async def run_job(
13
- model: str,
14
- task_or_taskset: Any,
15
- job_name: str,
16
- # Job kwargs
17
- auto_reply_question: bool = False,
18
- adapter_cls: Any = None,
19
- adapter_kwargs: Optional[Dict[str, Any]] = None,
20
- max_steps_per_task: int = 20,
21
- run_parallel: bool = True,
22
- job_metadata: Optional[Dict[str, Any]] = None,
23
- show_progress: bool = True,
24
- max_concurrent_env_creations: Optional[int] = 30, # Limits gym.make calls
25
- max_concurrent_agent_predictions: Optional[int] = None, # No limit on LLM calls
26
- max_concurrent_tasks: Optional[int] = 30, # Limits overall task concurrency
27
- **agent_kwargs: Any
28
- ) -> Any:
29
- """
30
- Run a job using ComputerAgent with the specified model.
31
-
32
- Args:
33
- model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
34
- task_or_taskset: Task or TaskSet to run
35
- job_name: Name for the job
36
- auto_reply_question: Whether to auto-reply to questions
37
- adapter_cls: Custom adapter class (defaults to ComputerAgentAdapter)
38
- adapter_kwargs: Additional kwargs for the adapter
39
- max_steps_per_task: Maximum steps per task
40
- run_parallel: Whether to run tasks in parallel
41
- job_metadata: Additional metadata for the job
42
- show_progress: Whether to show progress
43
- max_concurrent_env_creations: Max concurrent environment creations
44
- max_concurrent_agent_predictions: Max concurrent agent predictions
45
- max_concurrent_tasks: Max concurrent tasks
46
- **agent_kwargs: Additional kwargs to pass to ComputerAgent
47
-
48
- Returns:
49
- Job instance from HUD
1
+ """HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).
2
+
3
+ This module exposes two helpers to evaluate HUD-compatible datasets using
4
+ HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
5
+ `FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
6
+
7
+ Exports:
8
+ - run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
9
+ - run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
10
+ """
11
+ import time
12
+ from typing import Any, Optional
13
+
14
+ from PIL import Image
15
+ from datasets import load_dataset, Dataset
16
+ from hud.agents import OperatorAgent
17
+ from hud.datasets import Task, run_dataset
18
+ from hud.tools.computer.settings import computer_settings
19
+ from hud import trace
20
+
21
+ from agent.agent import ComputerAgent as BaseComputerAgent
22
+ from .proxy import FakeAsyncOpenAI
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Proxy OperatorAgent
27
+ # ---------------------------------------------------------------------------
28
+
29
+
30
+ class ProxyOperatorAgent(OperatorAgent):
31
+ """OperatorAgent that proxies model calls through our ComputerAgent.
32
+
33
+ Accepts the same config keys we pass via hud.run_dataset `agent_config`:
34
+ - model: str | None
35
+ - allowed_tools: list[str] | None
36
+ Additional kwargs are forwarded to OperatorAgent (if any are supported).
50
37
  """
51
- # combine verbose and verbosity kwargs
52
- if "verbose" in agent_kwargs:
53
- agent_kwargs["verbosity"] = logging.INFO
54
- del agent_kwargs["verbose"]
55
- verbose = True if agent_kwargs.get("verbosity", logging.WARNING) > logging.INFO else False
38
+
39
+ def __init__(
40
+ self,
41
+ *,
42
+ model: str | None = None,
43
+ allowed_tools: list[str] | None = None,
44
+ trajectory_dir: str | None = None,
45
+ # === ComputerAgent kwargs ===
46
+ tools: list[Any] | None = None,
47
+ custom_loop: Any | None = None,
48
+ only_n_most_recent_images: int | None = None,
49
+ callbacks: list[Any] | None = None,
50
+ verbosity: int | None = None,
51
+ max_retries: int | None = 3,
52
+ screenshot_delay: float | int = 0.5,
53
+ use_prompt_caching: bool | None = False,
54
+ max_trajectory_budget: float | dict | None = None,
55
+ telemetry_enabled: bool | None = True,
56
+ **kwargs: Any,
57
+ ) -> None:
58
+ model = model or "computer-use-preview"
59
+ allowed_tools = allowed_tools or ["openai_computer"]
60
+
61
+ computer_shim = {
62
+ 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
63
+ 'environment': 'linux',
64
+ 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
65
+ }
66
+ # Build tools ensuring the computer_shim is included
67
+ agent_tools: list[Any] = [computer_shim]
68
+ if tools:
69
+ agent_tools.extend(tools)
70
+
71
+ computer_agent = BaseComputerAgent(
72
+ model=model,
73
+ tools=agent_tools,
74
+ custom_loop=custom_loop,
75
+ only_n_most_recent_images=only_n_most_recent_images,
76
+ callbacks=callbacks,
77
+ verbosity=verbosity,
78
+ trajectory_dir=trajectory_dir,
79
+ max_retries=max_retries,
80
+ screenshot_delay=screenshot_delay,
81
+ use_prompt_caching=use_prompt_caching,
82
+ max_trajectory_budget=max_trajectory_budget,
83
+ telemetry_enabled=telemetry_enabled,
84
+ )
85
+ model_client = FakeAsyncOpenAI(computer_agent)
86
+
87
+ super().__init__(
88
+ model_client=model_client, # type: ignore[arg-type]
89
+ model=model,
90
+ allowed_tools=allowed_tools,
91
+ **kwargs,
92
+ )
93
+
94
+
95
+ # ---------------------------------------------------------------------------
96
+ # Single-task runner
97
+ # ---------------------------------------------------------------------------
98
+
99
+
100
+ async def run_single_task(
101
+ dataset: str | Dataset | list[dict[str, Any]],
102
+ *,
103
+ task_id: int = 0,
104
+ model: str | None = None,
105
+ allowed_tools: list[str] | None = None,
106
+ # === ComputerAgent kwargs ===
107
+ tools: list[Any] | None = None,
108
+ custom_loop: Any | None = None,
109
+ only_n_most_recent_images: int | None = None,
110
+ callbacks: list[Any] | None = None,
111
+ verbosity: int | None = None,
112
+ trajectory_dir: str | None = None,
113
+ max_retries: int | None = 3,
114
+ screenshot_delay: float | int = 0.5,
115
+ use_prompt_caching: bool | None = False,
116
+ max_trajectory_budget: float | dict | None = None,
117
+ telemetry_enabled: bool | None = True,
118
+ ) -> None:
119
+ """Load one task from the dataset and execute it with Operator+CUA proxy."""
120
+
121
+ # Load dataset and pick a sample
122
+ if isinstance(dataset, str):
123
+ dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
124
+ elif isinstance(dataset, list):
125
+ dataset = dataset
126
+ else:
127
+ dataset = dataset["train"]
56
128
 
57
- # run job
58
- return await hud_run_job(
59
- agent_cls=ComputerAgent,
60
- agent_kwargs={"model": model, **agent_kwargs},
61
- task_or_taskset=task_or_taskset,
62
- job_name=job_name,
63
- auto_reply_question=auto_reply_question,
64
- adapter_cls=adapter_cls,
65
- adapter_kwargs=adapter_kwargs,
66
- max_steps_per_task=max_steps_per_task,
67
- run_parallel=run_parallel,
68
- job_metadata=job_metadata,
69
- show_progress=show_progress,
70
- verbose=verbose,
71
- max_concurrent_env_creations=max_concurrent_env_creations,
72
- max_concurrent_agent_predictions=max_concurrent_agent_predictions,
73
- max_concurrent_tasks=max_concurrent_tasks
129
+ sample_task = dataset[task_id] # type: ignore[index]
130
+ task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
131
+
132
+ with trace(name=task_prompt):
133
+ task = Task(**sample_task) # type: ignore[arg-type]
134
+
135
+ agent = ProxyOperatorAgent(
136
+ model=model,
137
+ allowed_tools=allowed_tools,
138
+ # === ComputerAgent kwargs passthrough ===
139
+ tools=tools,
140
+ custom_loop=custom_loop,
141
+ only_n_most_recent_images=only_n_most_recent_images,
142
+ callbacks=callbacks,
143
+ verbosity=verbosity,
144
+ trajectory_dir=trajectory_dir,
145
+ max_retries=max_retries,
146
+ screenshot_delay=screenshot_delay,
147
+ use_prompt_caching=use_prompt_caching,
148
+ max_trajectory_budget=max_trajectory_budget,
149
+ telemetry_enabled=telemetry_enabled,
150
+ )
151
+ print(f"Running: {task_prompt}")
152
+ result = await agent.run(task, max_steps=10)
153
+ print(f"✅ Reward: {getattr(result, 'reward')}")
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Full-dataset runner
158
+ # ---------------------------------------------------------------------------
159
+
160
+
161
+ async def run_full_dataset(
162
+ dataset: str | Dataset | list[dict[str, Any]],
163
+ *,
164
+ job_name: Optional[str] = None,
165
+ model: str | None = None,
166
+ allowed_tools: list[str] | None = None,
167
+ max_concurrent: int = 30,
168
+ max_steps: int = 50,
169
+ split: str = "train",
170
+ trajectory_dir: str | None = None,
171
+ # === ComputerAgent kwargs ===
172
+ tools: list[Any] | None = None,
173
+ custom_loop: Any | None = None,
174
+ only_n_most_recent_images: int | None = 5,
175
+ callbacks: list[Any] | None = None,
176
+ verbosity: int | None = None,
177
+ max_retries: int | None = 3,
178
+ screenshot_delay: float | int = 0.5,
179
+ use_prompt_caching: bool | None = False,
180
+ max_trajectory_budget: float | dict | None = None,
181
+ telemetry_enabled: bool | None = True,
182
+ ) -> list[Any]:
183
+ """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
184
+
185
+ # We pass OperatorAgent as the class and provide a config that injects our
186
+ # FakeAsyncOpenAI per agent instantiation.
187
+
188
+ if isinstance(dataset, str):
189
+ dataset_name = dataset.split('/')[-1]
190
+ job_name = job_name or f"Evaluation {dataset_name}"
191
+ dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
192
+ else:
193
+ dataset_name = "custom"
194
+ job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
195
+
196
+ # Execute evaluation
197
+ return await run_dataset(
198
+ name=job_name,
199
+ dataset=dataset,
200
+ agent_class=ProxyOperatorAgent,
201
+ agent_config={
202
+ "model": model,
203
+ "allowed_tools": allowed_tools,
204
+ "trajectory_dir": trajectory_dir,
205
+ # === ComputerAgent kwargs passthrough ===
206
+ "tools": tools,
207
+ "custom_loop": custom_loop,
208
+ "only_n_most_recent_images": only_n_most_recent_images,
209
+ "callbacks": callbacks,
210
+ "verbosity": verbosity,
211
+ "max_retries": max_retries,
212
+ "screenshot_delay": screenshot_delay,
213
+ "use_prompt_caching": use_prompt_caching,
214
+ "max_trajectory_budget": max_trajectory_budget,
215
+ "telemetry_enabled": telemetry_enabled,
216
+ },
217
+ max_concurrent=max_concurrent,
218
+ metadata={"dataset": dataset_name},
219
+ max_steps=max_steps,
220
+ auto_respond=True,
74
221
  )
75
222
 
76
223
 
77
- __all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]
224
+ __all__ = [
225
+ "run_single_task",
226
+ "run_full_dataset",
227
+ "ProxyOperatorAgent",
228
+ ]
@@ -0,0 +1,183 @@
1
+ """HUD ComputerAgent wrapper and Fake AsyncOpenAI client.
2
+
3
+ Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses
4
+ interface needed by HUD's OperatorAgent. It implements only `responses.create`
5
+ and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of
6
+ OpenAI-like response blocks. We intentionally only support a single-step call
7
+ by consuming the first yielded result from `ComputerAgent.run()`.
8
+ """
9
+
10
+ import traceback
11
+ import time
12
+ import uuid
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from agent.agent import ComputerAgent as BaseComputerAgent
16
+
17
+ # OpenAI Responses typed models (required)
18
+ from openai.types.responses import (
19
+ Response,
20
+ ResponseInputParam,
21
+ ResponseOutputItem,
22
+ ResponseComputerToolCall,
23
+ ResponseOutputMessage,
24
+ ResponseOutputText,
25
+ ResponseReasoningItem,
26
+ ResponseUsage,
27
+ )
28
+
29
+ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
30
+ """Map our agent output items to OpenAI ResponseOutputItem typed models.
31
+
32
+ Only a subset is supported: computer_call, assistant message (text), and reasoning.
33
+ Unknown types are ignored.
34
+ """
35
+ blocks: List[ResponseOutputItem] = []
36
+ for item in output_items or []:
37
+ t = item.get("type")
38
+ if t == "computer_call":
39
+ comp = ResponseComputerToolCall.model_validate({
40
+ "id": item.get("id") or f"cu_{uuid.uuid4().hex}",
41
+ "type": "computer_call",
42
+ "call_id": item["call_id"],
43
+ "action": item["action"],
44
+ "pending_safety_checks": item.get("pending_safety_checks", []),
45
+ "status": "completed",
46
+ })
47
+ blocks.append(comp)
48
+ # we will exit early here as the responses api only supports a single step
49
+ break
50
+ elif t == "message" and item.get("role") == "assistant":
51
+ content_blocks: List[ResponseOutputText] = []
52
+ for c in item.get("content", []) or []:
53
+ content_blocks.append(
54
+ ResponseOutputText.model_validate({
55
+ "type": "output_text",
56
+ "text": c["text"],
57
+ "annotations": [],
58
+ })
59
+ )
60
+ if content_blocks:
61
+ msg = ResponseOutputMessage.model_validate({
62
+ "id": item.get("id") or f"msg_{uuid.uuid4()}",
63
+ "type": "message",
64
+ "role": "assistant",
65
+ "status": "completed",
66
+ "content": [ct.model_dump() for ct in content_blocks],
67
+ })
68
+ blocks.append(msg)
69
+ elif t == "reasoning":
70
+ reasoning = ResponseReasoningItem.model_validate({
71
+ "id": item.get("id") or f"rsn_{uuid.uuid4()}",
72
+ "type": "reasoning",
73
+ "summary": item["summary"],
74
+ })
75
+ blocks.append(reasoning)
76
+ # Unhandled types are ignored
77
+ return blocks
78
+
79
+ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
80
+ out: List[Dict[str, Any]] = []
81
+ for it in list(items):
82
+ if hasattr(it, "model_dump"):
83
+ out.append(it.model_dump()) # type: ignore[attr-defined]
84
+ elif isinstance(it, dict):
85
+ out.append(it)
86
+ else:
87
+ # Strict: rely on default __dict__ if present
88
+ out.append(dict(it)) # may raise if not mapping
89
+ return out
90
+
91
+ class FakeAsyncOpenAI:
92
+ """Minimal fake OpenAI client with only `responses.create` implemented.
93
+
94
+ It uses a provided `ComputerAgent` instance to produce a single-step
95
+ response compatible with HUD's OperatorAgent loop.
96
+ """
97
+
98
+ def __init__(self, computer_agent: BaseComputerAgent) -> None:
99
+ self._agent = computer_agent
100
+ self.responses = self._Responses(self)
101
+
102
+ class _Responses:
103
+ def __init__(self, parent: "FakeAsyncOpenAI") -> None:
104
+ # Caches for cross-call context when using previous_response_id
105
+ self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
106
+ self.context_cache: Dict[str, List[str]] = {}
107
+ self.agent = parent._agent
108
+
109
+ async def create(
110
+ self,
111
+ *,
112
+ model: str,
113
+ input: ResponseInputParam,
114
+ tools: Optional[List[Dict[str, Any]]] = None,
115
+ instructions: Optional[str] = None,
116
+ previous_response_id: Optional[str] = None,
117
+ max_retries: int = 5,
118
+ **_: Any,
119
+ ) -> Any:
120
+ for attempt in range(max_retries):
121
+ # Prepend cached blocks from previous_response_id to input
122
+ full_input = input
123
+ if previous_response_id is not None:
124
+ prev_block_ids = self.context_cache[previous_response_id]
125
+ prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids]
126
+ full_input = _to_plain_dict_list(prev_blocks + input)
127
+
128
+ # Pre-pend instructions message
129
+ effective_input = full_input
130
+ if instructions:
131
+ effective_input = [{
132
+ "role": "user",
133
+ "content": instructions,
134
+ }] + full_input
135
+
136
+ # Run a single iteration of the ComputerAgent
137
+ agent_result: Optional[Dict[str, Any]] = None
138
+ async for result in self.agent.run(effective_input): # type: ignore[arg-type]
139
+ agent_result = result
140
+ break
141
+ assert agent_result is not None, "Agent failed to produce result"
142
+
143
+ output = _map_agent_output_to_openai_blocks(agent_result["output"])
144
+ usage = agent_result["usage"]
145
+
146
+ # Cache conversation context using the last response id
147
+ block_ids: List[str] = []
148
+ blocks_to_cache = full_input + output
149
+ for b in blocks_to_cache:
150
+ bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
151
+ self.blocks_cache[bid] = b # type: ignore[assignment]
152
+ block_ids.append(bid)
153
+ response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
154
+ self.context_cache[response_id] = block_ids
155
+
156
+ try:
157
+ return Response.model_validate({
158
+ "id": response_id,
159
+ "created_at": time.time(),
160
+ "object": "response",
161
+ "model": model,
162
+ "output": output,
163
+ "parallel_tool_calls": False,
164
+ "tool_choice": "auto",
165
+ "tools": [],
166
+ "previous_response_id": previous_response_id,
167
+ "usage": ResponseUsage.model_validate({
168
+ "input_tokens": usage.get("input_tokens", 0),
169
+ "output_tokens": usage.get("output_tokens", 0),
170
+ "total_tokens": usage.get("total_tokens", 0),
171
+ "input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
172
+ "output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
173
+ }),
174
+ })
175
+ except Exception as e:
176
+ print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
177
+ if attempt == max_retries - 1:
178
+ print(traceback.format_exc())
179
+ raise e
180
+
181
+ __all__ = [
182
+ "FakeAsyncOpenAI",
183
+ ]
agent/loops/anthropic.py CHANGED
@@ -1530,7 +1530,18 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
1530
1530
  "content": [
1531
1531
  {
1532
1532
  "type": "text",
1533
- "text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
1533
+ "text": f"""You are a UI grounding expert. Follow these guidelines:
1534
+
1535
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
1536
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
1537
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
1538
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
1539
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
1540
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
1541
+ 7. Be decisive and action-oriented. Complete the requested task fully.
1542
+
1543
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
1544
+ Task: Click {instruction}. Output ONLY a click action on the target element."""
1534
1545
  },
1535
1546
  {
1536
1547
  "type": "image_url",
@@ -48,11 +48,11 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
48
48
  "get_dimensions",
49
49
  "get_environment"
50
50
  ],
51
- "description": "The action to perform"
51
+ "description": "The action to perform (required for all actions)"
52
52
  },
53
53
  "element_description": {
54
54
  "type": "string",
55
- "description": "Description of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
55
+ "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
56
56
  },
57
57
  "start_element_description": {
58
58
  "type": "string",
@@ -67,20 +67,30 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
67
67
  "description": "The text to type (required for type action)"
68
68
  },
69
69
  "keys": {
70
- "type": "string",
71
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
70
+ "type": "array",
71
+ "items": {
72
+ "type": "string"
73
+ },
74
+ "description": "Key(s) to press (required for keypress action)"
72
75
  },
73
76
  "button": {
74
77
  "type": "string",
75
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
78
+ "enum": [
79
+ "left",
80
+ "right",
81
+ "wheel",
82
+ "back",
83
+ "forward"
84
+ ],
85
+ "description": "The mouse button to use for click action (required for click and double_click action)",
76
86
  },
77
87
  "scroll_x": {
78
88
  "type": "integer",
79
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
89
+ "description": "Horizontal scroll amount for scroll action (required for scroll action)",
80
90
  },
81
91
  "scroll_y": {
82
92
  "type": "integer",
83
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
93
+ "description": "Vertical scroll amount for scroll action (required for scroll action)",
84
94
  },
85
95
  },
86
96
  "required": [
@@ -266,13 +276,15 @@ class ComposedGroundedConfig:
266
276
  grounding_agent = grounding_agent_conf.agent_class()
267
277
 
268
278
  for desc in element_descriptions:
269
- coords = await grounding_agent.predict_click(
270
- model=grounding_model,
271
- image_b64=last_image_b64,
272
- instruction=desc
273
- )
274
- if coords:
275
- self.desc2xy[desc] = coords
279
+ for _ in range(3): # try 3 times
280
+ coords = await grounding_agent.predict_click(
281
+ model=grounding_model,
282
+ image_b64=last_image_b64,
283
+ instruction=desc
284
+ )
285
+ if coords:
286
+ self.desc2xy[desc] = coords
287
+ break
276
288
 
277
289
  # Step 6: Convert computer calls from descriptions back to xy coordinates
278
290
  final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
agent/loops/openai.py CHANGED
@@ -162,7 +162,18 @@ class OpenAIComputerUseConfig:
162
162
  input_items = [
163
163
  {
164
164
  "role": "user",
165
- "content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
165
+ "content": f"""You are a UI grounding expert. Follow these guidelines:
166
+
167
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
168
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
169
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
170
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
171
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
172
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
173
+ 7. Be decisive and action-oriented. Complete the requested task fully.
174
+
175
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
176
+ Task: Click {instruction}. Output ONLY a click action on the target element."""
166
177
  },
167
178
  {
168
179
  "role": "user",
@@ -200,7 +211,7 @@ class OpenAIComputerUseConfig:
200
211
  "stream": False,
201
212
  "reasoning": {"summary": "concise"},
202
213
  "truncation": "auto",
203
- "max_tokens": 100 # Keep response short for click prediction
214
+ "max_tokens": 200 # Keep response short for click prediction
204
215
  }
205
216
 
206
217
  # Use liteLLM responses
@@ -217,11 +228,8 @@ class OpenAIComputerUseConfig:
217
228
  isinstance(item.get("action"), dict)):
218
229
 
219
230
  action = item["action"]
220
- if action.get("type") == "click":
221
- x = action.get("x")
222
- y = action.get("y")
223
- if x is not None and y is not None:
224
- return (int(x), int(y))
231
+ if action.get("x") is not None and action.get("y") is not None:
232
+ return (int(action.get("x")), int(action.get("y")))
225
233
 
226
234
  return None
227
235
 
agent/loops/uitars.py CHANGED
@@ -228,15 +228,24 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
228
228
 
229
229
  # Handle coordinate parameters
230
230
  if "start_box" in param_name or "end_box" in param_name:
231
- # Parse coordinates like '(x,y)' or '(x1,y1,x2,y2)'
232
- numbers = param.replace("(", "").replace(")", "").split(",")
233
- float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
231
+ # Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
232
+ # First, remove special tokens
233
+ clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
234
+ # Then remove parentheses and split
235
+ numbers = clean_param.replace("(", "").replace(")", "").split(",")
234
236
 
235
- if len(float_numbers) == 2:
236
- # Single point, duplicate for box format
237
- float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
238
-
239
- action_inputs[param_name.strip()] = str(float_numbers)
237
+ try:
238
+ float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
239
+
240
+ if len(float_numbers) == 2:
241
+ # Single point, duplicate for box format
242
+ float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
243
+
244
+ action_inputs[param_name.strip()] = str(float_numbers)
245
+ except ValueError as e:
246
+ # If parsing fails, keep the original parameter value
247
+ print(f"Warning: Could not parse coordinates '{param}': {e}")
248
+ action_inputs[param_name.strip()] = param
240
249
 
241
250
  return [{
242
251
  "thought": thought,