cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/human_tool/ui.py CHANGED
@@ -1,21 +1,29 @@
1
- import gradio as gr
1
+ import base64
2
+ import io
2
3
  import json
3
4
  import time
4
- from typing import List, Dict, Any, Optional
5
5
  from datetime import datetime
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import gradio as gr
6
9
  import requests
7
- from .server import completion_queue
8
- import base64
9
- import io
10
10
  from PIL import Image
11
11
 
12
+ from .server import completion_queue
13
+
14
+
12
15
  class HumanCompletionUI:
13
16
  def __init__(self, server_url: str = "http://localhost:8002"):
14
17
  self.server_url = server_url
15
18
  self.current_call_id: Optional[str] = None
16
19
  self.refresh_interval = 2.0 # seconds
17
20
  self.last_image = None # Store the last image for display
18
-
21
+ # Track current interactive action controls
22
+ self.current_action_type: str = "click"
23
+ self.current_button: str = "left"
24
+ self.current_scroll_x: int = 0
25
+ self.current_scroll_y: int = -120
26
+
19
27
  def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
20
28
  """Format messages for display in gr.Chatbot with type='messages'."""
21
29
  formatted = []
@@ -23,7 +31,7 @@ class HumanCompletionUI:
23
31
  role = msg.get("role", "user")
24
32
  content = msg.get("content", "")
25
33
  tool_calls = msg.get("tool_calls", [])
26
-
34
+
27
35
  # Handle different content formats
28
36
  if isinstance(content, list):
29
37
  # Multi-modal content - can include text and images
@@ -50,7 +58,7 @@ class HumanCompletionUI:
50
58
  else:
51
59
  # For URL images, create gr.Image with URL
52
60
  formatted_content.append(gr.Image(value=image_url))
53
-
61
+
54
62
  # Determine final content format
55
63
  if len(formatted_content) == 1:
56
64
  content = formatted_content[0]
@@ -58,28 +66,28 @@ class HumanCompletionUI:
58
66
  content = formatted_content
59
67
  else:
60
68
  content = "[Empty content]"
61
-
69
+
62
70
  # Ensure role is valid for Gradio Chatbot
63
71
  if role not in ["user", "assistant"]:
64
72
  role = "assistant" if role == "system" else "user"
65
-
73
+
66
74
  # Invert roles for better display in human UI context
67
75
  # (what the AI says becomes "user", what human should respond becomes "assistant")
68
76
  if role == "user":
69
77
  role = "assistant"
70
78
  else:
71
79
  role = "user"
72
-
80
+
73
81
  # Add the main message if it has content
74
82
  if content and str(content).strip():
75
83
  formatted.append({"role": role, "content": content})
76
-
84
+
77
85
  # Handle tool calls - create separate messages for each tool call
78
86
  if tool_calls:
79
87
  for tool_call in tool_calls:
80
88
  function_name = tool_call.get("function", {}).get("name", "unknown")
81
89
  arguments_str = tool_call.get("function", {}).get("arguments", "{}")
82
-
90
+
83
91
  try:
84
92
  # Parse arguments to format them nicely
85
93
  arguments = json.loads(arguments_str)
@@ -87,18 +95,20 @@ class HumanCompletionUI:
87
95
  except json.JSONDecodeError:
88
96
  # If parsing fails, use the raw string
89
97
  formatted_args = arguments_str
90
-
98
+
91
99
  # Create a formatted message for the tool call
92
100
  tool_call_content = f"```json\n{formatted_args}\n```"
93
-
94
- formatted.append({
95
- "role": role,
96
- "content": tool_call_content,
97
- "metadata": {"title": f"🛠️ Used {function_name}"}
98
- })
99
-
101
+
102
+ formatted.append(
103
+ {
104
+ "role": role,
105
+ "content": tool_call_content,
106
+ "metadata": {"title": f"🛠️ Used {function_name}"},
107
+ }
108
+ )
109
+
100
110
  return formatted
101
-
111
+
102
112
  def get_pending_calls(self) -> List[Dict[str, Any]]:
103
113
  """Get pending calls from the server."""
104
114
  try:
@@ -108,38 +118,39 @@ class HumanCompletionUI:
108
118
  except Exception as e:
109
119
  print(f"Error fetching pending calls: {e}")
110
120
  return []
111
-
121
+
112
122
  def complete_call_with_response(self, call_id: str, response: str) -> bool:
113
123
  """Complete a call with a text response."""
114
124
  try:
115
125
  response_data = {"response": response}
116
126
  response_obj = requests.post(
117
- f"{self.server_url}/complete/{call_id}",
118
- json=response_data,
119
- timeout=10
127
+ f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
120
128
  )
121
129
  response_obj.raise_for_status()
122
130
  return True
123
131
  except requests.RequestException as e:
124
132
  print(f"Error completing call: {e}")
125
133
  return False
126
-
134
+
127
135
  def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
128
136
  """Complete a call with tool calls."""
129
137
  try:
130
138
  response_data = {"tool_calls": tool_calls}
131
139
  response_obj = requests.post(
132
- f"{self.server_url}/complete/{call_id}",
133
- json=response_data,
134
- timeout=10
140
+ f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
135
141
  )
136
142
  response_obj.raise_for_status()
137
143
  return True
138
144
  except requests.RequestException as e:
139
145
  print(f"Error completing call: {e}")
140
146
  return False
141
-
142
- def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
147
+
148
+ def complete_call(
149
+ self,
150
+ call_id: str,
151
+ response: Optional[str] = None,
152
+ tool_calls: Optional[List[Dict[str, Any]]] = None,
153
+ ) -> bool:
143
154
  """Complete a call with either a response or tool calls."""
144
155
  try:
145
156
  response_data = {}
@@ -147,25 +158,23 @@ class HumanCompletionUI:
147
158
  response_data["response"] = response
148
159
  if tool_calls:
149
160
  response_data["tool_calls"] = tool_calls
150
-
161
+
151
162
  response_obj = requests.post(
152
- f"{self.server_url}/complete/{call_id}",
153
- json=response_data,
154
- timeout=10
163
+ f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
155
164
  )
156
165
  response_obj.raise_for_status()
157
166
  return True
158
167
  except requests.RequestException as e:
159
168
  print(f"Error completing call: {e}")
160
169
  return False
161
-
170
+
162
171
  def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
163
172
  """Extract the last image from the messages for display above conversation."""
164
173
  last_image = None
165
-
174
+
166
175
  for msg in reversed(messages): # Start from the last message
167
176
  content = msg.get("content", "")
168
-
177
+
169
178
  if isinstance(content, list):
170
179
  for item in reversed(content): # Get the last image in the message
171
180
  if item.get("type") == "image_url":
@@ -184,41 +193,43 @@ class HumanCompletionUI:
184
193
  else:
185
194
  # For URL images, return the URL
186
195
  return image_url
187
-
196
+
188
197
  return last_image
189
-
198
+
190
199
  def refresh_pending_calls(self):
191
200
  """Refresh the list of pending calls."""
192
201
  pending_calls = self.get_pending_calls()
193
-
202
+
194
203
  if not pending_calls:
195
204
  return (
196
205
  gr.update(choices=["latest"], value="latest"), # dropdown
197
206
  gr.update(value=None), # image (no image)
198
207
  gr.update(value=[]), # chatbot (empty messages)
199
- gr.update(interactive=False) # submit button
208
+ gr.update(interactive=False), # submit button
209
+ gr.update(visible=False), # click_actions_group hidden
210
+ gr.update(visible=False), # actions_group hidden
200
211
  )
201
-
212
+
202
213
  # Sort pending calls by created_at to get oldest first
203
214
  sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
204
-
215
+
205
216
  # Create choices for dropdown
206
217
  choices = [("latest", "latest")] # Add "latest" option first
207
-
218
+
208
219
  for call in sorted_calls:
209
220
  call_id = call["id"]
210
221
  model = call.get("model", "unknown")
211
222
  created_at = call.get("created_at", "")
212
223
  # Format timestamp
213
224
  try:
214
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
225
+ dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
215
226
  time_str = dt.strftime("%H:%M:%S")
216
227
  except:
217
228
  time_str = created_at
218
-
229
+
219
230
  choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
220
231
  choices.append((choice_label, call_id))
221
-
232
+
222
233
  # Default to "latest" which shows the oldest pending conversation
223
234
  selected_call_id = "latest"
224
235
  if selected_call_id == "latest" and sorted_calls:
@@ -232,31 +243,37 @@ class HumanCompletionUI:
232
243
  conversation = []
233
244
  self.current_call_id = None
234
245
  self.last_image = None
235
-
246
+
236
247
  return (
237
248
  gr.update(choices=choices, value="latest"),
238
249
  gr.update(value=self.last_image),
239
250
  gr.update(value=conversation),
240
- gr.update(interactive=bool(choices))
251
+ gr.update(interactive=bool(choices)),
252
+ gr.update(visible=True), # click_actions_group visible when there is a call
253
+ gr.update(visible=True), # actions_group visible when there is a call
241
254
  )
242
-
255
+
243
256
  def on_call_selected(self, selected_choice):
244
257
  """Handle when a call is selected from the dropdown."""
245
258
  if not selected_choice:
246
259
  return (
247
260
  gr.update(value=None), # no image
248
261
  gr.update(value=[]), # empty chatbot
249
- gr.update(interactive=False)
262
+ gr.update(interactive=False),
263
+ gr.update(visible=False), # click_actions_group hidden
264
+ gr.update(visible=False), # actions_group hidden
250
265
  )
251
-
266
+
252
267
  pending_calls = self.get_pending_calls()
253
268
  if not pending_calls:
254
269
  return (
255
270
  gr.update(value=None), # no image
256
271
  gr.update(value=[]), # empty chatbot
257
- gr.update(interactive=False)
272
+ gr.update(interactive=False),
273
+ gr.update(visible=False), # click_actions_group hidden
274
+ gr.update(visible=False), # actions_group hidden
258
275
  )
259
-
276
+
260
277
  # Handle "latest" option
261
278
  if selected_choice == "latest":
262
279
  # Sort calls by created_at to get oldest first
@@ -271,134 +288,143 @@ class HumanCompletionUI:
271
288
  if call_id_short in selected_choice:
272
289
  call_id = call["id"]
273
290
  break
274
-
291
+
275
292
  if not call_id:
276
293
  return (
277
294
  gr.update(value=None), # no image
278
295
  gr.update(value=[]), # empty chatbot
279
- gr.update(interactive=False)
296
+ gr.update(interactive=False),
280
297
  )
281
-
298
+
282
299
  # Find the selected call
283
300
  selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
284
-
301
+
285
302
  if not selected_call:
286
303
  return (
287
304
  gr.update(value=None), # no image
288
305
  gr.update(value=[]), # empty chatbot
289
- gr.update(interactive=False)
306
+ gr.update(interactive=False),
307
+ gr.update(visible=False), # click_actions_group hidden
308
+ gr.update(visible=False), # actions_group hidden
290
309
  )
291
-
310
+
292
311
  conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
293
312
  self.current_call_id = call_id
294
313
  # Get the last image from messages
295
314
  self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
296
-
315
+
297
316
  return (
298
317
  gr.update(value=self.last_image),
299
318
  gr.update(value=conversation),
300
- gr.update(interactive=True)
319
+ gr.update(interactive=True),
320
+ gr.update(visible=True), # click_actions_group visible
321
+ gr.update(visible=True), # actions_group visible
301
322
  )
302
-
323
+
303
324
  def submit_response(self, response_text: str):
304
325
  """Submit a text response to the current call."""
305
326
  if not self.current_call_id:
306
327
  return (
307
328
  gr.update(value=response_text), # keep response text
308
- gr.update(value="❌ No call selected") # status
329
+ gr.update(value="❌ No call selected"), # status
309
330
  )
310
-
331
+
311
332
  if not response_text.strip():
312
333
  return (
313
334
  gr.update(value=response_text), # keep response text
314
- gr.update(value="❌ Response cannot be empty") # status
335
+ gr.update(value="❌ Response cannot be empty"), # status
315
336
  )
316
-
337
+
317
338
  success = self.complete_call_with_response(self.current_call_id, response_text)
318
-
339
+
319
340
  if success:
320
341
  status_msg = "✅ Response submitted successfully!"
321
342
  return (
322
343
  gr.update(value=""), # clear response text
323
- gr.update(value=status_msg) # status
344
+ gr.update(value=status_msg), # status
324
345
  )
325
346
  else:
326
347
  return (
327
348
  gr.update(value=response_text), # keep response text
328
- gr.update(value="❌ Failed to submit response") # status
349
+ gr.update(value="❌ Failed to submit response"), # status
329
350
  )
330
-
351
+
331
352
  def submit_action(self, action_type: str, **kwargs) -> str:
332
353
  """Submit a computer action as a tool call."""
333
354
  if not self.current_call_id:
334
355
  return "❌ No call selected"
335
-
356
+
336
357
  import uuid
337
-
358
+
338
359
  # Create tool call structure
339
360
  action_data = {"type": action_type, **kwargs}
340
361
  tool_call = {
341
362
  "id": f"call_{uuid.uuid4().hex[:24]}",
342
363
  "type": "function",
343
- "function": {
344
- "name": "computer",
345
- "arguments": json.dumps(action_data)
346
- }
364
+ "function": {"name": "computer", "arguments": json.dumps(action_data)},
347
365
  }
348
-
366
+
349
367
  success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
350
-
368
+
351
369
  if success:
352
370
  return f"✅ {action_type.capitalize()} action submitted as tool call"
353
371
  else:
354
372
  return f"❌ Failed to submit {action_type} action"
355
-
356
- def submit_click_action(self, x: int, y: int, action_type: str = "click", button: str = "left") -> str:
373
+
374
+ def submit_click_action(
375
+ self, x: int, y: int, action_type: str = "click", button: str = "left"
376
+ ) -> str:
357
377
  """Submit a coordinate-based action."""
358
378
  if action_type == "click":
359
379
  return self.submit_action(action_type, x=x, y=y, button=button)
360
380
  else:
361
381
  return self.submit_action(action_type, x=x, y=y)
362
-
382
+
363
383
  def submit_type_action(self, text: str) -> str:
364
384
  """Submit a type action."""
365
385
  return self.submit_action("type", text=text)
366
-
386
+
367
387
  def submit_hotkey_action(self, keys: str) -> str:
368
388
  """Submit a hotkey action."""
369
389
  return self.submit_action("keypress", keys=keys)
370
-
371
- def submit_description_click(self, description: str, action_type: str = "click", button: str = "left") -> str:
390
+
391
+ def submit_wait_action(self) -> str:
392
+ """Submit a wait action with no kwargs."""
393
+ return self.submit_action("wait")
394
+
395
+ def submit_description_click(
396
+ self, description: str, action_type: str = "click", button: str = "left"
397
+ ) -> str:
372
398
  """Submit a description-based action."""
373
399
  if action_type == "click":
374
400
  return self.submit_action(action_type, element_description=description, button=button)
375
401
  else:
376
402
  return self.submit_action(action_type, element_description=description)
377
-
403
+
378
404
  def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
379
405
  """Wait for pending calls to appear or until max_seconds elapsed.
380
-
406
+
381
407
  This method loops and checks for pending calls at regular intervals,
382
408
  returning as soon as a pending call is found or the maximum wait time is reached.
383
-
409
+
384
410
  Args:
385
411
  max_seconds: Maximum number of seconds to wait
386
412
  check_interval: How often to check for pending calls (in seconds)
387
413
  """
388
414
  import time
389
-
415
+
390
416
  start_time = time.time()
391
-
417
+
392
418
  while time.time() - start_time < max_seconds:
393
419
  # Check if there are any pending calls
394
420
  pending_calls = self.get_pending_calls()
395
421
  if pending_calls:
396
422
  # Found pending calls, return immediately
397
423
  return self.refresh_pending_calls()
398
-
424
+
399
425
  # Wait before checking again
400
426
  time.sleep(check_interval)
401
-
427
+
402
428
  # Max wait time reached, return current state
403
429
  return self.refresh_pending_calls()
404
430
 
@@ -406,199 +432,261 @@ class HumanCompletionUI:
406
432
  def create_ui():
407
433
  """Create the Gradio interface."""
408
434
  ui_handler = HumanCompletionUI()
409
-
410
- with gr.Blocks(title="Human-in-the-Loop Agent Tool") as demo:
435
+
436
+ with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
411
437
  gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
412
438
  gr.Markdown("Review AI conversation requests and provide human responses.")
413
-
439
+
414
440
  with gr.Row():
415
441
  with gr.Column(scale=2):
416
442
  with gr.Group():
417
443
  screenshot_image = gr.Image(
418
- label="Screenshot",
419
- interactive=False,
420
- height=600
444
+ label="Interactive Screenshot", interactive=False, height=600
421
445
  )
422
-
423
- # Action type selection for image clicks
424
- with gr.Row():
425
- action_type_radio = gr.Radio(
426
- label="Action Type",
427
- choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
428
- value="click",
429
- scale=2
430
- )
431
- action_button_radio = gr.Radio(
432
- label="Button (for click only)",
433
- choices=["left", "right", "wheel", "back", "forward"],
434
- value="left",
435
- visible=True,
436
- scale=1
437
- )
438
-
446
+
447
+ # Action type selection for image clicks (wrapped for visibility control)
448
+ with gr.Group(visible=False) as click_actions_group:
449
+ with gr.Row():
450
+ action_type_radio = gr.Dropdown(
451
+ label="Interactive Action",
452
+ choices=[
453
+ "click",
454
+ "double_click",
455
+ "move",
456
+ "left_mouse_up",
457
+ "left_mouse_down",
458
+ "scroll",
459
+ ],
460
+ value="click",
461
+ scale=2,
462
+ )
463
+ action_button_radio = gr.Dropdown(
464
+ label="Button",
465
+ choices=["left", "right", "wheel", "back", "forward"],
466
+ value="left",
467
+ visible=True,
468
+ scale=1,
469
+ )
470
+ scroll_x_input = gr.Number(
471
+ label="scroll_x", value=0, visible=False, scale=1
472
+ )
473
+ scroll_y_input = gr.Number(
474
+ label="scroll_y", value=-120, visible=False, scale=1
475
+ )
476
+
439
477
  conversation_chatbot = gr.Chatbot(
440
- label="Messages",
441
- type="messages",
442
- height=500,
443
- show_copy_button=True
478
+ label="Conversation", height=500, buttons=["copy"]
444
479
  )
445
-
480
+
446
481
  with gr.Column(scale=1):
447
482
  with gr.Group():
448
483
  call_dropdown = gr.Dropdown(
449
- label="Select a pending call",
484
+ label="Select a pending conversation request",
450
485
  choices=["latest"],
451
486
  interactive=True,
452
- value="latest"
487
+ value="latest",
453
488
  )
454
489
  refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
490
+ status_display = gr.Textbox(
491
+ label="Status", interactive=False, value="Ready to receive requests..."
492
+ )
455
493
 
456
494
  with gr.Group():
457
495
  response_text = gr.Textbox(
458
- label="Response",
459
- lines=3,
460
- placeholder="Enter your response here..."
496
+ label="Message", lines=3, placeholder="Enter your message here..."
461
497
  )
462
- submit_btn = gr.Button("📤 Submit Response", variant="primary", interactive=False)
463
-
464
- # Action Accordions
465
- with gr.Accordion("🖱️ Click Actions", open=False):
466
- with gr.Group():
467
- with gr.Row():
468
- click_x = gr.Number(label="X", value=0, minimum=0)
469
- click_y = gr.Number(label="Y", value=0, minimum=0)
470
- with gr.Row():
471
- click_action_type = gr.Dropdown(
472
- label="Action Type",
473
- choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
474
- value="click"
475
- )
476
- click_button = gr.Dropdown(
477
- label="Button (for click only)",
478
- choices=["left", "right", "wheel", "back", "forward"],
479
- value="left"
480
- )
481
- click_submit_btn = gr.Button("Submit Action")
482
-
483
- with gr.Accordion("📝 Type Action", open=False):
484
- with gr.Group():
485
- type_text = gr.Textbox(
486
- label="Text to Type",
487
- placeholder="Enter text to type..."
488
- )
489
- type_submit_btn = gr.Button("Submit Type")
490
-
491
- with gr.Accordion("⌨️ Keypress Action", open=False):
492
- with gr.Group():
493
- keypress_text = gr.Textbox(
494
- label="Keys",
495
- placeholder="e.g., ctrl+c, alt+tab"
496
- )
497
- keypress_submit_btn = gr.Button("Submit Keypress")
498
-
499
- with gr.Accordion("🎯 Description Action", open=False):
500
- with gr.Group():
501
- description_text = gr.Textbox(
502
- label="Element Description",
503
- placeholder="e.g., 'Privacy and security option in left sidebar'"
504
- )
505
- with gr.Row():
506
- description_action_type = gr.Dropdown(
507
- label="Action Type",
508
- choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
509
- value="click"
510
- )
511
- description_button = gr.Radio(
512
- label="Button (for click only)",
513
- choices=["left", "right", "wheel", "back", "forward"],
514
- value="left"
515
- )
516
- description_submit_btn = gr.Button("Submit Description Action")
517
-
518
- status_display = gr.Textbox(
519
- label="Status",
520
- interactive=False,
521
- value="Ready to receive calls..."
522
- )
523
-
498
+ submit_btn = gr.Button(
499
+ "📤 Submit Message", variant="primary", interactive=False
500
+ )
501
+
502
+ # Action Accordions (wrapped for visibility control)
503
+ with gr.Group(visible=False) as actions_group:
504
+ with gr.Tabs():
505
+ with gr.Tab("🖱️ Click Actions"):
506
+ with gr.Group():
507
+ description_text = gr.Textbox(
508
+ label="Element Description",
509
+ placeholder="e.g., 'Privacy and security option in left sidebar'",
510
+ )
511
+ with gr.Row():
512
+ description_action_type = gr.Dropdown(
513
+ label="Action",
514
+ choices=[
515
+ "click",
516
+ "double_click",
517
+ "move",
518
+ "left_mouse_up",
519
+ "left_mouse_down",
520
+ ],
521
+ value="click",
522
+ )
523
+ description_button = gr.Dropdown(
524
+ label="Button",
525
+ choices=["left", "right", "wheel", "back", "forward"],
526
+ value="left",
527
+ )
528
+ description_submit_btn = gr.Button("Submit Click Action")
529
+
530
+ with gr.Tab("📝 Type Action"):
531
+ with gr.Group():
532
+ type_text = gr.Textbox(
533
+ label="Text to Type", placeholder="Enter text to type..."
534
+ )
535
+ type_submit_btn = gr.Button("Submit Type")
536
+
537
+ with gr.Tab("⌨️ Keypress Action"):
538
+ with gr.Group():
539
+ keypress_text = gr.Textbox(
540
+ label="Keys", placeholder="e.g., ctrl+c, alt+tab"
541
+ )
542
+ keypress_submit_btn = gr.Button("Submit Keypress")
543
+
544
+ with gr.Tab("🧰 Misc Actions"):
545
+ with gr.Group():
546
+ misc_action_dropdown = gr.Dropdown(
547
+ label="Action", choices=["wait"], value="wait"
548
+ )
549
+ misc_submit_btn = gr.Button("Submit Action")
550
+
524
551
  # Event handlers
525
552
  refresh_btn.click(
526
553
  fn=ui_handler.refresh_pending_calls,
527
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
554
+ outputs=[
555
+ call_dropdown,
556
+ screenshot_image,
557
+ conversation_chatbot,
558
+ submit_btn,
559
+ click_actions_group,
560
+ actions_group,
561
+ ],
528
562
  )
529
-
563
+
530
564
  call_dropdown.change(
531
565
  fn=ui_handler.on_call_selected,
532
566
  inputs=[call_dropdown],
533
- outputs=[screenshot_image, conversation_chatbot, submit_btn]
567
+ outputs=[
568
+ screenshot_image,
569
+ conversation_chatbot,
570
+ submit_btn,
571
+ click_actions_group,
572
+ actions_group,
573
+ ],
534
574
  )
535
-
575
+
536
576
  def handle_image_click(evt: gr.SelectData):
537
577
  if evt.index is not None:
538
578
  x, y = evt.index
539
- action_type = action_type_radio.value or "click"
540
- button = action_button_radio.value or "left"
541
- result = ui_handler.submit_click_action(x, y, action_type, button)
579
+ action_type = ui_handler.current_action_type or "click"
580
+ button = ui_handler.current_button or "left"
581
+ if action_type == "scroll":
582
+ sx_i = int(ui_handler.current_scroll_x or 0)
583
+ sy_i = int(ui_handler.current_scroll_y or 0)
584
+ # Submit a scroll action with x,y position and scroll deltas
585
+ result = ui_handler.submit_action(
586
+ "scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
587
+ )
588
+ else:
589
+ result = ui_handler.submit_click_action(x, y, action_type, button)
542
590
  ui_handler.wait_for_pending_calls()
543
591
  return result
544
592
  return "No coordinates selected"
545
593
 
546
- screenshot_image.select(
547
- fn=handle_image_click,
548
- outputs=[status_display]
549
- ).then(
594
+ screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
550
595
  fn=ui_handler.wait_for_pending_calls,
551
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
596
+ outputs=[
597
+ call_dropdown,
598
+ screenshot_image,
599
+ conversation_chatbot,
600
+ submit_btn,
601
+ click_actions_group,
602
+ actions_group,
603
+ ],
552
604
  )
553
605
 
554
606
  # Response submission
555
607
  submit_btn.click(
556
608
  fn=ui_handler.submit_response,
557
609
  inputs=[response_text],
558
- outputs=[response_text, status_display]
610
+ outputs=[response_text, status_display],
559
611
  ).then(
560
612
  fn=ui_handler.refresh_pending_calls,
561
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
613
+ outputs=[
614
+ call_dropdown,
615
+ screenshot_image,
616
+ conversation_chatbot,
617
+ submit_btn,
618
+ click_actions_group,
619
+ actions_group,
620
+ ],
562
621
  )
563
-
564
- # Toggle button radio visibility based on action type
565
- def toggle_button_visibility(action_type):
566
- return gr.update(visible=(action_type == "click"))
567
-
622
+
623
+ # Toggle visibility of controls based on action type
624
+ def toggle_action_controls(action_type):
625
+ # Button visible only for click
626
+ button_vis = gr.update(visible=(action_type == "click"))
627
+ # Scroll inputs visible only for scroll
628
+ scroll_x_vis = gr.update(visible=(action_type == "scroll"))
629
+ scroll_y_vis = gr.update(visible=(action_type == "scroll"))
630
+ # Update state
631
+ ui_handler.current_action_type = action_type or "click"
632
+ return button_vis, scroll_x_vis, scroll_y_vis
633
+
568
634
  action_type_radio.change(
569
- fn=toggle_button_visibility,
635
+ fn=toggle_action_controls,
570
636
  inputs=[action_type_radio],
571
- outputs=[action_button_radio]
637
+ outputs=[action_button_radio, scroll_x_input, scroll_y_input],
572
638
  )
573
639
 
574
- # Action accordion handlers
575
- click_submit_btn.click(
576
- fn=ui_handler.submit_click_action,
577
- inputs=[click_x, click_y, click_action_type, click_button],
578
- outputs=[status_display]
579
- ).then(
580
- fn=ui_handler.wait_for_pending_calls,
581
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
582
- )
583
-
640
+ # Keep other control values in ui_handler state
641
+ def on_button_change(val):
642
+ ui_handler.current_button = val or "left"
643
+
644
+ action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
645
+
646
+ def on_scroll_x_change(val):
647
+ try:
648
+ ui_handler.current_scroll_x = int(val) if val is not None else 0
649
+ except Exception:
650
+ ui_handler.current_scroll_x = 0
651
+
652
+ scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
653
+
654
+ def on_scroll_y_change(val):
655
+ try:
656
+ ui_handler.current_scroll_y = int(val) if val is not None else 0
657
+ except Exception:
658
+ ui_handler.current_scroll_y = 0
659
+
660
+ scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
661
+
584
662
  type_submit_btn.click(
585
- fn=ui_handler.submit_type_action,
586
- inputs=[type_text],
587
- outputs=[status_display]
663
+ fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
588
664
  ).then(
589
665
  fn=ui_handler.wait_for_pending_calls,
590
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
666
+ outputs=[
667
+ call_dropdown,
668
+ screenshot_image,
669
+ conversation_chatbot,
670
+ submit_btn,
671
+ click_actions_group,
672
+ actions_group,
673
+ ],
591
674
  )
592
-
675
+
593
676
  keypress_submit_btn.click(
594
- fn=ui_handler.submit_hotkey_action,
595
- inputs=[keypress_text],
596
- outputs=[status_display]
677
+ fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
597
678
  ).then(
598
679
  fn=ui_handler.wait_for_pending_calls,
599
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
680
+ outputs=[
681
+ call_dropdown,
682
+ screenshot_image,
683
+ conversation_chatbot,
684
+ submit_btn,
685
+ click_actions_group,
686
+ actions_group,
687
+ ],
600
688
  )
601
-
689
+
602
690
  def handle_description_submit(description, action_type, button):
603
691
  if description:
604
692
  result = ui_handler.submit_description_click(description, action_type, button)
@@ -609,18 +697,54 @@ def create_ui():
609
697
  description_submit_btn.click(
610
698
  fn=handle_description_submit,
611
699
  inputs=[description_text, description_action_type, description_button],
612
- outputs=[status_display]
700
+ outputs=[status_display],
613
701
  ).then(
614
702
  fn=ui_handler.wait_for_pending_calls,
615
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
703
+ outputs=[
704
+ call_dropdown,
705
+ screenshot_image,
706
+ conversation_chatbot,
707
+ submit_btn,
708
+ click_actions_group,
709
+ actions_group,
710
+ ],
616
711
  )
617
-
712
+
713
+ # Misc action handler
714
+ def handle_misc_submit(selected_action):
715
+ if selected_action == "wait":
716
+ result = ui_handler.submit_wait_action()
717
+ ui_handler.wait_for_pending_calls()
718
+ return result
719
+ return f"Unsupported misc action: {selected_action}"
720
+
721
+ misc_submit_btn.click(
722
+ fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
723
+ ).then(
724
+ fn=ui_handler.wait_for_pending_calls,
725
+ outputs=[
726
+ call_dropdown,
727
+ screenshot_image,
728
+ conversation_chatbot,
729
+ submit_btn,
730
+ click_actions_group,
731
+ actions_group,
732
+ ],
733
+ )
734
+
618
735
  # Load initial data
619
736
  demo.load(
620
737
  fn=ui_handler.refresh_pending_calls,
621
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
738
+ outputs=[
739
+ call_dropdown,
740
+ screenshot_image,
741
+ conversation_chatbot,
742
+ submit_btn,
743
+ click_actions_group,
744
+ actions_group,
745
+ ],
622
746
  )
623
-
747
+
624
748
  return demo
625
749
 
626
750