cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +32 -19
  24. agent/computers/cua.py +33 -25
  25. agent/computers/custom.py +78 -71
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +215 -210
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +510 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.36.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/human_tool/ui.py CHANGED
@@ -1,14 +1,17 @@
1
- import gradio as gr
1
+ import base64
2
+ import io
2
3
  import json
3
4
  import time
4
- from typing import List, Dict, Any, Optional
5
5
  from datetime import datetime
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import gradio as gr
6
9
  import requests
7
- from .server import completion_queue
8
- import base64
9
- import io
10
10
  from PIL import Image
11
11
 
12
+ from .server import completion_queue
13
+
14
+
12
15
  class HumanCompletionUI:
13
16
  def __init__(self, server_url: str = "http://localhost:8002"):
14
17
  self.server_url = server_url
@@ -20,7 +23,7 @@ class HumanCompletionUI:
20
23
  self.current_button: str = "left"
21
24
  self.current_scroll_x: int = 0
22
25
  self.current_scroll_y: int = -120
23
-
26
+
24
27
  def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
25
28
  """Format messages for display in gr.Chatbot with type='messages'."""
26
29
  formatted = []
@@ -28,7 +31,7 @@ class HumanCompletionUI:
28
31
  role = msg.get("role", "user")
29
32
  content = msg.get("content", "")
30
33
  tool_calls = msg.get("tool_calls", [])
31
-
34
+
32
35
  # Handle different content formats
33
36
  if isinstance(content, list):
34
37
  # Multi-modal content - can include text and images
@@ -55,7 +58,7 @@ class HumanCompletionUI:
55
58
  else:
56
59
  # For URL images, create gr.Image with URL
57
60
  formatted_content.append(gr.Image(value=image_url))
58
-
61
+
59
62
  # Determine final content format
60
63
  if len(formatted_content) == 1:
61
64
  content = formatted_content[0]
@@ -63,28 +66,28 @@ class HumanCompletionUI:
63
66
  content = formatted_content
64
67
  else:
65
68
  content = "[Empty content]"
66
-
69
+
67
70
  # Ensure role is valid for Gradio Chatbot
68
71
  if role not in ["user", "assistant"]:
69
72
  role = "assistant" if role == "system" else "user"
70
-
73
+
71
74
  # Invert roles for better display in human UI context
72
75
  # (what the AI says becomes "user", what human should respond becomes "assistant")
73
76
  if role == "user":
74
77
  role = "assistant"
75
78
  else:
76
79
  role = "user"
77
-
80
+
78
81
  # Add the main message if it has content
79
82
  if content and str(content).strip():
80
83
  formatted.append({"role": role, "content": content})
81
-
84
+
82
85
  # Handle tool calls - create separate messages for each tool call
83
86
  if tool_calls:
84
87
  for tool_call in tool_calls:
85
88
  function_name = tool_call.get("function", {}).get("name", "unknown")
86
89
  arguments_str = tool_call.get("function", {}).get("arguments", "{}")
87
-
90
+
88
91
  try:
89
92
  # Parse arguments to format them nicely
90
93
  arguments = json.loads(arguments_str)
@@ -92,18 +95,20 @@ class HumanCompletionUI:
92
95
  except json.JSONDecodeError:
93
96
  # If parsing fails, use the raw string
94
97
  formatted_args = arguments_str
95
-
98
+
96
99
  # Create a formatted message for the tool call
97
100
  tool_call_content = f"```json\n{formatted_args}\n```"
98
-
99
- formatted.append({
100
- "role": role,
101
- "content": tool_call_content,
102
- "metadata": {"title": f"🛠️ Used {function_name}"}
103
- })
104
-
101
+
102
+ formatted.append(
103
+ {
104
+ "role": role,
105
+ "content": tool_call_content,
106
+ "metadata": {"title": f"🛠️ Used {function_name}"},
107
+ }
108
+ )
109
+
105
110
  return formatted
106
-
111
+
107
112
  def get_pending_calls(self) -> List[Dict[str, Any]]:
108
113
  """Get pending calls from the server."""
109
114
  try:
@@ -113,38 +118,39 @@ class HumanCompletionUI:
113
118
  except Exception as e:
114
119
  print(f"Error fetching pending calls: {e}")
115
120
  return []
116
-
121
+
117
122
  def complete_call_with_response(self, call_id: str, response: str) -> bool:
118
123
  """Complete a call with a text response."""
119
124
  try:
120
125
  response_data = {"response": response}
121
126
  response_obj = requests.post(
122
- f"{self.server_url}/complete/{call_id}",
123
- json=response_data,
124
- timeout=10
127
+ f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
125
128
  )
126
129
  response_obj.raise_for_status()
127
130
  return True
128
131
  except requests.RequestException as e:
129
132
  print(f"Error completing call: {e}")
130
133
  return False
131
-
134
+
132
135
  def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
133
136
  """Complete a call with tool calls."""
134
137
  try:
135
138
  response_data = {"tool_calls": tool_calls}
136
139
  response_obj = requests.post(
137
- f"{self.server_url}/complete/{call_id}",
138
- json=response_data,
139
- timeout=10
140
+ f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
140
141
  )
141
142
  response_obj.raise_for_status()
142
143
  return True
143
144
  except requests.RequestException as e:
144
145
  print(f"Error completing call: {e}")
145
146
  return False
146
-
147
- def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
147
+
148
+ def complete_call(
149
+ self,
150
+ call_id: str,
151
+ response: Optional[str] = None,
152
+ tool_calls: Optional[List[Dict[str, Any]]] = None,
153
+ ) -> bool:
148
154
  """Complete a call with either a response or tool calls."""
149
155
  try:
150
156
  response_data = {}
@@ -152,25 +158,23 @@ class HumanCompletionUI:
152
158
  response_data["response"] = response
153
159
  if tool_calls:
154
160
  response_data["tool_calls"] = tool_calls
155
-
161
+
156
162
  response_obj = requests.post(
157
- f"{self.server_url}/complete/{call_id}",
158
- json=response_data,
159
- timeout=10
163
+ f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
160
164
  )
161
165
  response_obj.raise_for_status()
162
166
  return True
163
167
  except requests.RequestException as e:
164
168
  print(f"Error completing call: {e}")
165
169
  return False
166
-
170
+
167
171
  def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
168
172
  """Extract the last image from the messages for display above conversation."""
169
173
  last_image = None
170
-
174
+
171
175
  for msg in reversed(messages): # Start from the last message
172
176
  content = msg.get("content", "")
173
-
177
+
174
178
  if isinstance(content, list):
175
179
  for item in reversed(content): # Get the last image in the message
176
180
  if item.get("type") == "image_url":
@@ -189,13 +193,13 @@ class HumanCompletionUI:
189
193
  else:
190
194
  # For URL images, return the URL
191
195
  return image_url
192
-
196
+
193
197
  return last_image
194
-
198
+
195
199
  def refresh_pending_calls(self):
196
200
  """Refresh the list of pending calls."""
197
201
  pending_calls = self.get_pending_calls()
198
-
202
+
199
203
  if not pending_calls:
200
204
  return (
201
205
  gr.update(choices=["latest"], value="latest"), # dropdown
@@ -205,27 +209,27 @@ class HumanCompletionUI:
205
209
  gr.update(visible=False), # click_actions_group hidden
206
210
  gr.update(visible=False), # actions_group hidden
207
211
  )
208
-
212
+
209
213
  # Sort pending calls by created_at to get oldest first
210
214
  sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
211
-
215
+
212
216
  # Create choices for dropdown
213
217
  choices = [("latest", "latest")] # Add "latest" option first
214
-
218
+
215
219
  for call in sorted_calls:
216
220
  call_id = call["id"]
217
221
  model = call.get("model", "unknown")
218
222
  created_at = call.get("created_at", "")
219
223
  # Format timestamp
220
224
  try:
221
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
225
+ dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
222
226
  time_str = dt.strftime("%H:%M:%S")
223
227
  except:
224
228
  time_str = created_at
225
-
229
+
226
230
  choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
227
231
  choices.append((choice_label, call_id))
228
-
232
+
229
233
  # Default to "latest" which shows the oldest pending conversation
230
234
  selected_call_id = "latest"
231
235
  if selected_call_id == "latest" and sorted_calls:
@@ -239,7 +243,7 @@ class HumanCompletionUI:
239
243
  conversation = []
240
244
  self.current_call_id = None
241
245
  self.last_image = None
242
-
246
+
243
247
  return (
244
248
  gr.update(choices=choices, value="latest"),
245
249
  gr.update(value=self.last_image),
@@ -248,7 +252,7 @@ class HumanCompletionUI:
248
252
  gr.update(visible=True), # click_actions_group visible when there is a call
249
253
  gr.update(visible=True), # actions_group visible when there is a call
250
254
  )
251
-
255
+
252
256
  def on_call_selected(self, selected_choice):
253
257
  """Handle when a call is selected from the dropdown."""
254
258
  if not selected_choice:
@@ -259,7 +263,7 @@ class HumanCompletionUI:
259
263
  gr.update(visible=False), # click_actions_group hidden
260
264
  gr.update(visible=False), # actions_group hidden
261
265
  )
262
-
266
+
263
267
  pending_calls = self.get_pending_calls()
264
268
  if not pending_calls:
265
269
  return (
@@ -269,7 +273,7 @@ class HumanCompletionUI:
269
273
  gr.update(visible=False), # click_actions_group hidden
270
274
  gr.update(visible=False), # actions_group hidden
271
275
  )
272
-
276
+
273
277
  # Handle "latest" option
274
278
  if selected_choice == "latest":
275
279
  # Sort calls by created_at to get oldest first
@@ -284,17 +288,17 @@ class HumanCompletionUI:
284
288
  if call_id_short in selected_choice:
285
289
  call_id = call["id"]
286
290
  break
287
-
291
+
288
292
  if not call_id:
289
293
  return (
290
294
  gr.update(value=None), # no image
291
295
  gr.update(value=[]), # empty chatbot
292
- gr.update(interactive=False)
296
+ gr.update(interactive=False),
293
297
  )
294
-
298
+
295
299
  # Find the selected call
296
300
  selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
297
-
301
+
298
302
  if not selected_call:
299
303
  return (
300
304
  gr.update(value=None), # no image
@@ -303,12 +307,12 @@ class HumanCompletionUI:
303
307
  gr.update(visible=False), # click_actions_group hidden
304
308
  gr.update(visible=False), # actions_group hidden
305
309
  )
306
-
310
+
307
311
  conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
308
312
  self.current_call_id = call_id
309
313
  # Get the last image from messages
310
314
  self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
311
-
315
+
312
316
  return (
313
317
  gr.update(value=self.last_image),
314
318
  gr.update(value=conversation),
@@ -316,110 +320,111 @@ class HumanCompletionUI:
316
320
  gr.update(visible=True), # click_actions_group visible
317
321
  gr.update(visible=True), # actions_group visible
318
322
  )
319
-
323
+
320
324
  def submit_response(self, response_text: str):
321
325
  """Submit a text response to the current call."""
322
326
  if not self.current_call_id:
323
327
  return (
324
328
  gr.update(value=response_text), # keep response text
325
- gr.update(value="❌ No call selected") # status
329
+ gr.update(value="❌ No call selected"), # status
326
330
  )
327
-
331
+
328
332
  if not response_text.strip():
329
333
  return (
330
334
  gr.update(value=response_text), # keep response text
331
- gr.update(value="❌ Response cannot be empty") # status
335
+ gr.update(value="❌ Response cannot be empty"), # status
332
336
  )
333
-
337
+
334
338
  success = self.complete_call_with_response(self.current_call_id, response_text)
335
-
339
+
336
340
  if success:
337
341
  status_msg = "✅ Response submitted successfully!"
338
342
  return (
339
343
  gr.update(value=""), # clear response text
340
- gr.update(value=status_msg) # status
344
+ gr.update(value=status_msg), # status
341
345
  )
342
346
  else:
343
347
  return (
344
348
  gr.update(value=response_text), # keep response text
345
- gr.update(value="❌ Failed to submit response") # status
349
+ gr.update(value="❌ Failed to submit response"), # status
346
350
  )
347
-
351
+
348
352
  def submit_action(self, action_type: str, **kwargs) -> str:
349
353
  """Submit a computer action as a tool call."""
350
354
  if not self.current_call_id:
351
355
  return "❌ No call selected"
352
-
356
+
353
357
  import uuid
354
-
358
+
355
359
  # Create tool call structure
356
360
  action_data = {"type": action_type, **kwargs}
357
361
  tool_call = {
358
362
  "id": f"call_{uuid.uuid4().hex[:24]}",
359
363
  "type": "function",
360
- "function": {
361
- "name": "computer",
362
- "arguments": json.dumps(action_data)
363
- }
364
+ "function": {"name": "computer", "arguments": json.dumps(action_data)},
364
365
  }
365
-
366
+
366
367
  success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
367
-
368
+
368
369
  if success:
369
370
  return f"✅ {action_type.capitalize()} action submitted as tool call"
370
371
  else:
371
372
  return f"❌ Failed to submit {action_type} action"
372
-
373
- def submit_click_action(self, x: int, y: int, action_type: str = "click", button: str = "left") -> str:
373
+
374
+ def submit_click_action(
375
+ self, x: int, y: int, action_type: str = "click", button: str = "left"
376
+ ) -> str:
374
377
  """Submit a coordinate-based action."""
375
378
  if action_type == "click":
376
379
  return self.submit_action(action_type, x=x, y=y, button=button)
377
380
  else:
378
381
  return self.submit_action(action_type, x=x, y=y)
379
-
382
+
380
383
  def submit_type_action(self, text: str) -> str:
381
384
  """Submit a type action."""
382
385
  return self.submit_action("type", text=text)
383
-
386
+
384
387
  def submit_hotkey_action(self, keys: str) -> str:
385
388
  """Submit a hotkey action."""
386
389
  return self.submit_action("keypress", keys=keys)
387
-
390
+
388
391
  def submit_wait_action(self) -> str:
389
392
  """Submit a wait action with no kwargs."""
390
393
  return self.submit_action("wait")
391
-
392
- def submit_description_click(self, description: str, action_type: str = "click", button: str = "left") -> str:
394
+
395
+ def submit_description_click(
396
+ self, description: str, action_type: str = "click", button: str = "left"
397
+ ) -> str:
393
398
  """Submit a description-based action."""
394
399
  if action_type == "click":
395
400
  return self.submit_action(action_type, element_description=description, button=button)
396
401
  else:
397
402
  return self.submit_action(action_type, element_description=description)
398
-
403
+
399
404
  def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
400
405
  """Wait for pending calls to appear or until max_seconds elapsed.
401
-
406
+
402
407
  This method loops and checks for pending calls at regular intervals,
403
408
  returning as soon as a pending call is found or the maximum wait time is reached.
404
-
409
+
405
410
  Args:
406
411
  max_seconds: Maximum number of seconds to wait
407
412
  check_interval: How often to check for pending calls (in seconds)
408
413
  """
409
414
  import time
410
-
415
+
411
416
  start_time = time.time()
412
-
417
+
413
418
  while time.time() - start_time < max_seconds:
414
419
  # Check if there are any pending calls
415
420
  pending_calls = self.get_pending_calls()
416
421
  if pending_calls:
417
422
  # Found pending calls, return immediately
418
423
  return self.refresh_pending_calls()
419
-
424
+
420
425
  # Wait before checking again
421
426
  time.sleep(check_interval)
422
-
427
+
423
428
  # Max wait time reached, return current state
424
429
  return self.refresh_pending_calls()
425
430
 
@@ -427,79 +432,73 @@ class HumanCompletionUI:
427
432
  def create_ui():
428
433
  """Create the Gradio interface."""
429
434
  ui_handler = HumanCompletionUI()
430
-
435
+
431
436
  with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
432
437
  gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
433
438
  gr.Markdown("Review AI conversation requests and provide human responses.")
434
-
439
+
435
440
  with gr.Row():
436
441
  with gr.Column(scale=2):
437
442
  with gr.Group():
438
443
  screenshot_image = gr.Image(
439
- label="Interactive Screenshot",
440
- interactive=False,
441
- height=600
444
+ label="Interactive Screenshot", interactive=False, height=600
442
445
  )
443
-
446
+
444
447
  # Action type selection for image clicks (wrapped for visibility control)
445
448
  with gr.Group(visible=False) as click_actions_group:
446
449
  with gr.Row():
447
450
  action_type_radio = gr.Dropdown(
448
451
  label="Interactive Action",
449
- choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down", "scroll"],
452
+ choices=[
453
+ "click",
454
+ "double_click",
455
+ "move",
456
+ "left_mouse_up",
457
+ "left_mouse_down",
458
+ "scroll",
459
+ ],
450
460
  value="click",
451
- scale=2
461
+ scale=2,
452
462
  )
453
463
  action_button_radio = gr.Dropdown(
454
464
  label="Button",
455
465
  choices=["left", "right", "wheel", "back", "forward"],
456
466
  value="left",
457
467
  visible=True,
458
- scale=1
468
+ scale=1,
459
469
  )
460
470
  scroll_x_input = gr.Number(
461
- label="scroll_x",
462
- value=0,
463
- visible=False,
464
- scale=1
471
+ label="scroll_x", value=0, visible=False, scale=1
465
472
  )
466
473
  scroll_y_input = gr.Number(
467
- label="scroll_y",
468
- value=-120,
469
- visible=False,
470
- scale=1
474
+ label="scroll_y", value=-120, visible=False, scale=1
471
475
  )
472
-
476
+
473
477
  conversation_chatbot = gr.Chatbot(
474
- label="Conversation",
475
- type="messages",
476
- height=500,
477
- show_copy_button=True
478
+ label="Conversation", type="messages", height=500, show_copy_button=True
478
479
  )
479
-
480
+
480
481
  with gr.Column(scale=1):
481
482
  with gr.Group():
482
483
  call_dropdown = gr.Dropdown(
483
484
  label="Select a pending conversation request",
484
485
  choices=["latest"],
485
486
  interactive=True,
486
- value="latest"
487
+ value="latest",
487
488
  )
488
489
  refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
489
490
  status_display = gr.Textbox(
490
- label="Status",
491
- interactive=False,
492
- value="Ready to receive requests..."
491
+ label="Status", interactive=False, value="Ready to receive requests..."
493
492
  )
494
493
 
495
494
  with gr.Group():
496
495
  response_text = gr.Textbox(
497
- label="Message",
498
- lines=3,
499
- placeholder="Enter your message here..."
496
+ label="Message", lines=3, placeholder="Enter your message here..."
500
497
  )
501
- submit_btn = gr.Button("📤 Submit Message", variant="primary", interactive=False)
502
-
498
+ submit_btn = gr.Button(
499
+ "📤 Submit Message", variant="primary", interactive=False
500
+ )
501
+
503
502
  # Action Accordions (wrapped for visibility control)
504
503
  with gr.Group(visible=False) as actions_group:
505
504
  with gr.Tabs():
@@ -507,58 +506,73 @@ def create_ui():
507
506
  with gr.Group():
508
507
  description_text = gr.Textbox(
509
508
  label="Element Description",
510
- placeholder="e.g., 'Privacy and security option in left sidebar'"
509
+ placeholder="e.g., 'Privacy and security option in left sidebar'",
511
510
  )
512
511
  with gr.Row():
513
512
  description_action_type = gr.Dropdown(
514
513
  label="Action",
515
- choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
516
- value="click"
514
+ choices=[
515
+ "click",
516
+ "double_click",
517
+ "move",
518
+ "left_mouse_up",
519
+ "left_mouse_down",
520
+ ],
521
+ value="click",
517
522
  )
518
523
  description_button = gr.Dropdown(
519
524
  label="Button",
520
525
  choices=["left", "right", "wheel", "back", "forward"],
521
- value="left"
526
+ value="left",
522
527
  )
523
528
  description_submit_btn = gr.Button("Submit Click Action")
524
-
529
+
525
530
  with gr.Tab("📝 Type Action"):
526
531
  with gr.Group():
527
532
  type_text = gr.Textbox(
528
- label="Text to Type",
529
- placeholder="Enter text to type..."
533
+ label="Text to Type", placeholder="Enter text to type..."
530
534
  )
531
535
  type_submit_btn = gr.Button("Submit Type")
532
-
536
+
533
537
  with gr.Tab("⌨️ Keypress Action"):
534
538
  with gr.Group():
535
539
  keypress_text = gr.Textbox(
536
- label="Keys",
537
- placeholder="e.g., ctrl+c, alt+tab"
540
+ label="Keys", placeholder="e.g., ctrl+c, alt+tab"
538
541
  )
539
542
  keypress_submit_btn = gr.Button("Submit Keypress")
540
-
543
+
541
544
  with gr.Tab("🧰 Misc Actions"):
542
545
  with gr.Group():
543
546
  misc_action_dropdown = gr.Dropdown(
544
- label="Action",
545
- choices=["wait"],
546
- value="wait"
547
+ label="Action", choices=["wait"], value="wait"
547
548
  )
548
549
  misc_submit_btn = gr.Button("Submit Action")
549
-
550
+
550
551
  # Event handlers
551
552
  refresh_btn.click(
552
553
  fn=ui_handler.refresh_pending_calls,
553
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
554
+ outputs=[
555
+ call_dropdown,
556
+ screenshot_image,
557
+ conversation_chatbot,
558
+ submit_btn,
559
+ click_actions_group,
560
+ actions_group,
561
+ ],
554
562
  )
555
-
563
+
556
564
  call_dropdown.change(
557
565
  fn=ui_handler.on_call_selected,
558
566
  inputs=[call_dropdown],
559
- outputs=[screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
567
+ outputs=[
568
+ screenshot_image,
569
+ conversation_chatbot,
570
+ submit_btn,
571
+ click_actions_group,
572
+ actions_group,
573
+ ],
560
574
  )
561
-
575
+
562
576
  def handle_image_click(evt: gr.SelectData):
563
577
  if evt.index is not None:
564
578
  x, y = evt.index
@@ -568,31 +582,44 @@ def create_ui():
568
582
  sx_i = int(ui_handler.current_scroll_x or 0)
569
583
  sy_i = int(ui_handler.current_scroll_y or 0)
570
584
  # Submit a scroll action with x,y position and scroll deltas
571
- result = ui_handler.submit_action("scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i)
585
+ result = ui_handler.submit_action(
586
+ "scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
587
+ )
572
588
  else:
573
589
  result = ui_handler.submit_click_action(x, y, action_type, button)
574
590
  ui_handler.wait_for_pending_calls()
575
591
  return result
576
592
  return "No coordinates selected"
577
593
 
578
- screenshot_image.select(
579
- fn=handle_image_click,
580
- outputs=[status_display]
581
- ).then(
594
+ screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
582
595
  fn=ui_handler.wait_for_pending_calls,
583
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
596
+ outputs=[
597
+ call_dropdown,
598
+ screenshot_image,
599
+ conversation_chatbot,
600
+ submit_btn,
601
+ click_actions_group,
602
+ actions_group,
603
+ ],
584
604
  )
585
605
 
586
606
  # Response submission
587
607
  submit_btn.click(
588
608
  fn=ui_handler.submit_response,
589
609
  inputs=[response_text],
590
- outputs=[response_text, status_display]
610
+ outputs=[response_text, status_display],
591
611
  ).then(
592
612
  fn=ui_handler.refresh_pending_calls,
593
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
613
+ outputs=[
614
+ call_dropdown,
615
+ screenshot_image,
616
+ conversation_chatbot,
617
+ submit_btn,
618
+ click_actions_group,
619
+ actions_group,
620
+ ],
594
621
  )
595
-
622
+
596
623
  # Toggle visibility of controls based on action type
597
624
  def toggle_action_controls(action_type):
598
625
  # Button visible only for click
@@ -603,59 +630,63 @@ def create_ui():
603
630
  # Update state
604
631
  ui_handler.current_action_type = action_type or "click"
605
632
  return button_vis, scroll_x_vis, scroll_y_vis
606
-
633
+
607
634
  action_type_radio.change(
608
635
  fn=toggle_action_controls,
609
636
  inputs=[action_type_radio],
610
- outputs=[action_button_radio, scroll_x_input, scroll_y_input]
637
+ outputs=[action_button_radio, scroll_x_input, scroll_y_input],
611
638
  )
612
639
 
613
640
  # Keep other control values in ui_handler state
614
641
  def on_button_change(val):
615
- ui_handler.current_button = (val or "left")
616
- action_button_radio.change(
617
- fn=on_button_change,
618
- inputs=[action_button_radio]
619
- )
642
+ ui_handler.current_button = val or "left"
643
+
644
+ action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
620
645
 
621
646
  def on_scroll_x_change(val):
622
647
  try:
623
648
  ui_handler.current_scroll_x = int(val) if val is not None else 0
624
649
  except Exception:
625
650
  ui_handler.current_scroll_x = 0
626
- scroll_x_input.change(
627
- fn=on_scroll_x_change,
628
- inputs=[scroll_x_input]
629
- )
651
+
652
+ scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
630
653
 
631
654
  def on_scroll_y_change(val):
632
655
  try:
633
656
  ui_handler.current_scroll_y = int(val) if val is not None else 0
634
657
  except Exception:
635
658
  ui_handler.current_scroll_y = 0
636
- scroll_y_input.change(
637
- fn=on_scroll_y_change,
638
- inputs=[scroll_y_input]
639
- )
640
-
659
+
660
+ scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
661
+
641
662
  type_submit_btn.click(
642
- fn=ui_handler.submit_type_action,
643
- inputs=[type_text],
644
- outputs=[status_display]
663
+ fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
645
664
  ).then(
646
665
  fn=ui_handler.wait_for_pending_calls,
647
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
666
+ outputs=[
667
+ call_dropdown,
668
+ screenshot_image,
669
+ conversation_chatbot,
670
+ submit_btn,
671
+ click_actions_group,
672
+ actions_group,
673
+ ],
648
674
  )
649
-
675
+
650
676
  keypress_submit_btn.click(
651
- fn=ui_handler.submit_hotkey_action,
652
- inputs=[keypress_text],
653
- outputs=[status_display]
677
+ fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
654
678
  ).then(
655
679
  fn=ui_handler.wait_for_pending_calls,
656
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
680
+ outputs=[
681
+ call_dropdown,
682
+ screenshot_image,
683
+ conversation_chatbot,
684
+ submit_btn,
685
+ click_actions_group,
686
+ actions_group,
687
+ ],
657
688
  )
658
-
689
+
659
690
  def handle_description_submit(description, action_type, button):
660
691
  if description:
661
692
  result = ui_handler.submit_description_click(description, action_type, button)
@@ -666,12 +697,19 @@ def create_ui():
666
697
  description_submit_btn.click(
667
698
  fn=handle_description_submit,
668
699
  inputs=[description_text, description_action_type, description_button],
669
- outputs=[status_display]
700
+ outputs=[status_display],
670
701
  ).then(
671
702
  fn=ui_handler.wait_for_pending_calls,
672
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
703
+ outputs=[
704
+ call_dropdown,
705
+ screenshot_image,
706
+ conversation_chatbot,
707
+ submit_btn,
708
+ click_actions_group,
709
+ actions_group,
710
+ ],
673
711
  )
674
-
712
+
675
713
  # Misc action handler
676
714
  def handle_misc_submit(selected_action):
677
715
  if selected_action == "wait":
@@ -681,20 +719,32 @@ def create_ui():
681
719
  return f"Unsupported misc action: {selected_action}"
682
720
 
683
721
  misc_submit_btn.click(
684
- fn=handle_misc_submit,
685
- inputs=[misc_action_dropdown],
686
- outputs=[status_display]
722
+ fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
687
723
  ).then(
688
724
  fn=ui_handler.wait_for_pending_calls,
689
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
725
+ outputs=[
726
+ call_dropdown,
727
+ screenshot_image,
728
+ conversation_chatbot,
729
+ submit_btn,
730
+ click_actions_group,
731
+ actions_group,
732
+ ],
690
733
  )
691
-
734
+
692
735
  # Load initial data
693
736
  demo.load(
694
737
  fn=ui_handler.refresh_pending_calls,
695
- outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
738
+ outputs=[
739
+ call_dropdown,
740
+ screenshot_image,
741
+ conversation_chatbot,
742
+ submit_btn,
743
+ click_actions_group,
744
+ actions_group,
745
+ ],
696
746
  )
697
-
747
+
698
748
  return demo
699
749
 
700
750