cua-agent 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/human_tool/ui.py ADDED
@@ -0,0 +1,630 @@
1
+ import gradio as gr
2
+ import json
3
+ import time
4
+ from typing import List, Dict, Any, Optional
5
+ from datetime import datetime
6
+ import requests
7
+ from .server import completion_queue
8
+ import base64
9
+ import io
10
+ from PIL import Image
11
+
12
+ class HumanCompletionUI:
13
+ def __init__(self, server_url: str = "http://localhost:8002"):
14
+ self.server_url = server_url
15
+ self.current_call_id: Optional[str] = None
16
+ self.refresh_interval = 2.0 # seconds
17
+ self.last_image = None # Store the last image for display
18
+
19
+ def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
20
+ """Format messages for display in gr.Chatbot with type='messages'."""
21
+ formatted = []
22
+ for msg in messages:
23
+ role = msg.get("role", "user")
24
+ content = msg.get("content", "")
25
+ tool_calls = msg.get("tool_calls", [])
26
+
27
+ # Handle different content formats
28
+ if isinstance(content, list):
29
+ # Multi-modal content - can include text and images
30
+ formatted_content = []
31
+ for item in content:
32
+ if item.get("type") == "text":
33
+ text = item.get("text", "")
34
+ if text.strip(): # Only add non-empty text
35
+ formatted_content.append(text)
36
+ elif item.get("type") == "image_url":
37
+ image_url = item.get("image_url", {}).get("url", "")
38
+ if image_url:
39
+ # Check if it's a base64 image or URL
40
+ if image_url.startswith("data:image"):
41
+ # For base64 images, decode and create gr.Image
42
+ try:
43
+ header, data = image_url.split(",", 1)
44
+ image_data = base64.b64decode(data)
45
+ image = Image.open(io.BytesIO(image_data))
46
+ formatted_content.append(gr.Image(value=image))
47
+ except Exception as e:
48
+ print(f"Error loading image: {e}")
49
+ formatted_content.append(f"[Image loading error: {e}]")
50
+ else:
51
+ # For URL images, create gr.Image with URL
52
+ formatted_content.append(gr.Image(value=image_url))
53
+
54
+ # Determine final content format
55
+ if len(formatted_content) == 1:
56
+ content = formatted_content[0]
57
+ elif len(formatted_content) > 1:
58
+ content = formatted_content
59
+ else:
60
+ content = "[Empty content]"
61
+
62
+ # Ensure role is valid for Gradio Chatbot
63
+ if role not in ["user", "assistant"]:
64
+ role = "assistant" if role == "system" else "user"
65
+
66
+ # Invert roles for better display in human UI context
67
+ # (what the AI says becomes "user", what human should respond becomes "assistant")
68
+ if role == "user":
69
+ role = "assistant"
70
+ else:
71
+ role = "user"
72
+
73
+ # Add the main message if it has content
74
+ if content and str(content).strip():
75
+ formatted.append({"role": role, "content": content})
76
+
77
+ # Handle tool calls - create separate messages for each tool call
78
+ if tool_calls:
79
+ for tool_call in tool_calls:
80
+ function_name = tool_call.get("function", {}).get("name", "unknown")
81
+ arguments_str = tool_call.get("function", {}).get("arguments", "{}")
82
+
83
+ try:
84
+ # Parse arguments to format them nicely
85
+ arguments = json.loads(arguments_str)
86
+ formatted_args = json.dumps(arguments, indent=2)
87
+ except json.JSONDecodeError:
88
+ # If parsing fails, use the raw string
89
+ formatted_args = arguments_str
90
+
91
+ # Create a formatted message for the tool call
92
+ tool_call_content = f"```json\n{formatted_args}\n```"
93
+
94
+ formatted.append({
95
+ "role": role,
96
+ "content": tool_call_content,
97
+ "metadata": {"title": f"🛠️ Used {function_name}"}
98
+ })
99
+
100
+ return formatted
101
+
102
+ def get_pending_calls(self) -> List[Dict[str, Any]]:
103
+ """Get pending calls from the server."""
104
+ try:
105
+ response = requests.get(f"{self.server_url}/pending", timeout=5)
106
+ if response.status_code == 200:
107
+ return response.json().get("pending_calls", [])
108
+ except Exception as e:
109
+ print(f"Error fetching pending calls: {e}")
110
+ return []
111
+
112
+ def complete_call_with_response(self, call_id: str, response: str) -> bool:
113
+ """Complete a call with a text response."""
114
+ try:
115
+ response_data = {"response": response}
116
+ response_obj = requests.post(
117
+ f"{self.server_url}/complete/{call_id}",
118
+ json=response_data,
119
+ timeout=10
120
+ )
121
+ response_obj.raise_for_status()
122
+ return True
123
+ except requests.RequestException as e:
124
+ print(f"Error completing call: {e}")
125
+ return False
126
+
127
+ def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
128
+ """Complete a call with tool calls."""
129
+ try:
130
+ response_data = {"tool_calls": tool_calls}
131
+ response_obj = requests.post(
132
+ f"{self.server_url}/complete/{call_id}",
133
+ json=response_data,
134
+ timeout=10
135
+ )
136
+ response_obj.raise_for_status()
137
+ return True
138
+ except requests.RequestException as e:
139
+ print(f"Error completing call: {e}")
140
+ return False
141
+
142
+ def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
143
+ """Complete a call with either a response or tool calls."""
144
+ try:
145
+ response_data = {}
146
+ if response:
147
+ response_data["response"] = response
148
+ if tool_calls:
149
+ response_data["tool_calls"] = tool_calls
150
+
151
+ response_obj = requests.post(
152
+ f"{self.server_url}/complete/{call_id}",
153
+ json=response_data,
154
+ timeout=10
155
+ )
156
+ response_obj.raise_for_status()
157
+ return True
158
+ except requests.RequestException as e:
159
+ print(f"Error completing call: {e}")
160
+ return False
161
+
162
+ def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
163
+ """Extract the last image from the messages for display above conversation."""
164
+ last_image = None
165
+
166
+ for msg in reversed(messages): # Start from the last message
167
+ content = msg.get("content", "")
168
+
169
+ if isinstance(content, list):
170
+ for item in reversed(content): # Get the last image in the message
171
+ if item.get("type") == "image_url":
172
+ image_url = item.get("image_url", {}).get("url", "")
173
+ if image_url:
174
+ if image_url.startswith("data:image"):
175
+ # For base64 images, create a gr.Image component
176
+ try:
177
+ header, data = image_url.split(",", 1)
178
+ image_data = base64.b64decode(data)
179
+ image = Image.open(io.BytesIO(image_data))
180
+ return image
181
+ except Exception as e:
182
+ print(f"Error loading image: {e}")
183
+ continue
184
+ else:
185
+ # For URL images, return the URL
186
+ return image_url
187
+
188
+ return last_image
189
+
190
+ def refresh_pending_calls(self):
191
+ """Refresh the list of pending calls."""
192
+ pending_calls = self.get_pending_calls()
193
+
194
+ if not pending_calls:
195
+ return (
196
+ gr.update(choices=["latest"], value="latest"), # dropdown
197
+ gr.update(value=None), # image (no image)
198
+ gr.update(value=[]), # chatbot (empty messages)
199
+ gr.update(interactive=False) # submit button
200
+ )
201
+
202
+ # Sort pending calls by created_at to get oldest first
203
+ sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
204
+
205
+ # Create choices for dropdown
206
+ choices = [("latest", "latest")] # Add "latest" option first
207
+
208
+ for call in sorted_calls:
209
+ call_id = call["id"]
210
+ model = call.get("model", "unknown")
211
+ created_at = call.get("created_at", "")
212
+ # Format timestamp
213
+ try:
214
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
215
+ time_str = dt.strftime("%H:%M:%S")
216
+ except:
217
+ time_str = created_at
218
+
219
+ choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
220
+ choices.append((choice_label, call_id))
221
+
222
+ # Default to "latest" which shows the oldest pending conversation
223
+ selected_call_id = "latest"
224
+ if selected_call_id == "latest" and sorted_calls:
225
+ # Use the oldest call (first in sorted list)
226
+ selected_call = sorted_calls[0]
227
+ conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
228
+ self.current_call_id = selected_call["id"]
229
+ # Get the last image from messages
230
+ self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
231
+ else:
232
+ conversation = []
233
+ self.current_call_id = None
234
+ self.last_image = None
235
+
236
+ return (
237
+ gr.update(choices=choices, value="latest"),
238
+ gr.update(value=self.last_image),
239
+ gr.update(value=conversation),
240
+ gr.update(interactive=bool(choices))
241
+ )
242
+
243
+ def on_call_selected(self, selected_choice):
244
+ """Handle when a call is selected from the dropdown."""
245
+ if not selected_choice:
246
+ return (
247
+ gr.update(value=None), # no image
248
+ gr.update(value=[]), # empty chatbot
249
+ gr.update(interactive=False)
250
+ )
251
+
252
+ pending_calls = self.get_pending_calls()
253
+ if not pending_calls:
254
+ return (
255
+ gr.update(value=None), # no image
256
+ gr.update(value=[]), # empty chatbot
257
+ gr.update(interactive=False)
258
+ )
259
+
260
+ # Handle "latest" option
261
+ if selected_choice == "latest":
262
+ # Sort calls by created_at to get oldest first
263
+ sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
264
+ selected_call = sorted_calls[0] # Get the oldest call
265
+ call_id = selected_call["id"]
266
+ else:
267
+ # Extract call_id from the choice for specific calls
268
+ call_id = None
269
+ for call in pending_calls:
270
+ call_id_short = call["id"][:8]
271
+ if call_id_short in selected_choice:
272
+ call_id = call["id"]
273
+ break
274
+
275
+ if not call_id:
276
+ return (
277
+ gr.update(value=None), # no image
278
+ gr.update(value=[]), # empty chatbot
279
+ gr.update(interactive=False)
280
+ )
281
+
282
+ # Find the selected call
283
+ selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
284
+
285
+ if not selected_call:
286
+ return (
287
+ gr.update(value=None), # no image
288
+ gr.update(value=[]), # empty chatbot
289
+ gr.update(interactive=False)
290
+ )
291
+
292
+ conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
293
+ self.current_call_id = call_id
294
+ # Get the last image from messages
295
+ self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
296
+
297
+ return (
298
+ gr.update(value=self.last_image),
299
+ gr.update(value=conversation),
300
+ gr.update(interactive=True)
301
+ )
302
+
303
+ def submit_response(self, response_text: str):
304
+ """Submit a text response to the current call."""
305
+ if not self.current_call_id:
306
+ return (
307
+ gr.update(value=response_text), # keep response text
308
+ gr.update(value="❌ No call selected") # status
309
+ )
310
+
311
+ if not response_text.strip():
312
+ return (
313
+ gr.update(value=response_text), # keep response text
314
+ gr.update(value="❌ Response cannot be empty") # status
315
+ )
316
+
317
+ success = self.complete_call_with_response(self.current_call_id, response_text)
318
+
319
+ if success:
320
+ status_msg = "✅ Response submitted successfully!"
321
+ return (
322
+ gr.update(value=""), # clear response text
323
+ gr.update(value=status_msg) # status
324
+ )
325
+ else:
326
+ return (
327
+ gr.update(value=response_text), # keep response text
328
+ gr.update(value="❌ Failed to submit response") # status
329
+ )
330
+
331
+ def submit_action(self, action_type: str, **kwargs) -> str:
332
+ """Submit a computer action as a tool call."""
333
+ if not self.current_call_id:
334
+ return "❌ No call selected"
335
+
336
+ import uuid
337
+
338
+ # Create tool call structure
339
+ action_data = {"type": action_type, **kwargs}
340
+ tool_call = {
341
+ "id": f"call_{uuid.uuid4().hex[:24]}",
342
+ "type": "function",
343
+ "function": {
344
+ "name": "computer",
345
+ "arguments": json.dumps(action_data)
346
+ }
347
+ }
348
+
349
+ success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
350
+
351
+ if success:
352
+ return f"✅ {action_type.capitalize()} action submitted as tool call"
353
+ else:
354
+ return f"❌ Failed to submit {action_type} action"
355
+
356
+ def submit_click_action(self, x: int, y: int, action_type: str = "click", button: str = "left") -> str:
357
+ """Submit a coordinate-based action."""
358
+ if action_type == "click":
359
+ return self.submit_action(action_type, x=x, y=y, button=button)
360
+ else:
361
+ return self.submit_action(action_type, x=x, y=y)
362
+
363
+ def submit_type_action(self, text: str) -> str:
364
+ """Submit a type action."""
365
+ return self.submit_action("type", text=text)
366
+
367
+ def submit_hotkey_action(self, keys: str) -> str:
368
+ """Submit a hotkey action."""
369
+ return self.submit_action("keypress", keys=keys)
370
+
371
+ def submit_description_click(self, description: str, action_type: str = "click", button: str = "left") -> str:
372
+ """Submit a description-based action."""
373
+ if action_type == "click":
374
+ return self.submit_action(action_type, element_description=description, button=button)
375
+ else:
376
+ return self.submit_action(action_type, element_description=description)
377
+
378
+ def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
379
+ """Wait for pending calls to appear or until max_seconds elapsed.
380
+
381
+ This method loops and checks for pending calls at regular intervals,
382
+ returning as soon as a pending call is found or the maximum wait time is reached.
383
+
384
+ Args:
385
+ max_seconds: Maximum number of seconds to wait
386
+ check_interval: How often to check for pending calls (in seconds)
387
+ """
388
+ import time
389
+
390
+ start_time = time.time()
391
+
392
+ while time.time() - start_time < max_seconds:
393
+ # Check if there are any pending calls
394
+ pending_calls = self.get_pending_calls()
395
+ if pending_calls:
396
+ # Found pending calls, return immediately
397
+ return self.refresh_pending_calls()
398
+
399
+ # Wait before checking again
400
+ time.sleep(check_interval)
401
+
402
+ # Max wait time reached, return current state
403
+ return self.refresh_pending_calls()
404
+
405
+
406
+ def create_ui():
407
+ """Create the Gradio interface."""
408
+ ui_handler = HumanCompletionUI()
409
+
410
+ with gr.Blocks(title="Human-in-the-Loop Agent Tool") as demo:
411
+ gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
412
+ gr.Markdown("Review AI conversation requests and provide human responses.")
413
+
414
+ with gr.Row():
415
+ with gr.Column(scale=2):
416
+ with gr.Group():
417
+ screenshot_image = gr.Image(
418
+ label="Screenshot",
419
+ interactive=False,
420
+ height=600
421
+ )
422
+
423
+ # Action type selection for image clicks
424
+ with gr.Row():
425
+ action_type_radio = gr.Radio(
426
+ label="Action Type",
427
+ choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
428
+ value="click",
429
+ scale=2
430
+ )
431
+ action_button_radio = gr.Radio(
432
+ label="Button (for click only)",
433
+ choices=["left", "right", "wheel", "back", "forward"],
434
+ value="left",
435
+ visible=True,
436
+ scale=1
437
+ )
438
+
439
+ conversation_chatbot = gr.Chatbot(
440
+ label="Messages",
441
+ type="messages",
442
+ height=500,
443
+ show_copy_button=True
444
+ )
445
+
446
+ with gr.Column(scale=1):
447
+ with gr.Group():
448
+ call_dropdown = gr.Dropdown(
449
+ label="Select a pending call",
450
+ choices=["latest"],
451
+ interactive=True,
452
+ value="latest"
453
+ )
454
+ refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
455
+
456
+ with gr.Group():
457
+ response_text = gr.Textbox(
458
+ label="Response",
459
+ lines=3,
460
+ placeholder="Enter your response here..."
461
+ )
462
+ submit_btn = gr.Button("📤 Submit Response", variant="primary", interactive=False)
463
+
464
+ # Action Accordions
465
+ with gr.Accordion("🖱️ Click Actions", open=False):
466
+ with gr.Group():
467
+ with gr.Row():
468
+ click_x = gr.Number(label="X", value=0, minimum=0)
469
+ click_y = gr.Number(label="Y", value=0, minimum=0)
470
+ with gr.Row():
471
+ click_action_type = gr.Dropdown(
472
+ label="Action Type",
473
+ choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
474
+ value="click"
475
+ )
476
+ click_button = gr.Dropdown(
477
+ label="Button (for click only)",
478
+ choices=["left", "right", "wheel", "back", "forward"],
479
+ value="left"
480
+ )
481
+ click_submit_btn = gr.Button("Submit Action")
482
+
483
+ with gr.Accordion("📝 Type Action", open=False):
484
+ with gr.Group():
485
+ type_text = gr.Textbox(
486
+ label="Text to Type",
487
+ placeholder="Enter text to type..."
488
+ )
489
+ type_submit_btn = gr.Button("Submit Type")
490
+
491
+ with gr.Accordion("⌨️ Keypress Action", open=False):
492
+ with gr.Group():
493
+ keypress_text = gr.Textbox(
494
+ label="Keys",
495
+ placeholder="e.g., ctrl+c, alt+tab"
496
+ )
497
+ keypress_submit_btn = gr.Button("Submit Keypress")
498
+
499
+ with gr.Accordion("🎯 Description Action", open=False):
500
+ with gr.Group():
501
+ description_text = gr.Textbox(
502
+ label="Element Description",
503
+ placeholder="e.g., 'Privacy and security option in left sidebar'"
504
+ )
505
+ with gr.Row():
506
+ description_action_type = gr.Dropdown(
507
+ label="Action Type",
508
+ choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
509
+ value="click"
510
+ )
511
+ description_button = gr.Radio(
512
+ label="Button (for click only)",
513
+ choices=["left", "right", "wheel", "back", "forward"],
514
+ value="left"
515
+ )
516
+ description_submit_btn = gr.Button("Submit Description Action")
517
+
518
+ status_display = gr.Textbox(
519
+ label="Status",
520
+ interactive=False,
521
+ value="Ready to receive calls..."
522
+ )
523
+
524
+ # Event handlers
525
+ refresh_btn.click(
526
+ fn=ui_handler.refresh_pending_calls,
527
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
528
+ )
529
+
530
+ call_dropdown.change(
531
+ fn=ui_handler.on_call_selected,
532
+ inputs=[call_dropdown],
533
+ outputs=[screenshot_image, conversation_chatbot, submit_btn]
534
+ )
535
+
536
+ def handle_image_click(evt: gr.SelectData):
537
+ if evt.index is not None:
538
+ x, y = evt.index
539
+ action_type = action_type_radio.value or "click"
540
+ button = action_button_radio.value or "left"
541
+ result = ui_handler.submit_click_action(x, y, action_type, button)
542
+ ui_handler.wait_for_pending_calls()
543
+ return result
544
+ return "No coordinates selected"
545
+
546
+ screenshot_image.select(
547
+ fn=handle_image_click,
548
+ outputs=[status_display]
549
+ ).then(
550
+ fn=ui_handler.wait_for_pending_calls,
551
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
552
+ )
553
+
554
+ # Response submission
555
+ submit_btn.click(
556
+ fn=ui_handler.submit_response,
557
+ inputs=[response_text],
558
+ outputs=[response_text, status_display]
559
+ ).then(
560
+ fn=ui_handler.refresh_pending_calls,
561
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
562
+ )
563
+
564
+ # Toggle button radio visibility based on action type
565
+ def toggle_button_visibility(action_type):
566
+ return gr.update(visible=(action_type == "click"))
567
+
568
+ action_type_radio.change(
569
+ fn=toggle_button_visibility,
570
+ inputs=[action_type_radio],
571
+ outputs=[action_button_radio]
572
+ )
573
+
574
+ # Action accordion handlers
575
+ click_submit_btn.click(
576
+ fn=ui_handler.submit_click_action,
577
+ inputs=[click_x, click_y, click_action_type, click_button],
578
+ outputs=[status_display]
579
+ ).then(
580
+ fn=ui_handler.wait_for_pending_calls,
581
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
582
+ )
583
+
584
+ type_submit_btn.click(
585
+ fn=ui_handler.submit_type_action,
586
+ inputs=[type_text],
587
+ outputs=[status_display]
588
+ ).then(
589
+ fn=ui_handler.wait_for_pending_calls,
590
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
591
+ )
592
+
593
+ keypress_submit_btn.click(
594
+ fn=ui_handler.submit_hotkey_action,
595
+ inputs=[keypress_text],
596
+ outputs=[status_display]
597
+ ).then(
598
+ fn=ui_handler.wait_for_pending_calls,
599
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
600
+ )
601
+
602
+ def handle_description_submit(description, action_type, button):
603
+ if description:
604
+ result = ui_handler.submit_description_click(description, action_type, button)
605
+ ui_handler.wait_for_pending_calls()
606
+ return result
607
+ return "Please enter a description"
608
+
609
+ description_submit_btn.click(
610
+ fn=handle_description_submit,
611
+ inputs=[description_text, description_action_type, description_button],
612
+ outputs=[status_display]
613
+ ).then(
614
+ fn=ui_handler.wait_for_pending_calls,
615
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
616
+ )
617
+
618
+ # Load initial data
619
+ demo.load(
620
+ fn=ui_handler.refresh_pending_calls,
621
+ outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
622
+ )
623
+
624
+ return demo
625
+
626
+
627
+ if __name__ == "__main__":
628
+ demo = create_ui()
629
+ demo.queue()
630
+ demo.launch(server_name="0.0.0.0", server_port=7860)