cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/human_tool/ui.py
CHANGED
|
@@ -1,14 +1,17 @@
|
|
|
1
|
-
import
|
|
1
|
+
import base64
|
|
2
|
+
import io
|
|
2
3
|
import json
|
|
3
4
|
import time
|
|
4
|
-
from typing import List, Dict, Any, Optional
|
|
5
5
|
from datetime import datetime
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
import gradio as gr
|
|
6
9
|
import requests
|
|
7
|
-
from .server import completion_queue
|
|
8
|
-
import base64
|
|
9
|
-
import io
|
|
10
10
|
from PIL import Image
|
|
11
11
|
|
|
12
|
+
from .server import completion_queue
|
|
13
|
+
|
|
14
|
+
|
|
12
15
|
class HumanCompletionUI:
|
|
13
16
|
def __init__(self, server_url: str = "http://localhost:8002"):
|
|
14
17
|
self.server_url = server_url
|
|
@@ -20,7 +23,7 @@ class HumanCompletionUI:
|
|
|
20
23
|
self.current_button: str = "left"
|
|
21
24
|
self.current_scroll_x: int = 0
|
|
22
25
|
self.current_scroll_y: int = -120
|
|
23
|
-
|
|
26
|
+
|
|
24
27
|
def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
25
28
|
"""Format messages for display in gr.Chatbot with type='messages'."""
|
|
26
29
|
formatted = []
|
|
@@ -28,7 +31,7 @@ class HumanCompletionUI:
|
|
|
28
31
|
role = msg.get("role", "user")
|
|
29
32
|
content = msg.get("content", "")
|
|
30
33
|
tool_calls = msg.get("tool_calls", [])
|
|
31
|
-
|
|
34
|
+
|
|
32
35
|
# Handle different content formats
|
|
33
36
|
if isinstance(content, list):
|
|
34
37
|
# Multi-modal content - can include text and images
|
|
@@ -55,7 +58,7 @@ class HumanCompletionUI:
|
|
|
55
58
|
else:
|
|
56
59
|
# For URL images, create gr.Image with URL
|
|
57
60
|
formatted_content.append(gr.Image(value=image_url))
|
|
58
|
-
|
|
61
|
+
|
|
59
62
|
# Determine final content format
|
|
60
63
|
if len(formatted_content) == 1:
|
|
61
64
|
content = formatted_content[0]
|
|
@@ -63,28 +66,28 @@ class HumanCompletionUI:
|
|
|
63
66
|
content = formatted_content
|
|
64
67
|
else:
|
|
65
68
|
content = "[Empty content]"
|
|
66
|
-
|
|
69
|
+
|
|
67
70
|
# Ensure role is valid for Gradio Chatbot
|
|
68
71
|
if role not in ["user", "assistant"]:
|
|
69
72
|
role = "assistant" if role == "system" else "user"
|
|
70
|
-
|
|
73
|
+
|
|
71
74
|
# Invert roles for better display in human UI context
|
|
72
75
|
# (what the AI says becomes "user", what human should respond becomes "assistant")
|
|
73
76
|
if role == "user":
|
|
74
77
|
role = "assistant"
|
|
75
78
|
else:
|
|
76
79
|
role = "user"
|
|
77
|
-
|
|
80
|
+
|
|
78
81
|
# Add the main message if it has content
|
|
79
82
|
if content and str(content).strip():
|
|
80
83
|
formatted.append({"role": role, "content": content})
|
|
81
|
-
|
|
84
|
+
|
|
82
85
|
# Handle tool calls - create separate messages for each tool call
|
|
83
86
|
if tool_calls:
|
|
84
87
|
for tool_call in tool_calls:
|
|
85
88
|
function_name = tool_call.get("function", {}).get("name", "unknown")
|
|
86
89
|
arguments_str = tool_call.get("function", {}).get("arguments", "{}")
|
|
87
|
-
|
|
90
|
+
|
|
88
91
|
try:
|
|
89
92
|
# Parse arguments to format them nicely
|
|
90
93
|
arguments = json.loads(arguments_str)
|
|
@@ -92,18 +95,20 @@ class HumanCompletionUI:
|
|
|
92
95
|
except json.JSONDecodeError:
|
|
93
96
|
# If parsing fails, use the raw string
|
|
94
97
|
formatted_args = arguments_str
|
|
95
|
-
|
|
98
|
+
|
|
96
99
|
# Create a formatted message for the tool call
|
|
97
100
|
tool_call_content = f"```json\n{formatted_args}\n```"
|
|
98
|
-
|
|
99
|
-
formatted.append(
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
101
|
+
|
|
102
|
+
formatted.append(
|
|
103
|
+
{
|
|
104
|
+
"role": role,
|
|
105
|
+
"content": tool_call_content,
|
|
106
|
+
"metadata": {"title": f"🛠️ Used {function_name}"},
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
|
|
105
110
|
return formatted
|
|
106
|
-
|
|
111
|
+
|
|
107
112
|
def get_pending_calls(self) -> List[Dict[str, Any]]:
|
|
108
113
|
"""Get pending calls from the server."""
|
|
109
114
|
try:
|
|
@@ -113,38 +118,39 @@ class HumanCompletionUI:
|
|
|
113
118
|
except Exception as e:
|
|
114
119
|
print(f"Error fetching pending calls: {e}")
|
|
115
120
|
return []
|
|
116
|
-
|
|
121
|
+
|
|
117
122
|
def complete_call_with_response(self, call_id: str, response: str) -> bool:
|
|
118
123
|
"""Complete a call with a text response."""
|
|
119
124
|
try:
|
|
120
125
|
response_data = {"response": response}
|
|
121
126
|
response_obj = requests.post(
|
|
122
|
-
f"{self.server_url}/complete/{call_id}",
|
|
123
|
-
json=response_data,
|
|
124
|
-
timeout=10
|
|
127
|
+
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
|
125
128
|
)
|
|
126
129
|
response_obj.raise_for_status()
|
|
127
130
|
return True
|
|
128
131
|
except requests.RequestException as e:
|
|
129
132
|
print(f"Error completing call: {e}")
|
|
130
133
|
return False
|
|
131
|
-
|
|
134
|
+
|
|
132
135
|
def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
|
|
133
136
|
"""Complete a call with tool calls."""
|
|
134
137
|
try:
|
|
135
138
|
response_data = {"tool_calls": tool_calls}
|
|
136
139
|
response_obj = requests.post(
|
|
137
|
-
f"{self.server_url}/complete/{call_id}",
|
|
138
|
-
json=response_data,
|
|
139
|
-
timeout=10
|
|
140
|
+
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
|
140
141
|
)
|
|
141
142
|
response_obj.raise_for_status()
|
|
142
143
|
return True
|
|
143
144
|
except requests.RequestException as e:
|
|
144
145
|
print(f"Error completing call: {e}")
|
|
145
146
|
return False
|
|
146
|
-
|
|
147
|
-
def complete_call(
|
|
147
|
+
|
|
148
|
+
def complete_call(
|
|
149
|
+
self,
|
|
150
|
+
call_id: str,
|
|
151
|
+
response: Optional[str] = None,
|
|
152
|
+
tool_calls: Optional[List[Dict[str, Any]]] = None,
|
|
153
|
+
) -> bool:
|
|
148
154
|
"""Complete a call with either a response or tool calls."""
|
|
149
155
|
try:
|
|
150
156
|
response_data = {}
|
|
@@ -152,25 +158,23 @@ class HumanCompletionUI:
|
|
|
152
158
|
response_data["response"] = response
|
|
153
159
|
if tool_calls:
|
|
154
160
|
response_data["tool_calls"] = tool_calls
|
|
155
|
-
|
|
161
|
+
|
|
156
162
|
response_obj = requests.post(
|
|
157
|
-
f"{self.server_url}/complete/{call_id}",
|
|
158
|
-
json=response_data,
|
|
159
|
-
timeout=10
|
|
163
|
+
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
|
160
164
|
)
|
|
161
165
|
response_obj.raise_for_status()
|
|
162
166
|
return True
|
|
163
167
|
except requests.RequestException as e:
|
|
164
168
|
print(f"Error completing call: {e}")
|
|
165
169
|
return False
|
|
166
|
-
|
|
170
|
+
|
|
167
171
|
def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
|
|
168
172
|
"""Extract the last image from the messages for display above conversation."""
|
|
169
173
|
last_image = None
|
|
170
|
-
|
|
174
|
+
|
|
171
175
|
for msg in reversed(messages): # Start from the last message
|
|
172
176
|
content = msg.get("content", "")
|
|
173
|
-
|
|
177
|
+
|
|
174
178
|
if isinstance(content, list):
|
|
175
179
|
for item in reversed(content): # Get the last image in the message
|
|
176
180
|
if item.get("type") == "image_url":
|
|
@@ -189,13 +193,13 @@ class HumanCompletionUI:
|
|
|
189
193
|
else:
|
|
190
194
|
# For URL images, return the URL
|
|
191
195
|
return image_url
|
|
192
|
-
|
|
196
|
+
|
|
193
197
|
return last_image
|
|
194
|
-
|
|
198
|
+
|
|
195
199
|
def refresh_pending_calls(self):
|
|
196
200
|
"""Refresh the list of pending calls."""
|
|
197
201
|
pending_calls = self.get_pending_calls()
|
|
198
|
-
|
|
202
|
+
|
|
199
203
|
if not pending_calls:
|
|
200
204
|
return (
|
|
201
205
|
gr.update(choices=["latest"], value="latest"), # dropdown
|
|
@@ -205,27 +209,27 @@ class HumanCompletionUI:
|
|
|
205
209
|
gr.update(visible=False), # click_actions_group hidden
|
|
206
210
|
gr.update(visible=False), # actions_group hidden
|
|
207
211
|
)
|
|
208
|
-
|
|
212
|
+
|
|
209
213
|
# Sort pending calls by created_at to get oldest first
|
|
210
214
|
sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
|
|
211
|
-
|
|
215
|
+
|
|
212
216
|
# Create choices for dropdown
|
|
213
217
|
choices = [("latest", "latest")] # Add "latest" option first
|
|
214
|
-
|
|
218
|
+
|
|
215
219
|
for call in sorted_calls:
|
|
216
220
|
call_id = call["id"]
|
|
217
221
|
model = call.get("model", "unknown")
|
|
218
222
|
created_at = call.get("created_at", "")
|
|
219
223
|
# Format timestamp
|
|
220
224
|
try:
|
|
221
|
-
dt = datetime.fromisoformat(created_at.replace(
|
|
225
|
+
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
|
|
222
226
|
time_str = dt.strftime("%H:%M:%S")
|
|
223
227
|
except:
|
|
224
228
|
time_str = created_at
|
|
225
|
-
|
|
229
|
+
|
|
226
230
|
choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
|
|
227
231
|
choices.append((choice_label, call_id))
|
|
228
|
-
|
|
232
|
+
|
|
229
233
|
# Default to "latest" which shows the oldest pending conversation
|
|
230
234
|
selected_call_id = "latest"
|
|
231
235
|
if selected_call_id == "latest" and sorted_calls:
|
|
@@ -239,7 +243,7 @@ class HumanCompletionUI:
|
|
|
239
243
|
conversation = []
|
|
240
244
|
self.current_call_id = None
|
|
241
245
|
self.last_image = None
|
|
242
|
-
|
|
246
|
+
|
|
243
247
|
return (
|
|
244
248
|
gr.update(choices=choices, value="latest"),
|
|
245
249
|
gr.update(value=self.last_image),
|
|
@@ -248,7 +252,7 @@ class HumanCompletionUI:
|
|
|
248
252
|
gr.update(visible=True), # click_actions_group visible when there is a call
|
|
249
253
|
gr.update(visible=True), # actions_group visible when there is a call
|
|
250
254
|
)
|
|
251
|
-
|
|
255
|
+
|
|
252
256
|
def on_call_selected(self, selected_choice):
|
|
253
257
|
"""Handle when a call is selected from the dropdown."""
|
|
254
258
|
if not selected_choice:
|
|
@@ -259,7 +263,7 @@ class HumanCompletionUI:
|
|
|
259
263
|
gr.update(visible=False), # click_actions_group hidden
|
|
260
264
|
gr.update(visible=False), # actions_group hidden
|
|
261
265
|
)
|
|
262
|
-
|
|
266
|
+
|
|
263
267
|
pending_calls = self.get_pending_calls()
|
|
264
268
|
if not pending_calls:
|
|
265
269
|
return (
|
|
@@ -269,7 +273,7 @@ class HumanCompletionUI:
|
|
|
269
273
|
gr.update(visible=False), # click_actions_group hidden
|
|
270
274
|
gr.update(visible=False), # actions_group hidden
|
|
271
275
|
)
|
|
272
|
-
|
|
276
|
+
|
|
273
277
|
# Handle "latest" option
|
|
274
278
|
if selected_choice == "latest":
|
|
275
279
|
# Sort calls by created_at to get oldest first
|
|
@@ -284,17 +288,17 @@ class HumanCompletionUI:
|
|
|
284
288
|
if call_id_short in selected_choice:
|
|
285
289
|
call_id = call["id"]
|
|
286
290
|
break
|
|
287
|
-
|
|
291
|
+
|
|
288
292
|
if not call_id:
|
|
289
293
|
return (
|
|
290
294
|
gr.update(value=None), # no image
|
|
291
295
|
gr.update(value=[]), # empty chatbot
|
|
292
|
-
gr.update(interactive=False)
|
|
296
|
+
gr.update(interactive=False),
|
|
293
297
|
)
|
|
294
|
-
|
|
298
|
+
|
|
295
299
|
# Find the selected call
|
|
296
300
|
selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
|
|
297
|
-
|
|
301
|
+
|
|
298
302
|
if not selected_call:
|
|
299
303
|
return (
|
|
300
304
|
gr.update(value=None), # no image
|
|
@@ -303,12 +307,12 @@ class HumanCompletionUI:
|
|
|
303
307
|
gr.update(visible=False), # click_actions_group hidden
|
|
304
308
|
gr.update(visible=False), # actions_group hidden
|
|
305
309
|
)
|
|
306
|
-
|
|
310
|
+
|
|
307
311
|
conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
|
|
308
312
|
self.current_call_id = call_id
|
|
309
313
|
# Get the last image from messages
|
|
310
314
|
self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
|
|
311
|
-
|
|
315
|
+
|
|
312
316
|
return (
|
|
313
317
|
gr.update(value=self.last_image),
|
|
314
318
|
gr.update(value=conversation),
|
|
@@ -316,110 +320,111 @@ class HumanCompletionUI:
|
|
|
316
320
|
gr.update(visible=True), # click_actions_group visible
|
|
317
321
|
gr.update(visible=True), # actions_group visible
|
|
318
322
|
)
|
|
319
|
-
|
|
323
|
+
|
|
320
324
|
def submit_response(self, response_text: str):
|
|
321
325
|
"""Submit a text response to the current call."""
|
|
322
326
|
if not self.current_call_id:
|
|
323
327
|
return (
|
|
324
328
|
gr.update(value=response_text), # keep response text
|
|
325
|
-
gr.update(value="❌ No call selected") # status
|
|
329
|
+
gr.update(value="❌ No call selected"), # status
|
|
326
330
|
)
|
|
327
|
-
|
|
331
|
+
|
|
328
332
|
if not response_text.strip():
|
|
329
333
|
return (
|
|
330
334
|
gr.update(value=response_text), # keep response text
|
|
331
|
-
gr.update(value="❌ Response cannot be empty") # status
|
|
335
|
+
gr.update(value="❌ Response cannot be empty"), # status
|
|
332
336
|
)
|
|
333
|
-
|
|
337
|
+
|
|
334
338
|
success = self.complete_call_with_response(self.current_call_id, response_text)
|
|
335
|
-
|
|
339
|
+
|
|
336
340
|
if success:
|
|
337
341
|
status_msg = "✅ Response submitted successfully!"
|
|
338
342
|
return (
|
|
339
343
|
gr.update(value=""), # clear response text
|
|
340
|
-
gr.update(value=status_msg) # status
|
|
344
|
+
gr.update(value=status_msg), # status
|
|
341
345
|
)
|
|
342
346
|
else:
|
|
343
347
|
return (
|
|
344
348
|
gr.update(value=response_text), # keep response text
|
|
345
|
-
gr.update(value="❌ Failed to submit response") # status
|
|
349
|
+
gr.update(value="❌ Failed to submit response"), # status
|
|
346
350
|
)
|
|
347
|
-
|
|
351
|
+
|
|
348
352
|
def submit_action(self, action_type: str, **kwargs) -> str:
|
|
349
353
|
"""Submit a computer action as a tool call."""
|
|
350
354
|
if not self.current_call_id:
|
|
351
355
|
return "❌ No call selected"
|
|
352
|
-
|
|
356
|
+
|
|
353
357
|
import uuid
|
|
354
|
-
|
|
358
|
+
|
|
355
359
|
# Create tool call structure
|
|
356
360
|
action_data = {"type": action_type, **kwargs}
|
|
357
361
|
tool_call = {
|
|
358
362
|
"id": f"call_{uuid.uuid4().hex[:24]}",
|
|
359
363
|
"type": "function",
|
|
360
|
-
"function": {
|
|
361
|
-
"name": "computer",
|
|
362
|
-
"arguments": json.dumps(action_data)
|
|
363
|
-
}
|
|
364
|
+
"function": {"name": "computer", "arguments": json.dumps(action_data)},
|
|
364
365
|
}
|
|
365
|
-
|
|
366
|
+
|
|
366
367
|
success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
|
|
367
|
-
|
|
368
|
+
|
|
368
369
|
if success:
|
|
369
370
|
return f"✅ {action_type.capitalize()} action submitted as tool call"
|
|
370
371
|
else:
|
|
371
372
|
return f"❌ Failed to submit {action_type} action"
|
|
372
|
-
|
|
373
|
-
def submit_click_action(
|
|
373
|
+
|
|
374
|
+
def submit_click_action(
|
|
375
|
+
self, x: int, y: int, action_type: str = "click", button: str = "left"
|
|
376
|
+
) -> str:
|
|
374
377
|
"""Submit a coordinate-based action."""
|
|
375
378
|
if action_type == "click":
|
|
376
379
|
return self.submit_action(action_type, x=x, y=y, button=button)
|
|
377
380
|
else:
|
|
378
381
|
return self.submit_action(action_type, x=x, y=y)
|
|
379
|
-
|
|
382
|
+
|
|
380
383
|
def submit_type_action(self, text: str) -> str:
|
|
381
384
|
"""Submit a type action."""
|
|
382
385
|
return self.submit_action("type", text=text)
|
|
383
|
-
|
|
386
|
+
|
|
384
387
|
def submit_hotkey_action(self, keys: str) -> str:
|
|
385
388
|
"""Submit a hotkey action."""
|
|
386
389
|
return self.submit_action("keypress", keys=keys)
|
|
387
|
-
|
|
390
|
+
|
|
388
391
|
def submit_wait_action(self) -> str:
|
|
389
392
|
"""Submit a wait action with no kwargs."""
|
|
390
393
|
return self.submit_action("wait")
|
|
391
|
-
|
|
392
|
-
def submit_description_click(
|
|
394
|
+
|
|
395
|
+
def submit_description_click(
|
|
396
|
+
self, description: str, action_type: str = "click", button: str = "left"
|
|
397
|
+
) -> str:
|
|
393
398
|
"""Submit a description-based action."""
|
|
394
399
|
if action_type == "click":
|
|
395
400
|
return self.submit_action(action_type, element_description=description, button=button)
|
|
396
401
|
else:
|
|
397
402
|
return self.submit_action(action_type, element_description=description)
|
|
398
|
-
|
|
403
|
+
|
|
399
404
|
def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
|
|
400
405
|
"""Wait for pending calls to appear or until max_seconds elapsed.
|
|
401
|
-
|
|
406
|
+
|
|
402
407
|
This method loops and checks for pending calls at regular intervals,
|
|
403
408
|
returning as soon as a pending call is found or the maximum wait time is reached.
|
|
404
|
-
|
|
409
|
+
|
|
405
410
|
Args:
|
|
406
411
|
max_seconds: Maximum number of seconds to wait
|
|
407
412
|
check_interval: How often to check for pending calls (in seconds)
|
|
408
413
|
"""
|
|
409
414
|
import time
|
|
410
|
-
|
|
415
|
+
|
|
411
416
|
start_time = time.time()
|
|
412
|
-
|
|
417
|
+
|
|
413
418
|
while time.time() - start_time < max_seconds:
|
|
414
419
|
# Check if there are any pending calls
|
|
415
420
|
pending_calls = self.get_pending_calls()
|
|
416
421
|
if pending_calls:
|
|
417
422
|
# Found pending calls, return immediately
|
|
418
423
|
return self.refresh_pending_calls()
|
|
419
|
-
|
|
424
|
+
|
|
420
425
|
# Wait before checking again
|
|
421
426
|
time.sleep(check_interval)
|
|
422
|
-
|
|
427
|
+
|
|
423
428
|
# Max wait time reached, return current state
|
|
424
429
|
return self.refresh_pending_calls()
|
|
425
430
|
|
|
@@ -427,79 +432,73 @@ class HumanCompletionUI:
|
|
|
427
432
|
def create_ui():
|
|
428
433
|
"""Create the Gradio interface."""
|
|
429
434
|
ui_handler = HumanCompletionUI()
|
|
430
|
-
|
|
435
|
+
|
|
431
436
|
with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
|
|
432
437
|
gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
|
|
433
438
|
gr.Markdown("Review AI conversation requests and provide human responses.")
|
|
434
|
-
|
|
439
|
+
|
|
435
440
|
with gr.Row():
|
|
436
441
|
with gr.Column(scale=2):
|
|
437
442
|
with gr.Group():
|
|
438
443
|
screenshot_image = gr.Image(
|
|
439
|
-
label="Interactive Screenshot",
|
|
440
|
-
interactive=False,
|
|
441
|
-
height=600
|
|
444
|
+
label="Interactive Screenshot", interactive=False, height=600
|
|
442
445
|
)
|
|
443
|
-
|
|
446
|
+
|
|
444
447
|
# Action type selection for image clicks (wrapped for visibility control)
|
|
445
448
|
with gr.Group(visible=False) as click_actions_group:
|
|
446
449
|
with gr.Row():
|
|
447
450
|
action_type_radio = gr.Dropdown(
|
|
448
451
|
label="Interactive Action",
|
|
449
|
-
choices=[
|
|
452
|
+
choices=[
|
|
453
|
+
"click",
|
|
454
|
+
"double_click",
|
|
455
|
+
"move",
|
|
456
|
+
"left_mouse_up",
|
|
457
|
+
"left_mouse_down",
|
|
458
|
+
"scroll",
|
|
459
|
+
],
|
|
450
460
|
value="click",
|
|
451
|
-
scale=2
|
|
461
|
+
scale=2,
|
|
452
462
|
)
|
|
453
463
|
action_button_radio = gr.Dropdown(
|
|
454
464
|
label="Button",
|
|
455
465
|
choices=["left", "right", "wheel", "back", "forward"],
|
|
456
466
|
value="left",
|
|
457
467
|
visible=True,
|
|
458
|
-
scale=1
|
|
468
|
+
scale=1,
|
|
459
469
|
)
|
|
460
470
|
scroll_x_input = gr.Number(
|
|
461
|
-
label="scroll_x",
|
|
462
|
-
value=0,
|
|
463
|
-
visible=False,
|
|
464
|
-
scale=1
|
|
471
|
+
label="scroll_x", value=0, visible=False, scale=1
|
|
465
472
|
)
|
|
466
473
|
scroll_y_input = gr.Number(
|
|
467
|
-
label="scroll_y",
|
|
468
|
-
value=-120,
|
|
469
|
-
visible=False,
|
|
470
|
-
scale=1
|
|
474
|
+
label="scroll_y", value=-120, visible=False, scale=1
|
|
471
475
|
)
|
|
472
|
-
|
|
476
|
+
|
|
473
477
|
conversation_chatbot = gr.Chatbot(
|
|
474
|
-
label="Conversation",
|
|
475
|
-
type="messages",
|
|
476
|
-
height=500,
|
|
477
|
-
show_copy_button=True
|
|
478
|
+
label="Conversation", type="messages", height=500, show_copy_button=True
|
|
478
479
|
)
|
|
479
|
-
|
|
480
|
+
|
|
480
481
|
with gr.Column(scale=1):
|
|
481
482
|
with gr.Group():
|
|
482
483
|
call_dropdown = gr.Dropdown(
|
|
483
484
|
label="Select a pending conversation request",
|
|
484
485
|
choices=["latest"],
|
|
485
486
|
interactive=True,
|
|
486
|
-
value="latest"
|
|
487
|
+
value="latest",
|
|
487
488
|
)
|
|
488
489
|
refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
|
|
489
490
|
status_display = gr.Textbox(
|
|
490
|
-
label="Status",
|
|
491
|
-
interactive=False,
|
|
492
|
-
value="Ready to receive requests..."
|
|
491
|
+
label="Status", interactive=False, value="Ready to receive requests..."
|
|
493
492
|
)
|
|
494
493
|
|
|
495
494
|
with gr.Group():
|
|
496
495
|
response_text = gr.Textbox(
|
|
497
|
-
label="Message",
|
|
498
|
-
lines=3,
|
|
499
|
-
placeholder="Enter your message here..."
|
|
496
|
+
label="Message", lines=3, placeholder="Enter your message here..."
|
|
500
497
|
)
|
|
501
|
-
submit_btn = gr.Button(
|
|
502
|
-
|
|
498
|
+
submit_btn = gr.Button(
|
|
499
|
+
"📤 Submit Message", variant="primary", interactive=False
|
|
500
|
+
)
|
|
501
|
+
|
|
503
502
|
# Action Accordions (wrapped for visibility control)
|
|
504
503
|
with gr.Group(visible=False) as actions_group:
|
|
505
504
|
with gr.Tabs():
|
|
@@ -507,58 +506,73 @@ def create_ui():
|
|
|
507
506
|
with gr.Group():
|
|
508
507
|
description_text = gr.Textbox(
|
|
509
508
|
label="Element Description",
|
|
510
|
-
placeholder="e.g., 'Privacy and security option in left sidebar'"
|
|
509
|
+
placeholder="e.g., 'Privacy and security option in left sidebar'",
|
|
511
510
|
)
|
|
512
511
|
with gr.Row():
|
|
513
512
|
description_action_type = gr.Dropdown(
|
|
514
513
|
label="Action",
|
|
515
|
-
choices=[
|
|
516
|
-
|
|
514
|
+
choices=[
|
|
515
|
+
"click",
|
|
516
|
+
"double_click",
|
|
517
|
+
"move",
|
|
518
|
+
"left_mouse_up",
|
|
519
|
+
"left_mouse_down",
|
|
520
|
+
],
|
|
521
|
+
value="click",
|
|
517
522
|
)
|
|
518
523
|
description_button = gr.Dropdown(
|
|
519
524
|
label="Button",
|
|
520
525
|
choices=["left", "right", "wheel", "back", "forward"],
|
|
521
|
-
value="left"
|
|
526
|
+
value="left",
|
|
522
527
|
)
|
|
523
528
|
description_submit_btn = gr.Button("Submit Click Action")
|
|
524
|
-
|
|
529
|
+
|
|
525
530
|
with gr.Tab("📝 Type Action"):
|
|
526
531
|
with gr.Group():
|
|
527
532
|
type_text = gr.Textbox(
|
|
528
|
-
label="Text to Type",
|
|
529
|
-
placeholder="Enter text to type..."
|
|
533
|
+
label="Text to Type", placeholder="Enter text to type..."
|
|
530
534
|
)
|
|
531
535
|
type_submit_btn = gr.Button("Submit Type")
|
|
532
|
-
|
|
536
|
+
|
|
533
537
|
with gr.Tab("⌨️ Keypress Action"):
|
|
534
538
|
with gr.Group():
|
|
535
539
|
keypress_text = gr.Textbox(
|
|
536
|
-
label="Keys",
|
|
537
|
-
placeholder="e.g., ctrl+c, alt+tab"
|
|
540
|
+
label="Keys", placeholder="e.g., ctrl+c, alt+tab"
|
|
538
541
|
)
|
|
539
542
|
keypress_submit_btn = gr.Button("Submit Keypress")
|
|
540
|
-
|
|
543
|
+
|
|
541
544
|
with gr.Tab("🧰 Misc Actions"):
|
|
542
545
|
with gr.Group():
|
|
543
546
|
misc_action_dropdown = gr.Dropdown(
|
|
544
|
-
label="Action",
|
|
545
|
-
choices=["wait"],
|
|
546
|
-
value="wait"
|
|
547
|
+
label="Action", choices=["wait"], value="wait"
|
|
547
548
|
)
|
|
548
549
|
misc_submit_btn = gr.Button("Submit Action")
|
|
549
|
-
|
|
550
|
+
|
|
550
551
|
# Event handlers
|
|
551
552
|
refresh_btn.click(
|
|
552
553
|
fn=ui_handler.refresh_pending_calls,
|
|
553
|
-
outputs=[
|
|
554
|
+
outputs=[
|
|
555
|
+
call_dropdown,
|
|
556
|
+
screenshot_image,
|
|
557
|
+
conversation_chatbot,
|
|
558
|
+
submit_btn,
|
|
559
|
+
click_actions_group,
|
|
560
|
+
actions_group,
|
|
561
|
+
],
|
|
554
562
|
)
|
|
555
|
-
|
|
563
|
+
|
|
556
564
|
call_dropdown.change(
|
|
557
565
|
fn=ui_handler.on_call_selected,
|
|
558
566
|
inputs=[call_dropdown],
|
|
559
|
-
outputs=[
|
|
567
|
+
outputs=[
|
|
568
|
+
screenshot_image,
|
|
569
|
+
conversation_chatbot,
|
|
570
|
+
submit_btn,
|
|
571
|
+
click_actions_group,
|
|
572
|
+
actions_group,
|
|
573
|
+
],
|
|
560
574
|
)
|
|
561
|
-
|
|
575
|
+
|
|
562
576
|
def handle_image_click(evt: gr.SelectData):
|
|
563
577
|
if evt.index is not None:
|
|
564
578
|
x, y = evt.index
|
|
@@ -568,31 +582,44 @@ def create_ui():
|
|
|
568
582
|
sx_i = int(ui_handler.current_scroll_x or 0)
|
|
569
583
|
sy_i = int(ui_handler.current_scroll_y or 0)
|
|
570
584
|
# Submit a scroll action with x,y position and scroll deltas
|
|
571
|
-
result = ui_handler.submit_action(
|
|
585
|
+
result = ui_handler.submit_action(
|
|
586
|
+
"scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
|
|
587
|
+
)
|
|
572
588
|
else:
|
|
573
589
|
result = ui_handler.submit_click_action(x, y, action_type, button)
|
|
574
590
|
ui_handler.wait_for_pending_calls()
|
|
575
591
|
return result
|
|
576
592
|
return "No coordinates selected"
|
|
577
593
|
|
|
578
|
-
screenshot_image.select(
|
|
579
|
-
fn=handle_image_click,
|
|
580
|
-
outputs=[status_display]
|
|
581
|
-
).then(
|
|
594
|
+
screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
|
|
582
595
|
fn=ui_handler.wait_for_pending_calls,
|
|
583
|
-
outputs=[
|
|
596
|
+
outputs=[
|
|
597
|
+
call_dropdown,
|
|
598
|
+
screenshot_image,
|
|
599
|
+
conversation_chatbot,
|
|
600
|
+
submit_btn,
|
|
601
|
+
click_actions_group,
|
|
602
|
+
actions_group,
|
|
603
|
+
],
|
|
584
604
|
)
|
|
585
605
|
|
|
586
606
|
# Response submission
|
|
587
607
|
submit_btn.click(
|
|
588
608
|
fn=ui_handler.submit_response,
|
|
589
609
|
inputs=[response_text],
|
|
590
|
-
outputs=[response_text, status_display]
|
|
610
|
+
outputs=[response_text, status_display],
|
|
591
611
|
).then(
|
|
592
612
|
fn=ui_handler.refresh_pending_calls,
|
|
593
|
-
outputs=[
|
|
613
|
+
outputs=[
|
|
614
|
+
call_dropdown,
|
|
615
|
+
screenshot_image,
|
|
616
|
+
conversation_chatbot,
|
|
617
|
+
submit_btn,
|
|
618
|
+
click_actions_group,
|
|
619
|
+
actions_group,
|
|
620
|
+
],
|
|
594
621
|
)
|
|
595
|
-
|
|
622
|
+
|
|
596
623
|
# Toggle visibility of controls based on action type
|
|
597
624
|
def toggle_action_controls(action_type):
|
|
598
625
|
# Button visible only for click
|
|
@@ -603,59 +630,63 @@ def create_ui():
|
|
|
603
630
|
# Update state
|
|
604
631
|
ui_handler.current_action_type = action_type or "click"
|
|
605
632
|
return button_vis, scroll_x_vis, scroll_y_vis
|
|
606
|
-
|
|
633
|
+
|
|
607
634
|
action_type_radio.change(
|
|
608
635
|
fn=toggle_action_controls,
|
|
609
636
|
inputs=[action_type_radio],
|
|
610
|
-
outputs=[action_button_radio, scroll_x_input, scroll_y_input]
|
|
637
|
+
outputs=[action_button_radio, scroll_x_input, scroll_y_input],
|
|
611
638
|
)
|
|
612
639
|
|
|
613
640
|
# Keep other control values in ui_handler state
|
|
614
641
|
def on_button_change(val):
|
|
615
|
-
ui_handler.current_button =
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
inputs=[action_button_radio]
|
|
619
|
-
)
|
|
642
|
+
ui_handler.current_button = val or "left"
|
|
643
|
+
|
|
644
|
+
action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
|
|
620
645
|
|
|
621
646
|
def on_scroll_x_change(val):
|
|
622
647
|
try:
|
|
623
648
|
ui_handler.current_scroll_x = int(val) if val is not None else 0
|
|
624
649
|
except Exception:
|
|
625
650
|
ui_handler.current_scroll_x = 0
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
inputs=[scroll_x_input]
|
|
629
|
-
)
|
|
651
|
+
|
|
652
|
+
scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
|
|
630
653
|
|
|
631
654
|
def on_scroll_y_change(val):
|
|
632
655
|
try:
|
|
633
656
|
ui_handler.current_scroll_y = int(val) if val is not None else 0
|
|
634
657
|
except Exception:
|
|
635
658
|
ui_handler.current_scroll_y = 0
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
)
|
|
640
|
-
|
|
659
|
+
|
|
660
|
+
scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
|
|
661
|
+
|
|
641
662
|
type_submit_btn.click(
|
|
642
|
-
fn=ui_handler.submit_type_action,
|
|
643
|
-
inputs=[type_text],
|
|
644
|
-
outputs=[status_display]
|
|
663
|
+
fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
|
|
645
664
|
).then(
|
|
646
665
|
fn=ui_handler.wait_for_pending_calls,
|
|
647
|
-
outputs=[
|
|
666
|
+
outputs=[
|
|
667
|
+
call_dropdown,
|
|
668
|
+
screenshot_image,
|
|
669
|
+
conversation_chatbot,
|
|
670
|
+
submit_btn,
|
|
671
|
+
click_actions_group,
|
|
672
|
+
actions_group,
|
|
673
|
+
],
|
|
648
674
|
)
|
|
649
|
-
|
|
675
|
+
|
|
650
676
|
keypress_submit_btn.click(
|
|
651
|
-
fn=ui_handler.submit_hotkey_action,
|
|
652
|
-
inputs=[keypress_text],
|
|
653
|
-
outputs=[status_display]
|
|
677
|
+
fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
|
|
654
678
|
).then(
|
|
655
679
|
fn=ui_handler.wait_for_pending_calls,
|
|
656
|
-
outputs=[
|
|
680
|
+
outputs=[
|
|
681
|
+
call_dropdown,
|
|
682
|
+
screenshot_image,
|
|
683
|
+
conversation_chatbot,
|
|
684
|
+
submit_btn,
|
|
685
|
+
click_actions_group,
|
|
686
|
+
actions_group,
|
|
687
|
+
],
|
|
657
688
|
)
|
|
658
|
-
|
|
689
|
+
|
|
659
690
|
def handle_description_submit(description, action_type, button):
|
|
660
691
|
if description:
|
|
661
692
|
result = ui_handler.submit_description_click(description, action_type, button)
|
|
@@ -666,12 +697,19 @@ def create_ui():
|
|
|
666
697
|
description_submit_btn.click(
|
|
667
698
|
fn=handle_description_submit,
|
|
668
699
|
inputs=[description_text, description_action_type, description_button],
|
|
669
|
-
outputs=[status_display]
|
|
700
|
+
outputs=[status_display],
|
|
670
701
|
).then(
|
|
671
702
|
fn=ui_handler.wait_for_pending_calls,
|
|
672
|
-
outputs=[
|
|
703
|
+
outputs=[
|
|
704
|
+
call_dropdown,
|
|
705
|
+
screenshot_image,
|
|
706
|
+
conversation_chatbot,
|
|
707
|
+
submit_btn,
|
|
708
|
+
click_actions_group,
|
|
709
|
+
actions_group,
|
|
710
|
+
],
|
|
673
711
|
)
|
|
674
|
-
|
|
712
|
+
|
|
675
713
|
# Misc action handler
|
|
676
714
|
def handle_misc_submit(selected_action):
|
|
677
715
|
if selected_action == "wait":
|
|
@@ -681,20 +719,32 @@ def create_ui():
|
|
|
681
719
|
return f"Unsupported misc action: {selected_action}"
|
|
682
720
|
|
|
683
721
|
misc_submit_btn.click(
|
|
684
|
-
fn=handle_misc_submit,
|
|
685
|
-
inputs=[misc_action_dropdown],
|
|
686
|
-
outputs=[status_display]
|
|
722
|
+
fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
|
|
687
723
|
).then(
|
|
688
724
|
fn=ui_handler.wait_for_pending_calls,
|
|
689
|
-
outputs=[
|
|
725
|
+
outputs=[
|
|
726
|
+
call_dropdown,
|
|
727
|
+
screenshot_image,
|
|
728
|
+
conversation_chatbot,
|
|
729
|
+
submit_btn,
|
|
730
|
+
click_actions_group,
|
|
731
|
+
actions_group,
|
|
732
|
+
],
|
|
690
733
|
)
|
|
691
|
-
|
|
734
|
+
|
|
692
735
|
# Load initial data
|
|
693
736
|
demo.load(
|
|
694
737
|
fn=ui_handler.refresh_pending_calls,
|
|
695
|
-
outputs=[
|
|
738
|
+
outputs=[
|
|
739
|
+
call_dropdown,
|
|
740
|
+
screenshot_image,
|
|
741
|
+
conversation_chatbot,
|
|
742
|
+
submit_btn,
|
|
743
|
+
click_actions_group,
|
|
744
|
+
actions_group,
|
|
745
|
+
],
|
|
696
746
|
)
|
|
697
|
-
|
|
747
|
+
|
|
698
748
|
return demo
|
|
699
749
|
|
|
700
750
|
|