cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/human_tool/ui.py
CHANGED
|
@@ -1,21 +1,29 @@
|
|
|
1
|
-
import
|
|
1
|
+
import base64
|
|
2
|
+
import io
|
|
2
3
|
import json
|
|
3
4
|
import time
|
|
4
|
-
from typing import List, Dict, Any, Optional
|
|
5
5
|
from datetime import datetime
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
import gradio as gr
|
|
6
9
|
import requests
|
|
7
|
-
from .server import completion_queue
|
|
8
|
-
import base64
|
|
9
|
-
import io
|
|
10
10
|
from PIL import Image
|
|
11
11
|
|
|
12
|
+
from .server import completion_queue
|
|
13
|
+
|
|
14
|
+
|
|
12
15
|
class HumanCompletionUI:
|
|
13
16
|
def __init__(self, server_url: str = "http://localhost:8002"):
|
|
14
17
|
self.server_url = server_url
|
|
15
18
|
self.current_call_id: Optional[str] = None
|
|
16
19
|
self.refresh_interval = 2.0 # seconds
|
|
17
20
|
self.last_image = None # Store the last image for display
|
|
18
|
-
|
|
21
|
+
# Track current interactive action controls
|
|
22
|
+
self.current_action_type: str = "click"
|
|
23
|
+
self.current_button: str = "left"
|
|
24
|
+
self.current_scroll_x: int = 0
|
|
25
|
+
self.current_scroll_y: int = -120
|
|
26
|
+
|
|
19
27
|
def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
20
28
|
"""Format messages for display in gr.Chatbot with type='messages'."""
|
|
21
29
|
formatted = []
|
|
@@ -23,7 +31,7 @@ class HumanCompletionUI:
|
|
|
23
31
|
role = msg.get("role", "user")
|
|
24
32
|
content = msg.get("content", "")
|
|
25
33
|
tool_calls = msg.get("tool_calls", [])
|
|
26
|
-
|
|
34
|
+
|
|
27
35
|
# Handle different content formats
|
|
28
36
|
if isinstance(content, list):
|
|
29
37
|
# Multi-modal content - can include text and images
|
|
@@ -50,7 +58,7 @@ class HumanCompletionUI:
|
|
|
50
58
|
else:
|
|
51
59
|
# For URL images, create gr.Image with URL
|
|
52
60
|
formatted_content.append(gr.Image(value=image_url))
|
|
53
|
-
|
|
61
|
+
|
|
54
62
|
# Determine final content format
|
|
55
63
|
if len(formatted_content) == 1:
|
|
56
64
|
content = formatted_content[0]
|
|
@@ -58,28 +66,28 @@ class HumanCompletionUI:
|
|
|
58
66
|
content = formatted_content
|
|
59
67
|
else:
|
|
60
68
|
content = "[Empty content]"
|
|
61
|
-
|
|
69
|
+
|
|
62
70
|
# Ensure role is valid for Gradio Chatbot
|
|
63
71
|
if role not in ["user", "assistant"]:
|
|
64
72
|
role = "assistant" if role == "system" else "user"
|
|
65
|
-
|
|
73
|
+
|
|
66
74
|
# Invert roles for better display in human UI context
|
|
67
75
|
# (what the AI says becomes "user", what human should respond becomes "assistant")
|
|
68
76
|
if role == "user":
|
|
69
77
|
role = "assistant"
|
|
70
78
|
else:
|
|
71
79
|
role = "user"
|
|
72
|
-
|
|
80
|
+
|
|
73
81
|
# Add the main message if it has content
|
|
74
82
|
if content and str(content).strip():
|
|
75
83
|
formatted.append({"role": role, "content": content})
|
|
76
|
-
|
|
84
|
+
|
|
77
85
|
# Handle tool calls - create separate messages for each tool call
|
|
78
86
|
if tool_calls:
|
|
79
87
|
for tool_call in tool_calls:
|
|
80
88
|
function_name = tool_call.get("function", {}).get("name", "unknown")
|
|
81
89
|
arguments_str = tool_call.get("function", {}).get("arguments", "{}")
|
|
82
|
-
|
|
90
|
+
|
|
83
91
|
try:
|
|
84
92
|
# Parse arguments to format them nicely
|
|
85
93
|
arguments = json.loads(arguments_str)
|
|
@@ -87,18 +95,20 @@ class HumanCompletionUI:
|
|
|
87
95
|
except json.JSONDecodeError:
|
|
88
96
|
# If parsing fails, use the raw string
|
|
89
97
|
formatted_args = arguments_str
|
|
90
|
-
|
|
98
|
+
|
|
91
99
|
# Create a formatted message for the tool call
|
|
92
100
|
tool_call_content = f"```json\n{formatted_args}\n```"
|
|
93
|
-
|
|
94
|
-
formatted.append(
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
101
|
+
|
|
102
|
+
formatted.append(
|
|
103
|
+
{
|
|
104
|
+
"role": role,
|
|
105
|
+
"content": tool_call_content,
|
|
106
|
+
"metadata": {"title": f"🛠️ Used {function_name}"},
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
|
|
100
110
|
return formatted
|
|
101
|
-
|
|
111
|
+
|
|
102
112
|
def get_pending_calls(self) -> List[Dict[str, Any]]:
|
|
103
113
|
"""Get pending calls from the server."""
|
|
104
114
|
try:
|
|
@@ -108,38 +118,39 @@ class HumanCompletionUI:
|
|
|
108
118
|
except Exception as e:
|
|
109
119
|
print(f"Error fetching pending calls: {e}")
|
|
110
120
|
return []
|
|
111
|
-
|
|
121
|
+
|
|
112
122
|
def complete_call_with_response(self, call_id: str, response: str) -> bool:
|
|
113
123
|
"""Complete a call with a text response."""
|
|
114
124
|
try:
|
|
115
125
|
response_data = {"response": response}
|
|
116
126
|
response_obj = requests.post(
|
|
117
|
-
f"{self.server_url}/complete/{call_id}",
|
|
118
|
-
json=response_data,
|
|
119
|
-
timeout=10
|
|
127
|
+
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
|
120
128
|
)
|
|
121
129
|
response_obj.raise_for_status()
|
|
122
130
|
return True
|
|
123
131
|
except requests.RequestException as e:
|
|
124
132
|
print(f"Error completing call: {e}")
|
|
125
133
|
return False
|
|
126
|
-
|
|
134
|
+
|
|
127
135
|
def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
|
|
128
136
|
"""Complete a call with tool calls."""
|
|
129
137
|
try:
|
|
130
138
|
response_data = {"tool_calls": tool_calls}
|
|
131
139
|
response_obj = requests.post(
|
|
132
|
-
f"{self.server_url}/complete/{call_id}",
|
|
133
|
-
json=response_data,
|
|
134
|
-
timeout=10
|
|
140
|
+
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
|
135
141
|
)
|
|
136
142
|
response_obj.raise_for_status()
|
|
137
143
|
return True
|
|
138
144
|
except requests.RequestException as e:
|
|
139
145
|
print(f"Error completing call: {e}")
|
|
140
146
|
return False
|
|
141
|
-
|
|
142
|
-
def complete_call(
|
|
147
|
+
|
|
148
|
+
def complete_call(
|
|
149
|
+
self,
|
|
150
|
+
call_id: str,
|
|
151
|
+
response: Optional[str] = None,
|
|
152
|
+
tool_calls: Optional[List[Dict[str, Any]]] = None,
|
|
153
|
+
) -> bool:
|
|
143
154
|
"""Complete a call with either a response or tool calls."""
|
|
144
155
|
try:
|
|
145
156
|
response_data = {}
|
|
@@ -147,25 +158,23 @@ class HumanCompletionUI:
|
|
|
147
158
|
response_data["response"] = response
|
|
148
159
|
if tool_calls:
|
|
149
160
|
response_data["tool_calls"] = tool_calls
|
|
150
|
-
|
|
161
|
+
|
|
151
162
|
response_obj = requests.post(
|
|
152
|
-
f"{self.server_url}/complete/{call_id}",
|
|
153
|
-
json=response_data,
|
|
154
|
-
timeout=10
|
|
163
|
+
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
|
155
164
|
)
|
|
156
165
|
response_obj.raise_for_status()
|
|
157
166
|
return True
|
|
158
167
|
except requests.RequestException as e:
|
|
159
168
|
print(f"Error completing call: {e}")
|
|
160
169
|
return False
|
|
161
|
-
|
|
170
|
+
|
|
162
171
|
def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
|
|
163
172
|
"""Extract the last image from the messages for display above conversation."""
|
|
164
173
|
last_image = None
|
|
165
|
-
|
|
174
|
+
|
|
166
175
|
for msg in reversed(messages): # Start from the last message
|
|
167
176
|
content = msg.get("content", "")
|
|
168
|
-
|
|
177
|
+
|
|
169
178
|
if isinstance(content, list):
|
|
170
179
|
for item in reversed(content): # Get the last image in the message
|
|
171
180
|
if item.get("type") == "image_url":
|
|
@@ -184,41 +193,43 @@ class HumanCompletionUI:
|
|
|
184
193
|
else:
|
|
185
194
|
# For URL images, return the URL
|
|
186
195
|
return image_url
|
|
187
|
-
|
|
196
|
+
|
|
188
197
|
return last_image
|
|
189
|
-
|
|
198
|
+
|
|
190
199
|
def refresh_pending_calls(self):
|
|
191
200
|
"""Refresh the list of pending calls."""
|
|
192
201
|
pending_calls = self.get_pending_calls()
|
|
193
|
-
|
|
202
|
+
|
|
194
203
|
if not pending_calls:
|
|
195
204
|
return (
|
|
196
205
|
gr.update(choices=["latest"], value="latest"), # dropdown
|
|
197
206
|
gr.update(value=None), # image (no image)
|
|
198
207
|
gr.update(value=[]), # chatbot (empty messages)
|
|
199
|
-
gr.update(interactive=False) # submit button
|
|
208
|
+
gr.update(interactive=False), # submit button
|
|
209
|
+
gr.update(visible=False), # click_actions_group hidden
|
|
210
|
+
gr.update(visible=False), # actions_group hidden
|
|
200
211
|
)
|
|
201
|
-
|
|
212
|
+
|
|
202
213
|
# Sort pending calls by created_at to get oldest first
|
|
203
214
|
sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
|
|
204
|
-
|
|
215
|
+
|
|
205
216
|
# Create choices for dropdown
|
|
206
217
|
choices = [("latest", "latest")] # Add "latest" option first
|
|
207
|
-
|
|
218
|
+
|
|
208
219
|
for call in sorted_calls:
|
|
209
220
|
call_id = call["id"]
|
|
210
221
|
model = call.get("model", "unknown")
|
|
211
222
|
created_at = call.get("created_at", "")
|
|
212
223
|
# Format timestamp
|
|
213
224
|
try:
|
|
214
|
-
dt = datetime.fromisoformat(created_at.replace(
|
|
225
|
+
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
|
|
215
226
|
time_str = dt.strftime("%H:%M:%S")
|
|
216
227
|
except:
|
|
217
228
|
time_str = created_at
|
|
218
|
-
|
|
229
|
+
|
|
219
230
|
choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
|
|
220
231
|
choices.append((choice_label, call_id))
|
|
221
|
-
|
|
232
|
+
|
|
222
233
|
# Default to "latest" which shows the oldest pending conversation
|
|
223
234
|
selected_call_id = "latest"
|
|
224
235
|
if selected_call_id == "latest" and sorted_calls:
|
|
@@ -232,31 +243,37 @@ class HumanCompletionUI:
|
|
|
232
243
|
conversation = []
|
|
233
244
|
self.current_call_id = None
|
|
234
245
|
self.last_image = None
|
|
235
|
-
|
|
246
|
+
|
|
236
247
|
return (
|
|
237
248
|
gr.update(choices=choices, value="latest"),
|
|
238
249
|
gr.update(value=self.last_image),
|
|
239
250
|
gr.update(value=conversation),
|
|
240
|
-
gr.update(interactive=bool(choices))
|
|
251
|
+
gr.update(interactive=bool(choices)),
|
|
252
|
+
gr.update(visible=True), # click_actions_group visible when there is a call
|
|
253
|
+
gr.update(visible=True), # actions_group visible when there is a call
|
|
241
254
|
)
|
|
242
|
-
|
|
255
|
+
|
|
243
256
|
def on_call_selected(self, selected_choice):
|
|
244
257
|
"""Handle when a call is selected from the dropdown."""
|
|
245
258
|
if not selected_choice:
|
|
246
259
|
return (
|
|
247
260
|
gr.update(value=None), # no image
|
|
248
261
|
gr.update(value=[]), # empty chatbot
|
|
249
|
-
gr.update(interactive=False)
|
|
262
|
+
gr.update(interactive=False),
|
|
263
|
+
gr.update(visible=False), # click_actions_group hidden
|
|
264
|
+
gr.update(visible=False), # actions_group hidden
|
|
250
265
|
)
|
|
251
|
-
|
|
266
|
+
|
|
252
267
|
pending_calls = self.get_pending_calls()
|
|
253
268
|
if not pending_calls:
|
|
254
269
|
return (
|
|
255
270
|
gr.update(value=None), # no image
|
|
256
271
|
gr.update(value=[]), # empty chatbot
|
|
257
|
-
gr.update(interactive=False)
|
|
272
|
+
gr.update(interactive=False),
|
|
273
|
+
gr.update(visible=False), # click_actions_group hidden
|
|
274
|
+
gr.update(visible=False), # actions_group hidden
|
|
258
275
|
)
|
|
259
|
-
|
|
276
|
+
|
|
260
277
|
# Handle "latest" option
|
|
261
278
|
if selected_choice == "latest":
|
|
262
279
|
# Sort calls by created_at to get oldest first
|
|
@@ -271,134 +288,143 @@ class HumanCompletionUI:
|
|
|
271
288
|
if call_id_short in selected_choice:
|
|
272
289
|
call_id = call["id"]
|
|
273
290
|
break
|
|
274
|
-
|
|
291
|
+
|
|
275
292
|
if not call_id:
|
|
276
293
|
return (
|
|
277
294
|
gr.update(value=None), # no image
|
|
278
295
|
gr.update(value=[]), # empty chatbot
|
|
279
|
-
gr.update(interactive=False)
|
|
296
|
+
gr.update(interactive=False),
|
|
280
297
|
)
|
|
281
|
-
|
|
298
|
+
|
|
282
299
|
# Find the selected call
|
|
283
300
|
selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
|
|
284
|
-
|
|
301
|
+
|
|
285
302
|
if not selected_call:
|
|
286
303
|
return (
|
|
287
304
|
gr.update(value=None), # no image
|
|
288
305
|
gr.update(value=[]), # empty chatbot
|
|
289
|
-
gr.update(interactive=False)
|
|
306
|
+
gr.update(interactive=False),
|
|
307
|
+
gr.update(visible=False), # click_actions_group hidden
|
|
308
|
+
gr.update(visible=False), # actions_group hidden
|
|
290
309
|
)
|
|
291
|
-
|
|
310
|
+
|
|
292
311
|
conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
|
|
293
312
|
self.current_call_id = call_id
|
|
294
313
|
# Get the last image from messages
|
|
295
314
|
self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
|
|
296
|
-
|
|
315
|
+
|
|
297
316
|
return (
|
|
298
317
|
gr.update(value=self.last_image),
|
|
299
318
|
gr.update(value=conversation),
|
|
300
|
-
gr.update(interactive=True)
|
|
319
|
+
gr.update(interactive=True),
|
|
320
|
+
gr.update(visible=True), # click_actions_group visible
|
|
321
|
+
gr.update(visible=True), # actions_group visible
|
|
301
322
|
)
|
|
302
|
-
|
|
323
|
+
|
|
303
324
|
def submit_response(self, response_text: str):
|
|
304
325
|
"""Submit a text response to the current call."""
|
|
305
326
|
if not self.current_call_id:
|
|
306
327
|
return (
|
|
307
328
|
gr.update(value=response_text), # keep response text
|
|
308
|
-
gr.update(value="❌ No call selected") # status
|
|
329
|
+
gr.update(value="❌ No call selected"), # status
|
|
309
330
|
)
|
|
310
|
-
|
|
331
|
+
|
|
311
332
|
if not response_text.strip():
|
|
312
333
|
return (
|
|
313
334
|
gr.update(value=response_text), # keep response text
|
|
314
|
-
gr.update(value="❌ Response cannot be empty") # status
|
|
335
|
+
gr.update(value="❌ Response cannot be empty"), # status
|
|
315
336
|
)
|
|
316
|
-
|
|
337
|
+
|
|
317
338
|
success = self.complete_call_with_response(self.current_call_id, response_text)
|
|
318
|
-
|
|
339
|
+
|
|
319
340
|
if success:
|
|
320
341
|
status_msg = "✅ Response submitted successfully!"
|
|
321
342
|
return (
|
|
322
343
|
gr.update(value=""), # clear response text
|
|
323
|
-
gr.update(value=status_msg) # status
|
|
344
|
+
gr.update(value=status_msg), # status
|
|
324
345
|
)
|
|
325
346
|
else:
|
|
326
347
|
return (
|
|
327
348
|
gr.update(value=response_text), # keep response text
|
|
328
|
-
gr.update(value="❌ Failed to submit response") # status
|
|
349
|
+
gr.update(value="❌ Failed to submit response"), # status
|
|
329
350
|
)
|
|
330
|
-
|
|
351
|
+
|
|
331
352
|
def submit_action(self, action_type: str, **kwargs) -> str:
|
|
332
353
|
"""Submit a computer action as a tool call."""
|
|
333
354
|
if not self.current_call_id:
|
|
334
355
|
return "❌ No call selected"
|
|
335
|
-
|
|
356
|
+
|
|
336
357
|
import uuid
|
|
337
|
-
|
|
358
|
+
|
|
338
359
|
# Create tool call structure
|
|
339
360
|
action_data = {"type": action_type, **kwargs}
|
|
340
361
|
tool_call = {
|
|
341
362
|
"id": f"call_{uuid.uuid4().hex[:24]}",
|
|
342
363
|
"type": "function",
|
|
343
|
-
"function": {
|
|
344
|
-
"name": "computer",
|
|
345
|
-
"arguments": json.dumps(action_data)
|
|
346
|
-
}
|
|
364
|
+
"function": {"name": "computer", "arguments": json.dumps(action_data)},
|
|
347
365
|
}
|
|
348
|
-
|
|
366
|
+
|
|
349
367
|
success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
|
|
350
|
-
|
|
368
|
+
|
|
351
369
|
if success:
|
|
352
370
|
return f"✅ {action_type.capitalize()} action submitted as tool call"
|
|
353
371
|
else:
|
|
354
372
|
return f"❌ Failed to submit {action_type} action"
|
|
355
|
-
|
|
356
|
-
def submit_click_action(
|
|
373
|
+
|
|
374
|
+
def submit_click_action(
|
|
375
|
+
self, x: int, y: int, action_type: str = "click", button: str = "left"
|
|
376
|
+
) -> str:
|
|
357
377
|
"""Submit a coordinate-based action."""
|
|
358
378
|
if action_type == "click":
|
|
359
379
|
return self.submit_action(action_type, x=x, y=y, button=button)
|
|
360
380
|
else:
|
|
361
381
|
return self.submit_action(action_type, x=x, y=y)
|
|
362
|
-
|
|
382
|
+
|
|
363
383
|
def submit_type_action(self, text: str) -> str:
|
|
364
384
|
"""Submit a type action."""
|
|
365
385
|
return self.submit_action("type", text=text)
|
|
366
|
-
|
|
386
|
+
|
|
367
387
|
def submit_hotkey_action(self, keys: str) -> str:
|
|
368
388
|
"""Submit a hotkey action."""
|
|
369
389
|
return self.submit_action("keypress", keys=keys)
|
|
370
|
-
|
|
371
|
-
def
|
|
390
|
+
|
|
391
|
+
def submit_wait_action(self) -> str:
|
|
392
|
+
"""Submit a wait action with no kwargs."""
|
|
393
|
+
return self.submit_action("wait")
|
|
394
|
+
|
|
395
|
+
def submit_description_click(
|
|
396
|
+
self, description: str, action_type: str = "click", button: str = "left"
|
|
397
|
+
) -> str:
|
|
372
398
|
"""Submit a description-based action."""
|
|
373
399
|
if action_type == "click":
|
|
374
400
|
return self.submit_action(action_type, element_description=description, button=button)
|
|
375
401
|
else:
|
|
376
402
|
return self.submit_action(action_type, element_description=description)
|
|
377
|
-
|
|
403
|
+
|
|
378
404
|
def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
|
|
379
405
|
"""Wait for pending calls to appear or until max_seconds elapsed.
|
|
380
|
-
|
|
406
|
+
|
|
381
407
|
This method loops and checks for pending calls at regular intervals,
|
|
382
408
|
returning as soon as a pending call is found or the maximum wait time is reached.
|
|
383
|
-
|
|
409
|
+
|
|
384
410
|
Args:
|
|
385
411
|
max_seconds: Maximum number of seconds to wait
|
|
386
412
|
check_interval: How often to check for pending calls (in seconds)
|
|
387
413
|
"""
|
|
388
414
|
import time
|
|
389
|
-
|
|
415
|
+
|
|
390
416
|
start_time = time.time()
|
|
391
|
-
|
|
417
|
+
|
|
392
418
|
while time.time() - start_time < max_seconds:
|
|
393
419
|
# Check if there are any pending calls
|
|
394
420
|
pending_calls = self.get_pending_calls()
|
|
395
421
|
if pending_calls:
|
|
396
422
|
# Found pending calls, return immediately
|
|
397
423
|
return self.refresh_pending_calls()
|
|
398
|
-
|
|
424
|
+
|
|
399
425
|
# Wait before checking again
|
|
400
426
|
time.sleep(check_interval)
|
|
401
|
-
|
|
427
|
+
|
|
402
428
|
# Max wait time reached, return current state
|
|
403
429
|
return self.refresh_pending_calls()
|
|
404
430
|
|
|
@@ -406,199 +432,261 @@ class HumanCompletionUI:
|
|
|
406
432
|
def create_ui():
|
|
407
433
|
"""Create the Gradio interface."""
|
|
408
434
|
ui_handler = HumanCompletionUI()
|
|
409
|
-
|
|
410
|
-
with gr.Blocks(title="Human-in-the-Loop Agent Tool") as demo:
|
|
435
|
+
|
|
436
|
+
with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
|
|
411
437
|
gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
|
|
412
438
|
gr.Markdown("Review AI conversation requests and provide human responses.")
|
|
413
|
-
|
|
439
|
+
|
|
414
440
|
with gr.Row():
|
|
415
441
|
with gr.Column(scale=2):
|
|
416
442
|
with gr.Group():
|
|
417
443
|
screenshot_image = gr.Image(
|
|
418
|
-
label="Screenshot",
|
|
419
|
-
interactive=False,
|
|
420
|
-
height=600
|
|
444
|
+
label="Interactive Screenshot", interactive=False, height=600
|
|
421
445
|
)
|
|
422
|
-
|
|
423
|
-
# Action type selection for image clicks
|
|
424
|
-
with gr.
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
446
|
+
|
|
447
|
+
# Action type selection for image clicks (wrapped for visibility control)
|
|
448
|
+
with gr.Group(visible=False) as click_actions_group:
|
|
449
|
+
with gr.Row():
|
|
450
|
+
action_type_radio = gr.Dropdown(
|
|
451
|
+
label="Interactive Action",
|
|
452
|
+
choices=[
|
|
453
|
+
"click",
|
|
454
|
+
"double_click",
|
|
455
|
+
"move",
|
|
456
|
+
"left_mouse_up",
|
|
457
|
+
"left_mouse_down",
|
|
458
|
+
"scroll",
|
|
459
|
+
],
|
|
460
|
+
value="click",
|
|
461
|
+
scale=2,
|
|
462
|
+
)
|
|
463
|
+
action_button_radio = gr.Dropdown(
|
|
464
|
+
label="Button",
|
|
465
|
+
choices=["left", "right", "wheel", "back", "forward"],
|
|
466
|
+
value="left",
|
|
467
|
+
visible=True,
|
|
468
|
+
scale=1,
|
|
469
|
+
)
|
|
470
|
+
scroll_x_input = gr.Number(
|
|
471
|
+
label="scroll_x", value=0, visible=False, scale=1
|
|
472
|
+
)
|
|
473
|
+
scroll_y_input = gr.Number(
|
|
474
|
+
label="scroll_y", value=-120, visible=False, scale=1
|
|
475
|
+
)
|
|
476
|
+
|
|
439
477
|
conversation_chatbot = gr.Chatbot(
|
|
440
|
-
label="
|
|
441
|
-
type="messages",
|
|
442
|
-
height=500,
|
|
443
|
-
show_copy_button=True
|
|
478
|
+
label="Conversation", height=500, buttons=["copy"]
|
|
444
479
|
)
|
|
445
|
-
|
|
480
|
+
|
|
446
481
|
with gr.Column(scale=1):
|
|
447
482
|
with gr.Group():
|
|
448
483
|
call_dropdown = gr.Dropdown(
|
|
449
|
-
label="Select a pending
|
|
484
|
+
label="Select a pending conversation request",
|
|
450
485
|
choices=["latest"],
|
|
451
486
|
interactive=True,
|
|
452
|
-
value="latest"
|
|
487
|
+
value="latest",
|
|
453
488
|
)
|
|
454
489
|
refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
|
|
490
|
+
status_display = gr.Textbox(
|
|
491
|
+
label="Status", interactive=False, value="Ready to receive requests..."
|
|
492
|
+
)
|
|
455
493
|
|
|
456
494
|
with gr.Group():
|
|
457
495
|
response_text = gr.Textbox(
|
|
458
|
-
label="
|
|
459
|
-
lines=3,
|
|
460
|
-
placeholder="Enter your response here..."
|
|
496
|
+
label="Message", lines=3, placeholder="Enter your message here..."
|
|
461
497
|
)
|
|
462
|
-
submit_btn = gr.Button(
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
)
|
|
516
|
-
description_submit_btn = gr.Button("Submit Description Action")
|
|
517
|
-
|
|
518
|
-
status_display = gr.Textbox(
|
|
519
|
-
label="Status",
|
|
520
|
-
interactive=False,
|
|
521
|
-
value="Ready to receive calls..."
|
|
522
|
-
)
|
|
523
|
-
|
|
498
|
+
submit_btn = gr.Button(
|
|
499
|
+
"📤 Submit Message", variant="primary", interactive=False
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Action Accordions (wrapped for visibility control)
|
|
503
|
+
with gr.Group(visible=False) as actions_group:
|
|
504
|
+
with gr.Tabs():
|
|
505
|
+
with gr.Tab("🖱️ Click Actions"):
|
|
506
|
+
with gr.Group():
|
|
507
|
+
description_text = gr.Textbox(
|
|
508
|
+
label="Element Description",
|
|
509
|
+
placeholder="e.g., 'Privacy and security option in left sidebar'",
|
|
510
|
+
)
|
|
511
|
+
with gr.Row():
|
|
512
|
+
description_action_type = gr.Dropdown(
|
|
513
|
+
label="Action",
|
|
514
|
+
choices=[
|
|
515
|
+
"click",
|
|
516
|
+
"double_click",
|
|
517
|
+
"move",
|
|
518
|
+
"left_mouse_up",
|
|
519
|
+
"left_mouse_down",
|
|
520
|
+
],
|
|
521
|
+
value="click",
|
|
522
|
+
)
|
|
523
|
+
description_button = gr.Dropdown(
|
|
524
|
+
label="Button",
|
|
525
|
+
choices=["left", "right", "wheel", "back", "forward"],
|
|
526
|
+
value="left",
|
|
527
|
+
)
|
|
528
|
+
description_submit_btn = gr.Button("Submit Click Action")
|
|
529
|
+
|
|
530
|
+
with gr.Tab("📝 Type Action"):
|
|
531
|
+
with gr.Group():
|
|
532
|
+
type_text = gr.Textbox(
|
|
533
|
+
label="Text to Type", placeholder="Enter text to type..."
|
|
534
|
+
)
|
|
535
|
+
type_submit_btn = gr.Button("Submit Type")
|
|
536
|
+
|
|
537
|
+
with gr.Tab("⌨️ Keypress Action"):
|
|
538
|
+
with gr.Group():
|
|
539
|
+
keypress_text = gr.Textbox(
|
|
540
|
+
label="Keys", placeholder="e.g., ctrl+c, alt+tab"
|
|
541
|
+
)
|
|
542
|
+
keypress_submit_btn = gr.Button("Submit Keypress")
|
|
543
|
+
|
|
544
|
+
with gr.Tab("🧰 Misc Actions"):
|
|
545
|
+
with gr.Group():
|
|
546
|
+
misc_action_dropdown = gr.Dropdown(
|
|
547
|
+
label="Action", choices=["wait"], value="wait"
|
|
548
|
+
)
|
|
549
|
+
misc_submit_btn = gr.Button("Submit Action")
|
|
550
|
+
|
|
524
551
|
# Event handlers
|
|
525
552
|
refresh_btn.click(
|
|
526
553
|
fn=ui_handler.refresh_pending_calls,
|
|
527
|
-
outputs=[
|
|
554
|
+
outputs=[
|
|
555
|
+
call_dropdown,
|
|
556
|
+
screenshot_image,
|
|
557
|
+
conversation_chatbot,
|
|
558
|
+
submit_btn,
|
|
559
|
+
click_actions_group,
|
|
560
|
+
actions_group,
|
|
561
|
+
],
|
|
528
562
|
)
|
|
529
|
-
|
|
563
|
+
|
|
530
564
|
call_dropdown.change(
|
|
531
565
|
fn=ui_handler.on_call_selected,
|
|
532
566
|
inputs=[call_dropdown],
|
|
533
|
-
outputs=[
|
|
567
|
+
outputs=[
|
|
568
|
+
screenshot_image,
|
|
569
|
+
conversation_chatbot,
|
|
570
|
+
submit_btn,
|
|
571
|
+
click_actions_group,
|
|
572
|
+
actions_group,
|
|
573
|
+
],
|
|
534
574
|
)
|
|
535
|
-
|
|
575
|
+
|
|
536
576
|
def handle_image_click(evt: gr.SelectData):
|
|
537
577
|
if evt.index is not None:
|
|
538
578
|
x, y = evt.index
|
|
539
|
-
action_type =
|
|
540
|
-
button =
|
|
541
|
-
|
|
579
|
+
action_type = ui_handler.current_action_type or "click"
|
|
580
|
+
button = ui_handler.current_button or "left"
|
|
581
|
+
if action_type == "scroll":
|
|
582
|
+
sx_i = int(ui_handler.current_scroll_x or 0)
|
|
583
|
+
sy_i = int(ui_handler.current_scroll_y or 0)
|
|
584
|
+
# Submit a scroll action with x,y position and scroll deltas
|
|
585
|
+
result = ui_handler.submit_action(
|
|
586
|
+
"scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
|
|
587
|
+
)
|
|
588
|
+
else:
|
|
589
|
+
result = ui_handler.submit_click_action(x, y, action_type, button)
|
|
542
590
|
ui_handler.wait_for_pending_calls()
|
|
543
591
|
return result
|
|
544
592
|
return "No coordinates selected"
|
|
545
593
|
|
|
546
|
-
screenshot_image.select(
|
|
547
|
-
fn=handle_image_click,
|
|
548
|
-
outputs=[status_display]
|
|
549
|
-
).then(
|
|
594
|
+
screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
|
|
550
595
|
fn=ui_handler.wait_for_pending_calls,
|
|
551
|
-
outputs=[
|
|
596
|
+
outputs=[
|
|
597
|
+
call_dropdown,
|
|
598
|
+
screenshot_image,
|
|
599
|
+
conversation_chatbot,
|
|
600
|
+
submit_btn,
|
|
601
|
+
click_actions_group,
|
|
602
|
+
actions_group,
|
|
603
|
+
],
|
|
552
604
|
)
|
|
553
605
|
|
|
554
606
|
# Response submission
|
|
555
607
|
submit_btn.click(
|
|
556
608
|
fn=ui_handler.submit_response,
|
|
557
609
|
inputs=[response_text],
|
|
558
|
-
outputs=[response_text, status_display]
|
|
610
|
+
outputs=[response_text, status_display],
|
|
559
611
|
).then(
|
|
560
612
|
fn=ui_handler.refresh_pending_calls,
|
|
561
|
-
outputs=[
|
|
613
|
+
outputs=[
|
|
614
|
+
call_dropdown,
|
|
615
|
+
screenshot_image,
|
|
616
|
+
conversation_chatbot,
|
|
617
|
+
submit_btn,
|
|
618
|
+
click_actions_group,
|
|
619
|
+
actions_group,
|
|
620
|
+
],
|
|
562
621
|
)
|
|
563
|
-
|
|
564
|
-
# Toggle
|
|
565
|
-
def
|
|
566
|
-
|
|
567
|
-
|
|
622
|
+
|
|
623
|
+
# Toggle visibility of controls based on action type
|
|
624
|
+
def toggle_action_controls(action_type):
|
|
625
|
+
# Button visible only for click
|
|
626
|
+
button_vis = gr.update(visible=(action_type == "click"))
|
|
627
|
+
# Scroll inputs visible only for scroll
|
|
628
|
+
scroll_x_vis = gr.update(visible=(action_type == "scroll"))
|
|
629
|
+
scroll_y_vis = gr.update(visible=(action_type == "scroll"))
|
|
630
|
+
# Update state
|
|
631
|
+
ui_handler.current_action_type = action_type or "click"
|
|
632
|
+
return button_vis, scroll_x_vis, scroll_y_vis
|
|
633
|
+
|
|
568
634
|
action_type_radio.change(
|
|
569
|
-
fn=
|
|
635
|
+
fn=toggle_action_controls,
|
|
570
636
|
inputs=[action_type_radio],
|
|
571
|
-
outputs=[action_button_radio]
|
|
637
|
+
outputs=[action_button_radio, scroll_x_input, scroll_y_input],
|
|
572
638
|
)
|
|
573
639
|
|
|
574
|
-
#
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
640
|
+
# Keep other control values in ui_handler state
|
|
641
|
+
def on_button_change(val):
|
|
642
|
+
ui_handler.current_button = val or "left"
|
|
643
|
+
|
|
644
|
+
action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
|
|
645
|
+
|
|
646
|
+
def on_scroll_x_change(val):
|
|
647
|
+
try:
|
|
648
|
+
ui_handler.current_scroll_x = int(val) if val is not None else 0
|
|
649
|
+
except Exception:
|
|
650
|
+
ui_handler.current_scroll_x = 0
|
|
651
|
+
|
|
652
|
+
scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
|
|
653
|
+
|
|
654
|
+
def on_scroll_y_change(val):
|
|
655
|
+
try:
|
|
656
|
+
ui_handler.current_scroll_y = int(val) if val is not None else 0
|
|
657
|
+
except Exception:
|
|
658
|
+
ui_handler.current_scroll_y = 0
|
|
659
|
+
|
|
660
|
+
scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
|
|
661
|
+
|
|
584
662
|
type_submit_btn.click(
|
|
585
|
-
fn=ui_handler.submit_type_action,
|
|
586
|
-
inputs=[type_text],
|
|
587
|
-
outputs=[status_display]
|
|
663
|
+
fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
|
|
588
664
|
).then(
|
|
589
665
|
fn=ui_handler.wait_for_pending_calls,
|
|
590
|
-
outputs=[
|
|
666
|
+
outputs=[
|
|
667
|
+
call_dropdown,
|
|
668
|
+
screenshot_image,
|
|
669
|
+
conversation_chatbot,
|
|
670
|
+
submit_btn,
|
|
671
|
+
click_actions_group,
|
|
672
|
+
actions_group,
|
|
673
|
+
],
|
|
591
674
|
)
|
|
592
|
-
|
|
675
|
+
|
|
593
676
|
keypress_submit_btn.click(
|
|
594
|
-
fn=ui_handler.submit_hotkey_action,
|
|
595
|
-
inputs=[keypress_text],
|
|
596
|
-
outputs=[status_display]
|
|
677
|
+
fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
|
|
597
678
|
).then(
|
|
598
679
|
fn=ui_handler.wait_for_pending_calls,
|
|
599
|
-
outputs=[
|
|
680
|
+
outputs=[
|
|
681
|
+
call_dropdown,
|
|
682
|
+
screenshot_image,
|
|
683
|
+
conversation_chatbot,
|
|
684
|
+
submit_btn,
|
|
685
|
+
click_actions_group,
|
|
686
|
+
actions_group,
|
|
687
|
+
],
|
|
600
688
|
)
|
|
601
|
-
|
|
689
|
+
|
|
602
690
|
def handle_description_submit(description, action_type, button):
|
|
603
691
|
if description:
|
|
604
692
|
result = ui_handler.submit_description_click(description, action_type, button)
|
|
@@ -609,18 +697,54 @@ def create_ui():
|
|
|
609
697
|
description_submit_btn.click(
|
|
610
698
|
fn=handle_description_submit,
|
|
611
699
|
inputs=[description_text, description_action_type, description_button],
|
|
612
|
-
outputs=[status_display]
|
|
700
|
+
outputs=[status_display],
|
|
613
701
|
).then(
|
|
614
702
|
fn=ui_handler.wait_for_pending_calls,
|
|
615
|
-
outputs=[
|
|
703
|
+
outputs=[
|
|
704
|
+
call_dropdown,
|
|
705
|
+
screenshot_image,
|
|
706
|
+
conversation_chatbot,
|
|
707
|
+
submit_btn,
|
|
708
|
+
click_actions_group,
|
|
709
|
+
actions_group,
|
|
710
|
+
],
|
|
616
711
|
)
|
|
617
|
-
|
|
712
|
+
|
|
713
|
+
# Misc action handler
|
|
714
|
+
def handle_misc_submit(selected_action):
|
|
715
|
+
if selected_action == "wait":
|
|
716
|
+
result = ui_handler.submit_wait_action()
|
|
717
|
+
ui_handler.wait_for_pending_calls()
|
|
718
|
+
return result
|
|
719
|
+
return f"Unsupported misc action: {selected_action}"
|
|
720
|
+
|
|
721
|
+
misc_submit_btn.click(
|
|
722
|
+
fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
|
|
723
|
+
).then(
|
|
724
|
+
fn=ui_handler.wait_for_pending_calls,
|
|
725
|
+
outputs=[
|
|
726
|
+
call_dropdown,
|
|
727
|
+
screenshot_image,
|
|
728
|
+
conversation_chatbot,
|
|
729
|
+
submit_btn,
|
|
730
|
+
click_actions_group,
|
|
731
|
+
actions_group,
|
|
732
|
+
],
|
|
733
|
+
)
|
|
734
|
+
|
|
618
735
|
# Load initial data
|
|
619
736
|
demo.load(
|
|
620
737
|
fn=ui_handler.refresh_pending_calls,
|
|
621
|
-
outputs=[
|
|
738
|
+
outputs=[
|
|
739
|
+
call_dropdown,
|
|
740
|
+
screenshot_image,
|
|
741
|
+
conversation_chatbot,
|
|
742
|
+
submit_btn,
|
|
743
|
+
click_actions_group,
|
|
744
|
+
actions_group,
|
|
745
|
+
],
|
|
622
746
|
)
|
|
623
|
-
|
|
747
|
+
|
|
624
748
|
return demo
|
|
625
749
|
|
|
626
750
|
|