computer-use-ootb-internal 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- computer_use_ootb_internal/app_teachmode.py +8 -14
- computer_use_ootb_internal/app_teachmode_gradio.py +1 -1
- computer_use_ootb_internal/computer_use_demo/animation/click_animation.py +1 -1
- computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py +70 -72
- computer_use_ootb_internal/computer_use_demo/tools/base.py +4 -2
- computer_use_ootb_internal/computer_use_demo/tools/computer.py +55 -43
- computer_use_ootb_internal/run_teachmode_ootb_args.py +26 -11
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/METADATA +1 -1
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/RECORD +11 -16
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py +0 -676
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/icon_detection/icon_detection.py +0 -253
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/panel_recognition/llm_panel_recognize.py +0 -170
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/test_capture.py +0 -8
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py +0 -0
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/WHEEL +0 -0
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/entry_points.txt +0 -0
@@ -170,7 +170,7 @@ async def update_parameters(request: Request):
|
|
170
170
|
|
171
171
|
# Update shared state when parameters change
|
172
172
|
shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
|
173
|
-
shared_state.task = getattr(shared_state.args, 'task', "
|
173
|
+
shared_state.task = getattr(shared_state.args, 'task', "Following the instructions to complete the task.")
|
174
174
|
shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
|
175
175
|
shared_state.user_id = getattr(shared_state.args, 'user_id', "hero_cases")
|
176
176
|
shared_state.trace_id = getattr(shared_state.args, 'trace_id', "build_scroll_combat")
|
@@ -227,7 +227,7 @@ async def get_messages(request: Request):
|
|
227
227
|
status_code=429
|
228
228
|
)
|
229
229
|
|
230
|
-
log_ootb_request(shared_state.server_url, "get_messages", {})
|
230
|
+
# log_ootb_request(shared_state.server_url, "get_messages", {})
|
231
231
|
|
232
232
|
# Return all messages in the queue and clear it
|
233
233
|
messages = shared_state.message_queue.copy()
|
@@ -338,7 +338,7 @@ async def get_status(request: Request):
|
|
338
338
|
status_code=429
|
339
339
|
)
|
340
340
|
|
341
|
-
log_ootb_request(shared_state.server_url, "get_status", {})
|
341
|
+
# log_ootb_request(shared_state.server_url, "get_status", {})
|
342
342
|
|
343
343
|
print(f"Status check - Processing: {shared_state.is_processing}, Paused: {shared_state.is_paused}")
|
344
344
|
return JSONResponse(
|
@@ -393,14 +393,8 @@ def process_input():
|
|
393
393
|
print("Processing stopped while paused or resuming")
|
394
394
|
break
|
395
395
|
|
396
|
-
|
397
|
-
|
398
|
-
message = {"role": "user", "content": loop_msg}
|
399
|
-
else:
|
400
|
-
message = {"role": "assistant", "content": loop_msg}
|
401
|
-
|
402
|
-
shared_state.chatbot_messages.append(message)
|
403
|
-
shared_state.message_queue.append(message)
|
396
|
+
shared_state.chatbot_messages.append(loop_msg)
|
397
|
+
shared_state.message_queue.append(loop_msg)
|
404
398
|
|
405
399
|
# Short sleep to allow stop signals to be processed
|
406
400
|
for _ in range(5): # Check 5 times per second
|
@@ -416,17 +410,17 @@ def process_input():
|
|
416
410
|
# Handle any exceptions in the processing loop
|
417
411
|
error_msg = f"Error during task processing: {str(e)}"
|
418
412
|
print(error_msg)
|
419
|
-
error_message = {"role": "assistant", "content": error_msg}
|
413
|
+
error_message = {"role": "assistant", "content": error_msg, "type": "error"}
|
420
414
|
shared_state.message_queue.append(error_message)
|
421
415
|
|
422
416
|
finally:
|
423
417
|
# Handle completion or interruption
|
424
418
|
if shared_state.should_stop or shared_state.stop_event.is_set():
|
425
419
|
stop_msg = f"Task '{shared_state.task}' was stopped. Ready for new tasks."
|
426
|
-
final_message = {"role": "assistant", "content": stop_msg}
|
420
|
+
final_message = {"role": "assistant", "content": stop_msg, "type": "text"}
|
427
421
|
else:
|
428
422
|
complete_msg = f"Task '{shared_state.task}' completed. Thanks for using Teachmode-OOTB."
|
429
|
-
final_message = {"role": "assistant", "content": complete_msg}
|
423
|
+
final_message = {"role": "assistant", "content": complete_msg, "type": "text"}
|
430
424
|
|
431
425
|
shared_state.chatbot_messages.append(final_message)
|
432
426
|
shared_state.message_queue.append(final_message)
|
@@ -42,7 +42,7 @@ async def update_parameters(request: Request):
|
|
42
42
|
shared_state.task_updated = True
|
43
43
|
|
44
44
|
# Update shared state when parameters change
|
45
|
-
shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
|
45
|
+
# shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
|
46
46
|
shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
|
47
47
|
shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
|
48
48
|
shared_state.user_id = getattr(shared_state.args, 'user_id', "a_test")
|
@@ -64,7 +64,7 @@ def show_click(x: int, y: int, duration_ms: int = 800, existing_ms: int = 800):
|
|
64
64
|
).start()
|
65
65
|
|
66
66
|
def show_move_to(x1: int, y1: int, x2: int, y2: int,
|
67
|
-
duration_ms: int =
|
67
|
+
duration_ms: int = 800, existing_ms: int = 800):
|
68
68
|
if not CLICK_GIF.exists():
|
69
69
|
raise FileNotFoundError(f"GIF not found at {CLICK_GIF}")
|
70
70
|
mp.get_context("spawn").Process(
|
@@ -3,13 +3,9 @@ import json
|
|
3
3
|
import asyncio
|
4
4
|
from typing import Any, Dict, cast, List, Union
|
5
5
|
import uuid
|
6
|
-
from anthropic.types.beta import
|
7
|
-
|
8
|
-
|
9
|
-
BetaToolResultBlockParam,
|
10
|
-
)
|
11
|
-
from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
|
12
|
-
from computer_use_ootb_internal.computer_use_demo.tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
|
6
|
+
from anthropic.types.beta import BetaToolUseBlock
|
7
|
+
from computer_use_ootb_internal.computer_use_demo.tools import ComputerTool, ToolCollection
|
8
|
+
from computer_use_ootb_internal.computer_use_demo.tools.base import ToolResult, ToolError
|
13
9
|
|
14
10
|
|
15
11
|
class TeachmodeExecutor:
|
@@ -48,12 +44,13 @@ class TeachmodeExecutor:
|
|
48
44
|
|
49
45
|
|
50
46
|
|
51
|
-
def __call__(self,
|
52
|
-
|
53
|
-
# response is expected to be
|
47
|
+
def __call__(self, response: str):
|
48
|
+
|
49
|
+
# response is expected to be:
|
54
50
|
# {'content': "{'action': 'CLICK', 'value': None, 'position': [0.83, 0.15]}, ...", 'role': 'assistant'},
|
55
51
|
|
56
|
-
|
52
|
+
# str -> dict
|
53
|
+
action_dict = self._format_actor_output(response)
|
57
54
|
|
58
55
|
actions = action_dict["content"]
|
59
56
|
|
@@ -72,13 +69,9 @@ class TeachmodeExecutor:
|
|
72
69
|
|
73
70
|
print("Parsed Action List:", action_list)
|
74
71
|
|
75
|
-
tool_result_content = None
|
76
|
-
|
77
72
|
if action_list is not None and len(action_list) > 0:
|
78
73
|
|
79
|
-
for action in action_list:
|
80
|
-
|
81
|
-
tool_result_content: list[BetaToolResultBlockParam] = []
|
74
|
+
for action in action_list:
|
82
75
|
|
83
76
|
# self.output_callback(f"{colorful_text_showui}:\n{action}", sender="bot")
|
84
77
|
print("Converted Action:", action)
|
@@ -86,23 +79,28 @@ class TeachmodeExecutor:
|
|
86
79
|
sim_content_block = BetaToolUseBlock(
|
87
80
|
id=f'toolu_{uuid.uuid4()}',
|
88
81
|
input={'action': action["action"], 'text': action["text"], 'coordinate': action["coordinate"]},
|
89
|
-
name='computer',
|
82
|
+
name='computer',
|
83
|
+
type='tool_use'
|
84
|
+
)
|
90
85
|
|
91
86
|
# Run the asynchronous tool execution in a synchronous context
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
87
|
+
tool_result = asyncio.run(
|
88
|
+
self.tool_collection.run(
|
89
|
+
name=sim_content_block.name,
|
90
|
+
tool_input=cast(dict[str, Any], sim_content_block.input),
|
91
|
+
))
|
96
92
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
93
|
+
if isinstance(tool_result, ToolResult):
|
94
|
+
print(f"[teachmode_executor] tool_result: {tool_result}")
|
95
|
+
tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "action", "action_type": tool_result['base_type']}
|
96
|
+
yield tool_result_message
|
97
|
+
|
98
|
+
elif isinstance(tool_result, ToolError):
|
99
|
+
tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "error"}
|
100
|
+
yield tool_result_message
|
101
|
+
|
102
|
+
return tool_result_message
|
103
|
+
|
106
104
|
|
107
105
|
def _format_actor_output(self, action_output: str|dict) -> Dict[str, Any]:
|
108
106
|
if type(action_output) == dict:
|
@@ -172,8 +170,8 @@ class TeachmodeExecutor:
|
|
172
170
|
|
173
171
|
elif action_item["action"] == "PRESS": # 7. press
|
174
172
|
x, y = action_item["position"]
|
175
|
-
action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
|
176
|
-
|
173
|
+
# action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
|
174
|
+
# int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
|
177
175
|
refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
|
178
176
|
refined_output.append({"action": "left_press", "text": None, "coordinate": None})
|
179
177
|
|
@@ -316,43 +314,43 @@ class TeachmodeExecutor:
|
|
316
314
|
|
317
315
|
|
318
316
|
|
319
|
-
def _make_api_tool_result(
|
320
|
-
|
321
|
-
) -> BetaToolResultBlockParam:
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
|
356
|
-
|
357
|
-
|
358
|
-
|
317
|
+
# def _make_api_tool_result(
|
318
|
+
# result: ToolResult, tool_use_id: str
|
319
|
+
# ) -> BetaToolResultBlockParam:
|
320
|
+
# """Convert an agent ToolResult to an API ToolResultBlockParam."""
|
321
|
+
# tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
|
322
|
+
# is_error = False
|
323
|
+
# if result.error:
|
324
|
+
# is_error = True
|
325
|
+
# tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
|
326
|
+
# else:
|
327
|
+
# if result.output:
|
328
|
+
# tool_result_content.append(
|
329
|
+
# {
|
330
|
+
# "type": "text",
|
331
|
+
# "text": _maybe_prepend_system_tool_result(result, result.output),
|
332
|
+
# }
|
333
|
+
# )
|
334
|
+
# if result.base64_image:
|
335
|
+
# tool_result_content.append(
|
336
|
+
# {
|
337
|
+
# "type": "image",
|
338
|
+
# "source": {
|
339
|
+
# "type": "base64",
|
340
|
+
# "media_type": "image/png",
|
341
|
+
# "data": result.base64_image,
|
342
|
+
# },
|
343
|
+
# }
|
344
|
+
# )
|
345
|
+
# return {
|
346
|
+
# "type": "tool_result",
|
347
|
+
# "content": tool_result_content,
|
348
|
+
# "tool_use_id": tool_use_id,
|
349
|
+
# "is_error": is_error,
|
350
|
+
# }
|
351
|
+
|
352
|
+
|
353
|
+
# def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
|
354
|
+
# if result.system:
|
355
|
+
# result_text = f"<system>{result.system}</system>\n{result_text}"
|
356
|
+
# return result_text
|
@@ -28,6 +28,7 @@ class ToolResult:
|
|
28
28
|
error: str | None = None
|
29
29
|
base64_image: str | None = None
|
30
30
|
system: str | None = None
|
31
|
+
base_type: str | None = None
|
31
32
|
|
32
33
|
def __bool__(self):
|
33
34
|
return any(getattr(self, field.name) for field in fields(self))
|
@@ -65,5 +66,6 @@ class ToolFailure(ToolResult):
|
|
65
66
|
class ToolError(Exception):
|
66
67
|
"""Raised when a tool encounters an error."""
|
67
68
|
|
68
|
-
def __init__(self,
|
69
|
-
self.
|
69
|
+
def __init__(self, output: str, base_type: str):
|
70
|
+
self.output = output
|
71
|
+
self.base_type = base_type
|
@@ -217,13 +217,13 @@ class ComputerTool(BaseAnthropicTool):
|
|
217
217
|
|
218
218
|
if action in ("mouse_move", "left_click_drag"):
|
219
219
|
if coordinate is None:
|
220
|
-
raise ToolError(f"coordinate is required for {action}")
|
220
|
+
raise ToolError(output=f"coordinate is required for {action}", base_type="error")
|
221
221
|
if text is not None:
|
222
|
-
raise ToolError(f"text is not accepted for {action}")
|
222
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
223
223
|
if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
|
224
|
-
raise ToolError(f"{coordinate} must be a tuple of length 2")
|
224
|
+
raise ToolError(output=f"{coordinate} must be a tuple of length 2", base_type="error")
|
225
225
|
if not all(isinstance(i, int) for i in coordinate):
|
226
|
-
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
|
226
|
+
raise ToolError(output=f"{coordinate} must be a tuple of non-negative ints", base_type="error")
|
227
227
|
|
228
228
|
if self.is_scaling:
|
229
229
|
x, y = self.scale_coordinates(
|
@@ -237,21 +237,22 @@ class ComputerTool(BaseAnthropicTool):
|
|
237
237
|
|
238
238
|
if action == "mouse_move":
|
239
239
|
pyautogui.moveTo(x, y)
|
240
|
-
return ToolResult(output=f"
|
240
|
+
return ToolResult(output=f"Mouse move", base_type="move")
|
241
|
+
|
241
242
|
elif action == "left_click_drag":
|
242
243
|
current_x, current_y = pyautogui.position()
|
243
244
|
pyautogui.dragTo(x, y, duration=0.5) # Adjust duration as needed
|
244
|
-
return ToolResult(output=f"
|
245
|
+
return ToolResult(output=f"Mouse drag", base_type="move")
|
245
246
|
|
246
247
|
# Action Type 2: Required text (keynames)
|
247
248
|
# Actions: key, type, key_down, key_up
|
248
249
|
if action in ("key", "type", "key_down", "key_up"):
|
249
250
|
if text is None:
|
250
|
-
raise ToolError(f"text is required for {action}")
|
251
|
+
raise ToolError(output=f"text is required for {action}", base_type="error")
|
251
252
|
if coordinate is not None:
|
252
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
253
|
+
raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
|
253
254
|
if not isinstance(text, str):
|
254
|
-
raise ToolError(output=f"{text} must be a string")
|
255
|
+
raise ToolError(output=f"{text} must be a string", base_type="error")
|
255
256
|
|
256
257
|
if action == "key":
|
257
258
|
# Handle key combinations
|
@@ -264,19 +265,19 @@ class ComputerTool(BaseAnthropicTool):
|
|
264
265
|
key = self.key_conversion.get(key.strip(), key.strip())
|
265
266
|
key = key.lower()
|
266
267
|
pyautogui.keyUp(key) # Release each key in reverse order
|
267
|
-
return ToolResult(output=f"
|
268
|
+
return ToolResult(output=f"Press key '{text}'", base_type="key")
|
268
269
|
|
269
270
|
elif action == "key_down":
|
270
271
|
pyautogui.keyDown(text)
|
271
|
-
return ToolResult(output=f"
|
272
|
+
return ToolResult(output=f"Press key '{text}'", base_type="key")
|
272
273
|
elif action == "key_up":
|
273
274
|
pyautogui.keyUp(text)
|
274
|
-
return ToolResult(output=f"
|
275
|
+
return ToolResult(output=f"Release key '{text}'", base_type="key")
|
275
276
|
|
276
277
|
elif action == "type":
|
277
278
|
pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000) # Convert ms to seconds
|
278
|
-
screenshot_base64 = (await self.screenshot()).base64_image
|
279
|
-
return ToolResult(output=text, base64_image=screenshot_base64)
|
279
|
+
# screenshot_base64 = (await self.screenshot()).base64_image
|
280
|
+
return ToolResult(output=f"Type '{text}'", base_type="type") # base64_image=screenshot_base64)
|
280
281
|
|
281
282
|
# Action Type 3: No required text or coordinates
|
282
283
|
# Actions: left_click, right_click, double_click, middle_click, left_press, scroll_down, scroll_up
|
@@ -291,76 +292,81 @@ class ComputerTool(BaseAnthropicTool):
|
|
291
292
|
"wait",
|
292
293
|
):
|
293
294
|
if text is not None:
|
294
|
-
raise ToolError(f"text is not accepted for {action}")
|
295
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
295
296
|
if coordinate is not None:
|
296
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
297
|
+
raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
|
297
298
|
|
298
299
|
if action == "left_click":
|
299
300
|
x, y = pyautogui.position()
|
300
301
|
show_click(x, y)
|
301
302
|
pyautogui.click()
|
303
|
+
return ToolResult(output="Left click", base_type="click")
|
302
304
|
elif action == "right_click":
|
303
305
|
x, y = pyautogui.position()
|
304
306
|
show_click(x, y)
|
305
307
|
pyautogui.rightClick()
|
308
|
+
return ToolResult(output="Right click", base_type="click")
|
306
309
|
elif action == "middle_click":
|
307
310
|
x, y = pyautogui.position()
|
308
311
|
show_click(x, y)
|
309
312
|
pyautogui.middleClick()
|
313
|
+
return ToolResult(output="Middle click", base_type="click")
|
310
314
|
elif action == "double_click":
|
311
315
|
x, y = pyautogui.position()
|
312
316
|
show_click(x, y)
|
313
317
|
pyautogui.doubleClick()
|
318
|
+
return ToolResult(output="Double click", base_type="click")
|
314
319
|
elif action == "left_press":
|
315
320
|
x, y = pyautogui.position()
|
316
321
|
show_click(x, y)
|
317
322
|
pyautogui.mouseDown()
|
318
323
|
time.sleep(1)
|
319
324
|
pyautogui.mouseUp()
|
325
|
+
return ToolResult(output="Left press", base_type="click")
|
320
326
|
elif action == "scroll_down":
|
321
327
|
pyautogui.scroll(-200) # Adjust scroll amount as needed
|
322
|
-
return ToolResult(output="Scrolled down")
|
328
|
+
return ToolResult(output="Scrolled down", base_type="scroll")
|
323
329
|
|
324
330
|
elif action == "scroll_up":
|
325
331
|
pyautogui.scroll(200) # Adjust scroll amount as needed
|
326
|
-
return ToolResult(output="Scrolled up")
|
332
|
+
return ToolResult(output="Scrolled up", base_type="scroll")
|
327
333
|
|
328
334
|
elif action == "wait":
|
329
335
|
time.sleep(15)
|
330
|
-
return ToolResult(output="
|
336
|
+
return ToolResult(output="Wait for next event", base_type="wait")
|
331
337
|
|
332
|
-
return ToolResult(output=f"Performed {action}")
|
338
|
+
return ToolResult(output=f"Performed {action}", base_type="unknown")
|
333
339
|
|
334
340
|
# Action Type 4: Miscs. No required text or coordinates
|
335
341
|
# Actions: screenshot, cursor_position
|
336
342
|
if action in ("screenshot", "cursor_position"):
|
337
343
|
if text is not None:
|
338
|
-
raise ToolError(f"text is not accepted for {action}")
|
344
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
339
345
|
if coordinate is not None:
|
340
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
346
|
+
raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
|
341
347
|
if action == "screenshot":
|
342
348
|
return await self.screenshot()
|
343
349
|
elif action == "cursor_position":
|
344
350
|
x, y = pyautogui.position()
|
345
|
-
x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
|
346
|
-
return ToolResult(output=f"
|
351
|
+
# x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
|
352
|
+
return ToolResult(output=f"Cursor position ({x},{y})", base_type="unknown")
|
347
353
|
|
348
354
|
# Action Type 5: StarRail Mode
|
349
355
|
# Actions: sr_scroll_down, sr_scroll_up
|
350
356
|
if action in ("sr_scroll_down", "sr_scroll_up"):
|
351
357
|
if text is not None:
|
352
|
-
raise ToolError(f"text is not accepted for {action}")
|
358
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
353
359
|
|
354
360
|
if action == "sr_scroll_down":
|
355
361
|
for _ in range(20):
|
356
362
|
pyautogui.scroll(-100) # Adjust scroll amount as needed
|
357
363
|
time.sleep(0.001)
|
358
|
-
return ToolResult(output="
|
364
|
+
return ToolResult(output="Scroll down", base_type="scroll")
|
359
365
|
elif action == "sr_scroll_up":
|
360
366
|
for _ in range(20):
|
361
367
|
pyautogui.scroll(100) # Adjust scroll amount as needed
|
362
368
|
time.sleep(0.001)
|
363
|
-
return ToolResult(output="
|
369
|
+
return ToolResult(output="Scroll up", base_type="scroll")
|
364
370
|
|
365
371
|
# starrail browser mode
|
366
372
|
if action in ("left_click_windll", "mouse_move_windll", "right_click_windll", "key_down_windll", "key_up_windll"):
|
@@ -374,10 +380,11 @@ class ComputerTool(BaseAnthropicTool):
|
|
374
380
|
y = coordinate[1]+self.offset_y
|
375
381
|
show_click(x, y)
|
376
382
|
self.marbot_auto_gui.click(x=x, y=y)
|
383
|
+
return ToolResult(output=f"Left click", base_type="click")
|
377
384
|
|
378
385
|
elif action == "mouse_move_windll":
|
379
386
|
if coordinate is None:
|
380
|
-
raise ToolError(f"coordinate is required for {action}")
|
387
|
+
raise ToolError(output=f"coordinate is required for {action}", base_type="error")
|
381
388
|
|
382
389
|
x0, y0 = pyautogui.position()
|
383
390
|
# x0, y0 = self.scale_coordinates(ScalingSource.COMPUTER, x0, y0)
|
@@ -386,16 +393,21 @@ class ComputerTool(BaseAnthropicTool):
|
|
386
393
|
|
387
394
|
show_move_to(x0, y0, x1, y1, duration_ms=1000)
|
388
395
|
self.marbot_auto_gui.moveTo(x=x1, y=y1)
|
389
|
-
|
396
|
+
|
397
|
+
return ToolResult(output=f"Mouse move", base_type="move")
|
398
|
+
|
390
399
|
# elif action == "right_click_windll":
|
391
400
|
# self.marbot_auto_gui.rightClick(x=coordinate[0], y=coordinate[1])
|
392
401
|
elif action == "key_down_windll":
|
393
402
|
self.marbot_auto_gui.keyDown(text)
|
403
|
+
return ToolResult(output=f"Key down '{text}'", base_type="key")
|
394
404
|
elif action == "key_up_windll":
|
395
405
|
self.marbot_auto_gui.keyUp(text)
|
396
|
-
|
406
|
+
return ToolResult(output=f"Key up '{text}'", base_type="key")
|
407
|
+
|
408
|
+
return ToolResult(output=f"Performed dll action:{action}", base_type="unknown")
|
397
409
|
|
398
|
-
raise ToolError(f"Invalid action: {action}")
|
410
|
+
raise ToolError(output=f"Invalid action: {action}", base_type="error")
|
399
411
|
|
400
412
|
|
401
413
|
async def screenshot(self):
|
@@ -486,9 +498,9 @@ class ComputerTool(BaseAnthropicTool):
|
|
486
498
|
|
487
499
|
if path.exists():
|
488
500
|
# Return a ToolResult instance instead of a dictionary
|
489
|
-
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
|
501
|
+
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode(), base_type="screenshot")
|
490
502
|
|
491
|
-
raise ToolError(f"Failed to take screenshot: {path} does not exist.")
|
503
|
+
raise ToolError(output=f"Failed to take screenshot: {path} does not exist.", base_type="error")
|
492
504
|
|
493
505
|
def padding_image(self, screenshot):
|
494
506
|
"""Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
|
@@ -500,17 +512,17 @@ class ComputerTool(BaseAnthropicTool):
|
|
500
512
|
padding_image.paste(screenshot, (0, 0))
|
501
513
|
return padding_image
|
502
514
|
|
503
|
-
async def shell(self, command: str, take_screenshot=True) -> ToolResult:
|
504
|
-
|
505
|
-
|
506
|
-
|
515
|
+
# async def shell(self, command: str, take_screenshot=True) -> ToolResult:
|
516
|
+
# """Run a shell command and return the output, error, and optionally a screenshot."""
|
517
|
+
# _, stdout, stderr = await run(command)
|
518
|
+
# base64_image = None
|
507
519
|
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
520
|
+
# if take_screenshot:
|
521
|
+
# # delay to let things settle before taking a screenshot
|
522
|
+
# await asyncio.sleep(self._screenshot_delay)
|
523
|
+
# base64_image = (await self.screenshot()).base64_image
|
512
524
|
|
513
|
-
|
525
|
+
# return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
|
514
526
|
|
515
527
|
def scale_coordinates(self, source: ScalingSource, x: int, y: int):
|
516
528
|
"""Scale coordinates to a target maximum resolution."""
|
@@ -538,7 +550,7 @@ class ComputerTool(BaseAnthropicTool):
|
|
538
550
|
y_scaling_factor = target_dimension["height"] / self.height
|
539
551
|
if source == ScalingSource.API:
|
540
552
|
if x > self.width or y > self.height:
|
541
|
-
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
|
553
|
+
raise ToolError(output=f"Coordinates {x}, {y} are out of bounds", base_type="error")
|
542
554
|
# scale up
|
543
555
|
return round(x / x_scaling_factor), round(y / y_scaling_factor)
|
544
556
|
# scale down
|
@@ -2,15 +2,13 @@ import argparse
|
|
2
2
|
import time
|
3
3
|
import json
|
4
4
|
import platform
|
5
|
-
from typing import Callable
|
6
|
-
from collections.abc import Callable
|
7
5
|
import uuid
|
8
6
|
import datetime
|
9
7
|
from datetime import datetime, timedelta, timezone
|
10
8
|
|
11
9
|
from computer_use_ootb_internal.computer_use_demo.executor.teachmode_executor import TeachmodeExecutor
|
12
10
|
from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.gui_capture import get_screenshot
|
13
|
-
from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.
|
11
|
+
from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.llm_utils import encode_image, is_image_path
|
14
12
|
from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.icon_detection.icon_detection import get_screen_resize_factor
|
15
13
|
from computer_use_ootb_internal.computer_use_demo.tools.aws_request import send_request_to_server
|
16
14
|
from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.uia_tools.screenshot_service import get_screenshot_external_cmd
|
@@ -41,7 +39,7 @@ def simple_teachmode_sampling_loop(
|
|
41
39
|
if "star_rail" in user_id or "star_rail" in user_id:
|
42
40
|
full_screen_game_mode = 1
|
43
41
|
|
44
|
-
if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id:
|
42
|
+
if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id or "offical" in user_id:
|
45
43
|
full_screen_game_mode = 2
|
46
44
|
|
47
45
|
print(f"Full Screen Game Mode: {full_screen_game_mode}")
|
@@ -53,7 +51,7 @@ def simple_teachmode_sampling_loop(
|
|
53
51
|
timestamp = datetime.now(utc_plus_8).strftime("%m%d-%H%M%S")
|
54
52
|
|
55
53
|
step_count = 1
|
56
|
-
unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:
|
54
|
+
unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:6]}"
|
57
55
|
|
58
56
|
print("[simple_teachmode_sampling_loop] starting task: ", task)
|
59
57
|
print(f"[simple_teachmode_sampling_loop] unique_task_id: {unique_task_id}")
|
@@ -68,6 +66,10 @@ def simple_teachmode_sampling_loop(
|
|
68
66
|
|
69
67
|
uia_meta, sc_path = get_screenshot_external_cmd(selected_screen=selected_screen,
|
70
68
|
capture_uia_data=full_screen_game_mode==0)
|
69
|
+
|
70
|
+
if is_image_path(sc_path):
|
71
|
+
screenshot_message = {"role": "user", "content": sc_path, "type": "image"}
|
72
|
+
yield screenshot_message
|
71
73
|
|
72
74
|
payload = {
|
73
75
|
"task_id": unique_task_id,
|
@@ -102,27 +104,40 @@ def simple_teachmode_sampling_loop(
|
|
102
104
|
|
103
105
|
try:
|
104
106
|
step_plan = infer_server_response["generated_plan"]
|
105
|
-
|
107
|
+
step_reasoning = step_plan["reasoning"]
|
108
|
+
step_info = step_plan["step_info"]
|
106
109
|
step_action = infer_server_response["generated_action"]["content"]
|
107
110
|
step_traj_idx = infer_server_response["current_traj_step"]
|
108
111
|
|
109
112
|
except Exception as e:
|
110
113
|
print("Error parsing generated_action content:", e)
|
111
114
|
continue
|
115
|
+
|
116
|
+
plan_message = {"role": "assistant", "content": step_reasoning, "type": "text"}
|
117
|
+
yield plan_message
|
112
118
|
|
113
119
|
if step_action.get("action") == "STOP":
|
114
120
|
final_sc, final_sc_path = get_screenshot_external_cmd(selected_screen=selected_screen)
|
115
|
-
action_history = [] # reset action history
|
116
|
-
break
|
117
121
|
|
118
|
-
|
122
|
+
final_message = {"role": "assistant", "content": "Task completed. Final screenshot:", "type": "text"}
|
123
|
+
yield final_message
|
124
|
+
|
125
|
+
final_sc_message = {"role": "user", "content": final_sc_path, "type": "image"}
|
126
|
+
yield final_sc_message
|
127
|
+
|
128
|
+
# reset action history
|
129
|
+
action_history = []
|
130
|
+
break
|
119
131
|
|
120
132
|
action_history.append(f"Executing guidance trajectory step [{step_traj_idx}], Plan: {step_info}, Action: {step_action};\n")
|
121
133
|
|
122
|
-
for
|
123
|
-
yield
|
134
|
+
for exec_message in executor({"role": "assistant", "content": step_action}):
|
135
|
+
yield exec_message
|
124
136
|
|
125
137
|
step_count += 1
|
138
|
+
|
139
|
+
# reset action history
|
140
|
+
action_history = []
|
126
141
|
|
127
142
|
|
128
143
|
|