computer-use-ootb-internal 0.0.108__py3-none-any.whl → 0.0.109.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- computer_use_ootb_internal/app_teachmode.py +8 -16
- computer_use_ootb_internal/app_teachmode_gradio.py +1 -1
- computer_use_ootb_internal/computer_use_demo/animation/click_animation.py +1 -1
- computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py +70 -72
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/utils.py +7 -0
- computer_use_ootb_internal/computer_use_demo/tools/base.py +4 -2
- computer_use_ootb_internal/computer_use_demo/tools/computer.py +55 -43
- computer_use_ootb_internal/run_teachmode_ootb_args.py +24 -13
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/METADATA +1 -1
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/RECORD +12 -17
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py +0 -676
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/icon_detection/icon_detection.py +0 -253
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/panel_recognition/llm_panel_recognize.py +0 -170
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/test_capture.py +0 -8
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py +0 -0
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/WHEEL +0 -0
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/entry_points.txt +0 -0
@@ -5,14 +5,12 @@ from datetime import datetime
|
|
5
5
|
import threading
|
6
6
|
import requests
|
7
7
|
import platform # Add platform import
|
8
|
-
import subprocess # Add subprocess import
|
9
8
|
import pyautogui # Add pyautogui import
|
10
9
|
import webbrowser # Add webbrowser import
|
11
10
|
import os # Import os for path joining
|
12
11
|
from fastapi import FastAPI, Request
|
13
12
|
from fastapi.responses import JSONResponse
|
14
13
|
from fastapi.middleware.cors import CORSMiddleware
|
15
|
-
from screeninfo import get_monitors
|
16
14
|
from computer_use_ootb_internal.computer_use_demo.tools.computer import get_screen_details
|
17
15
|
from computer_use_ootb_internal.run_teachmode_ootb_args import simple_teachmode_sampling_loop
|
18
16
|
|
@@ -170,7 +168,7 @@ async def update_parameters(request: Request):
|
|
170
168
|
|
171
169
|
# Update shared state when parameters change
|
172
170
|
shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
|
173
|
-
shared_state.task = getattr(shared_state.args, 'task', "
|
171
|
+
shared_state.task = getattr(shared_state.args, 'task', "Following the instructions to complete the task.")
|
174
172
|
shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
|
175
173
|
shared_state.user_id = getattr(shared_state.args, 'user_id', "hero_cases")
|
176
174
|
shared_state.trace_id = getattr(shared_state.args, 'trace_id', "build_scroll_combat")
|
@@ -227,7 +225,7 @@ async def get_messages(request: Request):
|
|
227
225
|
status_code=429
|
228
226
|
)
|
229
227
|
|
230
|
-
log_ootb_request(shared_state.server_url, "get_messages", {})
|
228
|
+
# log_ootb_request(shared_state.server_url, "get_messages", {})
|
231
229
|
|
232
230
|
# Return all messages in the queue and clear it
|
233
231
|
messages = shared_state.message_queue.copy()
|
@@ -338,7 +336,7 @@ async def get_status(request: Request):
|
|
338
336
|
status_code=429
|
339
337
|
)
|
340
338
|
|
341
|
-
log_ootb_request(shared_state.server_url, "get_status", {})
|
339
|
+
# log_ootb_request(shared_state.server_url, "get_status", {})
|
342
340
|
|
343
341
|
print(f"Status check - Processing: {shared_state.is_processing}, Paused: {shared_state.is_paused}")
|
344
342
|
return JSONResponse(
|
@@ -393,14 +391,8 @@ def process_input():
|
|
393
391
|
print("Processing stopped while paused or resuming")
|
394
392
|
break
|
395
393
|
|
396
|
-
|
397
|
-
|
398
|
-
message = {"role": "user", "content": loop_msg}
|
399
|
-
else:
|
400
|
-
message = {"role": "assistant", "content": loop_msg}
|
401
|
-
|
402
|
-
shared_state.chatbot_messages.append(message)
|
403
|
-
shared_state.message_queue.append(message)
|
394
|
+
shared_state.chatbot_messages.append(loop_msg)
|
395
|
+
shared_state.message_queue.append(loop_msg)
|
404
396
|
|
405
397
|
# Short sleep to allow stop signals to be processed
|
406
398
|
for _ in range(5): # Check 5 times per second
|
@@ -416,17 +408,17 @@ def process_input():
|
|
416
408
|
# Handle any exceptions in the processing loop
|
417
409
|
error_msg = f"Error during task processing: {str(e)}"
|
418
410
|
print(error_msg)
|
419
|
-
error_message = {"role": "assistant", "content": error_msg}
|
411
|
+
error_message = {"role": "assistant", "content": error_msg, "type": "error"}
|
420
412
|
shared_state.message_queue.append(error_message)
|
421
413
|
|
422
414
|
finally:
|
423
415
|
# Handle completion or interruption
|
424
416
|
if shared_state.should_stop or shared_state.stop_event.is_set():
|
425
417
|
stop_msg = f"Task '{shared_state.task}' was stopped. Ready for new tasks."
|
426
|
-
final_message = {"role": "assistant", "content": stop_msg}
|
418
|
+
final_message = {"role": "assistant", "content": stop_msg, "type": "text"}
|
427
419
|
else:
|
428
420
|
complete_msg = f"Task '{shared_state.task}' completed. Thanks for using Teachmode-OOTB."
|
429
|
-
final_message = {"role": "assistant", "content": complete_msg}
|
421
|
+
final_message = {"role": "assistant", "content": complete_msg, "type": "text"}
|
430
422
|
|
431
423
|
shared_state.chatbot_messages.append(final_message)
|
432
424
|
shared_state.message_queue.append(final_message)
|
@@ -42,7 +42,7 @@ async def update_parameters(request: Request):
|
|
42
42
|
shared_state.task_updated = True
|
43
43
|
|
44
44
|
# Update shared state when parameters change
|
45
|
-
shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
|
45
|
+
# shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
|
46
46
|
shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
|
47
47
|
shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
|
48
48
|
shared_state.user_id = getattr(shared_state.args, 'user_id', "a_test")
|
@@ -64,7 +64,7 @@ def show_click(x: int, y: int, duration_ms: int = 800, existing_ms: int = 800):
|
|
64
64
|
).start()
|
65
65
|
|
66
66
|
def show_move_to(x1: int, y1: int, x2: int, y2: int,
|
67
|
-
duration_ms: int =
|
67
|
+
duration_ms: int = 800, existing_ms: int = 800):
|
68
68
|
if not CLICK_GIF.exists():
|
69
69
|
raise FileNotFoundError(f"GIF not found at {CLICK_GIF}")
|
70
70
|
mp.get_context("spawn").Process(
|
@@ -3,13 +3,9 @@ import json
|
|
3
3
|
import asyncio
|
4
4
|
from typing import Any, Dict, cast, List, Union
|
5
5
|
import uuid
|
6
|
-
from anthropic.types.beta import
|
7
|
-
|
8
|
-
|
9
|
-
BetaToolResultBlockParam,
|
10
|
-
)
|
11
|
-
from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
|
12
|
-
from computer_use_ootb_internal.computer_use_demo.tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
|
6
|
+
from anthropic.types.beta import BetaToolUseBlock
|
7
|
+
from computer_use_ootb_internal.computer_use_demo.tools import ComputerTool, ToolCollection
|
8
|
+
from computer_use_ootb_internal.computer_use_demo.tools.base import ToolResult, ToolError
|
13
9
|
|
14
10
|
|
15
11
|
class TeachmodeExecutor:
|
@@ -48,12 +44,13 @@ class TeachmodeExecutor:
|
|
48
44
|
|
49
45
|
|
50
46
|
|
51
|
-
def __call__(self,
|
52
|
-
|
53
|
-
# response is expected to be
|
47
|
+
def __call__(self, response: str):
|
48
|
+
|
49
|
+
# response is expected to be:
|
54
50
|
# {'content': "{'action': 'CLICK', 'value': None, 'position': [0.83, 0.15]}, ...", 'role': 'assistant'},
|
55
51
|
|
56
|
-
|
52
|
+
# str -> dict
|
53
|
+
action_dict = self._format_actor_output(response)
|
57
54
|
|
58
55
|
actions = action_dict["content"]
|
59
56
|
|
@@ -72,13 +69,9 @@ class TeachmodeExecutor:
|
|
72
69
|
|
73
70
|
print("Parsed Action List:", action_list)
|
74
71
|
|
75
|
-
tool_result_content = None
|
76
|
-
|
77
72
|
if action_list is not None and len(action_list) > 0:
|
78
73
|
|
79
|
-
for action in action_list:
|
80
|
-
|
81
|
-
tool_result_content: list[BetaToolResultBlockParam] = []
|
74
|
+
for action in action_list:
|
82
75
|
|
83
76
|
# self.output_callback(f"{colorful_text_showui}:\n{action}", sender="bot")
|
84
77
|
print("Converted Action:", action)
|
@@ -86,23 +79,28 @@ class TeachmodeExecutor:
|
|
86
79
|
sim_content_block = BetaToolUseBlock(
|
87
80
|
id=f'toolu_{uuid.uuid4()}',
|
88
81
|
input={'action': action["action"], 'text': action["text"], 'coordinate': action["coordinate"]},
|
89
|
-
name='computer',
|
82
|
+
name='computer',
|
83
|
+
type='tool_use'
|
84
|
+
)
|
90
85
|
|
91
86
|
# Run the asynchronous tool execution in a synchronous context
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
87
|
+
tool_result = asyncio.run(
|
88
|
+
self.tool_collection.run(
|
89
|
+
name=sim_content_block.name,
|
90
|
+
tool_input=cast(dict[str, Any], sim_content_block.input),
|
91
|
+
))
|
96
92
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
93
|
+
if isinstance(tool_result, ToolResult):
|
94
|
+
print(f"[teachmode_executor] tool_result: {tool_result}")
|
95
|
+
tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "action", "action_type": tool_result['base_type']}
|
96
|
+
yield tool_result_message
|
97
|
+
|
98
|
+
elif isinstance(tool_result, ToolError):
|
99
|
+
tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "error"}
|
100
|
+
yield tool_result_message
|
101
|
+
|
102
|
+
return tool_result_message
|
103
|
+
|
106
104
|
|
107
105
|
def _format_actor_output(self, action_output: str|dict) -> Dict[str, Any]:
|
108
106
|
if type(action_output) == dict:
|
@@ -172,8 +170,8 @@ class TeachmodeExecutor:
|
|
172
170
|
|
173
171
|
elif action_item["action"] == "PRESS": # 7. press
|
174
172
|
x, y = action_item["position"]
|
175
|
-
action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
|
176
|
-
|
173
|
+
# action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
|
174
|
+
# int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
|
177
175
|
refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
|
178
176
|
refined_output.append({"action": "left_press", "text": None, "coordinate": None})
|
179
177
|
|
@@ -316,43 +314,43 @@ class TeachmodeExecutor:
|
|
316
314
|
|
317
315
|
|
318
316
|
|
319
|
-
def _make_api_tool_result(
|
320
|
-
|
321
|
-
) -> BetaToolResultBlockParam:
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
|
356
|
-
|
357
|
-
|
358
|
-
|
317
|
+
# def _make_api_tool_result(
|
318
|
+
# result: ToolResult, tool_use_id: str
|
319
|
+
# ) -> BetaToolResultBlockParam:
|
320
|
+
# """Convert an agent ToolResult to an API ToolResultBlockParam."""
|
321
|
+
# tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
|
322
|
+
# is_error = False
|
323
|
+
# if result.error:
|
324
|
+
# is_error = True
|
325
|
+
# tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
|
326
|
+
# else:
|
327
|
+
# if result.output:
|
328
|
+
# tool_result_content.append(
|
329
|
+
# {
|
330
|
+
# "type": "text",
|
331
|
+
# "text": _maybe_prepend_system_tool_result(result, result.output),
|
332
|
+
# }
|
333
|
+
# )
|
334
|
+
# if result.base64_image:
|
335
|
+
# tool_result_content.append(
|
336
|
+
# {
|
337
|
+
# "type": "image",
|
338
|
+
# "source": {
|
339
|
+
# "type": "base64",
|
340
|
+
# "media_type": "image/png",
|
341
|
+
# "data": result.base64_image,
|
342
|
+
# },
|
343
|
+
# }
|
344
|
+
# )
|
345
|
+
# return {
|
346
|
+
# "type": "tool_result",
|
347
|
+
# "content": tool_result_content,
|
348
|
+
# "tool_use_id": tool_use_id,
|
349
|
+
# "is_error": is_error,
|
350
|
+
# }
|
351
|
+
|
352
|
+
|
353
|
+
# def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
|
354
|
+
# if result.system:
|
355
|
+
# result_text = f"<system>{result.system}</system>\n{result_text}"
|
356
|
+
# return result_text
|
@@ -4,6 +4,13 @@ import datetime
|
|
4
4
|
import numpy as np
|
5
5
|
import cv2
|
6
6
|
|
7
|
+
def get_screen_resize_factor():
|
8
|
+
# import ctypes
|
9
|
+
# scaleFactor = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100
|
10
|
+
# scaleFactor = str(scaleFactor) + "x"
|
11
|
+
# return scaleFactor
|
12
|
+
return "1.0x"
|
13
|
+
|
7
14
|
|
8
15
|
def multivalue_image(img, mode='None', thresholds=None, interval_values=None, save=True, cache_folder=None):
|
9
16
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
@@ -28,6 +28,7 @@ class ToolResult:
|
|
28
28
|
error: str | None = None
|
29
29
|
base64_image: str | None = None
|
30
30
|
system: str | None = None
|
31
|
+
base_type: str | None = None
|
31
32
|
|
32
33
|
def __bool__(self):
|
33
34
|
return any(getattr(self, field.name) for field in fields(self))
|
@@ -65,5 +66,6 @@ class ToolFailure(ToolResult):
|
|
65
66
|
class ToolError(Exception):
|
66
67
|
"""Raised when a tool encounters an error."""
|
67
68
|
|
68
|
-
def __init__(self,
|
69
|
-
self.
|
69
|
+
def __init__(self, output: str, base_type: str):
|
70
|
+
self.output = output
|
71
|
+
self.base_type = base_type
|
@@ -217,13 +217,13 @@ class ComputerTool(BaseAnthropicTool):
|
|
217
217
|
|
218
218
|
if action in ("mouse_move", "left_click_drag"):
|
219
219
|
if coordinate is None:
|
220
|
-
raise ToolError(f"coordinate is required for {action}")
|
220
|
+
raise ToolError(output=f"coordinate is required for {action}", base_type="error")
|
221
221
|
if text is not None:
|
222
|
-
raise ToolError(f"text is not accepted for {action}")
|
222
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
223
223
|
if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
|
224
|
-
raise ToolError(f"{coordinate} must be a tuple of length 2")
|
224
|
+
raise ToolError(output=f"{coordinate} must be a tuple of length 2", base_type="error")
|
225
225
|
if not all(isinstance(i, int) for i in coordinate):
|
226
|
-
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
|
226
|
+
raise ToolError(output=f"{coordinate} must be a tuple of non-negative ints", base_type="error")
|
227
227
|
|
228
228
|
if self.is_scaling:
|
229
229
|
x, y = self.scale_coordinates(
|
@@ -237,21 +237,22 @@ class ComputerTool(BaseAnthropicTool):
|
|
237
237
|
|
238
238
|
if action == "mouse_move":
|
239
239
|
pyautogui.moveTo(x, y)
|
240
|
-
return ToolResult(output=f"
|
240
|
+
return ToolResult(output=f"Mouse move", base_type="move")
|
241
|
+
|
241
242
|
elif action == "left_click_drag":
|
242
243
|
current_x, current_y = pyautogui.position()
|
243
244
|
pyautogui.dragTo(x, y, duration=0.5) # Adjust duration as needed
|
244
|
-
return ToolResult(output=f"
|
245
|
+
return ToolResult(output=f"Mouse drag", base_type="move")
|
245
246
|
|
246
247
|
# Action Type 2: Required text (keynames)
|
247
248
|
# Actions: key, type, key_down, key_up
|
248
249
|
if action in ("key", "type", "key_down", "key_up"):
|
249
250
|
if text is None:
|
250
|
-
raise ToolError(f"text is required for {action}")
|
251
|
+
raise ToolError(output=f"text is required for {action}", base_type="error")
|
251
252
|
if coordinate is not None:
|
252
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
253
|
+
raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
|
253
254
|
if not isinstance(text, str):
|
254
|
-
raise ToolError(output=f"{text} must be a string")
|
255
|
+
raise ToolError(output=f"{text} must be a string", base_type="error")
|
255
256
|
|
256
257
|
if action == "key":
|
257
258
|
# Handle key combinations
|
@@ -264,19 +265,19 @@ class ComputerTool(BaseAnthropicTool):
|
|
264
265
|
key = self.key_conversion.get(key.strip(), key.strip())
|
265
266
|
key = key.lower()
|
266
267
|
pyautogui.keyUp(key) # Release each key in reverse order
|
267
|
-
return ToolResult(output=f"
|
268
|
+
return ToolResult(output=f"Press key '{text}'", base_type="key")
|
268
269
|
|
269
270
|
elif action == "key_down":
|
270
271
|
pyautogui.keyDown(text)
|
271
|
-
return ToolResult(output=f"
|
272
|
+
return ToolResult(output=f"Press key '{text}'", base_type="key")
|
272
273
|
elif action == "key_up":
|
273
274
|
pyautogui.keyUp(text)
|
274
|
-
return ToolResult(output=f"
|
275
|
+
return ToolResult(output=f"Release key '{text}'", base_type="key")
|
275
276
|
|
276
277
|
elif action == "type":
|
277
278
|
pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000) # Convert ms to seconds
|
278
|
-
screenshot_base64 = (await self.screenshot()).base64_image
|
279
|
-
return ToolResult(output=text, base64_image=screenshot_base64)
|
279
|
+
# screenshot_base64 = (await self.screenshot()).base64_image
|
280
|
+
return ToolResult(output=f"Type '{text}'", base_type="type") # base64_image=screenshot_base64)
|
280
281
|
|
281
282
|
# Action Type 3: No required text or coordinates
|
282
283
|
# Actions: left_click, right_click, double_click, middle_click, left_press, scroll_down, scroll_up
|
@@ -291,76 +292,81 @@ class ComputerTool(BaseAnthropicTool):
|
|
291
292
|
"wait",
|
292
293
|
):
|
293
294
|
if text is not None:
|
294
|
-
raise ToolError(f"text is not accepted for {action}")
|
295
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
295
296
|
if coordinate is not None:
|
296
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
297
|
+
raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
|
297
298
|
|
298
299
|
if action == "left_click":
|
299
300
|
x, y = pyautogui.position()
|
300
301
|
show_click(x, y)
|
301
302
|
pyautogui.click()
|
303
|
+
return ToolResult(output="Left click", base_type="click")
|
302
304
|
elif action == "right_click":
|
303
305
|
x, y = pyautogui.position()
|
304
306
|
show_click(x, y)
|
305
307
|
pyautogui.rightClick()
|
308
|
+
return ToolResult(output="Right click", base_type="click")
|
306
309
|
elif action == "middle_click":
|
307
310
|
x, y = pyautogui.position()
|
308
311
|
show_click(x, y)
|
309
312
|
pyautogui.middleClick()
|
313
|
+
return ToolResult(output="Middle click", base_type="click")
|
310
314
|
elif action == "double_click":
|
311
315
|
x, y = pyautogui.position()
|
312
316
|
show_click(x, y)
|
313
317
|
pyautogui.doubleClick()
|
318
|
+
return ToolResult(output="Double click", base_type="click")
|
314
319
|
elif action == "left_press":
|
315
320
|
x, y = pyautogui.position()
|
316
321
|
show_click(x, y)
|
317
322
|
pyautogui.mouseDown()
|
318
323
|
time.sleep(1)
|
319
324
|
pyautogui.mouseUp()
|
325
|
+
return ToolResult(output="Left press", base_type="click")
|
320
326
|
elif action == "scroll_down":
|
321
327
|
pyautogui.scroll(-200) # Adjust scroll amount as needed
|
322
|
-
return ToolResult(output="Scrolled down")
|
328
|
+
return ToolResult(output="Scrolled down", base_type="scroll")
|
323
329
|
|
324
330
|
elif action == "scroll_up":
|
325
331
|
pyautogui.scroll(200) # Adjust scroll amount as needed
|
326
|
-
return ToolResult(output="Scrolled up")
|
332
|
+
return ToolResult(output="Scrolled up", base_type="scroll")
|
327
333
|
|
328
334
|
elif action == "wait":
|
329
335
|
time.sleep(15)
|
330
|
-
return ToolResult(output="
|
336
|
+
return ToolResult(output="Wait for next event", base_type="wait")
|
331
337
|
|
332
|
-
return ToolResult(output=f"Performed {action}")
|
338
|
+
return ToolResult(output=f"Performed {action}", base_type="unknown")
|
333
339
|
|
334
340
|
# Action Type 4: Miscs. No required text or coordinates
|
335
341
|
# Actions: screenshot, cursor_position
|
336
342
|
if action in ("screenshot", "cursor_position"):
|
337
343
|
if text is not None:
|
338
|
-
raise ToolError(f"text is not accepted for {action}")
|
344
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
339
345
|
if coordinate is not None:
|
340
|
-
raise ToolError(f"coordinate is not accepted for {action}")
|
346
|
+
raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
|
341
347
|
if action == "screenshot":
|
342
348
|
return await self.screenshot()
|
343
349
|
elif action == "cursor_position":
|
344
350
|
x, y = pyautogui.position()
|
345
|
-
x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
|
346
|
-
return ToolResult(output=f"
|
351
|
+
# x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
|
352
|
+
return ToolResult(output=f"Cursor position ({x},{y})", base_type="unknown")
|
347
353
|
|
348
354
|
# Action Type 5: StarRail Mode
|
349
355
|
# Actions: sr_scroll_down, sr_scroll_up
|
350
356
|
if action in ("sr_scroll_down", "sr_scroll_up"):
|
351
357
|
if text is not None:
|
352
|
-
raise ToolError(f"text is not accepted for {action}")
|
358
|
+
raise ToolError(output=f"text is not accepted for {action}", base_type="error")
|
353
359
|
|
354
360
|
if action == "sr_scroll_down":
|
355
361
|
for _ in range(20):
|
356
362
|
pyautogui.scroll(-100) # Adjust scroll amount as needed
|
357
363
|
time.sleep(0.001)
|
358
|
-
return ToolResult(output="
|
364
|
+
return ToolResult(output="Scroll down", base_type="scroll")
|
359
365
|
elif action == "sr_scroll_up":
|
360
366
|
for _ in range(20):
|
361
367
|
pyautogui.scroll(100) # Adjust scroll amount as needed
|
362
368
|
time.sleep(0.001)
|
363
|
-
return ToolResult(output="
|
369
|
+
return ToolResult(output="Scroll up", base_type="scroll")
|
364
370
|
|
365
371
|
# starrail browser mode
|
366
372
|
if action in ("left_click_windll", "mouse_move_windll", "right_click_windll", "key_down_windll", "key_up_windll"):
|
@@ -374,10 +380,11 @@ class ComputerTool(BaseAnthropicTool):
|
|
374
380
|
y = coordinate[1]+self.offset_y
|
375
381
|
show_click(x, y)
|
376
382
|
self.marbot_auto_gui.click(x=x, y=y)
|
383
|
+
return ToolResult(output=f"Left click", base_type="click")
|
377
384
|
|
378
385
|
elif action == "mouse_move_windll":
|
379
386
|
if coordinate is None:
|
380
|
-
raise ToolError(f"coordinate is required for {action}")
|
387
|
+
raise ToolError(output=f"coordinate is required for {action}", base_type="error")
|
381
388
|
|
382
389
|
x0, y0 = pyautogui.position()
|
383
390
|
# x0, y0 = self.scale_coordinates(ScalingSource.COMPUTER, x0, y0)
|
@@ -386,16 +393,21 @@ class ComputerTool(BaseAnthropicTool):
|
|
386
393
|
|
387
394
|
show_move_to(x0, y0, x1, y1, duration_ms=1000)
|
388
395
|
self.marbot_auto_gui.moveTo(x=x1, y=y1)
|
389
|
-
|
396
|
+
|
397
|
+
return ToolResult(output=f"Mouse move", base_type="move")
|
398
|
+
|
390
399
|
# elif action == "right_click_windll":
|
391
400
|
# self.marbot_auto_gui.rightClick(x=coordinate[0], y=coordinate[1])
|
392
401
|
elif action == "key_down_windll":
|
393
402
|
self.marbot_auto_gui.keyDown(text)
|
403
|
+
return ToolResult(output=f"Key down '{text}'", base_type="key")
|
394
404
|
elif action == "key_up_windll":
|
395
405
|
self.marbot_auto_gui.keyUp(text)
|
396
|
-
|
406
|
+
return ToolResult(output=f"Key up '{text}'", base_type="key")
|
407
|
+
|
408
|
+
return ToolResult(output=f"Performed dll action:{action}", base_type="unknown")
|
397
409
|
|
398
|
-
raise ToolError(f"Invalid action: {action}")
|
410
|
+
raise ToolError(output=f"Invalid action: {action}", base_type="error")
|
399
411
|
|
400
412
|
|
401
413
|
async def screenshot(self):
|
@@ -486,9 +498,9 @@ class ComputerTool(BaseAnthropicTool):
|
|
486
498
|
|
487
499
|
if path.exists():
|
488
500
|
# Return a ToolResult instance instead of a dictionary
|
489
|
-
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
|
501
|
+
return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode(), base_type="screenshot")
|
490
502
|
|
491
|
-
raise ToolError(f"Failed to take screenshot: {path} does not exist.")
|
503
|
+
raise ToolError(output=f"Failed to take screenshot: {path} does not exist.", base_type="error")
|
492
504
|
|
493
505
|
def padding_image(self, screenshot):
|
494
506
|
"""Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
|
@@ -500,17 +512,17 @@ class ComputerTool(BaseAnthropicTool):
|
|
500
512
|
padding_image.paste(screenshot, (0, 0))
|
501
513
|
return padding_image
|
502
514
|
|
503
|
-
async def shell(self, command: str, take_screenshot=True) -> ToolResult:
|
504
|
-
|
505
|
-
|
506
|
-
|
515
|
+
# async def shell(self, command: str, take_screenshot=True) -> ToolResult:
|
516
|
+
# """Run a shell command and return the output, error, and optionally a screenshot."""
|
517
|
+
# _, stdout, stderr = await run(command)
|
518
|
+
# base64_image = None
|
507
519
|
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
520
|
+
# if take_screenshot:
|
521
|
+
# # delay to let things settle before taking a screenshot
|
522
|
+
# await asyncio.sleep(self._screenshot_delay)
|
523
|
+
# base64_image = (await self.screenshot()).base64_image
|
512
524
|
|
513
|
-
|
525
|
+
# return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
|
514
526
|
|
515
527
|
def scale_coordinates(self, source: ScalingSource, x: int, y: int):
|
516
528
|
"""Scale coordinates to a target maximum resolution."""
|
@@ -538,7 +550,7 @@ class ComputerTool(BaseAnthropicTool):
|
|
538
550
|
y_scaling_factor = target_dimension["height"] / self.height
|
539
551
|
if source == ScalingSource.API:
|
540
552
|
if x > self.width or y > self.height:
|
541
|
-
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
|
553
|
+
raise ToolError(output=f"Coordinates {x}, {y} are out of bounds", base_type="error")
|
542
554
|
# scale up
|
543
555
|
return round(x / x_scaling_factor), round(y / y_scaling_factor)
|
544
556
|
# scale down
|