computer-use-ootb-internal 0.0.108__py3-none-any.whl → 0.0.109.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. computer_use_ootb_internal/app_teachmode.py +8 -16
  2. computer_use_ootb_internal/app_teachmode_gradio.py +1 -1
  3. computer_use_ootb_internal/computer_use_demo/animation/click_animation.py +1 -1
  4. computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py +70 -72
  5. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/utils.py +7 -0
  6. computer_use_ootb_internal/computer_use_demo/tools/base.py +4 -2
  7. computer_use_ootb_internal/computer_use_demo/tools/computer.py +55 -43
  8. computer_use_ootb_internal/run_teachmode_ootb_args.py +24 -13
  9. {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/METADATA +1 -1
  10. {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/RECORD +12 -17
  11. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py +0 -676
  12. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/icon_detection/icon_detection.py +0 -253
  13. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/panel_recognition/llm_panel_recognize.py +0 -170
  14. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/test_capture.py +0 -8
  15. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py +0 -0
  16. {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/WHEEL +0 -0
  17. {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.post1.dist-info}/entry_points.txt +0 -0
@@ -5,14 +5,12 @@ from datetime import datetime
5
5
  import threading
6
6
  import requests
7
7
  import platform # Add platform import
8
- import subprocess # Add subprocess import
9
8
  import pyautogui # Add pyautogui import
10
9
  import webbrowser # Add webbrowser import
11
10
  import os # Import os for path joining
12
11
  from fastapi import FastAPI, Request
13
12
  from fastapi.responses import JSONResponse
14
13
  from fastapi.middleware.cors import CORSMiddleware
15
- from screeninfo import get_monitors
16
14
  from computer_use_ootb_internal.computer_use_demo.tools.computer import get_screen_details
17
15
  from computer_use_ootb_internal.run_teachmode_ootb_args import simple_teachmode_sampling_loop
18
16
 
@@ -170,7 +168,7 @@ async def update_parameters(request: Request):
170
168
 
171
169
  # Update shared state when parameters change
172
170
  shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
173
- shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
171
+ shared_state.task = getattr(shared_state.args, 'task', "Following the instructions to complete the task.")
174
172
  shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
175
173
  shared_state.user_id = getattr(shared_state.args, 'user_id', "hero_cases")
176
174
  shared_state.trace_id = getattr(shared_state.args, 'trace_id', "build_scroll_combat")
@@ -227,7 +225,7 @@ async def get_messages(request: Request):
227
225
  status_code=429
228
226
  )
229
227
 
230
- log_ootb_request(shared_state.server_url, "get_messages", {})
228
+ # log_ootb_request(shared_state.server_url, "get_messages", {})
231
229
 
232
230
  # Return all messages in the queue and clear it
233
231
  messages = shared_state.message_queue.copy()
@@ -338,7 +336,7 @@ async def get_status(request: Request):
338
336
  status_code=429
339
337
  )
340
338
 
341
- log_ootb_request(shared_state.server_url, "get_status", {})
339
+ # log_ootb_request(shared_state.server_url, "get_status", {})
342
340
 
343
341
  print(f"Status check - Processing: {shared_state.is_processing}, Paused: {shared_state.is_paused}")
344
342
  return JSONResponse(
@@ -393,14 +391,8 @@ def process_input():
393
391
  print("Processing stopped while paused or resuming")
394
392
  break
395
393
 
396
- # Process the message
397
- if loop_msg.startswith('<img'):
398
- message = {"role": "user", "content": loop_msg}
399
- else:
400
- message = {"role": "assistant", "content": loop_msg}
401
-
402
- shared_state.chatbot_messages.append(message)
403
- shared_state.message_queue.append(message)
394
+ shared_state.chatbot_messages.append(loop_msg)
395
+ shared_state.message_queue.append(loop_msg)
404
396
 
405
397
  # Short sleep to allow stop signals to be processed
406
398
  for _ in range(5): # Check 5 times per second
@@ -416,17 +408,17 @@ def process_input():
416
408
  # Handle any exceptions in the processing loop
417
409
  error_msg = f"Error during task processing: {str(e)}"
418
410
  print(error_msg)
419
- error_message = {"role": "assistant", "content": error_msg}
411
+ error_message = {"role": "assistant", "content": error_msg, "type": "error"}
420
412
  shared_state.message_queue.append(error_message)
421
413
 
422
414
  finally:
423
415
  # Handle completion or interruption
424
416
  if shared_state.should_stop or shared_state.stop_event.is_set():
425
417
  stop_msg = f"Task '{shared_state.task}' was stopped. Ready for new tasks."
426
- final_message = {"role": "assistant", "content": stop_msg}
418
+ final_message = {"role": "assistant", "content": stop_msg, "type": "text"}
427
419
  else:
428
420
  complete_msg = f"Task '{shared_state.task}' completed. Thanks for using Teachmode-OOTB."
429
- final_message = {"role": "assistant", "content": complete_msg}
421
+ final_message = {"role": "assistant", "content": complete_msg, "type": "text"}
430
422
 
431
423
  shared_state.chatbot_messages.append(final_message)
432
424
  shared_state.message_queue.append(final_message)
@@ -42,7 +42,7 @@ async def update_parameters(request: Request):
42
42
  shared_state.task_updated = True
43
43
 
44
44
  # Update shared state when parameters change
45
- shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
45
+ # shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
46
46
  shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
47
47
  shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
48
48
  shared_state.user_id = getattr(shared_state.args, 'user_id', "a_test")
@@ -64,7 +64,7 @@ def show_click(x: int, y: int, duration_ms: int = 800, existing_ms: int = 800):
64
64
  ).start()
65
65
 
66
66
  def show_move_to(x1: int, y1: int, x2: int, y2: int,
67
- duration_ms: int = 1000, existing_ms: int = 800):
67
+ duration_ms: int = 800, existing_ms: int = 800):
68
68
  if not CLICK_GIF.exists():
69
69
  raise FileNotFoundError(f"GIF not found at {CLICK_GIF}")
70
70
  mp.get_context("spawn").Process(
@@ -3,13 +3,9 @@ import json
3
3
  import asyncio
4
4
  from typing import Any, Dict, cast, List, Union
5
5
  import uuid
6
- from anthropic.types.beta import (
7
- BetaImageBlockParam,
8
- BetaTextBlockParam,
9
- BetaToolResultBlockParam,
10
- )
11
- from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
12
- from computer_use_ootb_internal.computer_use_demo.tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
6
+ from anthropic.types.beta import BetaToolUseBlock
7
+ from computer_use_ootb_internal.computer_use_demo.tools import ComputerTool, ToolCollection
8
+ from computer_use_ootb_internal.computer_use_demo.tools.base import ToolResult, ToolError
13
9
 
14
10
 
15
11
  class TeachmodeExecutor:
@@ -48,12 +44,13 @@ class TeachmodeExecutor:
48
44
 
49
45
 
50
46
 
51
- def __call__(self,
52
- response: str):
53
- # response is expected to be :
47
+ def __call__(self, response: str):
48
+
49
+ # response is expected to be:
54
50
  # {'content': "{'action': 'CLICK', 'value': None, 'position': [0.83, 0.15]}, ...", 'role': 'assistant'},
55
51
 
56
- action_dict = self._format_actor_output(response) # str -> dict
52
+ # str -> dict
53
+ action_dict = self._format_actor_output(response)
57
54
 
58
55
  actions = action_dict["content"]
59
56
 
@@ -72,13 +69,9 @@ class TeachmodeExecutor:
72
69
 
73
70
  print("Parsed Action List:", action_list)
74
71
 
75
- tool_result_content = None
76
-
77
72
  if action_list is not None and len(action_list) > 0:
78
73
 
79
- for action in action_list: # Execute the tool (adapting the code from anthropic_executor.py)
80
-
81
- tool_result_content: list[BetaToolResultBlockParam] = []
74
+ for action in action_list:
82
75
 
83
76
  # self.output_callback(f"{colorful_text_showui}:\n{action}", sender="bot")
84
77
  print("Converted Action:", action)
@@ -86,23 +79,28 @@ class TeachmodeExecutor:
86
79
  sim_content_block = BetaToolUseBlock(
87
80
  id=f'toolu_{uuid.uuid4()}',
88
81
  input={'action': action["action"], 'text': action["text"], 'coordinate': action["coordinate"]},
89
- name='computer', type='tool_use')
82
+ name='computer',
83
+ type='tool_use'
84
+ )
90
85
 
91
86
  # Run the asynchronous tool execution in a synchronous context
92
- result = asyncio.run(self.tool_collection.run(
93
- name=sim_content_block.name,
94
- tool_input=cast(dict[str, Any], sim_content_block.input),
95
- ))
87
+ tool_result = asyncio.run(
88
+ self.tool_collection.run(
89
+ name=sim_content_block.name,
90
+ tool_input=cast(dict[str, Any], sim_content_block.input),
91
+ ))
96
92
 
97
- tool_result_content.append(
98
- _make_api_tool_result(result, sim_content_block.id)
99
- )
100
- print(f"[teachmode_executor] tool_result_content: {tool_result_content}")
101
-
102
- yield tool_result_content[0]['content'][0]['text']
103
-
104
- return tool_result_content[0]['content'][0]['text']
105
-
93
+ if isinstance(tool_result, ToolResult):
94
+ print(f"[teachmode_executor] tool_result: {tool_result}")
95
+ tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "action", "action_type": tool_result['base_type']}
96
+ yield tool_result_message
97
+
98
+ elif isinstance(tool_result, ToolError):
99
+ tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "error"}
100
+ yield tool_result_message
101
+
102
+ return tool_result_message
103
+
106
104
 
107
105
  def _format_actor_output(self, action_output: str|dict) -> Dict[str, Any]:
108
106
  if type(action_output) == dict:
@@ -172,8 +170,8 @@ class TeachmodeExecutor:
172
170
 
173
171
  elif action_item["action"] == "PRESS": # 7. press
174
172
  x, y = action_item["position"]
175
- action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
176
- int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
173
+ # action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
174
+ # int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
177
175
  refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
178
176
  refined_output.append({"action": "left_press", "text": None, "coordinate": None})
179
177
 
@@ -316,43 +314,43 @@ class TeachmodeExecutor:
316
314
 
317
315
 
318
316
 
319
- def _make_api_tool_result(
320
- result: ToolResult, tool_use_id: str
321
- ) -> BetaToolResultBlockParam:
322
- """Convert an agent ToolResult to an API ToolResultBlockParam."""
323
- tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
324
- is_error = False
325
- if result.error:
326
- is_error = True
327
- tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
328
- else:
329
- if result.output:
330
- tool_result_content.append(
331
- {
332
- "type": "text",
333
- "text": _maybe_prepend_system_tool_result(result, result.output),
334
- }
335
- )
336
- if result.base64_image:
337
- tool_result_content.append(
338
- {
339
- "type": "image",
340
- "source": {
341
- "type": "base64",
342
- "media_type": "image/png",
343
- "data": result.base64_image,
344
- },
345
- }
346
- )
347
- return {
348
- "type": "tool_result",
349
- "content": tool_result_content,
350
- "tool_use_id": tool_use_id,
351
- "is_error": is_error,
352
- }
353
-
354
-
355
- def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
356
- if result.system:
357
- result_text = f"<system>{result.system}</system>\n{result_text}"
358
- return result_text
317
+ # def _make_api_tool_result(
318
+ # result: ToolResult, tool_use_id: str
319
+ # ) -> BetaToolResultBlockParam:
320
+ # """Convert an agent ToolResult to an API ToolResultBlockParam."""
321
+ # tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
322
+ # is_error = False
323
+ # if result.error:
324
+ # is_error = True
325
+ # tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
326
+ # else:
327
+ # if result.output:
328
+ # tool_result_content.append(
329
+ # {
330
+ # "type": "text",
331
+ # "text": _maybe_prepend_system_tool_result(result, result.output),
332
+ # }
333
+ # )
334
+ # if result.base64_image:
335
+ # tool_result_content.append(
336
+ # {
337
+ # "type": "image",
338
+ # "source": {
339
+ # "type": "base64",
340
+ # "media_type": "image/png",
341
+ # "data": result.base64_image,
342
+ # },
343
+ # }
344
+ # )
345
+ # return {
346
+ # "type": "tool_result",
347
+ # "content": tool_result_content,
348
+ # "tool_use_id": tool_use_id,
349
+ # "is_error": is_error,
350
+ # }
351
+
352
+
353
+ # def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
354
+ # if result.system:
355
+ # result_text = f"<system>{result.system}</system>\n{result_text}"
356
+ # return result_text
@@ -4,6 +4,13 @@ import datetime
4
4
  import numpy as np
5
5
  import cv2
6
6
 
7
+ def get_screen_resize_factor():
8
+ # import ctypes
9
+ # scaleFactor = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100
10
+ # scaleFactor = str(scaleFactor) + "x"
11
+ # return scaleFactor
12
+ return "1.0x"
13
+
7
14
 
8
15
  def multivalue_image(img, mode='None', thresholds=None, interval_values=None, save=True, cache_folder=None):
9
16
  img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@@ -28,6 +28,7 @@ class ToolResult:
28
28
  error: str | None = None
29
29
  base64_image: str | None = None
30
30
  system: str | None = None
31
+ base_type: str | None = None
31
32
 
32
33
  def __bool__(self):
33
34
  return any(getattr(self, field.name) for field in fields(self))
@@ -65,5 +66,6 @@ class ToolFailure(ToolResult):
65
66
  class ToolError(Exception):
66
67
  """Raised when a tool encounters an error."""
67
68
 
68
- def __init__(self, message):
69
- self.message = message
69
+ def __init__(self, output: str, base_type: str):
70
+ self.output = output
71
+ self.base_type = base_type
@@ -217,13 +217,13 @@ class ComputerTool(BaseAnthropicTool):
217
217
 
218
218
  if action in ("mouse_move", "left_click_drag"):
219
219
  if coordinate is None:
220
- raise ToolError(f"coordinate is required for {action}")
220
+ raise ToolError(output=f"coordinate is required for {action}", base_type="error")
221
221
  if text is not None:
222
- raise ToolError(f"text is not accepted for {action}")
222
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
223
223
  if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
224
- raise ToolError(f"{coordinate} must be a tuple of length 2")
224
+ raise ToolError(output=f"{coordinate} must be a tuple of length 2", base_type="error")
225
225
  if not all(isinstance(i, int) for i in coordinate):
226
- raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
226
+ raise ToolError(output=f"{coordinate} must be a tuple of non-negative ints", base_type="error")
227
227
 
228
228
  if self.is_scaling:
229
229
  x, y = self.scale_coordinates(
@@ -237,21 +237,22 @@ class ComputerTool(BaseAnthropicTool):
237
237
 
238
238
  if action == "mouse_move":
239
239
  pyautogui.moveTo(x, y)
240
- return ToolResult(output=f"Moved mouse to ({x}, {y})")
240
+ return ToolResult(output=f"Mouse move", base_type="move")
241
+
241
242
  elif action == "left_click_drag":
242
243
  current_x, current_y = pyautogui.position()
243
244
  pyautogui.dragTo(x, y, duration=0.5) # Adjust duration as needed
244
- return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
245
+ return ToolResult(output=f"Mouse drag", base_type="move")
245
246
 
246
247
  # Action Type 2: Required text (keynames)
247
248
  # Actions: key, type, key_down, key_up
248
249
  if action in ("key", "type", "key_down", "key_up"):
249
250
  if text is None:
250
- raise ToolError(f"text is required for {action}")
251
+ raise ToolError(output=f"text is required for {action}", base_type="error")
251
252
  if coordinate is not None:
252
- raise ToolError(f"coordinate is not accepted for {action}")
253
+ raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
253
254
  if not isinstance(text, str):
254
- raise ToolError(output=f"{text} must be a string")
255
+ raise ToolError(output=f"{text} must be a string", base_type="error")
255
256
 
256
257
  if action == "key":
257
258
  # Handle key combinations
@@ -264,19 +265,19 @@ class ComputerTool(BaseAnthropicTool):
264
265
  key = self.key_conversion.get(key.strip(), key.strip())
265
266
  key = key.lower()
266
267
  pyautogui.keyUp(key) # Release each key in reverse order
267
- return ToolResult(output=f"Pressed keys: {text}")
268
+ return ToolResult(output=f"Press key '{text}'", base_type="key")
268
269
 
269
270
  elif action == "key_down":
270
271
  pyautogui.keyDown(text)
271
- return ToolResult(output=f"Pressed key: {text}")
272
+ return ToolResult(output=f"Press key '{text}'", base_type="key")
272
273
  elif action == "key_up":
273
274
  pyautogui.keyUp(text)
274
- return ToolResult(output=f"Released key: {text}")
275
+ return ToolResult(output=f"Release key '{text}'", base_type="key")
275
276
 
276
277
  elif action == "type":
277
278
  pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000) # Convert ms to seconds
278
- screenshot_base64 = (await self.screenshot()).base64_image
279
- return ToolResult(output=text, base64_image=screenshot_base64)
279
+ # screenshot_base64 = (await self.screenshot()).base64_image
280
+ return ToolResult(output=f"Type '{text}'", base_type="type") # base64_image=screenshot_base64)
280
281
 
281
282
  # Action Type 3: No required text or coordinates
282
283
  # Actions: left_click, right_click, double_click, middle_click, left_press, scroll_down, scroll_up
@@ -291,76 +292,81 @@ class ComputerTool(BaseAnthropicTool):
291
292
  "wait",
292
293
  ):
293
294
  if text is not None:
294
- raise ToolError(f"text is not accepted for {action}")
295
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
295
296
  if coordinate is not None:
296
- raise ToolError(f"coordinate is not accepted for {action}")
297
+ raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
297
298
 
298
299
  if action == "left_click":
299
300
  x, y = pyautogui.position()
300
301
  show_click(x, y)
301
302
  pyautogui.click()
303
+ return ToolResult(output="Left click", base_type="click")
302
304
  elif action == "right_click":
303
305
  x, y = pyautogui.position()
304
306
  show_click(x, y)
305
307
  pyautogui.rightClick()
308
+ return ToolResult(output="Right click", base_type="click")
306
309
  elif action == "middle_click":
307
310
  x, y = pyautogui.position()
308
311
  show_click(x, y)
309
312
  pyautogui.middleClick()
313
+ return ToolResult(output="Middle click", base_type="click")
310
314
  elif action == "double_click":
311
315
  x, y = pyautogui.position()
312
316
  show_click(x, y)
313
317
  pyautogui.doubleClick()
318
+ return ToolResult(output="Double click", base_type="click")
314
319
  elif action == "left_press":
315
320
  x, y = pyautogui.position()
316
321
  show_click(x, y)
317
322
  pyautogui.mouseDown()
318
323
  time.sleep(1)
319
324
  pyautogui.mouseUp()
325
+ return ToolResult(output="Left press", base_type="click")
320
326
  elif action == "scroll_down":
321
327
  pyautogui.scroll(-200) # Adjust scroll amount as needed
322
- return ToolResult(output="Scrolled down")
328
+ return ToolResult(output="Scrolled down", base_type="scroll")
323
329
 
324
330
  elif action == "scroll_up":
325
331
  pyautogui.scroll(200) # Adjust scroll amount as needed
326
- return ToolResult(output="Scrolled up")
332
+ return ToolResult(output="Scrolled up", base_type="scroll")
327
333
 
328
334
  elif action == "wait":
329
335
  time.sleep(15)
330
- return ToolResult(output="Waited")
336
+ return ToolResult(output="Wait for next event", base_type="wait")
331
337
 
332
- return ToolResult(output=f"Performed {action}")
338
+ return ToolResult(output=f"Performed {action}", base_type="unknown")
333
339
 
334
340
  # Action Type 4: Miscs. No required text or coordinates
335
341
  # Actions: screenshot, cursor_position
336
342
  if action in ("screenshot", "cursor_position"):
337
343
  if text is not None:
338
- raise ToolError(f"text is not accepted for {action}")
344
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
339
345
  if coordinate is not None:
340
- raise ToolError(f"coordinate is not accepted for {action}")
346
+ raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
341
347
  if action == "screenshot":
342
348
  return await self.screenshot()
343
349
  elif action == "cursor_position":
344
350
  x, y = pyautogui.position()
345
- x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
346
- return ToolResult(output=f"X={x},Y={y}")
351
+ # x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
352
+ return ToolResult(output=f"Cursor position ({x},{y})", base_type="unknown")
347
353
 
348
354
  # Action Type 5: StarRail Mode
349
355
  # Actions: sr_scroll_down, sr_scroll_up
350
356
  if action in ("sr_scroll_down", "sr_scroll_up"):
351
357
  if text is not None:
352
- raise ToolError(f"text is not accepted for {action}")
358
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
353
359
 
354
360
  if action == "sr_scroll_down":
355
361
  for _ in range(20):
356
362
  pyautogui.scroll(-100) # Adjust scroll amount as needed
357
363
  time.sleep(0.001)
358
- return ToolResult(output="SR Scrolled down")
364
+ return ToolResult(output="Scroll down", base_type="scroll")
359
365
  elif action == "sr_scroll_up":
360
366
  for _ in range(20):
361
367
  pyautogui.scroll(100) # Adjust scroll amount as needed
362
368
  time.sleep(0.001)
363
- return ToolResult(output="SR Scrolled up")
369
+ return ToolResult(output="Scroll up", base_type="scroll")
364
370
 
365
371
  # starrail browser mode
366
372
  if action in ("left_click_windll", "mouse_move_windll", "right_click_windll", "key_down_windll", "key_up_windll"):
@@ -374,10 +380,11 @@ class ComputerTool(BaseAnthropicTool):
374
380
  y = coordinate[1]+self.offset_y
375
381
  show_click(x, y)
376
382
  self.marbot_auto_gui.click(x=x, y=y)
383
+ return ToolResult(output=f"Left click", base_type="click")
377
384
 
378
385
  elif action == "mouse_move_windll":
379
386
  if coordinate is None:
380
- raise ToolError(f"coordinate is required for {action}")
387
+ raise ToolError(output=f"coordinate is required for {action}", base_type="error")
381
388
 
382
389
  x0, y0 = pyautogui.position()
383
390
  # x0, y0 = self.scale_coordinates(ScalingSource.COMPUTER, x0, y0)
@@ -386,16 +393,21 @@ class ComputerTool(BaseAnthropicTool):
386
393
 
387
394
  show_move_to(x0, y0, x1, y1, duration_ms=1000)
388
395
  self.marbot_auto_gui.moveTo(x=x1, y=y1)
389
-
396
+
397
+ return ToolResult(output=f"Mouse move", base_type="move")
398
+
390
399
  # elif action == "right_click_windll":
391
400
  # self.marbot_auto_gui.rightClick(x=coordinate[0], y=coordinate[1])
392
401
  elif action == "key_down_windll":
393
402
  self.marbot_auto_gui.keyDown(text)
403
+ return ToolResult(output=f"Key down '{text}'", base_type="key")
394
404
  elif action == "key_up_windll":
395
405
  self.marbot_auto_gui.keyUp(text)
396
- return ToolResult(output=f"Performed dll action:{action}")
406
+ return ToolResult(output=f"Key up '{text}'", base_type="key")
407
+
408
+ return ToolResult(output=f"Performed dll action:{action}", base_type="unknown")
397
409
 
398
- raise ToolError(f"Invalid action: {action}")
410
+ raise ToolError(output=f"Invalid action: {action}", base_type="error")
399
411
 
400
412
 
401
413
  async def screenshot(self):
@@ -486,9 +498,9 @@ class ComputerTool(BaseAnthropicTool):
486
498
 
487
499
  if path.exists():
488
500
  # Return a ToolResult instance instead of a dictionary
489
- return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
501
+ return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode(), base_type="screenshot")
490
502
 
491
- raise ToolError(f"Failed to take screenshot: {path} does not exist.")
503
+ raise ToolError(output=f"Failed to take screenshot: {path} does not exist.", base_type="error")
492
504
 
493
505
  def padding_image(self, screenshot):
494
506
  """Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
@@ -500,17 +512,17 @@ class ComputerTool(BaseAnthropicTool):
500
512
  padding_image.paste(screenshot, (0, 0))
501
513
  return padding_image
502
514
 
503
- async def shell(self, command: str, take_screenshot=True) -> ToolResult:
504
- """Run a shell command and return the output, error, and optionally a screenshot."""
505
- _, stdout, stderr = await run(command)
506
- base64_image = None
515
+ # async def shell(self, command: str, take_screenshot=True) -> ToolResult:
516
+ # """Run a shell command and return the output, error, and optionally a screenshot."""
517
+ # _, stdout, stderr = await run(command)
518
+ # base64_image = None
507
519
 
508
- if take_screenshot:
509
- # delay to let things settle before taking a screenshot
510
- await asyncio.sleep(self._screenshot_delay)
511
- base64_image = (await self.screenshot()).base64_image
520
+ # if take_screenshot:
521
+ # # delay to let things settle before taking a screenshot
522
+ # await asyncio.sleep(self._screenshot_delay)
523
+ # base64_image = (await self.screenshot()).base64_image
512
524
 
513
- return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
525
+ # return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
514
526
 
515
527
  def scale_coordinates(self, source: ScalingSource, x: int, y: int):
516
528
  """Scale coordinates to a target maximum resolution."""
@@ -538,7 +550,7 @@ class ComputerTool(BaseAnthropicTool):
538
550
  y_scaling_factor = target_dimension["height"] / self.height
539
551
  if source == ScalingSource.API:
540
552
  if x > self.width or y > self.height:
541
- raise ToolError(f"Coordinates {x}, {y} are out of bounds")
553
+ raise ToolError(output=f"Coordinates {x}, {y} are out of bounds", base_type="error")
542
554
  # scale up
543
555
  return round(x / x_scaling_factor), round(y / y_scaling_factor)
544
556
  # scale down