computer-use-ootb-internal 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. computer_use_ootb_internal/app_teachmode.py +8 -14
  2. computer_use_ootb_internal/app_teachmode_gradio.py +1 -1
  3. computer_use_ootb_internal/computer_use_demo/animation/click_animation.py +1 -1
  4. computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py +70 -72
  5. computer_use_ootb_internal/computer_use_demo/tools/base.py +4 -2
  6. computer_use_ootb_internal/computer_use_demo/tools/computer.py +55 -43
  7. computer_use_ootb_internal/run_teachmode_ootb_args.py +26 -11
  8. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/METADATA +1 -1
  9. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/RECORD +11 -16
  10. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py +0 -676
  11. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/icon_detection/icon_detection.py +0 -253
  12. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/panel_recognition/llm_panel_recognize.py +0 -170
  13. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/test_capture.py +0 -8
  14. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py +0 -0
  15. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/WHEEL +0 -0
  16. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/entry_points.txt +0 -0
@@ -170,7 +170,7 @@ async def update_parameters(request: Request):
170
170
 
171
171
  # Update shared state when parameters change
172
172
  shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
173
- shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
173
+ shared_state.task = getattr(shared_state.args, 'task', "Following the instructions to complete the task.")
174
174
  shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
175
175
  shared_state.user_id = getattr(shared_state.args, 'user_id', "hero_cases")
176
176
  shared_state.trace_id = getattr(shared_state.args, 'trace_id', "build_scroll_combat")
@@ -227,7 +227,7 @@ async def get_messages(request: Request):
227
227
  status_code=429
228
228
  )
229
229
 
230
- log_ootb_request(shared_state.server_url, "get_messages", {})
230
+ # log_ootb_request(shared_state.server_url, "get_messages", {})
231
231
 
232
232
  # Return all messages in the queue and clear it
233
233
  messages = shared_state.message_queue.copy()
@@ -338,7 +338,7 @@ async def get_status(request: Request):
338
338
  status_code=429
339
339
  )
340
340
 
341
- log_ootb_request(shared_state.server_url, "get_status", {})
341
+ # log_ootb_request(shared_state.server_url, "get_status", {})
342
342
 
343
343
  print(f"Status check - Processing: {shared_state.is_processing}, Paused: {shared_state.is_paused}")
344
344
  return JSONResponse(
@@ -393,14 +393,8 @@ def process_input():
393
393
  print("Processing stopped while paused or resuming")
394
394
  break
395
395
 
396
- # Process the message
397
- if loop_msg.startswith('<img'):
398
- message = {"role": "user", "content": loop_msg}
399
- else:
400
- message = {"role": "assistant", "content": loop_msg}
401
-
402
- shared_state.chatbot_messages.append(message)
403
- shared_state.message_queue.append(message)
396
+ shared_state.chatbot_messages.append(loop_msg)
397
+ shared_state.message_queue.append(loop_msg)
404
398
 
405
399
  # Short sleep to allow stop signals to be processed
406
400
  for _ in range(5): # Check 5 times per second
@@ -416,17 +410,17 @@ def process_input():
416
410
  # Handle any exceptions in the processing loop
417
411
  error_msg = f"Error during task processing: {str(e)}"
418
412
  print(error_msg)
419
- error_message = {"role": "assistant", "content": error_msg}
413
+ error_message = {"role": "assistant", "content": error_msg, "type": "error"}
420
414
  shared_state.message_queue.append(error_message)
421
415
 
422
416
  finally:
423
417
  # Handle completion or interruption
424
418
  if shared_state.should_stop or shared_state.stop_event.is_set():
425
419
  stop_msg = f"Task '{shared_state.task}' was stopped. Ready for new tasks."
426
- final_message = {"role": "assistant", "content": stop_msg}
420
+ final_message = {"role": "assistant", "content": stop_msg, "type": "text"}
427
421
  else:
428
422
  complete_msg = f"Task '{shared_state.task}' completed. Thanks for using Teachmode-OOTB."
429
- final_message = {"role": "assistant", "content": complete_msg}
423
+ final_message = {"role": "assistant", "content": complete_msg, "type": "text"}
430
424
 
431
425
  shared_state.chatbot_messages.append(final_message)
432
426
  shared_state.message_queue.append(final_message)
@@ -42,7 +42,7 @@ async def update_parameters(request: Request):
42
42
  shared_state.task_updated = True
43
43
 
44
44
  # Update shared state when parameters change
45
- shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
45
+ # shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
46
46
  shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
47
47
  shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
48
48
  shared_state.user_id = getattr(shared_state.args, 'user_id', "a_test")
@@ -64,7 +64,7 @@ def show_click(x: int, y: int, duration_ms: int = 800, existing_ms: int = 800):
64
64
  ).start()
65
65
 
66
66
  def show_move_to(x1: int, y1: int, x2: int, y2: int,
67
- duration_ms: int = 1000, existing_ms: int = 800):
67
+ duration_ms: int = 800, existing_ms: int = 800):
68
68
  if not CLICK_GIF.exists():
69
69
  raise FileNotFoundError(f"GIF not found at {CLICK_GIF}")
70
70
  mp.get_context("spawn").Process(
@@ -3,13 +3,9 @@ import json
3
3
  import asyncio
4
4
  from typing import Any, Dict, cast, List, Union
5
5
  import uuid
6
- from anthropic.types.beta import (
7
- BetaImageBlockParam,
8
- BetaTextBlockParam,
9
- BetaToolResultBlockParam,
10
- )
11
- from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
12
- from computer_use_ootb_internal.computer_use_demo.tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
6
+ from anthropic.types.beta import BetaToolUseBlock
7
+ from computer_use_ootb_internal.computer_use_demo.tools import ComputerTool, ToolCollection
8
+ from computer_use_ootb_internal.computer_use_demo.tools.base import ToolResult, ToolError
13
9
 
14
10
 
15
11
  class TeachmodeExecutor:
@@ -48,12 +44,13 @@ class TeachmodeExecutor:
48
44
 
49
45
 
50
46
 
51
- def __call__(self,
52
- response: str):
53
- # response is expected to be :
47
+ def __call__(self, response: str):
48
+
49
+ # response is expected to be:
54
50
  # {'content': "{'action': 'CLICK', 'value': None, 'position': [0.83, 0.15]}, ...", 'role': 'assistant'},
55
51
 
56
- action_dict = self._format_actor_output(response) # str -> dict
52
+ # str -> dict
53
+ action_dict = self._format_actor_output(response)
57
54
 
58
55
  actions = action_dict["content"]
59
56
 
@@ -72,13 +69,9 @@ class TeachmodeExecutor:
72
69
 
73
70
  print("Parsed Action List:", action_list)
74
71
 
75
- tool_result_content = None
76
-
77
72
  if action_list is not None and len(action_list) > 0:
78
73
 
79
- for action in action_list: # Execute the tool (adapting the code from anthropic_executor.py)
80
-
81
- tool_result_content: list[BetaToolResultBlockParam] = []
74
+ for action in action_list:
82
75
 
83
76
  # self.output_callback(f"{colorful_text_showui}:\n{action}", sender="bot")
84
77
  print("Converted Action:", action)
@@ -86,23 +79,28 @@ class TeachmodeExecutor:
86
79
  sim_content_block = BetaToolUseBlock(
87
80
  id=f'toolu_{uuid.uuid4()}',
88
81
  input={'action': action["action"], 'text': action["text"], 'coordinate': action["coordinate"]},
89
- name='computer', type='tool_use')
82
+ name='computer',
83
+ type='tool_use'
84
+ )
90
85
 
91
86
  # Run the asynchronous tool execution in a synchronous context
92
- result = asyncio.run(self.tool_collection.run(
93
- name=sim_content_block.name,
94
- tool_input=cast(dict[str, Any], sim_content_block.input),
95
- ))
87
+ tool_result = asyncio.run(
88
+ self.tool_collection.run(
89
+ name=sim_content_block.name,
90
+ tool_input=cast(dict[str, Any], sim_content_block.input),
91
+ ))
96
92
 
97
- tool_result_content.append(
98
- _make_api_tool_result(result, sim_content_block.id)
99
- )
100
- print(f"[teachmode_executor] tool_result_content: {tool_result_content}")
101
-
102
- yield tool_result_content[0]['content'][0]['text']
103
-
104
- return tool_result_content[0]['content'][0]['text']
105
-
93
+ if isinstance(tool_result, ToolResult):
94
+ print(f"[teachmode_executor] tool_result: {tool_result}")
95
+ tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "action", "action_type": tool_result['base_type']}
96
+ yield tool_result_message
97
+
98
+ elif isinstance(tool_result, ToolError):
99
+ tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "error"}
100
+ yield tool_result_message
101
+
102
+ return tool_result_message
103
+
106
104
 
107
105
  def _format_actor_output(self, action_output: str|dict) -> Dict[str, Any]:
108
106
  if type(action_output) == dict:
@@ -172,8 +170,8 @@ class TeachmodeExecutor:
172
170
 
173
171
  elif action_item["action"] == "PRESS": # 7. press
174
172
  x, y = action_item["position"]
175
- action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
176
- int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
173
+ # action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
174
+ # int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
177
175
  refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
178
176
  refined_output.append({"action": "left_press", "text": None, "coordinate": None})
179
177
 
@@ -316,43 +314,43 @@ class TeachmodeExecutor:
316
314
 
317
315
 
318
316
 
319
- def _make_api_tool_result(
320
- result: ToolResult, tool_use_id: str
321
- ) -> BetaToolResultBlockParam:
322
- """Convert an agent ToolResult to an API ToolResultBlockParam."""
323
- tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
324
- is_error = False
325
- if result.error:
326
- is_error = True
327
- tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
328
- else:
329
- if result.output:
330
- tool_result_content.append(
331
- {
332
- "type": "text",
333
- "text": _maybe_prepend_system_tool_result(result, result.output),
334
- }
335
- )
336
- if result.base64_image:
337
- tool_result_content.append(
338
- {
339
- "type": "image",
340
- "source": {
341
- "type": "base64",
342
- "media_type": "image/png",
343
- "data": result.base64_image,
344
- },
345
- }
346
- )
347
- return {
348
- "type": "tool_result",
349
- "content": tool_result_content,
350
- "tool_use_id": tool_use_id,
351
- "is_error": is_error,
352
- }
353
-
354
-
355
- def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
356
- if result.system:
357
- result_text = f"<system>{result.system}</system>\n{result_text}"
358
- return result_text
317
+ # def _make_api_tool_result(
318
+ # result: ToolResult, tool_use_id: str
319
+ # ) -> BetaToolResultBlockParam:
320
+ # """Convert an agent ToolResult to an API ToolResultBlockParam."""
321
+ # tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
322
+ # is_error = False
323
+ # if result.error:
324
+ # is_error = True
325
+ # tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
326
+ # else:
327
+ # if result.output:
328
+ # tool_result_content.append(
329
+ # {
330
+ # "type": "text",
331
+ # "text": _maybe_prepend_system_tool_result(result, result.output),
332
+ # }
333
+ # )
334
+ # if result.base64_image:
335
+ # tool_result_content.append(
336
+ # {
337
+ # "type": "image",
338
+ # "source": {
339
+ # "type": "base64",
340
+ # "media_type": "image/png",
341
+ # "data": result.base64_image,
342
+ # },
343
+ # }
344
+ # )
345
+ # return {
346
+ # "type": "tool_result",
347
+ # "content": tool_result_content,
348
+ # "tool_use_id": tool_use_id,
349
+ # "is_error": is_error,
350
+ # }
351
+
352
+
353
+ # def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
354
+ # if result.system:
355
+ # result_text = f"<system>{result.system}</system>\n{result_text}"
356
+ # return result_text
@@ -28,6 +28,7 @@ class ToolResult:
28
28
  error: str | None = None
29
29
  base64_image: str | None = None
30
30
  system: str | None = None
31
+ base_type: str | None = None
31
32
 
32
33
  def __bool__(self):
33
34
  return any(getattr(self, field.name) for field in fields(self))
@@ -65,5 +66,6 @@ class ToolFailure(ToolResult):
65
66
  class ToolError(Exception):
66
67
  """Raised when a tool encounters an error."""
67
68
 
68
- def __init__(self, message):
69
- self.message = message
69
+ def __init__(self, output: str, base_type: str):
70
+ self.output = output
71
+ self.base_type = base_type
@@ -217,13 +217,13 @@ class ComputerTool(BaseAnthropicTool):
217
217
 
218
218
  if action in ("mouse_move", "left_click_drag"):
219
219
  if coordinate is None:
220
- raise ToolError(f"coordinate is required for {action}")
220
+ raise ToolError(output=f"coordinate is required for {action}", base_type="error")
221
221
  if text is not None:
222
- raise ToolError(f"text is not accepted for {action}")
222
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
223
223
  if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
224
- raise ToolError(f"{coordinate} must be a tuple of length 2")
224
+ raise ToolError(output=f"{coordinate} must be a tuple of length 2", base_type="error")
225
225
  if not all(isinstance(i, int) for i in coordinate):
226
- raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
226
+ raise ToolError(output=f"{coordinate} must be a tuple of non-negative ints", base_type="error")
227
227
 
228
228
  if self.is_scaling:
229
229
  x, y = self.scale_coordinates(
@@ -237,21 +237,22 @@ class ComputerTool(BaseAnthropicTool):
237
237
 
238
238
  if action == "mouse_move":
239
239
  pyautogui.moveTo(x, y)
240
- return ToolResult(output=f"Moved mouse to ({x}, {y})")
240
+ return ToolResult(output=f"Mouse move", base_type="move")
241
+
241
242
  elif action == "left_click_drag":
242
243
  current_x, current_y = pyautogui.position()
243
244
  pyautogui.dragTo(x, y, duration=0.5) # Adjust duration as needed
244
- return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
245
+ return ToolResult(output=f"Mouse drag", base_type="move")
245
246
 
246
247
  # Action Type 2: Required text (keynames)
247
248
  # Actions: key, type, key_down, key_up
248
249
  if action in ("key", "type", "key_down", "key_up"):
249
250
  if text is None:
250
- raise ToolError(f"text is required for {action}")
251
+ raise ToolError(output=f"text is required for {action}", base_type="error")
251
252
  if coordinate is not None:
252
- raise ToolError(f"coordinate is not accepted for {action}")
253
+ raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
253
254
  if not isinstance(text, str):
254
- raise ToolError(output=f"{text} must be a string")
255
+ raise ToolError(output=f"{text} must be a string", base_type="error")
255
256
 
256
257
  if action == "key":
257
258
  # Handle key combinations
@@ -264,19 +265,19 @@ class ComputerTool(BaseAnthropicTool):
264
265
  key = self.key_conversion.get(key.strip(), key.strip())
265
266
  key = key.lower()
266
267
  pyautogui.keyUp(key) # Release each key in reverse order
267
- return ToolResult(output=f"Pressed keys: {text}")
268
+ return ToolResult(output=f"Press key '{text}'", base_type="key")
268
269
 
269
270
  elif action == "key_down":
270
271
  pyautogui.keyDown(text)
271
- return ToolResult(output=f"Pressed key: {text}")
272
+ return ToolResult(output=f"Press key '{text}'", base_type="key")
272
273
  elif action == "key_up":
273
274
  pyautogui.keyUp(text)
274
- return ToolResult(output=f"Released key: {text}")
275
+ return ToolResult(output=f"Release key '{text}'", base_type="key")
275
276
 
276
277
  elif action == "type":
277
278
  pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000) # Convert ms to seconds
278
- screenshot_base64 = (await self.screenshot()).base64_image
279
- return ToolResult(output=text, base64_image=screenshot_base64)
279
+ # screenshot_base64 = (await self.screenshot()).base64_image
280
+ return ToolResult(output=f"Type '{text}'", base_type="type") # base64_image=screenshot_base64)
280
281
 
281
282
  # Action Type 3: No required text or coordinates
282
283
  # Actions: left_click, right_click, double_click, middle_click, left_press, scroll_down, scroll_up
@@ -291,76 +292,81 @@ class ComputerTool(BaseAnthropicTool):
291
292
  "wait",
292
293
  ):
293
294
  if text is not None:
294
- raise ToolError(f"text is not accepted for {action}")
295
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
295
296
  if coordinate is not None:
296
- raise ToolError(f"coordinate is not accepted for {action}")
297
+ raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
297
298
 
298
299
  if action == "left_click":
299
300
  x, y = pyautogui.position()
300
301
  show_click(x, y)
301
302
  pyautogui.click()
303
+ return ToolResult(output="Left click", base_type="click")
302
304
  elif action == "right_click":
303
305
  x, y = pyautogui.position()
304
306
  show_click(x, y)
305
307
  pyautogui.rightClick()
308
+ return ToolResult(output="Right click", base_type="click")
306
309
  elif action == "middle_click":
307
310
  x, y = pyautogui.position()
308
311
  show_click(x, y)
309
312
  pyautogui.middleClick()
313
+ return ToolResult(output="Middle click", base_type="click")
310
314
  elif action == "double_click":
311
315
  x, y = pyautogui.position()
312
316
  show_click(x, y)
313
317
  pyautogui.doubleClick()
318
+ return ToolResult(output="Double click", base_type="click")
314
319
  elif action == "left_press":
315
320
  x, y = pyautogui.position()
316
321
  show_click(x, y)
317
322
  pyautogui.mouseDown()
318
323
  time.sleep(1)
319
324
  pyautogui.mouseUp()
325
+ return ToolResult(output="Left press", base_type="click")
320
326
  elif action == "scroll_down":
321
327
  pyautogui.scroll(-200) # Adjust scroll amount as needed
322
- return ToolResult(output="Scrolled down")
328
+ return ToolResult(output="Scrolled down", base_type="scroll")
323
329
 
324
330
  elif action == "scroll_up":
325
331
  pyautogui.scroll(200) # Adjust scroll amount as needed
326
- return ToolResult(output="Scrolled up")
332
+ return ToolResult(output="Scrolled up", base_type="scroll")
327
333
 
328
334
  elif action == "wait":
329
335
  time.sleep(15)
330
- return ToolResult(output="Waited")
336
+ return ToolResult(output="Wait for next event", base_type="wait")
331
337
 
332
- return ToolResult(output=f"Performed {action}")
338
+ return ToolResult(output=f"Performed {action}", base_type="unknown")
333
339
 
334
340
  # Action Type 4: Miscs. No required text or coordinates
335
341
  # Actions: screenshot, cursor_position
336
342
  if action in ("screenshot", "cursor_position"):
337
343
  if text is not None:
338
- raise ToolError(f"text is not accepted for {action}")
344
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
339
345
  if coordinate is not None:
340
- raise ToolError(f"coordinate is not accepted for {action}")
346
+ raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
341
347
  if action == "screenshot":
342
348
  return await self.screenshot()
343
349
  elif action == "cursor_position":
344
350
  x, y = pyautogui.position()
345
- x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
346
- return ToolResult(output=f"X={x},Y={y}")
351
+ # x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
352
+ return ToolResult(output=f"Cursor position ({x},{y})", base_type="unknown")
347
353
 
348
354
  # Action Type 5: StarRail Mode
349
355
  # Actions: sr_scroll_down, sr_scroll_up
350
356
  if action in ("sr_scroll_down", "sr_scroll_up"):
351
357
  if text is not None:
352
- raise ToolError(f"text is not accepted for {action}")
358
+ raise ToolError(output=f"text is not accepted for {action}", base_type="error")
353
359
 
354
360
  if action == "sr_scroll_down":
355
361
  for _ in range(20):
356
362
  pyautogui.scroll(-100) # Adjust scroll amount as needed
357
363
  time.sleep(0.001)
358
- return ToolResult(output="SR Scrolled down")
364
+ return ToolResult(output="Scroll down", base_type="scroll")
359
365
  elif action == "sr_scroll_up":
360
366
  for _ in range(20):
361
367
  pyautogui.scroll(100) # Adjust scroll amount as needed
362
368
  time.sleep(0.001)
363
- return ToolResult(output="SR Scrolled up")
369
+ return ToolResult(output="Scroll up", base_type="scroll")
364
370
 
365
371
  # starrail browser mode
366
372
  if action in ("left_click_windll", "mouse_move_windll", "right_click_windll", "key_down_windll", "key_up_windll"):
@@ -374,10 +380,11 @@ class ComputerTool(BaseAnthropicTool):
374
380
  y = coordinate[1]+self.offset_y
375
381
  show_click(x, y)
376
382
  self.marbot_auto_gui.click(x=x, y=y)
383
+ return ToolResult(output=f"Left click", base_type="click")
377
384
 
378
385
  elif action == "mouse_move_windll":
379
386
  if coordinate is None:
380
- raise ToolError(f"coordinate is required for {action}")
387
+ raise ToolError(output=f"coordinate is required for {action}", base_type="error")
381
388
 
382
389
  x0, y0 = pyautogui.position()
383
390
  # x0, y0 = self.scale_coordinates(ScalingSource.COMPUTER, x0, y0)
@@ -386,16 +393,21 @@ class ComputerTool(BaseAnthropicTool):
386
393
 
387
394
  show_move_to(x0, y0, x1, y1, duration_ms=1000)
388
395
  self.marbot_auto_gui.moveTo(x=x1, y=y1)
389
-
396
+
397
+ return ToolResult(output=f"Mouse move", base_type="move")
398
+
390
399
  # elif action == "right_click_windll":
391
400
  # self.marbot_auto_gui.rightClick(x=coordinate[0], y=coordinate[1])
392
401
  elif action == "key_down_windll":
393
402
  self.marbot_auto_gui.keyDown(text)
403
+ return ToolResult(output=f"Key down '{text}'", base_type="key")
394
404
  elif action == "key_up_windll":
395
405
  self.marbot_auto_gui.keyUp(text)
396
- return ToolResult(output=f"Performed dll action:{action}")
406
+ return ToolResult(output=f"Key up '{text}'", base_type="key")
407
+
408
+ return ToolResult(output=f"Performed dll action:{action}", base_type="unknown")
397
409
 
398
- raise ToolError(f"Invalid action: {action}")
410
+ raise ToolError(output=f"Invalid action: {action}", base_type="error")
399
411
 
400
412
 
401
413
  async def screenshot(self):
@@ -486,9 +498,9 @@ class ComputerTool(BaseAnthropicTool):
486
498
 
487
499
  if path.exists():
488
500
  # Return a ToolResult instance instead of a dictionary
489
- return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
501
+ return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode(), base_type="screenshot")
490
502
 
491
- raise ToolError(f"Failed to take screenshot: {path} does not exist.")
503
+ raise ToolError(output=f"Failed to take screenshot: {path} does not exist.", base_type="error")
492
504
 
493
505
  def padding_image(self, screenshot):
494
506
  """Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
@@ -500,17 +512,17 @@ class ComputerTool(BaseAnthropicTool):
500
512
  padding_image.paste(screenshot, (0, 0))
501
513
  return padding_image
502
514
 
503
- async def shell(self, command: str, take_screenshot=True) -> ToolResult:
504
- """Run a shell command and return the output, error, and optionally a screenshot."""
505
- _, stdout, stderr = await run(command)
506
- base64_image = None
515
+ # async def shell(self, command: str, take_screenshot=True) -> ToolResult:
516
+ # """Run a shell command and return the output, error, and optionally a screenshot."""
517
+ # _, stdout, stderr = await run(command)
518
+ # base64_image = None
507
519
 
508
- if take_screenshot:
509
- # delay to let things settle before taking a screenshot
510
- await asyncio.sleep(self._screenshot_delay)
511
- base64_image = (await self.screenshot()).base64_image
520
+ # if take_screenshot:
521
+ # # delay to let things settle before taking a screenshot
522
+ # await asyncio.sleep(self._screenshot_delay)
523
+ # base64_image = (await self.screenshot()).base64_image
512
524
 
513
- return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
525
+ # return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
514
526
 
515
527
  def scale_coordinates(self, source: ScalingSource, x: int, y: int):
516
528
  """Scale coordinates to a target maximum resolution."""
@@ -538,7 +550,7 @@ class ComputerTool(BaseAnthropicTool):
538
550
  y_scaling_factor = target_dimension["height"] / self.height
539
551
  if source == ScalingSource.API:
540
552
  if x > self.width or y > self.height:
541
- raise ToolError(f"Coordinates {x}, {y} are out of bounds")
553
+ raise ToolError(output=f"Coordinates {x}, {y} are out of bounds", base_type="error")
542
554
  # scale up
543
555
  return round(x / x_scaling_factor), round(y / y_scaling_factor)
544
556
  # scale down
@@ -2,15 +2,13 @@ import argparse
2
2
  import time
3
3
  import json
4
4
  import platform
5
- from typing import Callable
6
- from collections.abc import Callable
7
5
  import uuid
8
6
  import datetime
9
7
  from datetime import datetime, timedelta, timezone
10
8
 
11
9
  from computer_use_ootb_internal.computer_use_demo.executor.teachmode_executor import TeachmodeExecutor
12
10
  from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.gui_capture import get_screenshot
13
- from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.oai import encode_image
11
+ from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.llm_utils import encode_image, is_image_path
14
12
  from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.icon_detection.icon_detection import get_screen_resize_factor
15
13
  from computer_use_ootb_internal.computer_use_demo.tools.aws_request import send_request_to_server
16
14
  from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.uia_tools.screenshot_service import get_screenshot_external_cmd
@@ -41,7 +39,7 @@ def simple_teachmode_sampling_loop(
41
39
  if "star_rail" in user_id or "star_rail" in user_id:
42
40
  full_screen_game_mode = 1
43
41
 
44
- if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id:
42
+ if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id or "offical" in user_id:
45
43
  full_screen_game_mode = 2
46
44
 
47
45
  print(f"Full Screen Game Mode: {full_screen_game_mode}")
@@ -53,7 +51,7 @@ def simple_teachmode_sampling_loop(
53
51
  timestamp = datetime.now(utc_plus_8).strftime("%m%d-%H%M%S")
54
52
 
55
53
  step_count = 1
56
- unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:4]}"
54
+ unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:6]}"
57
55
 
58
56
  print("[simple_teachmode_sampling_loop] starting task: ", task)
59
57
  print(f"[simple_teachmode_sampling_loop] unique_task_id: {unique_task_id}")
@@ -68,6 +66,10 @@ def simple_teachmode_sampling_loop(
68
66
 
69
67
  uia_meta, sc_path = get_screenshot_external_cmd(selected_screen=selected_screen,
70
68
  capture_uia_data=full_screen_game_mode==0)
69
+
70
+ if is_image_path(sc_path):
71
+ screenshot_message = {"role": "user", "content": sc_path, "type": "image"}
72
+ yield screenshot_message
71
73
 
72
74
  payload = {
73
75
  "task_id": unique_task_id,
@@ -102,27 +104,40 @@ def simple_teachmode_sampling_loop(
102
104
 
103
105
  try:
104
106
  step_plan = infer_server_response["generated_plan"]
105
- step_info = infer_server_response["generated_plan"]["step_info"]
107
+ step_reasoning = step_plan["reasoning"]
108
+ step_info = step_plan["step_info"]
106
109
  step_action = infer_server_response["generated_action"]["content"]
107
110
  step_traj_idx = infer_server_response["current_traj_step"]
108
111
 
109
112
  except Exception as e:
110
113
  print("Error parsing generated_action content:", e)
111
114
  continue
115
+
116
+ plan_message = {"role": "assistant", "content": step_reasoning, "type": "text"}
117
+ yield plan_message
112
118
 
113
119
  if step_action.get("action") == "STOP":
114
120
  final_sc, final_sc_path = get_screenshot_external_cmd(selected_screen=selected_screen)
115
- action_history = [] # reset action history
116
- break
117
121
 
118
- # action_history.append(f"Executing Step: {step_count} - Trajectory Step: {step_traj_idx} - Plan: {step_plan} - Action: {step_action};\n")
122
+ final_message = {"role": "assistant", "content": "Task completed. Final screenshot:", "type": "text"}
123
+ yield final_message
124
+
125
+ final_sc_message = {"role": "user", "content": final_sc_path, "type": "image"}
126
+ yield final_sc_message
127
+
128
+ # reset action history
129
+ action_history = []
130
+ break
119
131
 
120
132
  action_history.append(f"Executing guidance trajectory step [{step_traj_idx}], Plan: {step_info}, Action: {step_action};\n")
121
133
 
122
- for message in executor({"role": "assistant", "content": step_action}):
123
- yield message
134
+ for exec_message in executor({"role": "assistant", "content": step_action}):
135
+ yield exec_message
124
136
 
125
137
  step_count += 1
138
+
139
+ # reset action history
140
+ action_history = []
126
141
 
127
142
 
128
143
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: computer-use-ootb-internal
3
- Version: 0.0.107
3
+ Version: 0.0.109
4
4
  Summary: Computer Use OOTB
5
5
  Author-email: Siyuan Hu <siyuan.hu.sg@gmail.com>
6
6
  Requires-Python: >=3.11