PyPI - computer-use-ootb-internal - Versions diffs - 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl - Mend

computer-use-ootb-internal 0.0.107py3-none-any.whl → 0.0.109py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

computer_use_ootb_internal/app_teachmode.py CHANGED Viewed

@@ -170,7 +170,7 @@ async def update_parameters(request: Request):
     # Update shared state when parameters change
     shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
-    shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
+    shared_state.task = getattr(shared_state.args, 'task', "Following the instructions to complete the task.")
     shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
     shared_state.user_id = getattr(shared_state.args, 'user_id', "hero_cases")
     shared_state.trace_id = getattr(shared_state.args, 'trace_id', "build_scroll_combat")
@@ -227,7 +227,7 @@ async def get_messages(request: Request):
             status_code=429
         )
-    log_ootb_request(shared_state.server_url, "get_messages", {})
+    # log_ootb_request(shared_state.server_url, "get_messages", {})
     # Return all messages in the queue and clear it
     messages = shared_state.message_queue.copy()
@@ -338,7 +338,7 @@ async def get_status(request: Request):
             status_code=429
         )
-    log_ootb_request(shared_state.server_url, "get_status", {})
+    # log_ootb_request(shared_state.server_url, "get_status", {})
     print(f"Status check - Processing: {shared_state.is_processing}, Paused: {shared_state.is_paused}")
     return JSONResponse(
@@ -393,14 +393,8 @@ def process_input():
                 print("Processing stopped while paused or resuming")
                 break
-            # Process the message
-            if loop_msg.startswith('<img'):
-                message = {"role": "user", "content": loop_msg}
-            else:
-                message = {"role": "assistant", "content": loop_msg}
-            shared_state.chatbot_messages.append(message)
-            shared_state.message_queue.append(message)
+            shared_state.chatbot_messages.append(loop_msg)
+            shared_state.message_queue.append(loop_msg)
             # Short sleep to allow stop signals to be processed
             for _ in range(5):  # Check 5 times per second
@@ -416,17 +410,17 @@ def process_input():
         # Handle any exceptions in the processing loop
         error_msg = f"Error during task processing: {str(e)}"
         print(error_msg)
-        error_message = {"role": "assistant", "content": error_msg}
+        error_message = {"role": "assistant", "content": error_msg, "type": "error"}
         shared_state.message_queue.append(error_message)
     finally:
         # Handle completion or interruption
         if shared_state.should_stop or shared_state.stop_event.is_set():
             stop_msg = f"Task '{shared_state.task}' was stopped. Ready for new tasks."
-            final_message = {"role": "assistant", "content": stop_msg}
+            final_message = {"role": "assistant", "content": stop_msg, "type": "text"}
         else:
             complete_msg = f"Task '{shared_state.task}' completed. Thanks for using Teachmode-OOTB."
-            final_message = {"role": "assistant", "content": complete_msg}
+            final_message = {"role": "assistant", "content": complete_msg, "type": "text"}
         shared_state.chatbot_messages.append(final_message)
         shared_state.message_queue.append(final_message)

computer_use_ootb_internal/app_teachmode_gradio.py CHANGED Viewed

@@ -42,7 +42,7 @@ async def update_parameters(request: Request):
     shared_state.task_updated = True
     # Update shared state when parameters change
-    shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
+    # shared_state.model = getattr(shared_state.args, 'model', "teach-mode-gpt-4o")
     shared_state.task = getattr(shared_state.args, 'task', "Create a claim on the SAP system, using Receipt.pdf as attachment.")
     shared_state.selected_screen = getattr(shared_state.args, 'selected_screen', 0)
     shared_state.user_id = getattr(shared_state.args, 'user_id', "a_test")

computer_use_ootb_internal/computer_use_demo/animation/click_animation.py CHANGED Viewed

@@ -64,7 +64,7 @@ def show_click(x: int, y: int, duration_ms: int = 800, existing_ms: int = 800):
     ).start()
 def show_move_to(x1: int, y1: int, x2: int, y2: int,
-                 duration_ms: int = 1000, existing_ms: int = 800):
+                 duration_ms: int = 800, existing_ms: int = 800):
     if not CLICK_GIF.exists():
         raise FileNotFoundError(f"GIF not found at {CLICK_GIF}")
     mp.get_context("spawn").Process(

computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py CHANGED Viewed

@@ -3,13 +3,9 @@ import json
 import asyncio
 from typing import Any, Dict, cast, List, Union
 import uuid
-from anthropic.types.beta import (
-    BetaImageBlockParam,
-    BetaTextBlockParam,
-    BetaToolResultBlockParam,
-)
-from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
-from computer_use_ootb_internal.computer_use_demo.tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
+from anthropic.types.beta import BetaToolUseBlock
+from computer_use_ootb_internal.computer_use_demo.tools import ComputerTool, ToolCollection
+from computer_use_ootb_internal.computer_use_demo.tools.base import ToolResult, ToolError
 class TeachmodeExecutor:
@@ -48,12 +44,13 @@ class TeachmodeExecutor:
-    def __call__(self,
-                response: str):
-        # response is expected to be :
+    def __call__(self, response: str):
+        # response is expected to be:
         # {'content': "{'action': 'CLICK', 'value': None, 'position': [0.83, 0.15]}, ...", 'role': 'assistant'},
-        action_dict = self._format_actor_output(response)  # str -> dict
+        # str -> dict
+        action_dict = self._format_actor_output(response)
         actions = action_dict["content"]
@@ -72,13 +69,9 @@ class TeachmodeExecutor:
         print("Parsed Action List:", action_list)
-        tool_result_content = None
         if action_list is not None and len(action_list) > 0:
-            for action in action_list:  # Execute the tool (adapting the code from anthropic_executor.py)
-                tool_result_content: list[BetaToolResultBlockParam] = []
+            for action in action_list:
                 # self.output_callback(f"{colorful_text_showui}:\n{action}", sender="bot")
                 print("Converted Action:", action)
@@ -86,23 +79,28 @@ class TeachmodeExecutor:
                 sim_content_block = BetaToolUseBlock(
                     id=f'toolu_{uuid.uuid4()}',
                     input={'action': action["action"], 'text': action["text"], 'coordinate': action["coordinate"]},
-                    name='computer', type='tool_use')
+                    name='computer',
+                    type='tool_use'
+                )
                 # Run the asynchronous tool execution in a synchronous context
-                result = asyncio.run(self.tool_collection.run(
-                    name=sim_content_block.name,
-                    tool_input=cast(dict[str, Any], sim_content_block.input),
-                ))
+                tool_result = asyncio.run(
+                    self.tool_collection.run(
+                        name=sim_content_block.name,
+                        tool_input=cast(dict[str, Any], sim_content_block.input),
+                    ))
-                tool_result_content.append(
-                    _make_api_tool_result(result, sim_content_block.id)
-                )
-                print(f"[teachmode_executor] tool_result_content: {tool_result_content}")
-                yield tool_result_content[0]['content'][0]['text']
-        return tool_result_content[0]['content'][0]['text']
+                if isinstance(tool_result, ToolResult):
+                    print(f"[teachmode_executor] tool_result: {tool_result}")
+                    tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "action", "action_type": tool_result['base_type']}
+                    yield tool_result_message
+                elif isinstance(tool_result, ToolError):
+                    tool_result_message = {"role": "assistant", "content": tool_result['output'], "type": "error"}
+                    yield tool_result_message
+            return tool_result_message
     def _format_actor_output(self, action_output: str|dict) -> Dict[str, Any]:
         if type(action_output) == dict:
@@ -172,8 +170,8 @@ class TeachmodeExecutor:
             elif action_item["action"] == "PRESS":  # 7. press
                 x, y = action_item["position"]
-                action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
-                                        int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
+                # action_item["position"] = (int(x * (self.screen_bbox[2] - self.screen_bbox[0])),
+                #                         int(y * (self.screen_bbox[3] - self.screen_bbox[1])))
                 refined_output.append({"action": "mouse_move", "text": None, "coordinate": tuple(action_item["position"])})
                 refined_output.append({"action": "left_press", "text": None, "coordinate": None})
@@ -316,43 +314,43 @@ class TeachmodeExecutor:
-def _make_api_tool_result(
-    result: ToolResult, tool_use_id: str
-) -> BetaToolResultBlockParam:
-    """Convert an agent ToolResult to an API ToolResultBlockParam."""
-    tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
-    is_error = False
-    if result.error:
-        is_error = True
-        tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
-    else:
-        if result.output:
-            tool_result_content.append(
-                {
-                    "type": "text",
-                    "text": _maybe_prepend_system_tool_result(result, result.output),
-                }
-            )
-        if result.base64_image:
-            tool_result_content.append(
-                {
-                    "type": "image",
-                    "source": {
-                        "type": "base64",
-                        "media_type": "image/png",
-                        "data": result.base64_image,
-                    },
-                }
-            )
-    return {
-        "type": "tool_result",
-        "content": tool_result_content,
-        "tool_use_id": tool_use_id,
-        "is_error": is_error,
-    }
-def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
-    if result.system:
-        result_text = f"<system>{result.system}</system>\n{result_text}"
-    return result_text
+# def _make_api_tool_result(
+#     result: ToolResult, tool_use_id: str
+# ) -> BetaToolResultBlockParam:
+#     """Convert an agent ToolResult to an API ToolResultBlockParam."""
+#     tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
+#     is_error = False
+#     if result.error:
+#         is_error = True
+#         tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
+#     else:
+#         if result.output:
+#             tool_result_content.append(
+#                 {
+#                     "type": "text",
+#                     "text": _maybe_prepend_system_tool_result(result, result.output),
+#                 }
+#             )
+#         if result.base64_image:
+#             tool_result_content.append(
+#                 {
+#                     "type": "image",
+#                     "source": {
+#                         "type": "base64",
+#                         "media_type": "image/png",
+#                         "data": result.base64_image,
+#                     },
+#                 }
+#             )
+#     return {
+#         "type": "tool_result",
+#         "content": tool_result_content,
+#         "tool_use_id": tool_use_id,
+#         "is_error": is_error,
+#     }
+# def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
+#     if result.system:
+#         result_text = f"<system>{result.system}</system>\n{result_text}"
+#     return result_text

computer_use_ootb_internal/computer_use_demo/tools/base.py CHANGED Viewed

@@ -28,6 +28,7 @@ class ToolResult:
     error: str | None = None
     base64_image: str | None = None
     system: str | None = None
+    base_type: str | None = None
     def __bool__(self):
         return any(getattr(self, field.name) for field in fields(self))
@@ -65,5 +66,6 @@ class ToolFailure(ToolResult):
 class ToolError(Exception):
     """Raised when a tool encounters an error."""
-    def __init__(self, message):
-        self.message = message
+    def __init__(self, output: str, base_type: str):
+        self.output = output
+        self.base_type = base_type

computer_use_ootb_internal/computer_use_demo/tools/computer.py CHANGED Viewed

@@ -217,13 +217,13 @@ class ComputerTool(BaseAnthropicTool):
         if action in ("mouse_move", "left_click_drag"):
             if coordinate is None:
-                raise ToolError(f"coordinate is required for {action}")
+                raise ToolError(output=f"coordinate is required for {action}", base_type="error")
             if text is not None:
-                raise ToolError(f"text is not accepted for {action}")
+                raise ToolError(output=f"text is not accepted for {action}", base_type="error")
             if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
-                raise ToolError(f"{coordinate} must be a tuple of length 2")
+                raise ToolError(output=f"{coordinate} must be a tuple of length 2", base_type="error")
             if not all(isinstance(i, int) for i in coordinate):
-                raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
+                raise ToolError(output=f"{coordinate} must be a tuple of non-negative ints", base_type="error")
             if self.is_scaling:
                 x, y = self.scale_coordinates(
@@ -237,21 +237,22 @@ class ComputerTool(BaseAnthropicTool):
             if action == "mouse_move":
                 pyautogui.moveTo(x, y)
-                return ToolResult(output=f"Moved mouse to ({x}, {y})")
+                return ToolResult(output=f"Mouse move", base_type="move")
             elif action == "left_click_drag":
                 current_x, current_y = pyautogui.position()
                 pyautogui.dragTo(x, y, duration=0.5)  # Adjust duration as needed
-                return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
+                return ToolResult(output=f"Mouse drag", base_type="move")
         # Action Type 2: Required text (keynames)
         # Actions: key, type, key_down, key_up
         if action in ("key", "type", "key_down", "key_up"):
             if text is None:
-                raise ToolError(f"text is required for {action}")
+                raise ToolError(output=f"text is required for {action}", base_type="error")
             if coordinate is not None:
-                raise ToolError(f"coordinate is not accepted for {action}")
+                raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
             if not isinstance(text, str):
-                raise ToolError(output=f"{text} must be a string")
+                raise ToolError(output=f"{text} must be a string", base_type="error")
             if action == "key":
                 # Handle key combinations
@@ -264,19 +265,19 @@ class ComputerTool(BaseAnthropicTool):
                     key = self.key_conversion.get(key.strip(), key.strip())
                     key = key.lower()
                     pyautogui.keyUp(key)    # Release each key in reverse order
-                return ToolResult(output=f"Pressed keys: {text}")
+                return ToolResult(output=f"Press key '{text}'", base_type="key")
             elif action == "key_down":
                 pyautogui.keyDown(text)
-                return ToolResult(output=f"Pressed key: {text}")
+                return ToolResult(output=f"Press key '{text}'", base_type="key")
             elif action == "key_up":
                 pyautogui.keyUp(text)
-                return ToolResult(output=f"Released key: {text}")
+                return ToolResult(output=f"Release key '{text}'", base_type="key")
             elif action == "type":
                 pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)  # Convert ms to seconds
-                screenshot_base64 = (await self.screenshot()).base64_image
-                return ToolResult(output=text, base64_image=screenshot_base64)
+                # screenshot_base64 = (await self.screenshot()).base64_image
+                return ToolResult(output=f"Type '{text}'", base_type="type") #  base64_image=screenshot_base64)
         # Action Type 3: No required text or coordinates
         # Actions: left_click, right_click, double_click, middle_click, left_press, scroll_down, scroll_up
@@ -291,76 +292,81 @@ class ComputerTool(BaseAnthropicTool):
             "wait",
         ):
             if text is not None:
-                raise ToolError(f"text is not accepted for {action}")
+                raise ToolError(output=f"text is not accepted for {action}", base_type="error")
             if coordinate is not None:
-                raise ToolError(f"coordinate is not accepted for {action}")
+                raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
             if action == "left_click":
                 x, y = pyautogui.position()
                 show_click(x, y)
                 pyautogui.click()
+                return ToolResult(output="Left click", base_type="click")
             elif action == "right_click":
                 x, y = pyautogui.position()
                 show_click(x, y)
                 pyautogui.rightClick()
+                return ToolResult(output="Right click", base_type="click")
             elif action == "middle_click":
                 x, y = pyautogui.position()
                 show_click(x, y)
                 pyautogui.middleClick()
+                return ToolResult(output="Middle click", base_type="click")
             elif action == "double_click":
                 x, y = pyautogui.position()
                 show_click(x, y)
                 pyautogui.doubleClick()
+                return ToolResult(output="Double click", base_type="click")
             elif action == "left_press":
                 x, y = pyautogui.position()
                 show_click(x, y)
                 pyautogui.mouseDown()
                 time.sleep(1)
                 pyautogui.mouseUp()
+                return ToolResult(output="Left press", base_type="click")
             elif action == "scroll_down":
                 pyautogui.scroll(-200)  # Adjust scroll amount as needed
-                return ToolResult(output="Scrolled down")
+                return ToolResult(output="Scrolled down", base_type="scroll")
             elif action == "scroll_up":
                 pyautogui.scroll(200)   # Adjust scroll amount as needed
-                return ToolResult(output="Scrolled up")
+                return ToolResult(output="Scrolled up", base_type="scroll")
             elif action == "wait":
                 time.sleep(15)
-                return ToolResult(output="Waited")
+                return ToolResult(output="Wait for next event", base_type="wait")
-            return ToolResult(output=f"Performed {action}")
+            return ToolResult(output=f"Performed {action}", base_type="unknown")
         # Action Type 4: Miscs. No required text or coordinates
         # Actions: screenshot, cursor_position
         if action in ("screenshot", "cursor_position"):
             if text is not None:
-                raise ToolError(f"text is not accepted for {action}")
+                raise ToolError(output=f"text is not accepted for {action}", base_type="error")
             if coordinate is not None:
-                raise ToolError(f"coordinate is not accepted for {action}")
+                raise ToolError(output=f"coordinate is not accepted for {action}", base_type="error")
             if action == "screenshot":
                 return await self.screenshot()
             elif action == "cursor_position":
                 x, y = pyautogui.position()
-                x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
-                return ToolResult(output=f"X={x},Y={y}")
+                # x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
+                return ToolResult(output=f"Cursor position ({x},{y})", base_type="unknown")
         # Action Type 5: StarRail Mode
         # Actions: sr_scroll_down, sr_scroll_up
         if action in ("sr_scroll_down", "sr_scroll_up"):
             if text is not None:
-                raise ToolError(f"text is not accepted for {action}")
+                raise ToolError(output=f"text is not accepted for {action}", base_type="error")
             if action == "sr_scroll_down":
                 for _ in range(20):
                     pyautogui.scroll(-100)  # Adjust scroll amount as needed
                     time.sleep(0.001)
-                return ToolResult(output="SR Scrolled down")
+                return ToolResult(output="Scroll down", base_type="scroll")
             elif action == "sr_scroll_up":
                 for _ in range(20):
                     pyautogui.scroll(100)   # Adjust scroll amount as needed
                     time.sleep(0.001)
-                return ToolResult(output="SR Scrolled up")
+                return ToolResult(output="Scroll up", base_type="scroll")
         # starrail browser mode
         if action in ("left_click_windll", "mouse_move_windll", "right_click_windll", "key_down_windll", "key_up_windll"):
@@ -374,10 +380,11 @@ class ComputerTool(BaseAnthropicTool):
                     y = coordinate[1]+self.offset_y
                     show_click(x, y)
                     self.marbot_auto_gui.click(x=x, y=y)
+                return ToolResult(output=f"Left click", base_type="click")
             elif action == "mouse_move_windll":
                 if coordinate is None:
-                    raise ToolError(f"coordinate is required for {action}")
+                    raise ToolError(output=f"coordinate is required for {action}", base_type="error")
                 x0, y0 = pyautogui.position()
                 # x0, y0 = self.scale_coordinates(ScalingSource.COMPUTER, x0, y0)
@@ -386,16 +393,21 @@ class ComputerTool(BaseAnthropicTool):
                 show_move_to(x0, y0, x1, y1, duration_ms=1000)
                 self.marbot_auto_gui.moveTo(x=x1, y=y1)
+                return ToolResult(output=f"Mouse move", base_type="move")
             # elif action == "right_click_windll":
             #     self.marbot_auto_gui.rightClick(x=coordinate[0], y=coordinate[1])
             elif action == "key_down_windll":
                 self.marbot_auto_gui.keyDown(text)
+                return ToolResult(output=f"Key down '{text}'", base_type="key")
             elif action == "key_up_windll":
                 self.marbot_auto_gui.keyUp(text)
-            return ToolResult(output=f"Performed dll action:{action}")
+                return ToolResult(output=f"Key up '{text}'", base_type="key")
+            return ToolResult(output=f"Performed dll action:{action}", base_type="unknown")
-        raise ToolError(f"Invalid action: {action}")
+        raise ToolError(output=f"Invalid action: {action}", base_type="error")
     async def screenshot(self):
@@ -486,9 +498,9 @@ class ComputerTool(BaseAnthropicTool):
         if path.exists():
             # Return a ToolResult instance instead of a dictionary
-            return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
+            return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode(), base_type="screenshot")
-        raise ToolError(f"Failed to take screenshot: {path} does not exist.")
+        raise ToolError(output=f"Failed to take screenshot: {path} does not exist.", base_type="error")
     def padding_image(self, screenshot):
         """Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
@@ -500,17 +512,17 @@ class ComputerTool(BaseAnthropicTool):
         padding_image.paste(screenshot, (0, 0))
         return padding_image
-    async def shell(self, command: str, take_screenshot=True) -> ToolResult:
-        """Run a shell command and return the output, error, and optionally a screenshot."""
-        _, stdout, stderr = await run(command)
-        base64_image = None
+    # async def shell(self, command: str, take_screenshot=True) -> ToolResult:
+    #     """Run a shell command and return the output, error, and optionally a screenshot."""
+    #     _, stdout, stderr = await run(command)
+    #     base64_image = None
-        if take_screenshot:
-            # delay to let things settle before taking a screenshot
-            await asyncio.sleep(self._screenshot_delay)
-            base64_image = (await self.screenshot()).base64_image
+    #     if take_screenshot:
+    #         # delay to let things settle before taking a screenshot
+    #         await asyncio.sleep(self._screenshot_delay)
+    #         base64_image = (await self.screenshot()).base64_image
-        return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
+    #     return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
     def scale_coordinates(self, source: ScalingSource, x: int, y: int):
         """Scale coordinates to a target maximum resolution."""
@@ -538,7 +550,7 @@ class ComputerTool(BaseAnthropicTool):
         y_scaling_factor = target_dimension["height"] / self.height
         if source == ScalingSource.API:
             if x > self.width or y > self.height:
-                raise ToolError(f"Coordinates {x}, {y} are out of bounds")
+                raise ToolError(output=f"Coordinates {x}, {y} are out of bounds", base_type="error")
             # scale up
             return round(x / x_scaling_factor), round(y / y_scaling_factor)
         # scale down

computer_use_ootb_internal/run_teachmode_ootb_args.py CHANGED Viewed

@@ -2,15 +2,13 @@ import argparse
 import time
 import json
 import platform
-from typing import Callable
-from collections.abc import Callable
 import uuid
 import datetime
 from datetime import datetime, timedelta, timezone
 from computer_use_ootb_internal.computer_use_demo.executor.teachmode_executor import TeachmodeExecutor
 from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.gui_capture import get_screenshot
-from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.oai import encode_image
+from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.llm_utils import encode_image, is_image_path
 from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.icon_detection.icon_detection import get_screen_resize_factor
 from computer_use_ootb_internal.computer_use_demo.tools.aws_request import send_request_to_server
 from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.uia_tools.screenshot_service import get_screenshot_external_cmd
@@ -41,7 +39,7 @@ def simple_teachmode_sampling_loop(
     if "star_rail" in user_id or "star_rail" in user_id:
         full_screen_game_mode = 1
-    if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id:
+    if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id or "offical" in user_id:
         full_screen_game_mode = 2
     print(f"Full Screen Game Mode: {full_screen_game_mode}")
@@ -53,7 +51,7 @@ def simple_teachmode_sampling_loop(
     timestamp = datetime.now(utc_plus_8).strftime("%m%d-%H%M%S")
     step_count = 1
-    unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:4]}"
+    unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:6]}"
     print("[simple_teachmode_sampling_loop] starting task: ", task)
     print(f"[simple_teachmode_sampling_loop] unique_task_id: {unique_task_id}")
@@ -68,6 +66,10 @@ def simple_teachmode_sampling_loop(
         uia_meta, sc_path = get_screenshot_external_cmd(selected_screen=selected_screen,
                                                         capture_uia_data=full_screen_game_mode==0)
+        if is_image_path(sc_path):
+            screenshot_message = {"role": "user", "content": sc_path, "type": "image"}
+            yield screenshot_message
         payload = {
             "task_id": unique_task_id,
@@ -102,27 +104,40 @@ def simple_teachmode_sampling_loop(
         try:
             step_plan = infer_server_response["generated_plan"]
-            step_info = infer_server_response["generated_plan"]["step_info"]
+            step_reasoning = step_plan["reasoning"]
+            step_info = step_plan["step_info"]
             step_action = infer_server_response["generated_action"]["content"]
             step_traj_idx = infer_server_response["current_traj_step"]
         except Exception as e:
             print("Error parsing generated_action content:", e)
             continue
+        plan_message = {"role": "assistant", "content": step_reasoning, "type": "text"}
+        yield plan_message
         if step_action.get("action") == "STOP":
             final_sc, final_sc_path = get_screenshot_external_cmd(selected_screen=selected_screen)
-            action_history = []  # reset action history
-            break
-        # action_history.append(f"Executing Step: {step_count} - Trajectory Step: {step_traj_idx} - Plan: {step_plan} - Action: {step_action};\n")
+            final_message = {"role": "assistant", "content": "Task completed. Final screenshot:", "type": "text"}
+            yield final_message
+            final_sc_message = {"role": "user", "content": final_sc_path, "type": "image"}
+            yield final_sc_message
+            # reset action history
+            action_history = []
+            break
         action_history.append(f"Executing guidance trajectory step [{step_traj_idx}], Plan: {step_info}, Action: {step_action};\n")
-        for message in executor({"role": "assistant", "content": step_action}):
-            yield message
+        for exec_message in executor({"role": "assistant", "content": step_action}):
+            yield exec_message
         step_count += 1
+    # reset action history
+    action_history = []

{computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: computer-use-ootb-internal
-Version: 0.0.107
+Version: 0.0.109
 Summary: Computer Use OOTB
 Author-email: Siyuan Hu <siyuan.hu.sg@gmail.com>
 Requires-Python: >=3.11

computer-use-ootb-internal 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl

computer-use-ootb-internal 0.0.107py3-none-any.whl → 0.0.109py3-none-any.whl