PyPI - khoj - Versions diffs - 1.41.1.dev97__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl - Mend

khoj 1.41.1.dev97py3-none-any.whl → 1.41.1.dev142py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

khoj/processor/operator/grounding_agent.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import logging
+from textwrap import dedent
 from openai import AzureOpenAI, OpenAI
 from openai.types.chat import ChatCompletion, ChatCompletionMessage
@@ -8,7 +9,7 @@ from khoj.database.models import ChatModel
 from khoj.processor.conversation.utils import construct_structured_message
 from khoj.processor.operator.operator_actions import *
 from khoj.processor.operator.operator_agent_base import AgentActResult
-from khoj.processor.operator.operator_environment_base import EnvState
+from khoj.processor.operator.operator_environment_base import EnvironmentType, EnvState
 from khoj.utils.helpers import get_chat_usage_metrics
 logger = logging.getLogger(__name__)
@@ -18,6 +19,7 @@ class GroundingAgent:
     def __init__(
         self,
         model: ChatModel,
+        environment_type: EnvironmentType,
         client: OpenAI | AzureOpenAI,
         max_iterations: int,
         tracer: dict = None,
@@ -26,9 +28,211 @@ class GroundingAgent:
         self.client = client
         self.max_iterations = max_iterations
         self.tracer = tracer
+        self.environment_type = environment_type
+        self.action_tools = self.get_tools(self.environment_type)
-        # Define tools for the grounding LLM (OpenAI format)
-        self.action_tools = [
+    async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
+        """Call the grounding LLM to get the next action based on the current state and instruction."""
+        # Format the message for the API call
+        messages_for_api = self._format_message_for_api(instruction, current_state)
+        try:
+            grounding_response: ChatCompletion = await self.client.chat.completions.create(
+                messages=messages_for_api,
+                model=self.model.name,
+                tools=self.action_tools,
+                tool_choice="required",
+                temperature=0.0,  # Grounding should be precise
+                max_completion_tokens=1000,  # Allow for thoughts + actions
+            )
+            if not isinstance(grounding_response, ChatCompletion):
+                raise ValueError("Grounding LLM response is not of type ChatCompletion.")
+            logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
+            # Parse tool calls
+            grounding_message = grounding_response.choices[0].message
+            rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
+            # Update usage by grounding model
+            self.tracer["usage"] = get_chat_usage_metrics(
+                self.model.name,
+                input_tokens=grounding_response.usage.prompt_tokens,
+                output_tokens=grounding_response.usage.completion_tokens,
+                usage=self.tracer.get("usage"),
+            )
+        except Exception as e:
+            logger.error(f"Error calling Grounding LLM: {e}")
+            rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
+            actions = []
+        return rendered_response, actions
+    def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
+        """Format the message for the API call."""
+        # Construct grounding LLM input (using only the latest user prompt + image)
+        # We don't pass the full history here, as grounding depends on the *current* state + NL action
+        grounding_user_prompt = self.get_instruction(instruction, self.environment_type)
+        screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
+        grounding_messages_content = construct_structured_message(
+            grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
+        )
+        return [{"role": "user", "content": grounding_messages_content}]
+    def _parse_action(
+        self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
+    ) -> tuple[str, list[OperatorAction]]:
+        """Parse the tool calls from the grounding LLM response and convert them to action objects."""
+        actions: List[OperatorAction] = []
+        action_results: List[dict] = []
+        if grounding_message.tool_calls:
+            rendered_parts = []
+            for tool_call in grounding_message.tool_calls:
+                function_name = tool_call.function.name
+                try:
+                    arguments = json.loads(tool_call.function.arguments)
+                    action_to_run: Optional[OperatorAction] = None
+                    action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
+                    if function_name == "click":
+                        action_to_run = ClickAction(**arguments)
+                    elif function_name == "left_double":
+                        action_to_run = DoubleClickAction(**arguments)
+                    elif function_name == "right_single":
+                        action_to_run = ClickAction(button="right", **arguments)
+                    elif function_name == "type":
+                        content = arguments.get("content")
+                        action_to_run = TypeAction(text=content)
+                    elif function_name == "scroll":
+                        direction = arguments.get("direction", "down")
+                        amount = 3
+                        action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
+                    elif function_name == "hotkey":
+                        action_to_run = KeypressAction(**arguments)
+                    elif function_name == "goto":
+                        action_to_run = GotoAction(**arguments)
+                    elif function_name == "back":
+                        action_to_run = BackAction(**arguments)
+                    elif function_name == "wait":
+                        action_to_run = WaitAction(**arguments)
+                    elif function_name == "screenshot":
+                        action_to_run = ScreenshotAction(**arguments)
+                    elif function_name == "drag":
+                        # Need to convert list of dicts to list of Point objects
+                        path_dicts = arguments.get("path", [])
+                        path_points = [Point(**p) for p in path_dicts]
+                        if path_points:
+                            action_to_run = DragAction(path=path_points)
+                        else:
+                            logger.warning(f"Drag action called with empty path: {arguments}")
+                            action_render_str += " [Skipped - empty path]"
+                    elif function_name == "finished":
+                        action_to_run = None
+                    else:
+                        logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
+                        action_render_str += " [Unhandled]"
+                    if action_to_run:
+                        actions.append(action_to_run)
+                        action_results.append(
+                            {
+                                "type": "tool_result",
+                                "tool_call_id": tool_call.id,
+                                "content": None,  # Updated after environment step
+                            }
+                        )
+                        rendered_parts.append(action_render_str)
+                except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
+                    logger.error(
+                        f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
+                    )
+                    rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
+            rendered_response = "\n- ".join(rendered_parts)
+        else:
+            # Grounding LLM responded but didn't call a tool
+            logger.warning("Grounding LLM did not produce a tool call.")
+            rendered_response = f"{grounding_message.content or 'No action required.'}"
+        # Render the response
+        return rendered_response, actions
+    def get_instruction(self, instruction: str, environment_type: EnvironmentType) -> str:
+        """
+        Get the instruction for the agent based on the environment type.
+        """
+        UITARS_COMPUTER_PREFIX_PROMPT = """
+        You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+        """
+        UITARS_BROWSER_PREFIX_PROMPT = """
+        You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
+        You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
+        Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
+        """
+        UITARS_USR_COMPUTER_PROMPT_THOUGHT = f"""
+        Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
+        ## Output Format
+        ```
+        Thought: ...
+        Action: ...
+        ```
+        ## Action Space
+        click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+        left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+        right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+        drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+        hotkey(key='')
+        type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
+        scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+        wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
+        ## Note
+        - Use English in `Thought` part.
+        - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+        ## User Instruction
+        {instruction}
+        """
+        UITARS_USR_BROWSER_PROMPT_THOUGHT = f"""
+        Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
+        ## Output Format
+        ```
+        Thought: ...
+        Action: ...
+        ```
+        ## Action Space
+        click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+        left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+        right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+        drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+        hotkey(key='')
+        type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
+        scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+        wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
+        goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
+        back() # Use this to go back to the previous page.
+        ## Note
+        - Use English in `Thought` part.
+        - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+        ## User Instruction
+        {instruction}
+        """
+        if environment_type == EnvironmentType.BROWSER:
+            return dedent(UITARS_BROWSER_PREFIX_PROMPT + UITARS_USR_BROWSER_PROMPT_THOUGHT).lstrip()
+        elif environment_type == EnvironmentType.COMPUTER:
+            return dedent(UITARS_COMPUTER_PREFIX_PROMPT + UITARS_USR_COMPUTER_PROMPT_THOUGHT).lstrip()
+        else:
+            raise ValueError(f"Expected environment type: Computer or Browser. Got {environment_type}.")
+    def get_tools(self, environment_type: EnvironmentType) -> list[dict]:
+        """Get tools for the grounding LLM, in OpenAI API tool format"""
+        tools = [
             {
                 "type": "function",
                 "function": {
@@ -163,182 +367,32 @@ class GroundingAgent:
                     },
                 },
             },
-            {
-                "type": "function",
-                "function": {
-                    "name": "goto",
-                    "description": "Navigate to a specific URL.",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
-                        "required": ["url"],
+        ]
+        if environment_type == EnvironmentType.BROWSER:
+            tools += [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "goto",
+                        "description": "Navigate to a specific URL.",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
+                            "required": ["url"],
+                        },
                     },
                 },
-            },
-            {
-                "type": "function",
-                "function": {
-                    "name": "back",
-                    "description": "navigate back to the previous page.",
-                    "parameters": {"type": "object", "properties": {}},
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "back",
+                        "description": "navigate back to the previous page.",
+                        "parameters": {"type": "object", "properties": {}},
+                    },
                 },
-            },
-        ]
-    async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
-        """Call the grounding LLM to get the next action based on the current state and instruction."""
-        # Format the message for the API call
-        messages_for_api = self._format_message_for_api(instruction, current_state)
-        try:
-            grounding_response: ChatCompletion = await self.client.chat.completions.create(
-                messages=messages_for_api,
-                model=self.model.name,
-                tools=self.action_tools,
-                tool_choice="required",
-                temperature=0.0,  # Grounding should be precise
-                max_completion_tokens=1000,  # Allow for thoughts + actions
-            )
-            if not isinstance(grounding_response, ChatCompletion):
-                raise ValueError("Grounding LLM response is not of type ChatCompletion.")
-            logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
-            # Parse tool calls
-            grounding_message = grounding_response.choices[0].message
-            rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
-            # Update usage by grounding model
-            self.tracer["usage"] = get_chat_usage_metrics(
-                self.model.name,
-                input_tokens=grounding_response.usage.prompt_tokens,
-                output_tokens=grounding_response.usage.completion_tokens,
-                usage=self.tracer.get("usage"),
-            )
-        except Exception as e:
-            logger.error(f"Error calling Grounding LLM: {e}")
-            rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
-            actions = []
-        return rendered_response, actions
-    def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
-        """Format the message for the API call."""
-        grounding_user_prompt = f"""
-You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
-You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
-Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
-## Output Format
-```
-Thought: ...
-Action: ...
-```
-## Action Space
-click(start_box='<|box_start|>(x1,y1)<|box_end|>')
-left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
-right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
-drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
-hotkey(key='')
-type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
-scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
-wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
-goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
-back() # Use this to go back to the previous page.
-## Note
-- Use English in `Thought` part.
-- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
-## User Instruction
-{instruction}
-""".lstrip()
-        # Construct grounding LLM input (using only the latest user prompt + image)
-        # We don't pass the full history here, as grounding depends on the *current* state + NL action
-        screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
-        grounding_messages_content = construct_structured_message(
-            grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
-        )
-        return [{"role": "user", "content": grounding_messages_content}]
-    def _parse_action(
-        self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
-    ) -> tuple[str, list[OperatorAction]]:
-        """Parse the tool calls from the grounding LLM response and convert them to action objects."""
-        actions: List[OperatorAction] = []
-        action_results: List[dict] = []
-        if grounding_message.tool_calls:
-            rendered_parts = []
-            for tool_call in grounding_message.tool_calls:
-                function_name = tool_call.function.name
-                try:
-                    arguments = json.loads(tool_call.function.arguments)
-                    action_to_run: Optional[OperatorAction] = None
-                    action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
+            ]
-                    if function_name == "click":
-                        action_to_run = ClickAction(**arguments)
-                    elif function_name == "left_double":
-                        action_to_run = DoubleClickAction(**arguments)
-                    elif function_name == "right_single":
-                        action_to_run = ClickAction(button="right", **arguments)
-                    elif function_name == "type":
-                        content = arguments.get("content")
-                        action_to_run = TypeAction(text=content)
-                    elif function_name == "scroll":
-                        direction = arguments.get("direction", "down")
-                        amount = 3
-                        action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
-                    elif function_name == "hotkey":
-                        action_to_run = KeypressAction(**arguments)
-                    elif function_name == "goto":
-                        action_to_run = GotoAction(**arguments)
-                    elif function_name == "back":
-                        action_to_run = BackAction(**arguments)
-                    elif function_name == "wait":
-                        action_to_run = WaitAction(**arguments)
-                    elif function_name == "screenshot":
-                        action_to_run = ScreenshotAction(**arguments)
-                    elif function_name == "drag":
-                        # Need to convert list of dicts to list of Point objects
-                        path_dicts = arguments.get("path", [])
-                        path_points = [Point(**p) for p in path_dicts]
-                        if path_points:
-                            action_to_run = DragAction(path=path_points)
-                        else:
-                            logger.warning(f"Drag action called with empty path: {arguments}")
-                            action_render_str += " [Skipped - empty path]"
-                    elif function_name == "finished":
-                        action_to_run = None
-                    else:
-                        logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
-                        action_render_str += " [Unhandled]"
-                    if action_to_run:
-                        actions.append(action_to_run)
-                        action_results.append(
-                            {
-                                "type": "tool_result",
-                                "tool_call_id": tool_call.id,
-                                "content": None,  # Updated after environment step
-                            }
-                        )
-                        rendered_parts.append(action_render_str)
-                except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
-                    logger.error(
-                        f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
-                    )
-                    rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
-            rendered_response = "\n- ".join(rendered_parts)
-        else:
-            # Grounding LLM responded but didn't call a tool
-            logger.warning("Grounding LLM did not produce a tool call.")
-            rendered_response = f"{grounding_message.content or 'No action required.'}"
-        # Render the response
-        return rendered_response, actions
+        return tools
     def reset(self):
         """Reset the agent state."""

khoj/processor/operator/grounding_agent_uitars.py CHANGED Viewed

@@ -10,15 +10,16 @@ import logging
 import math
 import re
 from io import BytesIO
+from textwrap import dedent
 from typing import Any, List
 import numpy as np
-from openai import AzureOpenAI, OpenAI
+from openai import AsyncAzureOpenAI, AsyncOpenAI
 from openai.types.chat import ChatCompletion
 from PIL import Image
 from khoj.processor.operator.operator_actions import *
-from khoj.processor.operator.operator_environment_base import EnvState
+from khoj.processor.operator.operator_environment_base import EnvironmentType, EnvState
 from khoj.utils.helpers import get_chat_usage_metrics
 logger = logging.getLogger(__name__)
@@ -35,29 +36,8 @@ class GroundingAgentUitars:
     MAX_PIXELS = 16384 * 28 * 28
     MAX_RATIO = 200
-    UITARS_USR_PROMPT_THOUGHT = """
-    You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task.
-    You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar.
-    Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
-    ## Output Format
-    ```
-    Thought: ...
-    Action: ...
-    ```
-    ## Action Space
-    {action_space}
-    ## Note
-    - Use {language} in `Thought` part.
-    - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
-    ## User Instruction
-    {instruction}
-    """
-    UITARS_NORMAL_ACTION_SPACE = """
+    UITARS_NORMAL_ACTION_SPACE = dedent(
+        """
     click(start_box='<|box_start|>(x1,y1)<|box_end|>')
     left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
     right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
@@ -67,14 +47,15 @@ class GroundingAgentUitars:
     scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
     wait() #Sleep for 5s and take a screenshot to check for any changes.
     finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
-    """.lstrip()
+    """
+    ).lstrip()
     def __init__(
         self,
         model_name: str,
-        client: OpenAI | AzureOpenAI,
+        environment_type: EnvironmentType,
+        client: AsyncOpenAI | AsyncAzureOpenAI,
         max_iterations=50,
-        environment_type: Literal["computer", "web"] = "computer",
         runtime_conf: dict = {
             "infer_mode": "qwen25vl_normal",
             "prompt_style": "qwen25vl_normal",
@@ -94,7 +75,7 @@ class GroundingAgentUitars:
         self.model_name = model_name
         self.client = client
         self.tracer = tracer
-        self.environment_type = environment_type
+        self.environment = environment_type
         self.max_iterations = max_iterations
         self.runtime_conf = runtime_conf
@@ -116,7 +97,7 @@ class GroundingAgentUitars:
         self.history_images: list[bytes] = []
         self.history_responses: list[str] = []
-        self.prompt_template = self.UITARS_USR_PROMPT_THOUGHT
+        self.prompt_template = self.get_instruction(self.environment)
         self.prompt_action_space = self.UITARS_NORMAL_ACTION_SPACE
         if "history_n" in self.runtime_conf:
@@ -126,11 +107,11 @@ class GroundingAgentUitars:
         self.cur_callusr_count = 0
-    async def act(self, instruction: str, env_state: EnvState) -> tuple[str, list[OperatorAction]]:
+    async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
         """
         Suggest the next action(s) based on the instruction and current environment.
         """
-        messages = self._format_messages_for_api(instruction, env_state)
+        messages = self._format_messages_for_api(instruction, current_state)
         recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
         origin_resized_height = recent_screenshot.height
@@ -145,9 +126,11 @@ class GroundingAgentUitars:
         try_times = 3
         while not parsed_responses:
             if try_times <= 0:
-                print(f"Reach max retry times to fetch response from client, as error flag.")
+                logger.warning(f"Reach max retry times to fetch response from client, as error flag.")
                 return "client error\nFAIL", []
             try:
+                message_content = "\n".join([msg["content"][0].get("text") or "[image]" for msg in messages])
+                logger.debug(f"User message content: {message_content}")
                 response: ChatCompletion = await self.client.chat.completions.create(
                     model="ui-tars",
                     messages=messages,
@@ -228,20 +211,9 @@ class GroundingAgentUitars:
                     self.actions.append(actions)
                     return f"{prediction}\nFAIL", []
-            if self.environment_type == "web":
-                actions.extend(
-                    self.parsing_response_to_action(parsed_response, obs_image_height, obs_image_width, self.input_swap)
-                )
-            else:
-                pass
-                # TODO: Add PyautoguiAction when enable computer environment
-                # actions.append(
-                #     PyautoguiAction(code=
-                #         self.parsing_response_to_pyautogui_code(
-                #             parsed_response, obs_image_height, obs_image_width, self.input_swap
-                #         )
-                #     )
-                # )
+            actions.extend(
+                self.parsing_response_to_action(parsed_response, obs_image_height, obs_image_width, self.input_swap)
+            )
         self.actions.append(actions)
@@ -252,13 +224,52 @@ class GroundingAgentUitars:
         return prediction or "", actions
-    def _format_messages_for_api(self, instruction: str, env_state: EnvState):
+    def get_instruction(self, environment_type: EnvironmentType) -> str:
+        """
+        Get the instruction for the agent based on the environment type.
+        """
+        UITARS_COMPUTER_PREFIX_PROMPT = """
+        You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
+        """
+        UITARS_BROWSER_PREFIX_PROMPT = """
+        You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task.
+        You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar.
+        """
+        UITARS_USR_PROMPT_THOUGHT = """
+        Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
+        ## Output Format
+        ```
+        Thought: ...
+        Action: ...
+        ```
+        ## Action Space
+        {action_space}
+        ## Note
+        - Use {language} in `Thought` part.
+        - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+        ## User Instruction
+        {instruction}
+        """
+        if environment_type == EnvironmentType.BROWSER:
+            return dedent(UITARS_BROWSER_PREFIX_PROMPT + UITARS_USR_PROMPT_THOUGHT).lstrip()
+        elif environment_type == EnvironmentType.COMPUTER:
+            return dedent(UITARS_COMPUTER_PREFIX_PROMPT + UITARS_USR_PROMPT_THOUGHT).lstrip()
+        else:
+            raise ValueError(f"Unsupported environment type: {environment_type}")
+    def _format_messages_for_api(self, instruction: str, current_state: EnvState):
         assert len(self.observations) == len(self.actions) and len(self.actions) == len(
             self.thoughts
         ), "The number of observations and actions should be the same."
-        self.history_images.append(base64.b64decode(env_state.screenshot))
-        self.observations.append({"screenshot": env_state.screenshot, "accessibility_tree": None})
+        self.history_images.append(base64.b64decode(current_state.screenshot))
+        self.observations.append({"screenshot": current_state.screenshot, "accessibility_tree": None})
         user_prompt = self.prompt_template.format(
             instruction=instruction, action_space=self.prompt_action_space, language=self.language

khoj 1.41.1.dev97__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl

khoj 1.41.1.dev97py3-none-any.whl → 1.41.1.dev142py3-none-any.whl