PyPI - khoj - Versions diffs - 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl - Mend

khoj 1.41.1.dev107py3-none-any.whl → 1.41.1.dev142py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

khoj/processor/operator/operator_agent_binary.py CHANGED Viewed

@@ -1,21 +1,24 @@
 import json
 import logging
 from datetime import datetime
+from textwrap import dedent
 from typing import List, Optional
-from openai.types.chat import ChatCompletion
 from khoj.database.models import ChatModel
-from khoj.processor.conversation.utils import construct_structured_message
+from khoj.processor.conversation.utils import (
+    AgentMessage,
+    OperatorRun,
+    construct_structured_message,
+)
 from khoj.processor.operator.grounding_agent import GroundingAgent
 from khoj.processor.operator.grounding_agent_uitars import GroundingAgentUitars
 from khoj.processor.operator.operator_actions import *
-from khoj.processor.operator.operator_agent_base import (
-    AgentActResult,
-    AgentMessage,
-    OperatorAgent,
+from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
+from khoj.processor.operator.operator_environment_base import (
+    EnvironmentType,
+    EnvState,
+    EnvStepResult,
 )
-from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
 from khoj.routers.helpers import send_message_to_model_wrapper
 from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
@@ -27,7 +30,7 @@ class BinaryOperatorAgent(OperatorAgent):
     """
     An OperatorAgent that uses two LLMs:
     1. Reasoning LLM: Determines the next high-level action based on the objective and current visual reasoning trajectory.
-    2. Grounding LLM: Converts the high-level action into specific, executable browser actions.
+    2. Grounding LLM: Converts the high-level action into specific, actions executable on the environment.
     """
     def __init__(
@@ -35,10 +38,23 @@ class BinaryOperatorAgent(OperatorAgent):
         query: str,
         reasoning_model: ChatModel,
         grounding_model: ChatModel,
+        environment_type: EnvironmentType,
         max_iterations: int,
-        tracer: dict,
+        max_context: int,
+        chat_history: List[AgentMessage] = [],
+        previous_trajectory: Optional[OperatorRun] = None,
+        tracer: dict = {},
     ):
-        super().__init__(query, reasoning_model, max_iterations, tracer)  # Use reasoning model for primary tracking
+        super().__init__(
+            query,
+            reasoning_model,
+            environment_type,
+            max_iterations,
+            max_context,
+            chat_history,
+            previous_trajectory,
+            tracer,
+        )  # Use reasoning model for primary tracking
         self.reasoning_model = reasoning_model
         self.grounding_model = grounding_model
         # Initialize openai api compatible client for grounding model
@@ -49,10 +65,12 @@ class BinaryOperatorAgent(OperatorAgent):
         self.grounding_agent: GroundingAgent | GroundingAgentUitars = None
         if "ui-tars-1.5" in grounding_model.name:
             self.grounding_agent = GroundingAgentUitars(
-                grounding_model.name, grounding_client, max_iterations, environment_type="web", tracer=tracer
+                grounding_model.name, self.environment_type, grounding_client, max_iterations, tracer=tracer
             )
         else:
-            self.grounding_agent = GroundingAgent(grounding_model.name, grounding_client, max_iterations, tracer=tracer)
+            self.grounding_agent = GroundingAgent(
+                grounding_model.name, self.environment_type, grounding_client, max_iterations, tracer=tracer
+            )
     async def act(self, current_state: EnvState) -> AgentActResult:
         """
@@ -84,48 +102,7 @@ class BinaryOperatorAgent(OperatorAgent):
         """
         Uses the reasoning LLM to determine the next high-level action based on the operation trajectory.
         """
-        reasoning_system_prompt = f"""
-# Introduction
-* You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser.
-* You are given the user's query and screenshots of the browser's state transitions.
-* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
-* The current URL is {current_state.url}.
-# Your Task
-* First look at the screenshots carefully to notice all pertinent information.
-* Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
-* Make sure you scroll down to see everything before deciding something isn't available.
-* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
-* Use your creativity to find alternate ways to make progress if you get stuck at any point.
-# Tool AI Capabilities
-* The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
-* It can interact with the web browser with these actions: click, right click, double click, type, scroll, drag, wait, goto url and go back to previous page.
-* It cannot access the OS, filesystem or application window. It just controls a single Chromium browser tab via Playwright.
-# IMPORTANT
-* You are allowed upto {self.max_iterations} iterations to complete the task.
-* To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
-* To navigate back to the previous page, end your response with "BACK" (without quotes).
-* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
-# Examples
-## Example 1
-GOTO https://example.com
-## Example 2
-click the blue login button located at the top right corner
-## Example 3
-scroll down the page
-## Example 4
-type the username example@email.com into the input field labeled Username
-## Example 5
-DONE
-# Instructions
-Now describe a single high-level action to take next to progress towards the user's goal in detail.
-Focus on the visual action and provide all necessary context.
-""".strip()
+        reasoning_system_prompt = self.get_instruction(self.environment_type, current_state)
         if is_none_or_empty(self.messages):
             query_text = f"**Main Objective**: {self.query}"
             query_screenshot = [f"data:image/webp;base64,{current_state.screenshot}"]
@@ -259,7 +236,8 @@ Focus on the visual action and provide all necessary context.
             action_results_content.extend(action_result["content"])
         self.messages.append(AgentMessage(role="environment", content=action_results_content))
-    async def summarize(self, summarize_prompt: str, env_state: EnvState) -> str:
+    async def summarize(self, env_state: EnvState, summarize_prompt: str = None) -> str:
+        summarize_prompt = summarize_prompt or self.summarize_prompt
         conversation_history = {"chat": self._format_message_for_api(self.messages)}
         try:
             summary = await send_message_to_model_wrapper(
@@ -282,7 +260,7 @@ Focus on the visual action and provide all necessary context.
         return summary
-    def compile_response(self, response_content: str | List) -> str:
+    def _compile_response(self, response_content: str | List) -> str:
         """Compile response content into a string, handling OpenAI message structures."""
         if isinstance(response_content, str):
             return response_content
@@ -330,6 +308,96 @@ Focus on the visual action and provide all necessary context.
         ]
         return formatted_messages
+    def get_instruction(self, environment_type: EnvironmentType, env_state: EnvState) -> str:
+        """Get the system instruction for the reasoning agent."""
+        if environment_type == EnvironmentType.BROWSER:
+            return dedent(
+                f"""
+                # Introduction
+                * You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser.
+                * You are given the user's query and screenshots of the browser's state transitions.
+                * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
+                * The current URL is {env_state.url}.
+                # Your Task
+                * First look at the screenshots carefully to notice all pertinent information.
+                * Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
+                * Make sure you scroll down to see everything before deciding something isn't available.
+                * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
+                * Use your creativity to find alternate ways to make progress if you get stuck at any point.
+                # Tool AI Capabilities
+                * The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
+                * It can interact with the web browser with these actions: click, right click, double click, type, scroll, drag, wait, goto url and go back to previous page.
+                * It cannot access the OS, filesystem or application window. It just controls a single Chromium browser tab via Playwright.
+                # IMPORTANT
+                * You are allowed upto {self.max_iterations} iterations to complete the task.
+                * To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
+                * To navigate back to the previous page, end your response with "BACK" (without quotes).
+                * Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
+                # Examples
+                ## Example 1
+                GOTO https://example.com
+                ## Example 2
+                click the blue login button located at the top right corner
+                ## Example 3
+                scroll down the page
+                ## Example 4
+                type the username example@email.com into the input field labeled Username
+                ## Example 5
+                DONE
+                # Instructions
+                Now describe a single high-level action to take next to progress towards the user's goal in detail.
+                Focus on the visual action and provide all necessary context.
+                """
+            ).strip()
+        elif environment_type == EnvironmentType.COMPUTER:
+            return dedent(
+                f"""
+                # Introduction
+                * You are Khoj, a smart and resourceful computer assistant. You help the user accomplish their task using a computer.
+                * You are given the user's query and screenshots of the computer's state transitions.
+                * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
+                # Your Task
+                * First look at the screenshots carefully to notice all pertinent information.
+                * Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
+                * Make sure you scroll down to see everything before deciding something isn't available.
+                * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
+                * Use your creativity to find alternate ways to make progress if you get stuck at any point.
+                # Tool AI Capabilities
+                * The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
+                * It can interact with the computer with these actions: click, right click, double click, type, scroll, drag, wait to previous page.
+                # IMPORTANT
+                * You are allowed upto {self.max_iterations} iterations to complete the task.
+                * Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
+                # Examples
+                ## Example 1
+                type https://example.com into the address bar and press Enter
+                ## Example 2
+                click the blue login button located at the top right corner
+                ## Example 3
+                scroll down the page
+                ## Example 4
+                type the username example@email.com into the input field labeled Username
+                ## Example 5
+                DONE
+                # Instructions
+                Now describe a single high-level action to take next to progress towards the user's goal in detail.
+                Focus on the visual action and provide all necessary context.
+                """
+            ).strip()
+        else:
+            raise ValueError(f"Expected environment type: Computer or Browser. Got {environment_type}.")
     def reset(self):
         """Reset the agent state."""
         super().reset()

khoj/processor/operator/operator_agent_openai.py CHANGED Viewed

@@ -1,18 +1,22 @@
 import json
 import logging
+import platform
 from copy import deepcopy
 from datetime import datetime
+from textwrap import dedent
 from typing import List, Optional, cast
 from openai.types.responses import Response, ResponseOutputItem
+from khoj.database.models import ChatModel
+from khoj.processor.conversation.utils import AgentMessage
 from khoj.processor.operator.operator_actions import *
-from khoj.processor.operator.operator_agent_base import (
-    AgentActResult,
-    AgentMessage,
-    OperatorAgent,
+from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
+from khoj.processor.operator.operator_environment_base import (
+    EnvironmentType,
+    EnvState,
+    EnvStepResult,
 )
-from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
 from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
 logger = logging.getLogger(__name__)
@@ -21,80 +25,18 @@ logger = logging.getLogger(__name__)
 # --- Anthropic Operator Agent ---
 class OpenAIOperatorAgent(OperatorAgent):
     async def act(self, current_state: EnvState) -> AgentActResult:
-        client = get_openai_async_client(
-            self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
-        )
         safety_check_prefix = "Say 'continue' after resolving the following safety checks to proceed:"
         safety_check_message = None
         actions: List[OperatorAction] = []
         action_results: List[dict] = []
         self._commit_trace()  # Commit trace before next action
-        system_prompt = f"""<SYSTEM_CAPABILITY>
-* You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
-* You operate a single Chromium browser page using Playwright.
-* You cannot access the OS or filesystem.
-* You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
-* You can use the additional back() and goto() functions to navigate the browser.
-* Always use the goto() function to navigate to a specific URL. If you see nothing, try goto duckduckgo.com
-* When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
-* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
-* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
-* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
-* The current URL is {current_state.url}.
-</SYSTEM_CAPABILITY>
-<IMPORTANT>
-* You are allowed upto {self.max_iterations} iterations to complete the task.
-* After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
-</IMPORTANT>
-"""
-        tools = [
-            {
-                "type": "computer_use_preview",
-                "display_width": 1024,  # TODO: Get from env
-                "display_height": 768,  # TODO: Get from env
-                "environment": "browser",
-            },
-            {
-                "type": "function",
-                "name": "back",
-                "description": "Go back to the previous page.",
-                "parameters": {},
-            },
-            {
-                "type": "function",
-                "name": "goto",
-                "description": "Go to a specific URL.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "url": {
-                            "type": "string",
-                            "description": "Fully qualified URL to navigate to.",
-                        },
-                    },
-                    "additionalProperties": False,
-                    "required": ["url"],
-                },
-            },
-        ]
+        system_prompt = self.get_instructions(self.environment_type, current_state)
+        tools = self.get_tools(self.environment_type, current_state)
         if is_none_or_empty(self.messages):
             self.messages = [AgentMessage(role="user", content=self.query)]
-        messages_for_api = self._format_message_for_api(self.messages)
-        response: Response = await client.responses.create(
-            model="computer-use-preview",
-            input=messages_for_api,
-            instructions=system_prompt,
-            tools=tools,
-            parallel_tool_calls=False,  # Keep sequential for now
-            max_output_tokens=4096,  # TODO: Make configurable?
-            truncation="auto",
-        )
-        logger.debug(f"Openai response: {response.model_dump_json()}")
-        self.messages += [AgentMessage(role="environment", content=response.output)]
+        response = await self._call_model(self.vision_model, system_prompt, tools)
+        self.messages += [AgentMessage(role="assistant", content=response.output)]
         rendered_response = self._render_response(response.output, current_state.screenshot)
         last_call_id = None
@@ -174,6 +116,9 @@ class OpenAIOperatorAgent(OperatorAgent):
                         "summary": [],
                     }
                 )
+            else:
+                logger.warning(f"Unsupported response block type: {block.type}")
+                content = f"Unsupported response block type: {block.type}"
             if action_to_run or content:
                 actions.append(action_to_run)
             if action_to_run or content:
@@ -220,6 +165,10 @@ class OpenAIOperatorAgent(OperatorAgent):
             elif action_result["type"] == "reasoning":
                 items_to_pop.append(idx)  # Mark placeholder reasoning action result for removal
                 continue
+            elif action_result["type"] == "computer_call" and action_result["status"] == "in_progress":
+                if isinstance(result_content, dict):
+                    result_content["status"] = "completed"  # Mark in-progress actions as completed
+                action_result["output"] = result_content
             else:
                 # Add text data
                 action_result["output"] = result_content
@@ -229,11 +178,45 @@ class OpenAIOperatorAgent(OperatorAgent):
         self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
+    async def summarize(self, current_state: EnvState, summarize_prompt: str = None) -> str:
+        summarize_prompt = summarize_prompt or self.summarize_prompt
+        self.messages.append(AgentMessage(role="user", content=summarize_prompt))
+        response = await self._call_model(self.vision_model, summarize_prompt, [])
+        self.messages += [AgentMessage(role="assistant", content=response.output)]
+        if not self.messages:
+            return "No actions to summarize."
+        return self._compile_response(self.messages[-1].content)
+    async def _call_model(self, model: ChatModel, system_prompt, tools) -> Response:
+        client = get_openai_async_client(model.ai_model_api.api_key, model.ai_model_api.api_base_url)
+        if tools:
+            model_name = "computer-use-preview"
+        else:
+            model_name = model.name
+        # Format messages for OpenAI API
+        messages_for_api = self._format_message_for_api(self.messages)
+        # format messages for summary if model is not computer-use-preview
+        if model_name != "computer-use-preview":
+            messages_for_api = self._format_messages_for_summary(messages_for_api)
+        response: Response = await client.responses.create(
+            model=model_name,
+            input=messages_for_api,
+            instructions=system_prompt,
+            tools=tools,
+            parallel_tool_calls=False,
+            truncation="auto",
+        )
+        logger.debug(f"Openai response: {response.model_dump_json()}")
+        return response
     def _format_message_for_api(self, messages: list[AgentMessage]) -> list:
         """Format the message for OpenAI API."""
         formatted_messages: list = []
         for message in messages:
-            if message.role == "environment":
+            if message.role == "assistant":
                 if isinstance(message.content, list):
                     # Remove reasoning message if not followed by computer call
                     if (
@@ -252,18 +235,23 @@ class OpenAIOperatorAgent(OperatorAgent):
                         message.content.pop(0)
                     formatted_messages.extend(message.content)
                 else:
-                    logger.warning(f"Expected message content list from environment, got {type(message.content)}")
+                    logger.warning(f"Expected message content list from assistant, got {type(message.content)}")
+            elif message.role == "environment":
+                formatted_messages.extend(message.content)
             else:
+                if isinstance(message.content, list):
+                    message.content = "\n".join([part["text"] for part in message.content if part["type"] == "text"])
                 formatted_messages.append(
                     {
                         "role": message.role,
                         "content": message.content,
                     }
                 )
         return formatted_messages
-    def compile_response(self, response_content: str | list[dict | ResponseOutputItem]) -> str:
-        """Compile the response from model into a single string."""
+    def _compile_response(self, response_content: str | list[dict | ResponseOutputItem]) -> str:
+        """Compile the response from model into a single string for prompt tracing."""
         # Handle case where response content is a string.
         # This is the case when response content is a user query
         if isinstance(response_content, str):
@@ -347,3 +335,123 @@ class OpenAIOperatorAgent(OperatorAgent):
         }
         return render_payload
+    def get_instructions(self, environment_type: EnvironmentType, current_state: EnvState) -> str:
+        """Return system instructions for the OpenAI operator."""
+        if environment_type == EnvironmentType.BROWSER:
+            return dedent(
+                f"""
+                <SYSTEM_CAPABILITY>
+                * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
+                * You operate a single Chromium browser page using Playwright.
+                * You cannot access the OS or filesystem.
+                * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
+                * You can use the additional back() and goto() functions to navigate the browser.
+                * Always use the goto() function to navigate to a specific URL. If you see nothing, try goto duckduckgo.com
+                * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
+                * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
+                * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
+                * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
+                * The current URL is {current_state.url}.
+                </SYSTEM_CAPABILITY>
+                <IMPORTANT>
+                * You are allowed upto {self.max_iterations} iterations to complete the task.
+                * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
+                </IMPORTANT>
+                """
+            ).lstrip()
+        elif environment_type == EnvironmentType.COMPUTER:
+            return dedent(
+                f"""
+                <SYSTEM_CAPABILITY>
+                * You are Khoj, a smart computer operating assistant. You help the users accomplish their tasks using a computer.
+                * You can interact with the computer to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
+                * When viewing a document or webpage it can be helpful to zoom out or scroll down to ensure you see everything before deciding something isn't available.
+                * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
+                * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
+                * You are allowed upto {self.max_iterations} iterations to complete the task.
+                </SYSTEM_CAPABILITY>
+                <CONTEXT>
+                * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
+                </CONTEXT>
+                """
+            ).lstrip()
+        else:
+            raise ValueError(f"Unsupported environment type: {environment_type}")
+    def get_tools(self, environment_type: EnvironmentType, current_state: EnvState) -> list[dict]:
+        """Return the tools available for the OpenAI operator."""
+        if environment_type == EnvironmentType.COMPUTER:
+            # TODO: Get OS info from the environment
+            # For now, assume Linux as the environment OS
+            environment_os = "linux"
+            # environment = "mac" if platform.system() == "Darwin" else "windows" if platform.system() == "Windows" else "linux"
+        else:
+            environment_os = "browser"
+        tools = [
+            {
+                "type": "computer_use_preview",
+                "display_width": current_state.width,
+                "display_height": current_state.height,
+                "environment": environment_os,
+            }
+        ]
+        if environment_type == EnvironmentType.BROWSER:
+            tools += [
+                {
+                    "type": "function",
+                    "name": "back",
+                    "description": "Go back to the previous page.",
+                    "parameters": {},
+                },
+                {
+                    "type": "function",
+                    "name": "goto",
+                    "description": "Go to a specific URL.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "url": {
+                                "type": "string",
+                                "description": "Fully qualified URL to navigate to.",
+                            },
+                        },
+                        "additionalProperties": False,
+                        "required": ["url"],
+                    },
+                },
+            ]
+        return tools
+    def _format_messages_for_summary(self, formatted_messages: List[dict]) -> List[dict]:
+        """Format messages for summary."""
+        # Format messages to interact with non computer use AI models
+        items_to_drop = []  # Track indices to drop reasoning messages
+        for idx, msg in enumerate(formatted_messages):
+            if isinstance(msg, dict) and "content" in msg:
+                continue
+            elif isinstance(msg, dict) and "output" in msg:
+                # Drop current_url from output as not supported for non computer operations
+                if "current_url" in msg["output"]:
+                    del msg["output"]["current_url"]
+                formatted_messages[idx] = {"role": "user", "content": [msg["output"]]}
+            elif isinstance(msg, str):
+                formatted_messages[idx] = {"role": "user", "content": [{"type": "input_text", "text": msg}]}
+            else:
+                text = self._compile_response([msg])
+                if not text:
+                    items_to_drop.append(idx)  # Track index to drop reasoning message
+                else:
+                    formatted_messages[idx] = {
+                        "role": "assistant",
+                        "content": [{"type": "output_text", "text": text}],
+                    }
+        # Remove reasoning messages for non-computer use models
+        for idx in reversed(items_to_drop):
+            formatted_messages.pop(idx)
+        return formatted_messages

khoj/processor/operator/operator_environment_base.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from enum import Enum
 from typing import Literal, Optional
 from pydantic import BaseModel
@@ -6,9 +7,18 @@ from pydantic import BaseModel
 from khoj.processor.operator.operator_actions import OperatorAction
+class EnvironmentType(Enum):
+    """Type of environment to operate."""
+    COMPUTER = "computer"
+    BROWSER = "browser"
 class EnvState(BaseModel):
-    url: str
+    height: int
+    width: int
     screenshot: Optional[str] = None
+    url: Optional[str] = None
 class EnvStepResult(BaseModel):

khoj/processor/operator/operator_environment_browser.py CHANGED Viewed

@@ -5,7 +5,7 @@ import logging
 import os
 from typing import Optional, Set, Union
-from khoj.processor.operator.operator_actions import OperatorAction, Point
+from khoj.processor.operator.operator_actions import DragAction, OperatorAction, Point
 from khoj.processor.operator.operator_environment_base import (
     Environment,
     EnvState,
@@ -124,10 +124,10 @@ class BrowserEnvironment(Environment):
     async def get_state(self) -> EnvState:
         if not self.page or self.page.is_closed():
-            return EnvState(url="about:blank", screenshot=None)
+            return EnvState(url="about:blank", screenshot=None, height=self.height, width=self.width)
         url = self.page.url
         screenshot = await self._get_screenshot()
-        return EnvState(url=url, screenshot=screenshot)
+        return EnvState(url=url, screenshot=screenshot, height=self.height, width=self.width)
     async def step(self, action: OperatorAction) -> EnvStepResult:
         if not self.page or self.page.is_closed():
@@ -246,6 +246,8 @@ class BrowserEnvironment(Environment):
                     logger.debug(f"Action: {action.type} to ({x},{y})")
                 case "drag":
+                    if not isinstance(action, DragAction):
+                        raise TypeError(f"Invalid action type for drag")
                     path = action.path
                     if not path:
                         error = "Missing path for drag action"

khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl

khoj 1.41.1.dev107py3-none-any.whl → 1.41.1.dev142py3-none-any.whl