PyPI - khoj - Versions diffs - 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl - Mend

khoj 1.41.1.dev43py3-none-any.whl → 1.41.1.dev97py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

khoj/processor/operator/operate_browser.py ADDED Viewed

@@ -0,0 +1,165 @@
+import asyncio
+import json
+import logging
+import os
+from typing import Callable, List, Optional
+from khoj.database.adapters import AgentAdapters, ConversationAdapters
+from khoj.database.models import Agent, ChatModel, KhojUser
+from khoj.processor.operator.operator_actions import *
+from khoj.processor.operator.operator_agent_anthropic import AnthropicOperatorAgent
+from khoj.processor.operator.operator_agent_base import OperatorAgent
+from khoj.processor.operator.operator_agent_binary import BinaryOperatorAgent
+from khoj.processor.operator.operator_agent_openai import OpenAIOperatorAgent
+from khoj.processor.operator.operator_environment_base import EnvStepResult
+from khoj.processor.operator.operator_environment_browser import BrowserEnvironment
+from khoj.routers.helpers import ChatEvent
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import LocationData
+logger = logging.getLogger(__name__)
+# --- Browser Operator Function ---
+async def operate_browser(
+    query: str,
+    user: KhojUser,
+    conversation_log: dict,
+    location_data: LocationData,
+    send_status_func: Optional[Callable] = None,
+    query_images: Optional[List[str]] = None,  # TODO: Handle query images
+    agent: Agent = None,
+    query_files: str = None,  # TODO: Handle query files
+    cancellation_event: Optional[asyncio.Event] = None,
+    tracer: dict = {},
+):
+    response, summary_message, user_input_message = None, None, None
+    environment: Optional[BrowserEnvironment] = None
+    # Get the agent chat model
+    agent_chat_model = await AgentAdapters.aget_agent_chat_model(agent, user) if agent else None
+    reasoning_model: ChatModel = await ConversationAdapters.aget_default_chat_model(user, agent_chat_model)
+    if not reasoning_model or not reasoning_model.vision_enabled:
+        reasoning_model = await ConversationAdapters.aget_vision_enabled_config()
+    if not reasoning_model:
+        raise ValueError(f"No vision enabled chat model found. Configure a vision chat model to operate browser.")
+    # Initialize Agent
+    max_iterations = int(os.getenv("KHOJ_OPERATOR_ITERATIONS", 40))
+    operator_agent: OperatorAgent
+    if is_operator_model(reasoning_model.name) == ChatModel.ModelType.OPENAI:
+        operator_agent = OpenAIOperatorAgent(query, reasoning_model, max_iterations, tracer)
+    elif is_operator_model(reasoning_model.name) == ChatModel.ModelType.ANTHROPIC:
+        operator_agent = AnthropicOperatorAgent(query, reasoning_model, max_iterations, tracer)
+    else:
+        grounding_model_name = "ui-tars-1.5"
+        grounding_model = await ConversationAdapters.aget_chat_model_by_name(grounding_model_name)
+        if (
+            not grounding_model
+            or not grounding_model.vision_enabled
+            or not grounding_model.model_type == ChatModel.ModelType.OPENAI
+        ):
+            raise ValueError("No supported visual grounding model for binary operator agent found.")
+        operator_agent = BinaryOperatorAgent(query, reasoning_model, grounding_model, max_iterations, tracer)
+    # Initialize Environment
+    if send_status_func:
+        async for event in send_status_func(f"**Launching Browser**"):
+            yield {ChatEvent.STATUS: event}
+    environment = BrowserEnvironment()
+    await environment.start(width=1024, height=768)
+    # Start Operator Loop
+    try:
+        summarize_prompt = f"Use the results of our research to provide a comprehensive, self-contained answer for the target query:\n{query}."
+        task_completed = False
+        iterations = 0
+        with timer(f"Operating browser with {reasoning_model.model_type} {reasoning_model.name}", logger):
+            while iterations < max_iterations and not task_completed:
+                if cancellation_event and cancellation_event.is_set():
+                    logger.debug(f"Browser operator cancelled by client disconnect")
+                    break
+                iterations += 1
+                # 1. Get current environment state
+                browser_state = await environment.get_state()
+                # 2. Agent decides action(s)
+                agent_result = await operator_agent.act(browser_state)
+                # 3. Execute actions in the environment
+                env_steps: List[EnvStepResult] = []
+                for action in agent_result.actions:
+                    if cancellation_event and cancellation_event.is_set():
+                        logger.debug(f"Browser operator cancelled by client disconnect")
+                        break
+                    # Handle request for user action and break the loop
+                    if isinstance(action, RequestUserAction):
+                        user_input_message = action.request
+                        if send_status_func:
+                            async for event in send_status_func(f"**Requesting User Input**:\n{action.request}"):
+                                yield {ChatEvent.STATUS: event}
+                        break
+                    env_step = await environment.step(action)
+                    env_steps.append(env_step)
+                # Render status update
+                latest_screenshot = f"data:image/webp;base64,{env_steps[-1].screenshot_base64 if env_steps else browser_state.screenshot}"
+                render_payload = agent_result.rendered_response
+                render_payload["image"] = latest_screenshot
+                render_content = f"**Action**: {json.dumps(render_payload)}"
+                if send_status_func:
+                    async for event in send_status_func(f"**Operating Browser**:\n{render_content}"):
+                        yield {ChatEvent.STATUS: event}
+                # Check if termination conditions are met
+                task_completed = not agent_result.actions  # No actions requested by agent
+                trigger_iteration_limit = iterations == max_iterations
+                if user_input_message:
+                    logger.info(f"User input requested: {user_input_message}")
+                    break
+                if task_completed or trigger_iteration_limit:
+                    # Summarize results of operator run on last iteration
+                    operator_agent.add_action_results(env_steps, agent_result)
+                    summary_message = await operator_agent.summarize(summarize_prompt, browser_state)
+                    logger.info(f"Task completed: {task_completed}, Iteration limit: {trigger_iteration_limit}")
+                    break
+                # 4. Update agent on the results of its action on the environment
+                operator_agent.add_action_results(env_steps, agent_result)
+            # Determine final response message
+            if user_input_message:
+                response = user_input_message
+            elif task_completed:
+                response = summary_message
+            else:  # Hit iteration limit
+                response = f"Operator hit iteration limit ({max_iterations}). If the results seem incomplete try again, assign a smaller task or try a different approach.\nThese were the results till now:\n{summary_message}"
+    finally:
+        if environment and not user_input_message:  # Don't close browser if user input required
+            await environment.close()
+        if operator_agent:
+            operator_agent.reset()
+    yield {
+        "query": query,
+        "result": user_input_message or response,
+        "webpages": [{"link": url, "snippet": ""} for url in environment.visited_urls],
+    }
+def is_operator_model(model: str) -> ChatModel.ModelType | None:
+    """Check if the model is an operator model."""
+    operator_models = {
+        "gpt-4o": ChatModel.ModelType.OPENAI,
+        "claude-3-7-sonnet": ChatModel.ModelType.ANTHROPIC,
+        "claude-sonnet-4": ChatModel.ModelType.ANTHROPIC,
+        "claude-opus-4": ChatModel.ModelType.ANTHROPIC,
+        "ui-tars-1.5": ChatModel.ModelType.OFFLINE,
+    }
+    for operator_model in operator_models:
+        if model.startswith(operator_model):
+            return operator_models[operator_model]  # type: ignore[return-value]
+    return None

khoj/processor/operator/operator_actions.py ADDED Viewed

@@ -0,0 +1,149 @@
+# --- Standardized Action Models ---
+from typing import List, Literal, Optional, Union
+from pydantic import BaseModel
+class Point(BaseModel):
+    x: float
+    y: float
+class BaseAction(BaseModel):
+    type: str
+class ClickAction(BaseAction):
+    type: Literal["click"] = "click"
+    x: float
+    y: float
+    button: Literal["left", "right", "middle", "wheel"] = "left"
+    modifiers: str = None
+class DoubleClickAction(BaseAction):
+    type: Literal["double_click"] = "double_click"
+    x: float
+    y: float
+class TripleClickAction(BaseAction):
+    type: Literal["triple_click"] = "triple_click"
+    x: float
+    y: float
+class ScrollAction(BaseAction):
+    type: Literal["scroll"] = "scroll"
+    x: Optional[float] = None
+    y: Optional[float] = None
+    scroll_x: Optional[float] = None
+    scroll_y: Optional[float] = None
+    scroll_direction: Optional[Literal["up", "down", "left", "right"]] = None
+    scroll_amount: Optional[float] = 2.0
+class KeypressAction(BaseAction):
+    type: Literal["keypress"] = "keypress"
+    keys: List[str]  # Standardized on list of keys
+class TypeAction(BaseAction):
+    type: Literal["type"] = "type"
+    text: str
+class WaitAction(BaseAction):
+    type: Literal["wait"] = "wait"
+    duration: float = 1.0
+class ScreenshotAction(BaseAction):
+    type: Literal["screenshot"] = "screenshot"
+class MoveAction(BaseAction):
+    type: Literal["move"] = "move"
+    x: float
+    y: float
+class DragAction(BaseAction):
+    type: Literal["drag"] = "drag"
+    path: List[Point]
+class MouseDownAction(BaseAction):
+    type: Literal["mouse_down"] = "mouse_down"
+    button: Literal["left", "right", "middle"] = "left"
+class MouseUpAction(BaseAction):
+    type: Literal["mouse_up"] = "mouse_up"
+    button: Literal["left", "right", "middle"] = "left"
+class HoldKeyAction(BaseAction):
+    type: Literal["hold_key"] = "hold_key"
+    text: str  # xdotool style key combination string
+    duration: float = 1.0
+class KeyUpAction(BaseAction):
+    type: Literal["key_up"] = "key_up"
+    key: str
+class KeyDownAction(BaseAction):
+    type: Literal["key_down"] = "key_down"
+    key: str
+class CursorPositionAction(BaseAction):
+    type: Literal["cursor_position"] = "cursor_position"
+class GotoAction(BaseAction):
+    type: Literal["goto"] = "goto"
+    url: str
+class BackAction(BaseAction):
+    type: Literal["back"] = "back"
+class RequestUserAction(BaseAction):
+    """Request user action to confirm or provide input."""
+    type: Literal["request_user"] = "request_user"
+    request: str
+class NoopAction(BaseAction):
+    """No operation action."""
+    type: Literal["noop"] = "noop"
+OperatorAction = Union[
+    ClickAction,
+    DoubleClickAction,
+    TripleClickAction,
+    ScrollAction,
+    KeypressAction,
+    TypeAction,
+    WaitAction,
+    ScreenshotAction,
+    MoveAction,
+    DragAction,
+    MouseDownAction,
+    MouseUpAction,
+    HoldKeyAction,
+    KeyDownAction,
+    KeyUpAction,
+    CursorPositionAction,
+    GotoAction,
+    BackAction,
+    RequestUserAction,
+    NoopAction,
+]

khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl

khoj 1.41.1.dev43py3-none-any.whl → 1.41.1.dev97py3-none-any.whl