PyPI - khoj - Versions diffs - 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl - Mend

khoj 1.41.1.dev43py3-none-any.whl → 1.41.1.dev97py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

khoj/processor/conversation/prompts.py CHANGED Viewed

@@ -736,7 +736,7 @@ Create a multi-step plan and intelligently iterate on the plan based on the retr
 - Ask highly diverse, detailed queries to the tool AIs, one tool AI at a time, to discover required information or run calculations. Their response will be shown to you in the next iteration.
 - Break down your research process into independent, self-contained steps that can be executed sequentially using the available tool AIs to answer the user's query. Write your step-by-step plan in the scratchpad.
 - Always ask a new query that was not asked to the tool AI in a previous iteration. Build on the results of the previous iterations.
-- Ensure that all required context is passed to the tool AIs for successful execution. They only know the context provided in your query.
+- Ensure that all required context is passed to the tool AIs for successful execution. Include any relevant stuff that has previously been attempted. They only know the context provided in your query.
 - Think step by step to come up with creative strategies when the previous iteration did not yield useful results.
 - You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to answer the user's question.
 - Stop when you have the required information by returning a JSON object with the "tool" field set to "text" and "query" field empty. E.g., {{"scratchpad": "I have all I need", "tool": "text", "query": ""}}
@@ -766,7 +766,7 @@ You decide which of the tool AIs listed below would you use to answer the user's
 {tools}
-Your response should always be a valid JSON object. Do not say anything else.
+Your response should always be a valid JSON object with keys: "scratchpad" (str), "tool" (str) and "query" (str). Do not say anything else.
 Response format:
 {{"scratchpad": "<your_scratchpad_to_reason_about_which_tool_to_use>", "tool": "<name_of_tool_ai>", "query": "<your_detailed_query_for_the_tool_ai>"}}
 """.strip()
@@ -1119,6 +1119,16 @@ terrarium_sandbox_context = """
 - The sandbox has access to only the standard library and the matplotlib, pandas, numpy, scipy, bs5 and sympy packages. The requests, torch, catboost, tensorflow, rdkit and tkinter packages are not available.
 """.strip()
+operator_execution_context = PromptTemplate.from_template(
+    """
+Use the results of operating a web browser to inform your response.
+Browser Operation Results:
+{operator_results}
+""".strip()
+)
 # Automations
 # --
 crontime_prompt = PromptTemplate.from_template(
@@ -1371,6 +1381,7 @@ help_message = PromptTemplate.from_template(
 - **/online**: Chat using the internet as a source of information.
 - **/image**: Generate an image based on your message.
 - **/research**: Go deeper in a topic for more accurate, in-depth responses.
+- **/operator**: Use a web browser to execute actions and search for information.
 - **/help**: Show this help message.
 You are using the **{model}** model on the **{device}**.

khoj/processor/conversation/utils.py CHANGED Viewed

@@ -73,6 +73,10 @@ model_to_prompt_size = {
     "claude-3-7-sonnet-20250219": 60000,
     "claude-3-7-sonnet-latest": 60000,
     "claude-3-5-haiku-20241022": 60000,
+    "claude-sonnet-4": 60000,
+    "claude-sonnet-4-20250514": 60000,
+    "claude-opus-4": 60000,
+    "claude-opus-4-20250514": 60000,
     # Offline Models
     "bartowski/Qwen2.5-14B-Instruct-GGUF": 20000,
     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
@@ -91,6 +95,7 @@ class InformationCollectionIteration:
         context: list = None,
         onlineContext: dict = None,
         codeContext: dict = None,
+        operatorContext: dict[str, str] = None,
         summarizedResult: str = None,
         warning: str = None,
     ):
@@ -99,6 +104,7 @@ class InformationCollectionIteration:
         self.context = context
         self.onlineContext = onlineContext
         self.codeContext = codeContext
+        self.operatorContext = operatorContext
         self.summarizedResult = summarizedResult
         self.warning = warning
@@ -187,6 +193,9 @@ def construct_tool_chat_history(
         ConversationCommand.Code: (
             lambda iteration: list(iteration.codeContext.keys()) if iteration.codeContext else []
         ),
+        ConversationCommand.Operator: (
+            lambda iteration: list(iteration.operatorContext.keys()) if iteration.operatorContext else []
+        ),
     }
     for iteration in previous_iterations:
         # If a tool is provided use the inferred query extractor for that tool if available
@@ -265,6 +274,7 @@ async def save_to_conversation_log(
     compiled_references: List[Dict[str, Any]] = [],
     online_results: Dict[str, Any] = {},
     code_results: Dict[str, Any] = {},
+    operator_results: Dict[str, str] = {},
     inferred_queries: List[str] = [],
     intent_type: str = "remember",
     client_application: ClientApplication = None,
@@ -291,6 +301,7 @@ async def save_to_conversation_log(
         "intent": {"inferred-queries": inferred_queries, "type": intent_type},
         "onlineContext": online_results,
         "codeContext": code_results,
+        "operatorContext": operator_results,
         "automationId": automation_id,
         "trainOfThought": train_of_thought,
         "turnId": turn_id,
@@ -380,7 +391,7 @@ def gather_raw_query_files(
 def generate_chatml_messages_with_context(
-    user_message,
+    user_message: str,
     system_message: str = None,
     conversation_log={},
     model_name="gpt-4o-mini",
@@ -447,6 +458,11 @@ def generate_chatml_messages_with_context(
         if not is_none_or_empty(chat.get("codeContext")):
             message_context += f"{prompts.code_executed_context.format(code_results=chat.get('codeContext'))}"
+        if not is_none_or_empty(chat.get("operatorContext")):
+            message_context += (
+                f"{prompts.operator_execution_context.format(operator_results=chat.get('operatorContext'))}"
+            )
         if not is_none_or_empty(message_context):
             reconstructed_context_message = ChatMessage(content=message_context, role="user")
             chatml_messages.insert(0, reconstructed_context_message)
@@ -685,8 +701,9 @@ def clean_code_python(code: str):
 def load_complex_json(json_str):
     """
-    Preprocess a raw JSON string to escape unescaped double quotes within value strings,
-    while preserving the JSON structure and already escaped quotes.
+    Preprocess a raw JSON string to
+    - escape unescaped double quotes within value strings while preserving the JSON structure and already escaped quotes.
+    - remove suffix after the first valid JSON object,
     """
     def replace_unescaped_quotes(match):
@@ -714,9 +731,20 @@ def load_complex_json(json_str):
     for loads in json_loaders_to_try:
         try:
             return loads(processed)
-        except (json.JSONDecodeError, pyjson5.Json5Exception) as e:
-            errors.append(f"{type(e).__name__}: {str(e)}")
+        except (json.JSONDecodeError, pyjson5.Json5Exception) as e_load:
+            loader_name = loads.__name__
+            errors.append(f"{loader_name} (initial parse): {type(e_load).__name__}: {str(e_load)}")
+            # Handle plain text suffixes by slicing at error position
+            if hasattr(e_load, "pos") and 0 < e_load.pos < len(processed):
+                try:
+                    sliced = processed[: e_load.pos].strip()
+                    if sliced:
+                        return loads(sliced)
+                except Exception as e_slice:
+                    errors.append(
+                        f"{loader_name} after slice at {e_load.pos}: {type(e_slice).__name__}: {str(e_slice)}"
+                    )
     # If all loaders fail, raise the aggregated error
     raise ValueError(
         f"Failed to load JSON with errors: {'; '.join(errors)}\n\n"

khoj/processor/operator/grounding_agent.py ADDED Viewed

@@ -0,0 +1,345 @@
+import json
+import logging
+from openai import AzureOpenAI, OpenAI
+from openai.types.chat import ChatCompletion, ChatCompletionMessage
+from khoj.database.models import ChatModel
+from khoj.processor.conversation.utils import construct_structured_message
+from khoj.processor.operator.operator_actions import *
+from khoj.processor.operator.operator_agent_base import AgentActResult
+from khoj.processor.operator.operator_environment_base import EnvState
+from khoj.utils.helpers import get_chat_usage_metrics
+logger = logging.getLogger(__name__)
+class GroundingAgent:
+    def __init__(
+        self,
+        model: ChatModel,
+        client: OpenAI | AzureOpenAI,
+        max_iterations: int,
+        tracer: dict = None,
+    ):
+        self.model = model
+        self.client = client
+        self.max_iterations = max_iterations
+        self.tracer = tracer
+        # Define tools for the grounding LLM (OpenAI format)
+        self.action_tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "click",
+                    "description": "Click on a specific coordinate.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "x": {"type": "integer", "description": "X coordinate"},
+                            "y": {"type": "integer", "description": "Y coordinate"},
+                        },
+                        "required": ["x", "y"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "left_double",
+                    "description": "Double click on a specific coordinate.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "x": {"type": "integer", "description": "X coordinate"},
+                            "y": {"type": "integer", "description": "Y coordinate"},
+                        },
+                        "required": ["x", "y"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "right_single",
+                    "description": "Right click on a specific coordinate.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "x": {"type": "integer", "description": "X coordinate"},
+                            "y": {"type": "integer", "description": "Y coordinate"},
+                        },
+                        "required": ["x", "y"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "drag",
+                    "description": "Perform a drag-and-drop operation along a path.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "path": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "x": {"type": "integer"},
+                                        "y": {"type": "integer"},
+                                    },
+                                    "required": ["x", "y"],
+                                },
+                                "description": "List of points (x, y coordinates) defining the drag path.",
+                            }
+                        },
+                        "required": ["path"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "hotkey",
+                    "description": "Press a key or key combination.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "keys": {
+                                "type": "array",
+                                "items": {"type": "string"},
+                                "description": "List of keys to press (e.g., ['Control', 'a'], ['Enter'])",
+                            }
+                        },
+                        "required": ["keys"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "type",
+                    "description": "Type text, usually into a focused input field.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"content": {"type": "string", "description": "Text to type"}},
+                        "required": ["content"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "scroll",
+                    "description": "Scroll the page.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "x": {"type": "integer", "description": "X coordinate to scroll from"},
+                            "y": {"type": "integer", "description": "Y coordinate to scroll from"},
+                            "direction": {
+                                "type": "string",
+                                "enum": ["up", "down", "left", "right"],
+                                "default": "down",
+                            },
+                        },
+                        "required": [],  # None is strictly required
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "wait",
+                    "description": "Pause execution for a specified duration.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "duration": {"type": "number", "description": "Duration in seconds", "default": 1.0}
+                        },
+                        "required": [],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "goto",
+                    "description": "Navigate to a specific URL.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
+                        "required": ["url"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "back",
+                    "description": "navigate back to the previous page.",
+                    "parameters": {"type": "object", "properties": {}},
+                },
+            },
+        ]
+    async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
+        """Call the grounding LLM to get the next action based on the current state and instruction."""
+        # Format the message for the API call
+        messages_for_api = self._format_message_for_api(instruction, current_state)
+        try:
+            grounding_response: ChatCompletion = await self.client.chat.completions.create(
+                messages=messages_for_api,
+                model=self.model.name,
+                tools=self.action_tools,
+                tool_choice="required",
+                temperature=0.0,  # Grounding should be precise
+                max_completion_tokens=1000,  # Allow for thoughts + actions
+            )
+            if not isinstance(grounding_response, ChatCompletion):
+                raise ValueError("Grounding LLM response is not of type ChatCompletion.")
+            logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
+            # Parse tool calls
+            grounding_message = grounding_response.choices[0].message
+            rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
+            # Update usage by grounding model
+            self.tracer["usage"] = get_chat_usage_metrics(
+                self.model.name,
+                input_tokens=grounding_response.usage.prompt_tokens,
+                output_tokens=grounding_response.usage.completion_tokens,
+                usage=self.tracer.get("usage"),
+            )
+        except Exception as e:
+            logger.error(f"Error calling Grounding LLM: {e}")
+            rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
+            actions = []
+        return rendered_response, actions
+    def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
+        """Format the message for the API call."""
+        grounding_user_prompt = f"""
+You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
+You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
+Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
+## Output Format
+```
+Thought: ...
+Action: ...
+```
+## Action Space
+click(start_box='<|box_start|>(x1,y1)<|box_end|>')
+left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
+right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
+drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
+hotkey(key='')
+type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
+scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
+wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
+goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
+back() # Use this to go back to the previous page.
+## Note
+- Use English in `Thought` part.
+- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
+## User Instruction
+{instruction}
+""".lstrip()
+        # Construct grounding LLM input (using only the latest user prompt + image)
+        # We don't pass the full history here, as grounding depends on the *current* state + NL action
+        screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
+        grounding_messages_content = construct_structured_message(
+            grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
+        )
+        return [{"role": "user", "content": grounding_messages_content}]
+    def _parse_action(
+        self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
+    ) -> tuple[str, list[OperatorAction]]:
+        """Parse the tool calls from the grounding LLM response and convert them to action objects."""
+        actions: List[OperatorAction] = []
+        action_results: List[dict] = []
+        if grounding_message.tool_calls:
+            rendered_parts = []
+            for tool_call in grounding_message.tool_calls:
+                function_name = tool_call.function.name
+                try:
+                    arguments = json.loads(tool_call.function.arguments)
+                    action_to_run: Optional[OperatorAction] = None
+                    action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
+                    if function_name == "click":
+                        action_to_run = ClickAction(**arguments)
+                    elif function_name == "left_double":
+                        action_to_run = DoubleClickAction(**arguments)
+                    elif function_name == "right_single":
+                        action_to_run = ClickAction(button="right", **arguments)
+                    elif function_name == "type":
+                        content = arguments.get("content")
+                        action_to_run = TypeAction(text=content)
+                    elif function_name == "scroll":
+                        direction = arguments.get("direction", "down")
+                        amount = 3
+                        action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
+                    elif function_name == "hotkey":
+                        action_to_run = KeypressAction(**arguments)
+                    elif function_name == "goto":
+                        action_to_run = GotoAction(**arguments)
+                    elif function_name == "back":
+                        action_to_run = BackAction(**arguments)
+                    elif function_name == "wait":
+                        action_to_run = WaitAction(**arguments)
+                    elif function_name == "screenshot":
+                        action_to_run = ScreenshotAction(**arguments)
+                    elif function_name == "drag":
+                        # Need to convert list of dicts to list of Point objects
+                        path_dicts = arguments.get("path", [])
+                        path_points = [Point(**p) for p in path_dicts]
+                        if path_points:
+                            action_to_run = DragAction(path=path_points)
+                        else:
+                            logger.warning(f"Drag action called with empty path: {arguments}")
+                            action_render_str += " [Skipped - empty path]"
+                    elif function_name == "finished":
+                        action_to_run = None
+                    else:
+                        logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
+                        action_render_str += " [Unhandled]"
+                    if action_to_run:
+                        actions.append(action_to_run)
+                        action_results.append(
+                            {
+                                "type": "tool_result",
+                                "tool_call_id": tool_call.id,
+                                "content": None,  # Updated after environment step
+                            }
+                        )
+                        rendered_parts.append(action_render_str)
+                except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
+                    logger.error(
+                        f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
+                    )
+                    rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
+            rendered_response = "\n- ".join(rendered_parts)
+        else:
+            # Grounding LLM responded but didn't call a tool
+            logger.warning("Grounding LLM did not produce a tool call.")
+            rendered_response = f"{grounding_message.content or 'No action required.'}"
+        # Render the response
+        return rendered_response, actions
+    def reset(self):
+        """Reset the agent state."""
+        pass

khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl

khoj 1.41.1.dev43py3-none-any.whl → 1.41.1.dev97py3-none-any.whl