PyPI - khoj - Versions diffs - 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl - Mend

khoj 1.41.1.dev107py3-none-any.whl → 1.41.1.dev142py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

khoj/processor/conversation/google/utils.py CHANGED Viewed

@@ -21,6 +21,7 @@ from tenacity import (
 )
 from khoj.processor.conversation.utils import (
+    ResponseWithThought,
     commit_conversation_trace,
     get_image_from_base64,
     get_image_from_url,
@@ -102,7 +103,7 @@ def gemini_completion_with_backoff(
         client = get_gemini_client(api_key, api_base_url)
         gemini_clients[api_key] = client
-    formatted_messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
+    formatted_messages, system_instruction = format_messages_for_gemini(messages, system_prompt)
     # format model response schema
     response_schema = None
@@ -110,12 +111,12 @@ def gemini_completion_with_backoff(
         response_schema = clean_response_schema(model_kwargs["response_schema"])
     thinking_config = None
-    if deepthought and model_name.startswith("gemini-2-5"):
+    if deepthought and model_name.startswith("gemini-2.5"):
         thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
     seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
     config = gtypes.GenerateContentConfig(
-        system_instruction=system_prompt,
+        system_instruction=system_instruction,
         temperature=temperature,
         thinking_config=thinking_config,
         max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
@@ -178,21 +179,21 @@ async def gemini_chat_completion_with_backoff(
     model_kwargs=None,
     deepthought=False,
     tracer: dict = {},
-) -> AsyncGenerator[str, None]:
+) -> AsyncGenerator[ResponseWithThought, None]:
     client = gemini_clients.get(api_key)
     if not client:
         client = get_gemini_client(api_key, api_base_url)
         gemini_clients[api_key] = client
-    formatted_messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
+    formatted_messages, system_instruction = format_messages_for_gemini(messages, system_prompt)
     thinking_config = None
-    if deepthought and model_name.startswith("gemini-2-5"):
-        thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
+    if deepthought and model_name.startswith("gemini-2.5"):
+        thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI, include_thoughts=True)
     seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
     config = gtypes.GenerateContentConfig(
-        system_instruction=system_prompt,
+        system_instruction=system_instruction,
         temperature=temperature,
         thinking_config=thinking_config,
         max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
@@ -216,18 +217,25 @@ async def gemini_chat_completion_with_backoff(
             logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
         # Keep track of the last chunk for usage data
         final_chunk = chunk
-        # Handle streamed response chunk
+        # handle safety, rate-limit, other finish reasons
         stop_message, stopped = handle_gemini_response(chunk.candidates, chunk.prompt_feedback)
-        message = stop_message or chunk.text
-        aggregated_response += message
-        yield message
         if stopped:
+            yield ResponseWithThought(response=stop_message)
             logger.warning(
                 f"LLM Response Prevented for {model_name}: {stop_message}.\n"
                 + f"Last Message by {messages[-1].role}: {messages[-1].content}"
             )
             break
+        # emit thought vs response parts
+        for part in chunk.candidates[0].content.parts:
+            if part.text:
+                aggregated_response += part.text
+                yield ResponseWithThought(response=part.text)
+            if part.thought:
+                yield ResponseWithThought(thought=part.text)
     # Calculate cost of chat
     input_tokens = final_chunk.usage_metadata.prompt_token_count or 0 if final_chunk else 0
     output_tokens = final_chunk.usage_metadata.candidates_token_count or 0 if final_chunk else 0

khoj/processor/conversation/offline/chat_model.py CHANGED Viewed

@@ -16,6 +16,7 @@ from khoj.processor.conversation.offline.utils import download_model
 from khoj.processor.conversation.utils import (
     clean_json,
     commit_conversation_trace,
+    construct_question_history,
     generate_chatml_messages_with_context,
     messages_to_print,
 )
@@ -64,13 +65,7 @@ def extract_questions_offline(
     username = prompts.user_name.format(name=user.get_full_name()) if user and user.get_full_name() else ""
     # Extract Past User Message and Inferred Questions from Conversation Log
-    chat_history = ""
-    if use_history:
-        for chat in conversation_log.get("chat", [])[-4:]:
-            if chat["by"] == "khoj":
-                chat_history += f"Q: {chat['intent']['query']}\n"
-                chat_history += f"Khoj: {chat['message']}\n\n"
+    chat_history = construct_question_history(conversation_log, include_query=False) if use_history else ""
     # Get dates relative to today for prompt creation
     today = datetime.today()

khoj/processor/conversation/openai/gpt.py CHANGED Viewed

@@ -17,8 +17,10 @@ from khoj.processor.conversation.openai.utils import (
 )
 from khoj.processor.conversation.utils import (
     JsonSupport,
+    OperatorRun,
     ResponseWithThought,
     clean_json,
+    construct_question_history,
     construct_structured_message,
     generate_chatml_messages_with_context,
     messages_to_print,
@@ -55,13 +57,7 @@ def extract_questions(
     username = prompts.user_name.format(name=user.get_full_name()) if user and user.get_full_name() else ""
     # Extract Past User Message and Inferred Questions from Conversation Log
-    chat_history = "".join(
-        [
-            f'Q: {chat["intent"]["query"]}\nKhoj: {{"queries": {chat["intent"].get("inferred-queries") or list([chat["intent"]["query"]])}}}\nA: {chat["message"]}\n\n'
-            for chat in conversation_log.get("chat", [])[-4:]
-            if chat["by"] == "khoj" and "to-image" not in chat["intent"].get("type")
-        ]
-    )
+    chat_history = construct_question_history(conversation_log)
     # Get dates relative to today for prompt creation
     today = datetime.today()
@@ -169,7 +165,7 @@ async def converse_openai(
     references: list[dict],
     online_results: Optional[Dict[str, Dict]] = None,
     code_results: Optional[Dict[str, Dict]] = None,
-    operator_results: Optional[Dict[str, str]] = None,
+    operator_results: Optional[List[OperatorRun]] = None,
     conversation_log={},
     model: str = "gpt-4o-mini",
     api_key: Optional[str] = None,
@@ -242,8 +238,11 @@ async def converse_openai(
             f"{prompts.code_executed_context.format(code_results=truncate_code_context(code_results))}\n\n"
         )
     if not is_none_or_empty(operator_results):
+        operator_content = [
+            {"query": oc.query, "response": oc.response, "webpages": oc.webpages} for oc in operator_results
+        ]
         context_message += (
-            f"{prompts.operator_execution_context.format(operator_results=yaml_dump(operator_results))}\n\n"
+            f"{prompts.operator_execution_context.format(operator_results=yaml_dump(operator_content))}\n\n"
         )
     context_message = context_message.strip()

khoj/processor/conversation/utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from io import BytesIO
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 import PIL.Image
 import pyjson5
@@ -20,6 +20,7 @@ import yaml
 from langchain_core.messages.chat import ChatMessage
 from llama_cpp import LlamaTokenizer
 from llama_cpp.llama import Llama
+from pydantic import BaseModel
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 from khoj.database.adapters import ConversationAdapters
@@ -73,9 +74,9 @@ model_to_prompt_size = {
     "claude-3-7-sonnet-20250219": 60000,
     "claude-3-7-sonnet-latest": 60000,
     "claude-3-5-haiku-20241022": 60000,
-    "claude-sonnet-4": 60000,
+    "claude-sonnet-4-0": 60000,
     "claude-sonnet-4-20250514": 60000,
-    "claude-opus-4": 60000,
+    "claude-opus-4-0": 60000,
     "claude-opus-4-20250514": 60000,
     # Offline Models
     "bartowski/Qwen2.5-14B-Instruct-GGUF": 20000,
@@ -87,7 +88,49 @@ model_to_prompt_size = {
 model_to_tokenizer: Dict[str, str] = {}
-class InformationCollectionIteration:
+class AgentMessage(BaseModel):
+    role: Literal["user", "assistant", "system", "environment"]
+    content: Union[str, List]
+class OperatorRun:
+    def __init__(
+        self,
+        query: str,
+        trajectory: list[AgentMessage] | list[dict] = None,
+        response: str = None,
+        webpages: list[dict] = None,
+    ):
+        self.query = query
+        self.response = response
+        self.webpages = webpages or []
+        self.trajectory: list[AgentMessage] = []
+        if trajectory:
+            for item in trajectory:
+                if isinstance(item, dict):
+                    self.trajectory.append(AgentMessage(**item))
+                elif hasattr(item, "role") and hasattr(item, "content"):  # Heuristic for AgentMessage like object
+                    self.trajectory.append(item)
+                else:
+                    logger.warning(f"Unexpected item type in trajectory: {type(item)}")
+    def to_dict(self) -> dict:
+        # Ensure AgentMessage instances in trajectory are also dicts
+        serialized_trajectory = []
+        for msg in self.trajectory:
+            if hasattr(msg, "model_dump"):  # Check if it's a Pydantic model
+                serialized_trajectory.append(msg.model_dump())
+            elif isinstance(msg, dict):
+                serialized_trajectory.append(msg)  # Already a dict
+        return {
+            "query": self.query,
+            "response": self.response,
+            "trajectory": serialized_trajectory,
+            "webpages": self.webpages,
+        }
+class ResearchIteration:
     def __init__(
         self,
         tool: str,
@@ -95,7 +138,7 @@ class InformationCollectionIteration:
         context: list = None,
         onlineContext: dict = None,
         codeContext: dict = None,
-        operatorContext: dict[str, str] = None,
+        operatorContext: dict | OperatorRun = None,
         summarizedResult: str = None,
         warning: str = None,
     ):
@@ -104,13 +147,18 @@ class InformationCollectionIteration:
         self.context = context
         self.onlineContext = onlineContext
         self.codeContext = codeContext
-        self.operatorContext = operatorContext
+        self.operatorContext = OperatorRun(**operatorContext) if isinstance(operatorContext, dict) else operatorContext
         self.summarizedResult = summarizedResult
         self.warning = warning
+    def to_dict(self) -> dict:
+        data = vars(self).copy()
+        data["operatorContext"] = self.operatorContext.to_dict() if self.operatorContext else None
+        return data
 def construct_iteration_history(
-    previous_iterations: List[InformationCollectionIteration],
+    previous_iterations: List[ResearchIteration],
     previous_iteration_prompt: str,
     query: str = None,
 ) -> list[dict]:
@@ -143,11 +191,8 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
     chat_history = ""
     for chat in conversation_history.get("chat", [])[-n:]:
         if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]:
-            chat_history += f"User: {chat['intent']['query']}\n"
             if chat["intent"].get("inferred-queries"):
                 chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
             chat_history += f"{agent_name}: {chat['message']}\n\n"
         elif chat["by"] == "khoj" and chat.get("images"):
             chat_history += f"User: {chat['intent']['query']}\n"
@@ -156,6 +201,7 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
             chat_history += f"User: {chat['intent']['query']}\n"
             chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
         elif chat["by"] == "you":
+            chat_history += f"User: {chat['message']}\n"
             raw_query_files = chat.get("queryFiles")
             if raw_query_files:
                 query_files: Dict[str, str] = {}
@@ -168,8 +214,74 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
     return chat_history
+def construct_question_history(
+    conversation_log: dict,
+    include_query: bool = True,
+    lookback: int = 6,
+    query_prefix: str = "Q",
+    agent_name: str = "Khoj",
+) -> str:
+    """
+    Constructs a chat history string formatted for query extraction purposes.
+    """
+    history_parts = ""
+    original_query = None
+    for chat in conversation_log.get("chat", [])[-lookback:]:
+        if chat["by"] == "you":
+            original_query = chat.get("message")
+            history_parts += f"{query_prefix}: {original_query}\n"
+        if chat["by"] == "khoj":
+            if original_query is None:
+                continue
+            message = chat.get("message", "")
+            inferred_queries_list = chat.get("intent", {}).get("inferred-queries")
+            # Ensure inferred_queries_list is a list, defaulting to the original query in a list
+            if not inferred_queries_list:
+                inferred_queries_list = [original_query]
+            # If it's a string (though unlikely based on usage), wrap it in a list
+            elif isinstance(inferred_queries_list, str):
+                inferred_queries_list = [inferred_queries_list]
+            if include_query:
+                # Ensure 'type' exists and is a string before checking 'to-image'
+                intent_type = chat.get("intent", {}).get("type", "")
+                if "to-image" not in intent_type:
+                    history_parts += f'{agent_name}: {{"queries": {inferred_queries_list}}}\n'
+                    history_parts += f"A: {message}\n\n"
+            else:
+                history_parts += f"{agent_name}: {message}\n\n"
+            # Reset original_query for the next turn
+            original_query = None
+    return history_parts
+def construct_chat_history_for_operator(conversation_history: dict, n: int = 6) -> list[AgentMessage]:
+    """
+    Construct chat history for operator agent in conversation log.
+    Only include last n completed turns (i.e with user and khoj message).
+    """
+    chat_history: list[AgentMessage] = []
+    user_message: Optional[AgentMessage] = None
+    for chat in conversation_history.get("chat", []):
+        if len(chat_history) >= n:
+            break
+        if chat["by"] == "you" and chat.get("message"):
+            content = [{"type": "text", "text": chat["message"]}]
+            for file in chat.get("queryFiles", []):
+                content += [{"type": "text", "text": f'## File: {file["name"]}\n\n{file["content"]}'}]
+            user_message = AgentMessage(role="user", content=content)
+        elif chat["by"] == "khoj" and chat.get("message"):
+            chat_history += [user_message, AgentMessage(role="assistant", content=chat["message"])]
+    return chat_history
 def construct_tool_chat_history(
-    previous_iterations: List[InformationCollectionIteration], tool: ConversationCommand = None
+    previous_iterations: List[ResearchIteration], tool: ConversationCommand = None
 ) -> Dict[str, list]:
     """
     Construct chat history from previous iterations for a specific tool
@@ -178,8 +290,8 @@ def construct_tool_chat_history(
     If no tool is provided inferred query for all tools used are added.
     """
     chat_history: list = []
-    base_extractor: Callable[[InformationCollectionIteration], List[str]] = lambda x: []
-    extract_inferred_query_map: Dict[ConversationCommand, Callable[[InformationCollectionIteration], List[str]]] = {
+    base_extractor: Callable[[ResearchIteration], List[str]] = lambda iteration: []
+    extract_inferred_query_map: Dict[ConversationCommand, Callable[[ResearchIteration], List[str]]] = {
         ConversationCommand.Notes: (
             lambda iteration: [c["query"] for c in iteration.context] if iteration.context else []
         ),
@@ -192,9 +304,6 @@ def construct_tool_chat_history(
         ConversationCommand.Code: (
             lambda iteration: list(iteration.codeContext.keys()) if iteration.codeContext else []
         ),
-        ConversationCommand.Operator: (
-            lambda iteration: list(iteration.operatorContext.keys()) if iteration.operatorContext else []
-        ),
     }
     for iteration in previous_iterations:
         # If a tool is provided use the inferred query extractor for that tool if available
@@ -273,7 +382,7 @@ async def save_to_conversation_log(
     compiled_references: List[Dict[str, Any]] = [],
     online_results: Dict[str, Any] = {},
     code_results: Dict[str, Any] = {},
-    operator_results: Dict[str, str] = {},
+    operator_results: List[OperatorRun] = None,
     inferred_queries: List[str] = [],
     intent_type: str = "remember",
     client_application: ClientApplication = None,
@@ -284,7 +393,7 @@ async def save_to_conversation_log(
     generated_images: List[str] = [],
     raw_generated_files: List[FileAttachment] = [],
     generated_mermaidjs_diagram: str = None,
-    research_results: Optional[List[InformationCollectionIteration]] = None,
+    research_results: Optional[List[ResearchIteration]] = None,
     train_of_thought: List[Any] = [],
     tracer: Dict[str, Any] = {},
 ):
@@ -301,8 +410,8 @@ async def save_to_conversation_log(
         "intent": {"inferred-queries": inferred_queries, "type": intent_type},
         "onlineContext": online_results,
         "codeContext": code_results,
-        "operatorContext": operator_results,
-        "researchContext": [vars(r) for r in research_results] if research_results and not chat_response else None,
+        "operatorContext": [o.to_dict() for o in operator_results] if operator_results and not chat_response else None,
+        "researchContext": [r.to_dict() for r in research_results] if research_results and not chat_response else None,
         "automationId": automation_id,
         "trainOfThought": train_of_thought,
         "turnId": turn_id,
@@ -459,10 +568,12 @@ def generate_chatml_messages_with_context(
             ]
         if not is_none_or_empty(chat.get("operatorContext")):
+            operator_context = chat.get("operatorContext")
+            operator_content = "\n\n".join([f'## Task: {oc["query"]}\n{oc["response"]}\n' for oc in operator_context])
             message_context += [
                 {
                     "type": "text",
-                    "text": f"{prompts.operator_execution_context.format(operator_results=chat.get('operatorContext'))}",
+                    "text": f"{prompts.operator_execution_context.format(operator_results=operator_content)}",
                 }
             ]

khoj/processor/operator/README.md ADDED Viewed

@@ -0,0 +1,59 @@
+# Khoj Operator (Experimental)
+## Overview
+Give Khoj its own computer to operate in a transparent, controlled manner. Accomplish tasks that require visual browsing, file editing and terminal access. Operator with research mode can work for 30+ minutes to accomplish more substantial tasks like feature development, travel planning, shopping etc.
+## Setup
+### Prerequisites
+- Docker and Docker Compose installed
+- Anthropic API key (required - only Anthropic models currently enabled)
+### Installation Steps
+1. Download the Khoj docker-compose.yml file
+    ```shell
+    mkdir ~/.khoj && cd ~/.khoj
+    wget https://raw.githubusercontent.com/khoj-ai/khoj/master/docker-compose.yml
+    ```
+2. Configure environment variables in `docker-compose.yml`
+    - Set `ANTHROPIC_API_KEY` to your [Anthropic API key](https://console.anthropic.com/settings/keys)
+    - Uncomment `KHOJ_OPERATOR_ENABLED=True` to enable the operator tool
+3. Start Khoj services
+    ```shell
+    docker-compose up
+    ```
+4. Access the web app at http://localhost:42110
+   Ensure you're using a claude 3.7+ models on your [settings page](http://localhost:42110/settings)
+## Usage
+Use the `/operator` command or ask Khoj in normal or research mode to use the operator tool to have it operate its computer:
+**Examples:**
+- `/operator Find flights from Bangkok to Mexico City with no US layover`
+- `/research Clone the khoj repo and tell me how the operator tool is implemented`
+## Supported Models
+Currently enables **only Anthropic models**:
+- Claude Sonnet 4
+- Claude 3.7 Sonnet
+- Claude Opus 4
+*Note: OpenAI and other operator models are disabled while in developemnt.*
+## Capabilities
+The operator can:
+- **Computer Control**: Take screenshots, click, type, navigate desktop
+- **File Operations**: Create, edit, and manage files
+- **Terminal Access**: Execute bash commands and scripts
+- **Web Browsing**: Navigate websites, documents and extract information
+## Architecture
+- **Environments**: Operator Computer and Browser environments
+- **Models**: Enable Vision Language Models (VLM) to operate computer
+- **Execution**: Containerize computer environment for security and isolation

khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl

khoj 1.41.1.dev107py3-none-any.whl → 1.41.1.dev142py3-none-any.whl