PyPI - khoj - Versions diffs - 1.42.8.dev6__py3-none-any.whl → 1.42.9.dev16__py3-none-any.whl - Mend

khoj 1.42.8.dev6py3-none-any.whl → 1.42.9.dev16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

khoj/processor/conversation/openai/utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 import logging
 import os
 from copy import deepcopy
@@ -9,6 +10,7 @@ from urllib.parse import urlparse
 import httpx
 import openai
 from langchain_core.messages.chat import ChatMessage
+from openai.lib._pydantic import _ensure_strict_json_schema
 from openai.lib.streaming.chat import (
     ChatCompletionStream,
     ChatCompletionStreamEvent,
@@ -20,6 +22,7 @@ from openai.types.chat.chat_completion_chunk import (
     Choice,
     ChoiceDelta,
 )
+from pydantic import BaseModel
 from tenacity import (
     before_sleep_log,
     retry,
@@ -30,11 +33,13 @@ from tenacity import (
 )
 from khoj.processor.conversation.utils import (
-    JsonSupport,
     ResponseWithThought,
+    StructuredOutputSupport,
+    ToolCall,
     commit_conversation_trace,
 )
 from khoj.utils.helpers import (
+    ToolDefinition,
     convert_image_data_uri,
     get_chat_usage_metrics,
     get_openai_async_client,
@@ -72,7 +77,7 @@ def completion_with_backoff(
     deepthought: bool = False,
     model_kwargs: dict = {},
     tracer: dict = {},
-) -> str:
+) -> ResponseWithThought:
     client_key = f"{openai_api_key}--{api_base_url}"
     client = openai_clients.get(client_key)
     if not client:
@@ -117,6 +122,9 @@ def completion_with_backoff(
     if os.getenv("KHOJ_LLM_SEED"):
         model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
+    tool_ids = []
+    tool_calls: list[ToolCall] = []
+    thoughts = ""
     aggregated_response = ""
     if stream:
         with client.beta.chat.completions.stream(
@@ -130,7 +138,16 @@ def completion_with_backoff(
                 if chunk.type == "content.delta":
                     aggregated_response += chunk.delta
                 elif chunk.type == "thought.delta":
-                    pass
+                    thoughts += chunk.delta
+                elif chunk.type == "chunk" and chunk.chunk.choices and chunk.chunk.choices[0].delta.tool_calls:
+                    tool_ids += [tool_call.id for tool_call in chunk.chunk.choices[0].delta.tool_calls]
+                elif chunk.type == "tool_calls.function.arguments.done":
+                    tool_calls += [ToolCall(name=chunk.name, args=json.loads(chunk.arguments), id=None)]
+        if tool_calls:
+            tool_calls = [
+                ToolCall(name=chunk.name, args=chunk.args, id=tool_id) for chunk, tool_id in zip(tool_calls, tool_ids)
+            ]
+            aggregated_response = json.dumps([tool_call.__dict__ for tool_call in tool_calls])
     else:
         # Non-streaming chat completion
         chunk = client.beta.chat.completions.parse(
@@ -164,7 +181,7 @@ def completion_with_backoff(
     if is_promptrace_enabled():
         commit_conversation_trace(messages, aggregated_response, tracer)
-    return aggregated_response
+    return ResponseWithThought(text=aggregated_response, thought=thoughts)
 @retry(
@@ -190,6 +207,7 @@ async def chat_completion_with_backoff(
     deepthought=False,
     model_kwargs: dict = {},
     tracer: dict = {},
+    tools=None,
 ) -> AsyncGenerator[ResponseWithThought, None]:
     client_key = f"{openai_api_key}--{api_base_url}"
     client = openai_async_clients.get(client_key)
@@ -258,6 +276,8 @@ async def chat_completion_with_backoff(
     read_timeout = 300 if is_local_api(api_base_url) else 60
     if os.getenv("KHOJ_LLM_SEED"):
         model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
+    if tools:
+        model_kwargs["tools"] = tools
     aggregated_response = ""
     final_chunk = None
@@ -277,7 +297,7 @@ async def chat_completion_with_backoff(
             raise ValueError("No response by model.")
         aggregated_response = response.choices[0].message.content
         final_chunk = response
-        yield ResponseWithThought(response=aggregated_response)
+        yield ResponseWithThought(text=aggregated_response)
     else:
         async for chunk in stream_processor(response):
             # Log the time taken to start response
@@ -293,8 +313,8 @@ async def chat_completion_with_backoff(
             response_chunk: ResponseWithThought = None
             response_delta = chunk.choices[0].delta
             if response_delta.content:
-                response_chunk = ResponseWithThought(response=response_delta.content)
-                aggregated_response += response_chunk.response
+                response_chunk = ResponseWithThought(text=response_delta.content)
+                aggregated_response += response_chunk.text
             elif response_delta.thought:
                 response_chunk = ResponseWithThought(thought=response_delta.thought)
             if response_chunk:
@@ -327,16 +347,16 @@ async def chat_completion_with_backoff(
         commit_conversation_trace(messages, aggregated_response, tracer)
-def get_openai_api_json_support(model_name: str, api_base_url: str = None) -> JsonSupport:
+def get_structured_output_support(model_name: str, api_base_url: str = None) -> StructuredOutputSupport:
     if model_name.startswith("deepseek-reasoner"):
-        return JsonSupport.NONE
+        return StructuredOutputSupport.NONE
     if api_base_url:
         host = urlparse(api_base_url).hostname
         if host and host.endswith(".ai.azure.com"):
-            return JsonSupport.OBJECT
+            return StructuredOutputSupport.OBJECT
         if host == "api.deepinfra.com":
-            return JsonSupport.OBJECT
-    return JsonSupport.SCHEMA
+            return StructuredOutputSupport.OBJECT
+    return StructuredOutputSupport.TOOL
 def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> List[dict]:
@@ -345,6 +365,43 @@ def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> Li
     """
     formatted_messages = []
     for message in deepcopy(messages):
+        # Handle tool call and tool result message types
+        message_type = message.additional_kwargs.get("message_type")
+        if message_type == "tool_call":
+            # Convert tool_call to OpenAI function call format
+            content = []
+            for part in message.content:
+                content.append(
+                    {
+                        "type": "function",
+                        "id": part.get("id"),
+                        "function": {
+                            "name": part.get("name"),
+                            "arguments": json.dumps(part.get("input", part.get("args", {}))),
+                        },
+                    }
+                )
+            formatted_messages.append(
+                {
+                    "role": "assistant",
+                    "content": None,
+                    "tool_calls": content,
+                }
+            )
+            continue
+        if message_type == "tool_result":
+            # Convert tool_result to OpenAI tool result format
+            # Each part is a result for a tool call
+            for part in message.content:
+                formatted_messages.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": part.get("id") or part.get("tool_use_id"),
+                        "name": part.get("name"),
+                        "content": part.get("content"),
+                    }
+                )
+            continue
         if isinstance(message.content, list) and not is_openai_api(api_base_url):
             assistant_texts = []
             has_images = False
@@ -708,3 +765,47 @@ def add_qwen_no_think_tag(formatted_messages: List[dict]) -> None:
                     if isinstance(content_part, dict) and content_part.get("type") == "text":
                         content_part["text"] += " /no_think"
                         break
+def to_openai_tools(tools: List[ToolDefinition]) -> List[Dict] | None:
+    "Transform tool definitions from standard format to OpenAI format."
+    openai_tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": tool.name,
+                "description": tool.description,
+                "parameters": clean_response_schema(tool.schema),
+            },
+        }
+        for tool in tools
+    ]
+    return openai_tools or None
+def clean_response_schema(schema: BaseModel | dict) -> dict:
+    """
+    Format response schema to be compatible with OpenAI API.
+    Clean the response schema by removing unsupported fields.
+    """
+    # Normalize schema to OpenAI compatible JSON schema format
+    schema_json = schema if isinstance(schema, dict) else schema.model_json_schema()
+    schema_json = _ensure_strict_json_schema(schema_json, path=(), root=schema_json)
+    # Recursively drop unsupported fields from schema passed to OpenAI API
+    # See https://platform.openai.com/docs/guides/structured-outputs#supported-schemas
+    fields_to_exclude = ["minItems", "maxItems"]
+    if isinstance(schema_json, dict) and isinstance(schema_json.get("properties"), dict):
+        for _, prop_value in schema_json["properties"].items():
+            if isinstance(prop_value, dict):
+                # Remove specified fields from direct properties
+                for field in fields_to_exclude:
+                    prop_value.pop(field, None)
+            # Recursively remove specified fields from child properties
+            if "items" in prop_value and isinstance(prop_value["items"], dict):
+                clean_response_schema(prop_value["items"])
+    # Return cleaned schema
+    return schema_json

khoj/processor/conversation/prompts.py CHANGED Viewed

@@ -667,33 +667,37 @@ Here's some additional context about you:
 plan_function_execution = PromptTemplate.from_template(
     """
-You are Khoj, a smart, creative and methodical researcher. Use the provided tool AIs to investigate information to answer query.
-Create a multi-step plan and intelligently iterate on the plan based on the retrieved information to find the requested information.
+You are Khoj, a smart, creative and meticulous researcher. Use the provided tool AIs to accomplish the task assigned to you.
+Create a multi-step plan and intelligently iterate on the plan to complete the task.
 {personality_context}
 # Instructions
-- Ask highly diverse, detailed queries to the tool AIs, one tool AI at a time, to discover required information or run calculations. Their response will be shown to you in the next iteration.
+- Provide highly diverse, detailed requests to the tool AIs, one tool AI at a time, to gather information, perform actions etc. Their response will be shown to you in the next iteration.
 - Break down your research process into independent, self-contained steps that can be executed sequentially using the available tool AIs to answer the user's query. Write your step-by-step plan in the scratchpad.
 - Always ask a new query that was not asked to the tool AI in a previous iteration. Build on the results of the previous iterations.
 - Ensure that all required context is passed to the tool AIs for successful execution. Include any relevant stuff that has previously been attempted. They only know the context provided in your query.
 - Think step by step to come up with creative strategies when the previous iteration did not yield useful results.
-- You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to answer the user's question.
-- Stop when you have the required information by returning a JSON object with the "tool" field set to "text" and "query" field empty. E.g., {{"scratchpad": "I have all I need", "tool": "text", "query": ""}}
+- You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to accomplish the task assigned to you. Only stop when you have completed the task.
 # Examples
-Assuming you can search the user's notes and the internet.
+Assuming you can search the user's files and the internet.
 - When the user asks for the population of their hometown
-  1. Try look up their hometown in their notes. Ask the note search AI to search for their birth certificate, childhood memories, school, resume etc.
-  2. If not found in their notes, try infer their hometown from their online social media profiles. Ask the online search AI to look for {username}'s biography, school, resume on linkedin, facebook, website etc.
-  3. Only then try find the latest population of their hometown by reading official websites with the help of the online search and web page reading AI.
+  1. Try look up their hometown in their notes. Ask the semantic search AI to search for their birth certificate, childhood memories, school, resume etc.
+  2. Use the other document retrieval tools to build on the semantic search results, fill in the gaps, add more details or confirm your hypothesis.
+  3. If not found in their notes, try infer their hometown from their online social media profiles. Ask the online search AI to look for {username}'s biography, school, resume on linkedin, facebook, website etc.
+  4. Only then try find the latest population of their hometown by reading official websites with the help of the online search and web page reading AI.
 - When the user asks for their computer's specs
-  1. Try find their computer model in their notes.
+  1. Try find their computer model in their documents.
   2. Now find webpages with their computer model's spec online.
   3. Ask the webpage tool AI to extract the required information from the relevant webpages.
 - When the user asks what clothes to carry for their upcoming trip
-  1. Find the itinerary of their upcoming trip in their notes.
+  1. Use the semantic search tool to find the itinerary of their upcoming trip in their documents.
   2. Next find the weather forecast at the destination online.
-  3. Then find if they mentioned what clothes they own in their notes.
+  3. Then combine the semantic search, regex search, view file and list files tools to find if all the clothes they own in their files.
+- When the user asks you to summarize their expenses in a particular month
+  1. Combine the semantic search and regex search tool AI to find all transactions in the user's documents for that month.
+  2. Use the view file tool to read the line ranges in the matched files
+  3. Finally summarize the expenses
 # Background Context
 - Current Date: {day_of_week}, {current_date}
@@ -701,31 +705,9 @@ Assuming you can search the user's notes and the internet.
 - User Name: {username}
 # Available Tool AIs
-You decide which of the tool AIs listed below would you use to answer the user's question. You **only** have access to the following tool AIs:
+You decide which of the tool AIs listed below would you use to accomplish the user assigned task. You **only** have access to the following tool AIs:
 {tools}
-Your response should always be a valid JSON object with keys: "scratchpad" (str), "tool" (str) and "query" (str). Do not say anything else.
-Response format:
-{{"scratchpad": "<your_scratchpad_to_reason_about_which_tool_to_use>", "tool": "<name_of_tool_ai>", "query": "<your_detailed_query_for_the_tool_ai>"}}
-""".strip()
-)
-plan_function_execution_next_tool = PromptTemplate.from_template(
-    """
-Given the results of your previous iterations, which tool AI will you use next to answer the target query?
-# Target Query:
-{query}
-""".strip()
-)
-previous_iteration = PromptTemplate.from_template(
-    """
-# Iteration {index}:
-- tool: {tool}
-- query: {query}
-- result: {result}
 """.strip()
 )

khoj/processor/conversation/utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from io import BytesIO
-from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 import PIL.Image
 import pyjson5
@@ -137,60 +137,83 @@ class OperatorRun:
         }
+class ToolCall:
+    def __init__(self, name: str, args: dict, id: str):
+        self.name = name
+        self.args = args
+        self.id = id
 class ResearchIteration:
     def __init__(
         self,
-        tool: str,
-        query: str,
+        query: ToolCall | dict | str,
         context: list = None,
         onlineContext: dict = None,
         codeContext: dict = None,
         operatorContext: dict | OperatorRun = None,
         summarizedResult: str = None,
         warning: str = None,
+        raw_response: list = None,
     ):
-        self.tool = tool
-        self.query = query
+        self.query = ToolCall(**query) if isinstance(query, dict) else query
         self.context = context
         self.onlineContext = onlineContext
         self.codeContext = codeContext
         self.operatorContext = OperatorRun(**operatorContext) if isinstance(operatorContext, dict) else operatorContext
         self.summarizedResult = summarizedResult
         self.warning = warning
+        self.raw_response = raw_response
     def to_dict(self) -> dict:
         data = vars(self).copy()
+        data["query"] = self.query.__dict__ if isinstance(self.query, ToolCall) else self.query
         data["operatorContext"] = self.operatorContext.to_dict() if self.operatorContext else None
         return data
 def construct_iteration_history(
     previous_iterations: List[ResearchIteration],
-    previous_iteration_prompt: str,
     query: str = None,
+    query_images: List[str] = None,
+    query_files: str = None,
 ) -> list[ChatMessageModel]:
     iteration_history: list[ChatMessageModel] = []
-    previous_iteration_messages: list[dict] = []
-    for idx, iteration in enumerate(previous_iterations):
-        iteration_data = previous_iteration_prompt.format(
-            tool=iteration.tool,
-            query=iteration.query,
-            result=iteration.summarizedResult,
-            index=idx + 1,
-        )
-        previous_iteration_messages.append({"type": "text", "text": iteration_data})
+    query_message_content = construct_structured_message(query, query_images, attached_file_context=query_files)
+    if query_message_content:
+        iteration_history.append(ChatMessageModel(by="you", message=query_message_content))
-    if previous_iteration_messages:
-        if query:
-            iteration_history.append(ChatMessageModel(by="you", message=query))
-        iteration_history.append(
+    for iteration in previous_iterations:
+        if not iteration.query or isinstance(iteration.query, str):
+            iteration_history.append(
+                ChatMessageModel(
+                    by="you",
+                    message=iteration.summarizedResult
+                    or iteration.warning
+                    or "Please specify what you want to do next.",
+                )
+            )
+            continue
+        iteration_history += [
             ChatMessageModel(
                 by="khoj",
-                intent=Intent(type="remember", query=query),
-                message=previous_iteration_messages,
-            )
-        )
+                message=iteration.raw_response or [iteration.query.__dict__],
+                intent=Intent(type="tool_call", query=query),
+            ),
+            ChatMessageModel(
+                by="you",
+                intent=Intent(type="tool_result"),
+                message=[
+                    {
+                        "type": "tool_result",
+                        "id": iteration.query.id,
+                        "name": iteration.query.name,
+                        "content": iteration.summarizedResult,
+                    }
+                ],
+            ),
+        ]
     return iteration_history
@@ -302,33 +325,44 @@ def construct_tool_chat_history(
         ConversationCommand.Notes: (
             lambda iteration: [c["query"] for c in iteration.context] if iteration.context else []
         ),
-        ConversationCommand.Online: (
+        ConversationCommand.SearchWeb: (
             lambda iteration: list(iteration.onlineContext.keys()) if iteration.onlineContext else []
         ),
-        ConversationCommand.Webpage: (
+        ConversationCommand.ReadWebpage: (
             lambda iteration: list(iteration.onlineContext.keys()) if iteration.onlineContext else []
         ),
-        ConversationCommand.Code: (
+        ConversationCommand.RunCode: (
             lambda iteration: list(iteration.codeContext.keys()) if iteration.codeContext else []
         ),
     }
     for iteration in previous_iterations:
+        if not iteration.query or isinstance(iteration.query, str):
+            chat_history.append(
+                ChatMessageModel(
+                    by="you",
+                    message=iteration.summarizedResult
+                    or iteration.warning
+                    or "Please specify what you want to do next.",
+                )
+            )
+            continue
         # If a tool is provided use the inferred query extractor for that tool if available
         # If no tool is provided, use inferred query extractor for the tool used in the iteration
         # Fallback to base extractor if the tool does not have an inferred query extractor
         inferred_query_extractor = extract_inferred_query_map.get(
-            tool or ConversationCommand(iteration.tool), base_extractor
+            tool or ConversationCommand(iteration.query.name), base_extractor
         )
         chat_history += [
             ChatMessageModel(
                 by="you",
-                message=iteration.query,
+                message=yaml.dump(iteration.query.args, default_flow_style=False),
             ),
             ChatMessageModel(
                 by="khoj",
                 intent=Intent(
                     type="remember",
-                    query=iteration.query,
+                    query=yaml.dump(iteration.query.args, default_flow_style=False),
                     inferred_queries=inferred_query_extractor(iteration),
                     memory_type="notes",
                 ),
@@ -481,28 +515,32 @@ Khoj: "{chat_response}"
 def construct_structured_message(
     message: list[dict] | str,
-    images: list[str],
-    model_type: str,
-    vision_enabled: bool,
+    images: list[str] = None,
+    model_type: str = None,
+    vision_enabled: bool = True,
     attached_file_context: str = None,
 ):
     """
-    Format messages into appropriate multimedia format for supported chat model types
+    Format messages into appropriate multimedia format for supported chat model types.
+    Assume vision is enabled and chat model provider supports messages in chatml format, unless specified otherwise.
     """
-    if model_type in [
+    if not model_type or model_type in [
         ChatModel.ModelType.OPENAI,
         ChatModel.ModelType.GOOGLE,
         ChatModel.ModelType.ANTHROPIC,
     ]:
-        constructed_messages: List[dict[str, Any]] = (
-            [{"type": "text", "text": message}] if isinstance(message, str) else message
-        )
+        constructed_messages: List[dict[str, Any]] = []
+        if not is_none_or_empty(message):
+            constructed_messages += [{"type": "text", "text": message}] if isinstance(message, str) else message
+        # Drop image message passed by caller if chat model does not have vision enabled
+        if not vision_enabled:
+            constructed_messages = [m for m in constructed_messages if m.get("type") != "image_url"]
         if not is_none_or_empty(attached_file_context):
-            constructed_messages.append({"type": "text", "text": attached_file_context})
+            constructed_messages += [{"type": "text", "text": attached_file_context}]
         if vision_enabled and images:
             for image in images:
-                constructed_messages.append({"type": "image_url", "image_url": {"url": image}})
+                constructed_messages += [{"type": "image_url", "image_url": {"url": image}}]
         return constructed_messages
     message = message if isinstance(message, str) else "\n\n".join(m["text"] for m in message)
@@ -638,7 +676,11 @@ def generate_chatml_messages_with_context(
             chat_message, chat.images if role == "user" else [], model_type, vision_enabled
         )
-        reconstructed_message = ChatMessage(content=message_content, role=role)
+        reconstructed_message = ChatMessage(
+            content=message_content,
+            role=role,
+            additional_kwargs={"message_type": chat.intent.type if chat.intent else None},
+        )
         chatml_messages.insert(0, reconstructed_message)
         if len(chatml_messages) >= 3 * lookback_turns:
@@ -737,10 +779,21 @@ def count_tokens(
         message_content_parts: list[str] = []
         # Collate message content into single string to ease token counting
         for part in message_content:
-            if isinstance(part, dict) and part.get("type") == "text":
-                message_content_parts.append(part["text"])
-            elif isinstance(part, dict) and part.get("type") == "image_url":
+            if isinstance(part, dict) and part.get("type") == "image_url":
                 image_count += 1
+            elif isinstance(part, dict) and part.get("type") == "text":
+                message_content_parts.append(part["text"])
+            elif isinstance(part, dict) and hasattr(part, "model_dump"):
+                message_content_parts.append(json.dumps(part.model_dump()))
+            elif isinstance(part, dict) and hasattr(part, "__dict__"):
+                message_content_parts.append(json.dumps(part.__dict__))
+            elif isinstance(part, dict):
+                # If part is a dict but not a recognized type, convert to JSON string
+                try:
+                    message_content_parts.append(json.dumps(part))
+                except (TypeError, ValueError) as e:
+                    logger.warning(f"Failed to serialize part {part} to JSON: {e}. Skipping.")
+                    image_count += 1  # Treat as an image/binary if serialization fails
             elif isinstance(part, str):
                 message_content_parts.append(part)
             else:
@@ -753,6 +806,15 @@ def count_tokens(
         return len(encoder.encode(json.dumps(message_content)))
+def count_total_tokens(messages: list[ChatMessage], encoder, system_message: Optional[ChatMessage]) -> Tuple[int, int]:
+    """Count total tokens in messages including system message"""
+    system_message_tokens = count_tokens(system_message.content, encoder) if system_message else 0
+    message_tokens = sum([count_tokens(message.content, encoder) for message in messages])
+    # Reserves 4 tokens to demarcate each message (e.g <|im_start|>user, <|im_end|>, <|endoftext|> etc.)
+    total_tokens = message_tokens + system_message_tokens + 4 * len(messages)
+    return total_tokens, system_message_tokens
 def truncate_messages(
     messages: list[ChatMessage],
     max_prompt_size: int,
@@ -771,23 +833,30 @@ def truncate_messages(
             break
     # Drop older messages until under max supported prompt size by model
-    # Reserves 4 tokens to demarcate each message (e.g <|im_start|>user, <|im_end|>, <|endoftext|> etc.)
-    system_message_tokens = count_tokens(system_message.content, encoder) if system_message else 0
-    tokens = sum([count_tokens(message.content, encoder) for message in messages])
-    total_tokens = tokens + system_message_tokens + 4 * len(messages)
+    total_tokens, system_message_tokens = count_total_tokens(messages, encoder, system_message)
     while total_tokens > max_prompt_size and (len(messages) > 1 or len(messages[0].content) > 1):
-        if len(messages[-1].content) > 1:
+        # If the last message has more than one content part, pop the oldest content part.
+        # For tool calls, the whole message should dropped, assistant's tool call content being truncated annoys AI APIs.
+        if len(messages[-1].content) > 1 and messages[-1].additional_kwargs.get("message_type") != "tool_call":
             # The oldest content part is earlier in content list. So pop from the front.
             messages[-1].content.pop(0)
+        # Otherwise, pop the last message if it has only one content part or is a tool call.
         else:
             # The oldest message is the last one. So pop from the back.
-            messages.pop()
-        tokens = sum([count_tokens(message.content, encoder) for message in messages])
-        total_tokens = tokens + system_message_tokens + 4 * len(messages)
+            dropped_message = messages.pop()
+            # Drop tool result pair of tool call, if tool call message has been removed
+            if (
+                dropped_message.additional_kwargs.get("message_type") == "tool_call"
+                and messages
+                and messages[-1].additional_kwargs.get("message_type") == "tool_result"
+            ):
+                messages.pop()
+        total_tokens, _ = count_total_tokens(messages, encoder, system_message)
     # Truncate current message if still over max supported prompt size by model
-    total_tokens = tokens + system_message_tokens + 4 * len(messages)
+    total_tokens, _ = count_total_tokens(messages, encoder, system_message)
     if total_tokens > max_prompt_size:
         # At this point, a single message with a single content part of type dict should remain
         assert (
@@ -1149,13 +1218,15 @@ def messages_to_print(messages: list[ChatMessage], max_length: int = 70) -> str:
     return "\n".join([f"{json.dumps(safe_serialize(message.content))[:max_length]}..." for message in messages])
-class JsonSupport(int, Enum):
+class StructuredOutputSupport(int, Enum):
     NONE = 0
     OBJECT = 1
     SCHEMA = 2
+    TOOL = 3
 class ResponseWithThought:
-    def __init__(self, response: str = None, thought: str = None):
-        self.response = response
+    def __init__(self, text: str = None, thought: str = None, raw_content: list = None):
+        self.text = text
         self.thought = thought
+        self.raw_content = raw_content

khoj/processor/operator/grounding_agent.py CHANGED Viewed

@@ -73,7 +73,7 @@ class GroundingAgent:
         grounding_user_prompt = self.get_instruction(instruction, self.environment_type)
         screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
         grounding_messages_content = construct_structured_message(
-            grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
+            grounding_user_prompt, screenshots, self.model.model_type, vision_enabled=True
         )
         return [{"role": "user", "content": grounding_messages_content}]

khoj 1.42.8.dev6__py3-none-any.whl → 1.42.9.dev16__py3-none-any.whl

khoj 1.42.8.dev6py3-none-any.whl → 1.42.9.dev16py3-none-any.whl