PyPI - khoj - Versions diffs - 1.42.3.dev1__py3-none-any.whl → 1.42.4__py3-none-any.whl - Mend

khoj 1.42.3.dev1py3-none-any.whl → 1.42.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

khoj/processor/conversation/openai/utils.py CHANGED Viewed

@@ -14,6 +14,7 @@ from openai.lib.streaming.chat import (
     ChatCompletionStreamEvent,
     ContentDeltaEvent,
 )
+from openai.types.chat.chat_completion import ChatCompletion
 from openai.types.chat.chat_completion_chunk import (
     ChatCompletionChunk,
     Choice,
@@ -78,7 +79,11 @@ def completion_with_backoff(
         client = get_openai_client(openai_api_key, api_base_url)
         openai_clients[client_key] = client
+    stream = not is_non_streaming_model(model_name, api_base_url)
     stream_processor = default_stream_processor
+    if stream:
+        model_kwargs["stream_options"] = {"include_usage": True}
     formatted_messages = format_message_for_api(messages, api_base_url)
     # Tune reasoning models arguments
@@ -105,27 +110,37 @@ def completion_with_backoff(
         stream_processor = partial(in_stream_thought_processor, thought_tag="think")
         # Reasoning is enabled by default. Disable when deepthought is False.
         # See https://qwenlm.github.io/blog/qwen3/#advanced-usages
-        if not deepthought and len(formatted_messages) > 0:
-            formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
+        if not deepthought:
+            add_qwen_no_think_tag(formatted_messages)
     read_timeout = 300 if is_local_api(api_base_url) else 60
-    model_kwargs["stream_options"] = {"include_usage": True}
     if os.getenv("KHOJ_LLM_SEED"):
         model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
     aggregated_response = ""
-    with client.beta.chat.completions.stream(
-        messages=formatted_messages,  # type: ignore
-        model=model_name,
-        temperature=temperature,
-        timeout=httpx.Timeout(30, read=read_timeout),
-        **model_kwargs,
-    ) as chat:
-        for chunk in stream_processor(chat):
-            if chunk.type == "content.delta":
-                aggregated_response += chunk.delta
-            elif chunk.type == "thought.delta":
-                pass
+    if stream:
+        with client.beta.chat.completions.stream(
+            messages=formatted_messages,  # type: ignore
+            model=model_name,
+            temperature=temperature,
+            timeout=httpx.Timeout(30, read=read_timeout),
+            **model_kwargs,
+        ) as chat:
+            for chunk in stream_processor(chat):
+                if chunk.type == "content.delta":
+                    aggregated_response += chunk.delta
+                elif chunk.type == "thought.delta":
+                    pass
+    else:
+        # Non-streaming chat completion
+        chunk = client.beta.chat.completions.parse(
+            messages=formatted_messages,  # type: ignore
+            model=model_name,
+            temperature=temperature,
+            timeout=httpx.Timeout(30, read=read_timeout),
+            **model_kwargs,
+        )
+        aggregated_response = chunk.choices[0].message.content
     # Calculate cost of chat
     input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0
@@ -182,7 +197,11 @@ async def chat_completion_with_backoff(
         client = get_openai_async_client(openai_api_key, api_base_url)
         openai_async_clients[client_key] = client
+    stream = not is_non_streaming_model(model_name, api_base_url)
     stream_processor = adefault_stream_processor
+    if stream:
+        model_kwargs["stream_options"] = {"include_usage": True}
     formatted_messages = format_message_for_api(messages, api_base_url)
     # Configure thinking for openai reasoning models
@@ -225,12 +244,10 @@ async def chat_completion_with_backoff(
         stream_processor = partial(ain_stream_thought_processor, thought_tag="think")
         # Reasoning is enabled by default. Disable when deepthought is False.
         # See https://qwenlm.github.io/blog/qwen3/#advanced-usages
-        if not deepthought and len(formatted_messages) > 0:
-            formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
+        if not deepthought:
+            add_qwen_no_think_tag(formatted_messages)
-    stream = True
     read_timeout = 300 if is_local_api(api_base_url) else 60
-    model_kwargs["stream_options"] = {"include_usage": True}
     if os.getenv("KHOJ_LLM_SEED"):
         model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
@@ -238,7 +255,7 @@ async def chat_completion_with_backoff(
     final_chunk = None
     response_started = False
     start_time = perf_counter()
-    chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
+    response: openai.AsyncStream[ChatCompletionChunk] | ChatCompletion = await client.chat.completions.create(
         messages=formatted_messages,  # type: ignore
         model=model_name,
         stream=stream,
@@ -246,26 +263,34 @@ async def chat_completion_with_backoff(
         timeout=httpx.Timeout(30, read=read_timeout),
         **model_kwargs,
     )
-    async for chunk in stream_processor(chat_stream):
-        # Log the time taken to start response
-        if not response_started:
-            response_started = True
-            logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
-        # Keep track of the last chunk for usage data
-        final_chunk = chunk
-        # Skip empty chunks
-        if len(chunk.choices) == 0:
-            continue
-        # Handle streamed response chunk
-        response_chunk: ResponseWithThought = None
-        response_delta = chunk.choices[0].delta
-        if response_delta.content:
-            response_chunk = ResponseWithThought(response=response_delta.content)
-            aggregated_response += response_chunk.response
-        elif response_delta.thought:
-            response_chunk = ResponseWithThought(thought=response_delta.thought)
-        if response_chunk:
-            yield response_chunk
+    if not stream:
+        # If not streaming, we can return the response directly
+        if len(response.choices) == 0 or not response.choices[0].message:
+            raise ValueError("No response by model.")
+        aggregated_response = response.choices[0].message.content
+        final_chunk = response
+        yield ResponseWithThought(response=aggregated_response)
+    else:
+        async for chunk in stream_processor(response):
+            # Log the time taken to start response
+            if not response_started:
+                response_started = True
+                logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
+            # Keep track of the last chunk for usage data
+            final_chunk = chunk
+            # Skip empty chunks
+            if len(chunk.choices) == 0:
+                continue
+            # Handle streamed response chunk
+            response_chunk: ResponseWithThought = None
+            response_delta = chunk.choices[0].delta
+            if response_delta.content:
+                response_chunk = ResponseWithThought(response=response_delta.content)
+                aggregated_response += response_chunk.response
+            elif response_delta.thought:
+                response_chunk = ResponseWithThought(thought=response_delta.thought)
+            if response_chunk:
+                yield response_chunk
     # Calculate cost of chat after stream finishes
     input_tokens, output_tokens, cost = 0, 0, 0
@@ -312,11 +337,29 @@ def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> Li
     """
     formatted_messages = []
     for message in deepcopy(messages):
-        # Convert images to PNG format if message to be sent to non OpenAI API
         if isinstance(message.content, list) and not is_openai_api(api_base_url):
-            for part in message.content:
+            assistant_texts = []
+            has_images = False
+            for idx, part in enumerate(message.content):
+                # Convert images to PNG format if message to be sent to non OpenAI API
                 if part.get("type") == "image_url":
+                    has_images = True
                     part["image_url"]["url"] = convert_image_data_uri(part["image_url"]["url"], target_format="png")
+                # Deepinfra API does not support text content list in assistant messages
+                # So we merge text content list into a single text string
+                if (
+                    part.get("type") == "text"
+                    and message.role == "assistant"
+                    and api_base_url.startswith("https://api.deepinfra.com/v1")
+                ):
+                    assistant_texts += [part["text"]]
+                    message.content.pop(idx)
+            if assistant_texts:
+                assistant_texts_str = "\n\n".join(assistant_texts)
+                if has_images:
+                    message.content += [{"type": "text", "text": assistant_texts_str}]
+                else:
+                    message.content = assistant_texts_str
         formatted_messages.append({"role": message.role, "content": message.content})
     return formatted_messages
@@ -336,6 +379,14 @@ def is_openai_reasoning_model(model_name: str, api_base_url: str = None) -> bool
     return model_name.startswith("o") and is_openai_api(api_base_url)
+def is_non_streaming_model(model_name: str, api_base_url: str = None) -> bool:
+    """
+    Check if model response should not be streamed.
+    """
+    # Some OpenAI models requires biometrics to stream. Avoid streaming their responses.
+    return model_name in ["o3", "o3-pro"] and is_openai_api(api_base_url)
 def is_twitter_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
     """
     Check if the model is a Twitter reasoning model
@@ -627,3 +678,25 @@ async def ain_stream_thought_processor(
         elif mode == "message":
             chunk.choices[0].delta.content = buf
             yield chunk
+def add_qwen_no_think_tag(formatted_messages: List[dict]) -> None:
+    """
+    Add /no_think tag to the last message content if it is a user message.
+    This is used to disable reasoning in Qwen models when deepthought is False.
+    """
+    if len(formatted_messages) > 0 and formatted_messages[-1]["role"] == "user":
+        last_message = formatted_messages[-1]
+        if isinstance(last_message["content"], str):
+            # Append /no_think to the last message content
+            formatted_messages[-1]["content"] = last_message["content"] + " /no_think"
+        elif isinstance(last_message["content"], list) and len(last_message["content"]) > 0:
+            # Append /no_think to the last content part
+            if isinstance(last_message["content"][-1], str):
+                last_message["content"][-1] = last_message["content"][-1] + " /no_think"
+            else:
+                # Find last content part of type text and append /no_think to "text" part
+                for content_part in reversed(last_message["content"]):
+                    if isinstance(content_part, dict) and content_part.get("type") == "text":
+                        content_part["text"] += " /no_think"
+                        break

khoj/processor/conversation/utils.py CHANGED Viewed

@@ -62,14 +62,15 @@ model_to_prompt_size = {
     "gpt-4.1": 60000,
     "gpt-4.1-mini": 120000,
     "gpt-4.1-nano": 120000,
-    "o1": 20000,
-    "o3": 30000,
-    "o1-mini": 60000,
-    "o3-mini": 60000,
-    "o4-mini": 60000,
+    "o1-mini": 90000,
+    "o1": 30000,
+    "o3-mini": 90000,
+    "o3": 60000,
+    "o3-pro": 30000,
+    "o4-mini": 90000,
     # Google Models
-    "gemini-2.5-flash-preview-04-17": 120000,
-    "gemini-2.5-pro-preview-03-25": 60000,
+    "gemini-2.5-flash-preview-05-20": 120000,
+    "gemini-2.5-pro-preview-06-05": 60000,
     "gemini-2.0-flash": 120000,
     "gemini-2.0-flash-lite": 120000,
     "gemini-1.5-flash": 120000,
@@ -186,7 +187,7 @@ def construct_iteration_history(
         iteration_history.append(
             ChatMessageModel(
                 by="khoj",
-                intent={"type": "remember", "query": query},
+                intent=Intent(type="remember", query=query),
                 message=previous_iteration_messages,
             )
         )
@@ -196,16 +197,16 @@ def construct_iteration_history(
 def construct_chat_history(chat_history: list[ChatMessageModel], n: int = 4, agent_name="AI") -> str:
     chat_history_str = ""
     for chat in chat_history[-n:]:
-        if chat.by == "khoj" and chat.intent.type in ["remember", "reminder", "summarize"]:
-            if chat.intent.inferred_queries:
-                chat_history_str += f'{agent_name}: {{"queries": {chat.intent.inferred_queries}}}\n'
+        intent_type = chat.intent.type if chat.intent and chat.intent.type else ""
+        inferred_queries = chat.intent.inferred_queries if chat.intent else None
+        if chat.by == "khoj" and intent_type in ["remember", "reminder", "summarize"]:
+            if inferred_queries:
+                chat_history_str += f'{agent_name}: {{"queries": {inferred_queries}}}\n'
             chat_history_str += f"{agent_name}: {chat.message}\n\n"
         elif chat.by == "khoj" and chat.images:
-            chat_history_str += f"User: {chat.intent.query}\n"
             chat_history_str += f"{agent_name}: [generated image redacted for space]\n"
-        elif chat.by == "khoj" and ("excalidraw" in chat.intent.type):
-            chat_history_str += f"User: {chat.intent.query}\n"
-            chat_history_str += f"{agent_name}: {chat.intent.inferred_queries[0]}\n"
+        elif chat.by == "khoj" and ("excalidraw" in intent_type):
+            chat_history_str += f"{agent_name}: {inferred_queries[0]}\n"
         elif chat.by == "you":
             chat_history_str += f"User: {chat.message}\n"
             raw_query_files = chat.queryFiles

khoj/processor/image/generate.py CHANGED Viewed

@@ -53,11 +53,11 @@ async def text_to_image(
     text2image_model = text_to_image_config.model_name
     chat_history_str = ""
     for chat in chat_history[-4:]:
-        if chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
-            chat_history_str += f"Q: {chat.intent.query or ''}\n"
+        if chat.by == "you":
+            chat_history_str += f"Q: {chat.message}\n"
+        elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
             chat_history_str += f"A: {chat.message}\n"
         elif chat.by == "khoj" and chat.images:
-            chat_history_str += f"Q: {chat.intent.query}\n"
             chat_history_str += f"A: Improved Prompt: {chat.intent.inferred_queries[0]}\n"
     if send_status_func:

khoj/routers/api_agents.py CHANGED Viewed

@@ -62,7 +62,7 @@ async def all_agents(
     for agent in agents:
         files = agent.fileobject_set.all()
         file_names = [file.file_name for file in files]
-        agent_chat_model = await AgentAdapters.aget_agent_chat_model(default_agent, user)
+        agent_chat_model = await AgentAdapters.aget_agent_chat_model(agent, user)
         agent_packet = {
             "slug": agent.slug,
             "name": agent.name,

khoj/routers/api_chat.py CHANGED Viewed

@@ -960,7 +960,11 @@ async def chat(
             online_results = {key: val.model_dump() for key, val in last_message.onlineContext.items() or []}
             code_results = {key: val.model_dump() for key, val in last_message.codeContext.items() or []}
             compiled_references = [ref.model_dump() for ref in last_message.context or []]
-            research_results = [ResearchIteration(**iter_dict) for iter_dict in last_message.researchContext or []]
+            research_results = [
+                ResearchIteration(**iter_dict)
+                for iter_dict in last_message.researchContext or []
+                if iter_dict.get("summarizedResult")
+            ]
             operator_results = [OperatorRun(**iter_dict) for iter_dict in last_message.operatorContext or []]
             train_of_thought = [thought.model_dump() for thought in last_message.trainOfThought or []]
             # Drop the interrupted message from conversation history
@@ -1011,7 +1015,7 @@ async def chat(
                 user=user,
                 query=defiltered_query,
                 conversation_id=conversation_id,
-                conversation_history=conversation.messages,
+                conversation_history=chat_history,
                 previous_iterations=list(research_results),
                 query_images=uploaded_images,
                 agent=agent,

khoj/utils/constants.py CHANGED Viewed

@@ -17,8 +17,8 @@ default_offline_chat_models = [
     "bartowski/gemma-2-2b-it-GGUF",
     "bartowski/Qwen2.5-14B-Instruct-GGUF",
 ]
-default_openai_chat_models = ["gpt-4o-mini", "gpt-4.1"]
-default_gemini_chat_models = ["gemini-2.0-flash", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-preview-05-06"]
+default_openai_chat_models = ["gpt-4o-mini", "gpt-4.1", "o3", "o4-mini"]
+default_gemini_chat_models = ["gemini-2.0-flash", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-preview-06-05"]
 default_anthropic_chat_models = ["claude-sonnet-4-0", "claude-3-5-haiku-latest"]
 empty_config = {
@@ -41,10 +41,11 @@ model_to_cost: Dict[str, Dict[str, float]] = {
     "gpt-4.1": {"input": 2.00, "output": 8.00},
     "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
     "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
-    "o1": {"input": 15.0, "output": 60.00},
-    "o3": {"input": 10.0, "output": 40.00},
     "o1-mini": {"input": 3.0, "output": 12.0},
+    "o1": {"input": 15.0, "output": 60.00},
     "o3-mini": {"input": 1.10, "output": 4.40},
+    "o3": {"input": 2.0, "output": 8.00},
+    "o3-pro": {"input": 20.0, "output": 80.00},
     "o4-mini": {"input": 1.10, "output": 4.40},
     # Gemini Pricing: https://ai.google.dev/pricing
     "gemini-1.5-flash": {"input": 0.075, "output": 0.30},
@@ -53,8 +54,8 @@ model_to_cost: Dict[str, Dict[str, float]] = {
     "gemini-1.5-pro-002": {"input": 1.25, "output": 5.00},
     "gemini-2.0-flash": {"input": 0.10, "output": 0.40},
     "gemini-2.0-flash-lite": {"input": 0.0075, "output": 0.30},
-    "gemini-2.5-flash-preview-04-17": {"input": 0.15, "output": 0.60, "thought": 3.50},
-    "gemini-2.5-pro-preview-03-25": {"input": 1.25, "output": 10.0},
+    "gemini-2.5-flash-preview-05-20": {"input": 0.15, "output": 0.60, "thought": 3.50},
+    "gemini-2.5-pro-preview-06-05": {"input": 1.25, "output": 10.0},
     # Anthropic Pricing: https://www.anthropic.com/pricing#anthropic-api
     "claude-3-5-haiku-20241022": {"input": 1.0, "output": 5.0, "cache_read": 0.08, "cache_write": 1.0},
     "claude-3-5-haiku@20241022": {"input": 1.0, "output": 5.0, "cache_read": 0.08, "cache_write": 1.0},

{khoj-1.42.3.dev1.dist-info → khoj-1.42.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: khoj
-Version: 1.42.3.dev1
+Version: 1.42.4
 Summary: Your Second Brain
 Project-URL: Homepage, https://khoj.dev
 Project-URL: Documentation, https://docs.khoj.dev
@@ -53,7 +53,7 @@ Requires-Dist: magika~=0.5.1
 Requires-Dist: markdown-it-py~=3.0.0
 Requires-Dist: markdownify~=0.11.6
 Requires-Dist: openai-whisper>=20231117
-Requires-Dist: openai>=1.0.0
+Requires-Dist: openai>=1.86.0
 Requires-Dist: pgvector==0.2.4
 Requires-Dist: phonenumbers==8.13.27
 Requires-Dist: pillow~=10.0.0

khoj 1.42.3.dev1__py3-none-any.whl → 1.42.4__py3-none-any.whl

khoj 1.42.3.dev1py3-none-any.whl → 1.42.4py3-none-any.whl