PyPI - khoj - Versions diffs - 1.41.1.dev4__py3-none-any.whl → 1.41.1.dev16__py3-none-any.whl - Mend

khoj 1.41.1.dev4py3-none-any.whl → 1.41.1.dev16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

khoj/processor/conversation/openai/utils.py CHANGED Viewed

@@ -1,12 +1,21 @@
 import logging
 import os
+from functools import partial
 from time import perf_counter
-from typing import AsyncGenerator, Dict, List
+from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union
 from urllib.parse import urlparse
 import openai
-from openai.types.chat.chat_completion import ChatCompletion
-from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+from openai.lib.streaming.chat import (
+    ChatCompletionStream,
+    ChatCompletionStreamEvent,
+    ContentDeltaEvent,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChatCompletionChunk,
+    Choice,
+    ChoiceDelta,
+)
 from tenacity import (
     before_sleep_log,
     retry,
@@ -16,7 +25,11 @@ from tenacity import (
     wait_random_exponential,
 )
-from khoj.processor.conversation.utils import JsonSupport, commit_conversation_trace
+from khoj.processor.conversation.utils import (
+    JsonSupport,
+    ResponseWithThought,
+    commit_conversation_trace,
+)
 from khoj.utils.helpers import (
     get_chat_usage_metrics,
     get_openai_async_client,
@@ -59,6 +72,7 @@ def completion_with_backoff(
         client = get_openai_client(openai_api_key, api_base_url)
         openai_clients[client_key] = client
+    stream_processor = default_stream_processor
     formatted_messages = [{"role": message.role, "content": message.content} for message in messages]
     # Tune reasoning models arguments
@@ -69,6 +83,24 @@ def completion_with_backoff(
     elif is_twitter_reasoning_model(model_name, api_base_url):
         reasoning_effort = "high" if deepthought else "low"
         model_kwargs["reasoning_effort"] = reasoning_effort
+    elif model_name.startswith("deepseek-reasoner"):
+        # Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
+        # The first message should always be a user message (except system message).
+        updated_messages: List[dict] = []
+        for i, message in enumerate(formatted_messages):
+            if i > 0 and message["role"] == formatted_messages[i - 1]["role"]:
+                updated_messages[-1]["content"] += " " + message["content"]
+            elif i == 1 and formatted_messages[i - 1]["role"] == "system" and message["role"] == "assistant":
+                updated_messages[-1]["content"] += " " + message["content"]
+            else:
+                updated_messages.append(message)
+        formatted_messages = updated_messages
+    elif is_qwen_reasoning_model(model_name, api_base_url):
+        stream_processor = partial(in_stream_thought_processor, thought_tag="think")
+        # Reasoning is enabled by default. Disable when deepthought is False.
+        # See https://qwenlm.github.io/blog/qwen3/#advanced-usages
+        if not deepthought and len(formatted_messages) > 0:
+            formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
     model_kwargs["stream_options"] = {"include_usage": True}
     if os.getenv("KHOJ_LLM_SEED"):
@@ -82,12 +114,11 @@ def completion_with_backoff(
         timeout=20,
         **model_kwargs,
     ) as chat:
-        for chunk in chat:
-            if chunk.type == "error":
-                logger.error(f"Openai api response error: {chunk.error}", exc_info=True)
-                continue
-            elif chunk.type == "content.delta":
+        for chunk in stream_processor(chat):
+            if chunk.type == "content.delta":
                 aggregated_response += chunk.delta
+            elif chunk.type == "thought.delta":
+                pass
     # Calculate cost of chat
     input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0
@@ -124,14 +155,14 @@ def completion_with_backoff(
 )
 async def chat_completion_with_backoff(
     messages,
-    model_name,
+    model_name: str,
     temperature,
     openai_api_key=None,
     api_base_url=None,
     deepthought=False,
     model_kwargs: dict = {},
     tracer: dict = {},
-) -> AsyncGenerator[str, None]:
+) -> AsyncGenerator[ResponseWithThought, None]:
     try:
         client_key = f"{openai_api_key}--{api_base_url}"
         client = openai_async_clients.get(client_key)
@@ -139,6 +170,7 @@ async def chat_completion_with_backoff(
             client = get_openai_async_client(openai_api_key, api_base_url)
             openai_async_clients[client_key] = client
+        stream_processor = adefault_stream_processor
         formatted_messages = [{"role": message.role, "content": message.content} for message in messages]
         # Configure thinking for openai reasoning models
@@ -161,9 +193,11 @@ async def chat_completion_with_backoff(
                         "content"
                     ] = f"{first_system_message_content}\nFormatting re-enabled"
         elif is_twitter_reasoning_model(model_name, api_base_url):
+            stream_processor = adeepseek_stream_processor
             reasoning_effort = "high" if deepthought else "low"
             model_kwargs["reasoning_effort"] = reasoning_effort
         elif model_name.startswith("deepseek-reasoner"):
+            stream_processor = adeepseek_stream_processor
             # Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
             # The first message should always be a user message (except system message).
             updated_messages: List[dict] = []
@@ -174,8 +208,13 @@ async def chat_completion_with_backoff(
                     updated_messages[-1]["content"] += " " + message["content"]
                 else:
                     updated_messages.append(message)
             formatted_messages = updated_messages
+        elif is_qwen_reasoning_model(model_name, api_base_url):
+            stream_processor = partial(ain_stream_thought_processor, thought_tag="think")
+            # Reasoning is enabled by default. Disable when deepthought is False.
+            # See https://qwenlm.github.io/blog/qwen3/#advanced-usages
+            if not deepthought and len(formatted_messages) > 0:
+                formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
         stream = True
         model_kwargs["stream_options"] = {"include_usage": True}
@@ -193,24 +232,25 @@ async def chat_completion_with_backoff(
             timeout=20,
             **model_kwargs,
         )
-        async for chunk in chat_stream:
+        async for chunk in stream_processor(chat_stream):
             # Log the time taken to start response
             if final_chunk is None:
                 logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
             # Keep track of the last chunk for usage data
             final_chunk = chunk
-            # Handle streamed response chunk
+            # Skip empty chunks
             if len(chunk.choices) == 0:
                 continue
-            delta_chunk = chunk.choices[0].delta
-            text_chunk = ""
-            if isinstance(delta_chunk, str):
-                text_chunk = delta_chunk
-            elif delta_chunk and delta_chunk.content:
-                text_chunk = delta_chunk.content
-            if text_chunk:
-                aggregated_response += text_chunk
-                yield text_chunk
+            # Handle streamed response chunk
+            response_chunk: ResponseWithThought = None
+            response_delta = chunk.choices[0].delta
+            if response_delta.content:
+                response_chunk = ResponseWithThought(response=response_delta.content)
+                aggregated_response += response_chunk.response
+            elif response_delta.thought:
+                response_chunk = ResponseWithThought(thought=response_delta.thought)
+            if response_chunk:
+                yield response_chunk
         # Log the time taken to stream the entire response
         logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
@@ -264,3 +304,274 @@ def is_twitter_reasoning_model(model_name: str, api_base_url: str = None) -> boo
         and api_base_url is not None
         and api_base_url.startswith("https://api.x.ai/v1")
     )
+def is_qwen_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
+    """
+    Check if the model is a Qwen reasoning model
+    """
+    return "qwen3" in model_name.lower() and api_base_url is not None
+class ThoughtDeltaEvent(ContentDeltaEvent):
+    """
+    Chat completion chunk with thoughts, reasoning support.
+    """
+    type: Literal["thought.delta"]
+    """The thought or reasoning generated by the model."""
+ChatCompletionStreamWithThoughtEvent = Union[ChatCompletionStreamEvent, ThoughtDeltaEvent]
+class ChoiceDeltaWithThoughts(ChoiceDelta):
+    """
+    Chat completion chunk with thoughts, reasoning support.
+    """
+    thought: Optional[str] = None
+    """The thought or reasoning generated by the model."""
+class ChoiceWithThoughts(Choice):
+    delta: ChoiceDeltaWithThoughts
+class ChatCompletionWithThoughtsChunk(ChatCompletionChunk):
+    choices: List[ChoiceWithThoughts]  # Override the choices type
+def default_stream_processor(
+    chat_stream: ChatCompletionStream,
+) -> Generator[ChatCompletionStreamWithThoughtEvent, None, None]:
+    """
+    Async generator to cast and return chunks from the standard openai chat completions stream.
+    """
+    for chunk in chat_stream:
+        yield chunk
+async def adefault_stream_processor(
+    chat_stream: openai.AsyncStream[ChatCompletionChunk],
+) -> AsyncGenerator[ChatCompletionWithThoughtsChunk, None]:
+    """
+    Async generator to cast and return chunks from the standard openai chat completions stream.
+    """
+    async for chunk in chat_stream:
+        yield ChatCompletionWithThoughtsChunk.model_validate(chunk.model_dump())
+async def adeepseek_stream_processor(
+    chat_stream: openai.AsyncStream[ChatCompletionChunk],
+) -> AsyncGenerator[ChatCompletionWithThoughtsChunk, None]:
+    """
+    Async generator to cast and return chunks from the deepseek chat completions stream.
+    """
+    async for chunk in chat_stream:
+        tchunk = ChatCompletionWithThoughtsChunk.model_validate(chunk.model_dump())
+        if (
+            len(tchunk.choices) > 0
+            and hasattr(tchunk.choices[0].delta, "reasoning_content")
+            and tchunk.choices[0].delta.reasoning_content
+        ):
+            tchunk.choices[0].delta.thought = chunk.choices[0].delta.reasoning_content
+        yield tchunk
+def in_stream_thought_processor(
+    chat_stream: openai.Stream[ChatCompletionChunk], thought_tag="think"
+) -> Generator[ChatCompletionStreamWithThoughtEvent, None, None]:
+    """
+    Generator for chat completion with thought chunks.
+    Assumes <thought_tag>...</thought_tag> can only appear once at the start.
+    Handles partial tags across streamed chunks.
+    """
+    start_tag = f"<{thought_tag}>"
+    end_tag = f"</{thought_tag}>"
+    buf: str = ""
+    # Modes and transitions: detect_start > thought (optional) > message
+    mode = "detect_start"
+    for chunk in default_stream_processor(chat_stream):
+        if mode == "message" or chunk.type != "content.delta":
+            # Message mode is terminal, so just yield chunks, no processing
+            yield chunk
+            continue
+        buf += chunk.delta
+        if mode == "detect_start":
+            # Try to determine if we start with thought tag
+            if buf.startswith(start_tag):
+                # Found start tag, switch mode
+                buf = buf[len(start_tag) :]  # Remove start tag
+                mode = "thought"
+                # Fall through to process the rest of the buffer in 'thought' mode *within this iteration*
+            elif len(buf) >= len(start_tag):
+                # Buffer is long enough, definitely doesn't start with tag
+                chunk.delta = buf
+                yield chunk
+                mode = "message"
+                buf = ""
+                continue
+            elif start_tag.startswith(buf):
+                # Buffer is a prefix of the start tag, need more data
+                continue
+            else:
+                # Buffer doesn't match start tag prefix and is shorter than tag
+                chunk.delta = buf
+                yield chunk
+                mode = "message"
+                buf = ""
+                continue
+        if mode == "thought":
+            # Look for the end tag
+            idx = buf.find(end_tag)
+            if idx != -1:
+                # Found end tag. Yield thought content before it.
+                if idx > 0 and buf[:idx].strip():
+                    chunk.type = "thought.delta"
+                    chunk.delta = buf[:idx]
+                    yield chunk
+                # Process content *after* the tag as message
+                buf = buf[idx + len(end_tag) :]
+                if buf:
+                    chunk.delta = buf
+                    yield chunk
+                mode = "message"
+                buf = ""
+                continue
+            else:
+                # End tag not found yet. Yield thought content, holding back potential partial end tag.
+                send_upto = len(buf)
+                # Check if buffer ends with a prefix of end_tag
+                for i in range(len(end_tag) - 1, 0, -1):
+                    if buf.endswith(end_tag[:i]):
+                        send_upto = len(buf) - i  # Don't send the partial tag yet
+                        break
+                if send_upto > 0 and buf[:send_upto].strip():
+                    chunk.type = "thought.delta"
+                    chunk.delta = buf[:send_upto]
+                    yield chunk
+                    buf = buf[send_upto:]  # Keep only the partial tag (or empty)
+                # Need more data to find the complete end tag
+                continue
+    # End of stream handling
+    if buf:
+        if mode == "thought":  # Stream ended before </think> was found
+            chunk.type = "thought.delta"
+            chunk.delta = buf
+            yield chunk
+        elif mode == "detect_start":  # Stream ended before start tag could be confirmed/denied
+            # If it wasn't a partial start tag, treat as message
+            if not start_tag.startswith(buf):
+                chunk.delta = buf
+                yield chunk
+            # else: discard partial <think>
+        # If mode == "message", buffer should be empty due to logic above, but yield just in case
+        elif mode == "message":
+            chunk.delta = buf
+            yield chunk
+async def ain_stream_thought_processor(
+    chat_stream: openai.AsyncStream[ChatCompletionChunk], thought_tag="think"
+) -> AsyncGenerator[ChatCompletionWithThoughtsChunk, None]:
+    """
+    Async generator for chat completion with thought chunks.
+    Assumes <thought_tag>...</thought_tag> can only appear once at the start.
+    Handles partial tags across streamed chunks.
+    """
+    start_tag = f"<{thought_tag}>"
+    end_tag = f"</{thought_tag}>"
+    buf: str = ""
+    # Modes and transitions: detect_start > thought (optional) > message
+    mode = "detect_start"
+    async for chunk in adefault_stream_processor(chat_stream):
+        if len(chunk.choices) == 0:
+            continue
+        if mode == "message":
+            # Message mode is terminal, so just yield chunks, no processing
+            yield chunk
+            continue
+        buf += chunk.choices[0].delta.content
+        if mode == "detect_start":
+            # Try to determine if we start with thought tag
+            if buf.startswith(start_tag):
+                # Found start tag, switch mode
+                buf = buf[len(start_tag) :]  # Remove start tag
+                mode = "thought"
+                # Fall through to process the rest of the buffer in 'thought' mode *within this iteration*
+            elif len(buf) >= len(start_tag):
+                # Buffer is long enough, definitely doesn't start with tag
+                chunk.choices[0].delta.content = buf
+                yield chunk
+                mode = "message"
+                buf = ""
+                continue
+            elif start_tag.startswith(buf):
+                # Buffer is a prefix of the start tag, need more data
+                continue
+            else:
+                # Buffer doesn't match start tag prefix and is shorter than tag
+                chunk.choices[0].delta.content = buf
+                yield chunk
+                mode = "message"
+                buf = ""
+                continue
+        if mode == "thought":
+            # Look for the end tag
+            idx = buf.find(end_tag)
+            if idx != -1:
+                # Found end tag. Yield thought content before it.
+                if idx > 0 and buf[:idx].strip():
+                    chunk.choices[0].delta.thought = buf[:idx]
+                    chunk.choices[0].delta.content = ""
+                    yield chunk
+                # Process content *after* the tag as message
+                buf = buf[idx + len(end_tag) :]
+                if buf:
+                    chunk.choices[0].delta.content = buf
+                    yield chunk
+                mode = "message"
+                buf = ""
+                continue
+            else:
+                # End tag not found yet. Yield thought content, holding back potential partial end tag.
+                send_upto = len(buf)
+                # Check if buffer ends with a prefix of end_tag
+                for i in range(len(end_tag) - 1, 0, -1):
+                    if buf.endswith(end_tag[:i]):
+                        send_upto = len(buf) - i  # Don't send the partial tag yet
+                        break
+                if send_upto > 0 and buf[:send_upto].strip():
+                    chunk.choices[0].delta.thought = buf[:send_upto]
+                    chunk.choices[0].delta.content = ""
+                    yield chunk
+                    buf = buf[send_upto:]  # Keep only the partial tag (or empty)
+                # Need more data to find the complete end tag
+                continue
+    # End of stream handling
+    if buf:
+        if mode == "thought":  # Stream ended before </think> was found
+            chunk.choices[0].delta.thought = buf
+            chunk.choices[0].delta.content = ""
+            yield chunk
+        elif mode == "detect_start":  # Stream ended before start tag could be confirmed/denied
+            # If it wasn't a partial start tag, treat as message
+            if not start_tag.startswith(buf):
+                chunk.choices[0].delta.content = buf
+                yield chunk
+            # else: discard partial <think>
+        # If mode == "message", buffer should be empty due to logic above, but yield just in case
+        elif mode == "message":
+            chunk.choices[0].delta.content = buf
+            yield chunk

khoj/processor/conversation/utils.py CHANGED Viewed

@@ -191,6 +191,7 @@ class ChatEvent(Enum):
     REFERENCES = "references"
     GENERATED_ASSETS = "generated_assets"
     STATUS = "status"
+    THOUGHT = "thought"
     METADATA = "metadata"
     USAGE = "usage"
     END_RESPONSE = "end_response"
@@ -873,3 +874,9 @@ class JsonSupport(int, Enum):
     NONE = 0
     OBJECT = 1
     SCHEMA = 2
+class ResponseWithThought:
+    def __init__(self, response: str = None, thought: str = None):
+        self.response = response
+        self.thought = thought

khoj/routers/api_chat.py CHANGED Viewed

@@ -25,7 +25,11 @@ from khoj.database.adapters import (
 from khoj.database.models import Agent, KhojUser
 from khoj.processor.conversation import prompts
 from khoj.processor.conversation.prompts import help_message, no_entries_found
-from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
+from khoj.processor.conversation.utils import (
+    ResponseWithThought,
+    defilter_query,
+    save_to_conversation_log,
+)
 from khoj.processor.image.generate import text_to_image
 from khoj.processor.speech.text_to_speech import generate_text_to_speech
 from khoj.processor.tools.online_search import (
@@ -679,14 +683,13 @@ async def chat(
         start_time = time.perf_counter()
         ttft = None
         chat_metadata: dict = {}
-        connection_alive = True
         user: KhojUser = request.user.object
         is_subscribed = has_required_scope(request, ["premium"])
-        event_delimiter = "␃🔚␗"
         q = unquote(q)
         train_of_thought = []
         nonlocal conversation_id
         nonlocal raw_query_files
+        cancellation_event = asyncio.Event()
         tracer: dict = {
             "mid": turn_id,
@@ -713,11 +716,33 @@ async def chat(
             for file in raw_query_files:
                 query_files[file.name] = file.content
+        # Create a task to monitor for disconnections
+        disconnect_monitor_task = None
+        async def monitor_disconnection():
+            try:
+                msg = await request.receive()
+                if msg["type"] == "http.disconnect":
+                    logger.debug(f"User {user} disconnected from {common.client} client.")
+                    cancellation_event.set()
+            except Exception as e:
+                logger.error(f"Error in disconnect monitor: {e}")
+        # Cancel the disconnect monitor task if it is still running
+        async def cancel_disconnect_monitor():
+            if disconnect_monitor_task and not disconnect_monitor_task.done():
+                logger.debug(f"Cancelling disconnect monitor task for user {user}")
+                disconnect_monitor_task.cancel()
+                try:
+                    await disconnect_monitor_task
+                except asyncio.CancelledError:
+                    pass
         async def send_event(event_type: ChatEvent, data: str | dict):
-            nonlocal connection_alive, ttft, train_of_thought
-            if not connection_alive or await request.is_disconnected():
-                connection_alive = False
-                logger.warning(f"User {user} disconnected from {common.client} client")
+            nonlocal ttft, train_of_thought
+            event_delimiter = "␃🔚␗"
+            if cancellation_event.is_set():
+                logger.debug(f"User {user} disconnected from {common.client} client. Setting cancellation event.")
                 return
             try:
                 if event_type == ChatEvent.END_LLM_RESPONSE:
@@ -726,23 +751,41 @@ async def chat(
                     ttft = time.perf_counter() - start_time
                 elif event_type == ChatEvent.STATUS:
                     train_of_thought.append({"type": event_type.value, "data": data})
+                elif event_type == ChatEvent.THOUGHT:
+                    # Append the data to the last thought as thoughts are streamed
+                    if (
+                        len(train_of_thought) > 0
+                        and train_of_thought[-1]["type"] == ChatEvent.THOUGHT.value
+                        and type(train_of_thought[-1]["data"]) == type(data) == str
+                    ):
+                        train_of_thought[-1]["data"] += data
+                    else:
+                        train_of_thought.append({"type": event_type.value, "data": data})
                 if event_type == ChatEvent.MESSAGE:
                     yield data
                 elif event_type == ChatEvent.REFERENCES or ChatEvent.METADATA or stream:
                     yield json.dumps({"type": event_type.value, "data": data}, ensure_ascii=False)
             except asyncio.CancelledError as e:
-                connection_alive = False
-                logger.warn(f"User {user} disconnected from {common.client} client: {e}")
-                return
+                if cancellation_event.is_set():
+                    logger.debug(f"Request cancelled. User {user} disconnected from {common.client} client: {e}.")
             except Exception as e:
-                connection_alive = False
-                logger.error(f"Failed to stream chat API response to {user} on {common.client}: {e}", exc_info=True)
-                return
+                if not cancellation_event.is_set():
+                    logger.error(
+                        f"Failed to stream chat API response to {user} on {common.client}: {e}.",
+                        exc_info=True,
+                    )
             finally:
-                yield event_delimiter
+                if not cancellation_event.is_set():
+                    yield event_delimiter
+                # Cancel the disconnect monitor task if it is still running
+                if cancellation_event.is_set() or event_type == ChatEvent.END_RESPONSE:
+                    await cancel_disconnect_monitor()
         async def send_llm_response(response: str, usage: dict = None):
+            # Check if the client is still connected
+            if cancellation_event.is_set():
+                return
             # Send Chat Response
             async for result in send_event(ChatEvent.START_LLM_RESPONSE, ""):
                 yield result
@@ -783,6 +826,9 @@ async def chat(
                 metadata=chat_metadata,
             )
+        # Start the disconnect monitor in the background
+        disconnect_monitor_task = asyncio.create_task(monitor_disconnection())
         if is_query_empty(q):
             async for result in send_llm_response("Please ask your query to get started.", tracer.get("usage")):
                 yield result
@@ -900,6 +946,7 @@ async def chat(
                 file_filters=conversation.file_filters if conversation else [],
                 query_files=attached_file_context,
                 tracer=tracer,
+                cancellation_event=cancellation_event,
             ):
                 if isinstance(research_result, InformationCollectionIteration):
                     if research_result.summarizedResult:
@@ -1274,6 +1321,13 @@ async def chat(
                         async for result in send_event(ChatEvent.STATUS, error_message):
                             yield result
+        # Check if the user has disconnected
+        if cancellation_event.is_set():
+            logger.debug(f"User {user} disconnected from {common.client} client. Stopping LLM response.")
+            # Cancel the disconnect monitor task if it is still running
+            await cancel_disconnect_monitor()
+            return
         ## Generate Text Output
         async for result in send_event(ChatEvent.STATUS, f"**Generating a well-informed response**"):
             yield result
@@ -1306,27 +1360,32 @@ async def chat(
             tracer,
         )
-        # Send Response
-        async for result in send_event(ChatEvent.START_LLM_RESPONSE, ""):
-            yield result
-        continue_stream = True
         async for item in llm_response:
             # Should not happen with async generator, end is signaled by loop exit. Skip.
             if item is None:
                 continue
-            if not connection_alive or not continue_stream:
-                # Drain the generator if disconnected but keep processing internally
+            if cancellation_event.is_set():
+                break
+            message = item.response if isinstance(item, ResponseWithThought) else item
+            if isinstance(item, ResponseWithThought) and item.thought:
+                async for result in send_event(ChatEvent.THOUGHT, item.thought):
+                    yield result
                 continue
+            # Start sending response
+            async for result in send_event(ChatEvent.START_LLM_RESPONSE, ""):
+                yield result
             try:
-                async for result in send_event(ChatEvent.MESSAGE, f"{item}"):
+                async for result in send_event(ChatEvent.MESSAGE, message):
                     yield result
             except Exception as e:
-                continue_stream = False
-                logger.info(f"User {user} disconnected or error during streaming. Stopping send: {e}")
+                if not cancellation_event.is_set():
+                    logger.warning(f"Error during streaming. Stopping send: {e}")
+                break
         # Signal end of LLM response after the loop finishes
-        if connection_alive:
+        if not cancellation_event.is_set():
             async for result in send_event(ChatEvent.END_LLM_RESPONSE, ""):
                 yield result
             # Send Usage Metadata once llm interactions are complete
@@ -1337,6 +1396,9 @@ async def chat(
                 yield result
             logger.debug("Finished streaming response")
+        # Cancel the disconnect monitor task if it is still running
+        await cancel_disconnect_monitor()
     ## Stream Text Response
     if stream:
         return StreamingResponse(event_generator(q, images=raw_images), media_type="text/plain")

khoj 1.41.1.dev4__py3-none-any.whl → 1.41.1.dev16__py3-none-any.whl

khoj 1.41.1.dev4py3-none-any.whl → 1.41.1.dev16py3-none-any.whl