khoj 2.0.0b10__py3-none-any.whl → 2.0.0b11.dev15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/configure.py +74 -15
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-0006674668eb5a4d.js → page-9a4610474cd59a71.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-4c465cde2d14cb52.js → page-f7bb9d777b7745d4.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-4408125f66c165cf.js → page-8e1c4f2af3c9429e.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-85b9b416898738f7.js → page-2b3056cba8aa96ce.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/{page-883b7d8d2e3abe3e.js → page-4885df3cd175c957.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-95e994ddac31473f.js → page-8be3b35178abf2ec.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-c062269e6906ef22.js → page-4a4b0c0f4749c2b2.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-c375c47fee5a4dda.js → webpack-2d7431816511b8a5.js} +1 -1
- khoj/interface/compiled/_next/static/css/{a0c2fd63bb396f04.css → 23b26df423cd8a9c.css} +1 -1
- khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
- khoj/interface/compiled/_next/static/css/821d0d60b0b6871d.css +1 -0
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +3 -3
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +4 -4
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/interface/web/error.html +149 -0
- khoj/processor/conversation/google/utils.py +71 -5
- khoj/processor/conversation/openai/utils.py +54 -39
- khoj/processor/conversation/utils.py +1 -0
- khoj/processor/operator/__init__.py +1 -1
- khoj/routers/api_agents.py +1 -1
- khoj/routers/api_chat.py +95 -20
- khoj/routers/helpers.py +4 -4
- khoj/routers/research.py +1 -1
- khoj/routers/web_client.py +5 -0
- {khoj-2.0.0b10.dist-info → khoj-2.0.0b11.dev15.dist-info}/METADATA +1 -1
- {khoj-2.0.0b10.dist-info → khoj-2.0.0b11.dev15.dist-info}/RECORD +55 -54
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-d5ae861e1ade9d08.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-f5881c7ae3ba0795.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
- khoj/interface/compiled/_next/static/css/ee66643a6a5bf71c.css +0 -1
- khoj/interface/compiled/_next/static/css/fbacbdfd5e7f3f0e.css +0 -1
- /khoj/interface/compiled/_next/static/chunks/{1327-3b1a41af530fa8ee.js → 1327-1a9107b9a2a04a98.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{1915-fbfe167c84ad60c5.js → 1915-5c6508f6ebb62a30.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2117-e78b6902ad6f75ec.js → 2117-080746c8e170c81a.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2939-4d4084c5b888b960.js → 2939-4af3fd24b8ffc9ad.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4447-d6cf93724d57e34b.js → 4447-cd95608f8e93e711.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{8667-4b7790573b08c50d.js → 8667-50b03a89e82e0ba7.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{9139-ce1ae935dac9c871.js → 9139-8ac4d9feb10f8869.js} +0 -0
- /khoj/interface/compiled/_next/static/{Ieo_9KsHXi-opl1-yfWnK → nqIeU27JxQkTS-5OXP3OU}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{Ieo_9KsHXi-opl1-yfWnK → nqIeU27JxQkTS-5OXP3OU}/_ssgManifest.js +0 -0
- {khoj-2.0.0b10.dist-info → khoj-2.0.0b11.dev15.dist-info}/WHEEL +0 -0
- {khoj-2.0.0b10.dist-info → khoj-2.0.0b11.dev15.dist-info}/entry_points.txt +0 -0
- {khoj-2.0.0b10.dist-info → khoj-2.0.0b11.dev15.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import random
|
5
|
+
import re
|
5
6
|
from copy import deepcopy
|
6
7
|
from time import perf_counter
|
7
8
|
from typing import Any, AsyncGenerator, AsyncIterator, Dict, List
|
@@ -13,6 +14,7 @@ from google.genai import types as gtypes
|
|
13
14
|
from langchain_core.messages.chat import ChatMessage
|
14
15
|
from pydantic import BaseModel
|
15
16
|
from tenacity import (
|
17
|
+
RetryCallState,
|
16
18
|
before_sleep_log,
|
17
19
|
retry,
|
18
20
|
retry_if_exception,
|
@@ -73,7 +75,7 @@ SAFETY_SETTINGS = [
|
|
73
75
|
def _is_retryable_error(exception: BaseException) -> bool:
|
74
76
|
"""Check if the exception is a retryable error"""
|
75
77
|
# server errors
|
76
|
-
if isinstance(exception, gerrors.APIError):
|
78
|
+
if isinstance(exception, (gerrors.APIError, gerrors.ClientError)):
|
77
79
|
return exception.code in [429, 502, 503, 504]
|
78
80
|
# client errors
|
79
81
|
if (
|
@@ -88,9 +90,48 @@ def _is_retryable_error(exception: BaseException) -> bool:
|
|
88
90
|
return False
|
89
91
|
|
90
92
|
|
93
|
+
def _extract_retry_delay(exception: BaseException) -> float:
|
94
|
+
"""Extract retry delay from Gemini error response, return in seconds"""
|
95
|
+
if (
|
96
|
+
isinstance(exception, (gerrors.ClientError, gerrors.APIError))
|
97
|
+
and hasattr(exception, "details")
|
98
|
+
and isinstance(exception.details, dict)
|
99
|
+
):
|
100
|
+
# Look for retryDelay key, value pair. E.g "retryDelay": "54s"
|
101
|
+
if delay_str := exception.details.get("retryDelay"):
|
102
|
+
delay_seconds_match = re.search(r"(\d+)s", delay_str)
|
103
|
+
if delay_seconds_match:
|
104
|
+
delay_seconds = float(delay_seconds_match.group(1))
|
105
|
+
return delay_seconds
|
106
|
+
return None
|
107
|
+
|
108
|
+
|
109
|
+
def _wait_with_gemini_delay(min_wait=4, max_wait=120, multiplier=1, fallback_wait=None):
|
110
|
+
"""Custom wait strategy that respects Gemini's retryDelay if present"""
|
111
|
+
|
112
|
+
def wait_func(retry_state: RetryCallState) -> float:
|
113
|
+
# Use backoff time if last exception suggests a retry delay
|
114
|
+
if retry_state.outcome and retry_state.outcome.failed:
|
115
|
+
exception = retry_state.outcome.exception()
|
116
|
+
gemini_delay = _extract_retry_delay(exception)
|
117
|
+
if gemini_delay:
|
118
|
+
# Use the Gemini-suggested delay, but cap it at max_wait
|
119
|
+
suggested_delay = min(gemini_delay, max_wait)
|
120
|
+
logger.info(f"Using Gemini suggested retry delay: {suggested_delay} seconds")
|
121
|
+
return suggested_delay
|
122
|
+
# Else use fallback backoff if provided
|
123
|
+
if fallback_wait:
|
124
|
+
return fallback_wait(retry_state)
|
125
|
+
# Else use exponential backoff with provided parameters
|
126
|
+
else:
|
127
|
+
return wait_exponential(multiplier=multiplier, min=min_wait, max=max_wait)(retry_state)
|
128
|
+
|
129
|
+
return wait_func
|
130
|
+
|
131
|
+
|
91
132
|
@retry(
|
92
133
|
retry=retry_if_exception(_is_retryable_error),
|
93
|
-
wait=wait_random_exponential(min=1, max=10),
|
134
|
+
wait=_wait_with_gemini_delay(min_wait=1, max_wait=10, fallback_wait=wait_random_exponential(min=1, max=10)),
|
94
135
|
stop=stop_after_attempt(2),
|
95
136
|
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
96
137
|
reraise=True,
|
@@ -169,7 +210,14 @@ def gemini_completion_with_backoff(
|
|
169
210
|
)
|
170
211
|
except gerrors.ClientError as e:
|
171
212
|
response = None
|
172
|
-
|
213
|
+
# Handle 429 rate limit errors directly
|
214
|
+
if e.code == 429:
|
215
|
+
response_text = f"My brain is exhausted. Can you please try again in a bit?"
|
216
|
+
# Log the full error details for debugging
|
217
|
+
logger.error(f"Gemini ClientError: {e.code} {e.status}. Details: {e.details}")
|
218
|
+
# Handle other errors
|
219
|
+
else:
|
220
|
+
response_text, _ = handle_gemini_response(e.args)
|
173
221
|
# Respond with reason for stopping
|
174
222
|
logger.warning(
|
175
223
|
f"LLM Response Prevented for {model_name}: {response_text}.\n"
|
@@ -206,7 +254,7 @@ def gemini_completion_with_backoff(
|
|
206
254
|
|
207
255
|
@retry(
|
208
256
|
retry=retry_if_exception(_is_retryable_error),
|
209
|
-
wait=
|
257
|
+
wait=_wait_with_gemini_delay(multiplier=1, min_wait=4, max_wait=10),
|
210
258
|
stop=stop_after_attempt(3),
|
211
259
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
212
260
|
reraise=False,
|
@@ -310,6 +358,13 @@ def handle_gemini_response(
|
|
310
358
|
candidates: list[gtypes.Candidate], prompt_feedback: gtypes.GenerateContentResponsePromptFeedback = None
|
311
359
|
):
|
312
360
|
"""Check if Gemini response was blocked and return an explanatory error message."""
|
361
|
+
|
362
|
+
# Ensure we have a proper list of candidates
|
363
|
+
if not isinstance(candidates, list):
|
364
|
+
message = f"\nUnexpected response format. Try again."
|
365
|
+
stopped = True
|
366
|
+
return message, stopped
|
367
|
+
|
313
368
|
# Check if the response was blocked due to safety concerns with the prompt
|
314
369
|
if len(candidates) == 0 and prompt_feedback:
|
315
370
|
message = f"\nI'd prefer to not respond to that due to **{prompt_feedback.block_reason.name}** issues with your query."
|
@@ -428,7 +483,18 @@ def format_messages_for_gemini(
|
|
428
483
|
if len(messages) == 1:
|
429
484
|
messages[0].role = "user"
|
430
485
|
|
431
|
-
|
486
|
+
# Ensure messages are properly formatted for Content creation
|
487
|
+
valid_messages = []
|
488
|
+
for message in messages:
|
489
|
+
try:
|
490
|
+
# Try create Content object to validate the structure before adding to valid messages
|
491
|
+
gtypes.Content(role=message.role, parts=message.content)
|
492
|
+
valid_messages.append(message)
|
493
|
+
except Exception as e:
|
494
|
+
logger.warning(f"Dropping message with invalid content structure: {e}. Message: {message}")
|
495
|
+
continue
|
496
|
+
|
497
|
+
formatted_messages = [gtypes.Content(role=message.role, parts=message.content) for message in valid_messages]
|
432
498
|
return formatted_messages, system_prompt
|
433
499
|
|
434
500
|
|
@@ -100,6 +100,7 @@ def completion_with_backoff(
|
|
100
100
|
reasoning_effort = "high" if deepthought else "low"
|
101
101
|
model_kwargs["reasoning_effort"] = reasoning_effort
|
102
102
|
elif model_name.startswith("deepseek-reasoner"):
|
103
|
+
stream_processor = in_stream_thought_processor
|
103
104
|
# Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
|
104
105
|
# The first message should always be a user message (except system message).
|
105
106
|
updated_messages: List[dict] = []
|
@@ -111,8 +112,8 @@ def completion_with_backoff(
|
|
111
112
|
else:
|
112
113
|
updated_messages.append(message)
|
113
114
|
formatted_messages = updated_messages
|
114
|
-
elif
|
115
|
-
stream_processor =
|
115
|
+
elif is_qwen_style_reasoning_model(model_name, api_base_url):
|
116
|
+
stream_processor = in_stream_thought_processor
|
116
117
|
# Reasoning is enabled by default. Disable when deepthought is False.
|
117
118
|
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
118
119
|
if not deepthought:
|
@@ -144,6 +145,14 @@ def completion_with_backoff(
|
|
144
145
|
elif chunk.type == "tool_calls.function.arguments.done":
|
145
146
|
tool_calls += [ToolCall(name=chunk.name, args=json.loads(chunk.arguments), id=None)]
|
146
147
|
if tool_calls:
|
148
|
+
# If there are tool calls, aggregate thoughts and responses into thoughts
|
149
|
+
if thoughts and aggregated_response:
|
150
|
+
# wrap each line of thought in italics
|
151
|
+
thoughts = "\n".join([f"*{line.strip()}*" for line in thoughts.splitlines() if line.strip()])
|
152
|
+
thoughts = f"{thoughts}\n\n{aggregated_response}"
|
153
|
+
else:
|
154
|
+
thoughts = thoughts or aggregated_response
|
155
|
+
# Json dump tool calls into aggregated response
|
147
156
|
tool_calls = [
|
148
157
|
ToolCall(name=chunk.name, args=chunk.args, id=tool_id) for chunk, tool_id in zip(tool_calls, tool_ids)
|
149
158
|
]
|
@@ -158,6 +167,25 @@ def completion_with_backoff(
|
|
158
167
|
**model_kwargs,
|
159
168
|
)
|
160
169
|
aggregated_response = chunk.choices[0].message.content
|
170
|
+
if hasattr(chunk.choices[0].message, "reasoning_content"):
|
171
|
+
thoughts = chunk.choices[0].message.reasoning_content
|
172
|
+
else:
|
173
|
+
thoughts = chunk.choices[0].message.model_extra.get("reasoning_content", "")
|
174
|
+
raw_tool_calls = chunk.choices[0].message.tool_calls
|
175
|
+
if raw_tool_calls:
|
176
|
+
tool_calls = [
|
177
|
+
ToolCall(name=tool.function.name, args=tool.function.parsed_arguments, id=tool.id)
|
178
|
+
for tool in raw_tool_calls
|
179
|
+
]
|
180
|
+
# If there are tool calls, aggregate thoughts and responses into thoughts
|
181
|
+
if thoughts and aggregated_response:
|
182
|
+
# wrap each line of thought in italics
|
183
|
+
thoughts = "\n".join([f"*{line.strip()}*" for line in thoughts.splitlines() if line.strip()])
|
184
|
+
thoughts = f"{thoughts}\n\n{aggregated_response}"
|
185
|
+
else:
|
186
|
+
thoughts = thoughts or aggregated_response
|
187
|
+
# Json dump tool calls into aggregated response
|
188
|
+
aggregated_response = json.dumps([tool_call.__dict__ for tool_call in tool_calls])
|
161
189
|
|
162
190
|
# Calculate cost of chat
|
163
191
|
input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0
|
@@ -216,7 +244,7 @@ async def chat_completion_with_backoff(
|
|
216
244
|
openai_async_clients[client_key] = client
|
217
245
|
|
218
246
|
stream = not is_non_streaming_model(model_name, api_base_url)
|
219
|
-
stream_processor =
|
247
|
+
stream_processor = astream_thought_processor
|
220
248
|
if stream:
|
221
249
|
model_kwargs["stream_options"] = {"include_usage": True}
|
222
250
|
else:
|
@@ -244,13 +272,13 @@ async def chat_completion_with_backoff(
|
|
244
272
|
"content"
|
245
273
|
] = f"{first_system_message_content}\nFormatting re-enabled"
|
246
274
|
elif is_twitter_reasoning_model(model_name, api_base_url):
|
247
|
-
stream_processor = adeepseek_stream_processor
|
248
275
|
reasoning_effort = "high" if deepthought else "low"
|
249
276
|
model_kwargs["reasoning_effort"] = reasoning_effort
|
250
277
|
elif model_name.startswith("deepseek-reasoner") or "deepseek-r1" in model_name:
|
251
|
-
# Official Deepseek reasoner model
|
252
|
-
#
|
253
|
-
|
278
|
+
# Official Deepseek reasoner model and some inference APIs like vLLM return structured thinking output.
|
279
|
+
# Others like DeepInfra return it in response stream.
|
280
|
+
# Using the instream thought processor handles both cases, structured thoughts and in response thoughts.
|
281
|
+
stream_processor = ain_stream_thought_processor
|
254
282
|
# Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
|
255
283
|
# The first message should always be a user message (except system message).
|
256
284
|
updated_messages: List[dict] = []
|
@@ -266,8 +294,8 @@ async def chat_completion_with_backoff(
|
|
266
294
|
else:
|
267
295
|
updated_messages.append(message)
|
268
296
|
formatted_messages = updated_messages
|
269
|
-
elif
|
270
|
-
stream_processor =
|
297
|
+
elif is_qwen_style_reasoning_model(model_name, api_base_url):
|
298
|
+
stream_processor = ain_stream_thought_processor
|
271
299
|
# Reasoning is enabled by default. Disable when deepthought is False.
|
272
300
|
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
273
301
|
if not deepthought:
|
@@ -492,11 +520,12 @@ def is_twitter_reasoning_model(model_name: str, api_base_url: str = None) -> boo
|
|
492
520
|
)
|
493
521
|
|
494
522
|
|
495
|
-
def
|
523
|
+
def is_qwen_style_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
|
496
524
|
"""
|
497
|
-
Check if the model is a Qwen reasoning model
|
525
|
+
Check if the model is a Qwen style reasoning model
|
498
526
|
"""
|
499
|
-
|
527
|
+
qwen_style_reason_model = ["qwen3", "smollm3"]
|
528
|
+
return any(prefix in model_name.lower() for prefix in qwen_style_reason_model) and api_base_url is not None
|
500
529
|
|
501
530
|
|
502
531
|
def is_local_api(api_base_url: str) -> bool:
|
@@ -543,39 +572,17 @@ def default_stream_processor(
|
|
543
572
|
chat_stream: ChatCompletionStream,
|
544
573
|
) -> Generator[ChatCompletionStreamWithThoughtEvent, None, None]:
|
545
574
|
"""
|
546
|
-
|
575
|
+
Generator of chunks from the standard openai chat completions stream.
|
547
576
|
"""
|
548
577
|
for chunk in chat_stream:
|
549
578
|
yield chunk
|
550
579
|
|
551
580
|
|
552
|
-
async def
|
581
|
+
async def astream_thought_processor(
|
553
582
|
chat_stream: openai.AsyncStream[ChatCompletionChunk],
|
554
583
|
) -> AsyncGenerator[ChatCompletionWithThoughtsChunk, None]:
|
555
584
|
"""
|
556
|
-
Async generator
|
557
|
-
"""
|
558
|
-
async for chunk in chat_stream:
|
559
|
-
try:
|
560
|
-
# Validate the chunk has the required fields before processing
|
561
|
-
chunk_data = chunk.model_dump()
|
562
|
-
|
563
|
-
# Skip chunks that don't have the required object field or have invalid values
|
564
|
-
if not chunk_data.get("object") or chunk_data.get("object") != "chat.completion.chunk":
|
565
|
-
logger.warning(f"Skipping invalid chunk with object field: {chunk_data.get('object', 'missing')}")
|
566
|
-
continue
|
567
|
-
|
568
|
-
yield ChatCompletionWithThoughtsChunk.model_validate(chunk_data)
|
569
|
-
except Exception as e:
|
570
|
-
logger.warning(f"Error processing chunk: {e}. Skipping malformed chunk.")
|
571
|
-
continue
|
572
|
-
|
573
|
-
|
574
|
-
async def adeepseek_stream_processor(
|
575
|
-
chat_stream: openai.AsyncStream[ChatCompletionChunk],
|
576
|
-
) -> AsyncGenerator[ChatCompletionWithThoughtsChunk, None]:
|
577
|
-
"""
|
578
|
-
Async generator to cast and return chunks from the deepseek chat completions stream.
|
585
|
+
Async generator of chunks from standard openai chat completions stream with thoughts/reasoning.
|
579
586
|
"""
|
580
587
|
async for chunk in chat_stream:
|
581
588
|
try:
|
@@ -588,12 +595,19 @@ async def adeepseek_stream_processor(
|
|
588
595
|
continue
|
589
596
|
|
590
597
|
tchunk = ChatCompletionWithThoughtsChunk.model_validate(chunk_data)
|
598
|
+
|
599
|
+
# Handlle deepseek style response with thoughts. Used by AI APIs like vLLM, sgLang, DeepSeek, LiteLLM.
|
591
600
|
if (
|
592
601
|
len(tchunk.choices) > 0
|
593
602
|
and hasattr(tchunk.choices[0].delta, "reasoning_content")
|
594
603
|
and tchunk.choices[0].delta.reasoning_content
|
595
604
|
):
|
596
605
|
tchunk.choices[0].delta.thought = chunk.choices[0].delta.reasoning_content
|
606
|
+
|
607
|
+
# Handlle llama.cpp server style response with thoughts.
|
608
|
+
elif len(tchunk.choices) > 0 and tchunk.choices[0].delta.model_extra.get("reasoning_content"):
|
609
|
+
tchunk.choices[0].delta.thought = tchunk.choices[0].delta.model_extra.get("reasoning_content")
|
610
|
+
|
597
611
|
yield tchunk
|
598
612
|
except Exception as e:
|
599
613
|
logger.warning(f"Error processing chunk: {e}. Skipping malformed chunk.")
|
@@ -702,7 +716,7 @@ async def ain_stream_thought_processor(
|
|
702
716
|
chat_stream: openai.AsyncStream[ChatCompletionChunk], thought_tag="think"
|
703
717
|
) -> AsyncGenerator[ChatCompletionWithThoughtsChunk, None]:
|
704
718
|
"""
|
705
|
-
Async generator for chat completion with thought chunks.
|
719
|
+
Async generator for chat completion with structured and inline thought chunks.
|
706
720
|
Assumes <thought_tag>...</thought_tag> can only appear once at the start.
|
707
721
|
Handles partial tags across streamed chunks.
|
708
722
|
"""
|
@@ -712,7 +726,7 @@ async def ain_stream_thought_processor(
|
|
712
726
|
# Modes and transitions: detect_start > thought (optional) > message
|
713
727
|
mode = "detect_start"
|
714
728
|
|
715
|
-
async for chunk in
|
729
|
+
async for chunk in astream_thought_processor(chat_stream):
|
716
730
|
if len(chunk.choices) == 0:
|
717
731
|
continue
|
718
732
|
if mode == "message":
|
@@ -829,6 +843,7 @@ def to_openai_tools(tools: List[ToolDefinition]) -> List[Dict] | None:
|
|
829
843
|
"name": tool.name,
|
830
844
|
"description": tool.description,
|
831
845
|
"parameters": clean_response_schema(tool.schema),
|
846
|
+
"strict": True,
|
832
847
|
},
|
833
848
|
}
|
834
849
|
for tool in tools
|
@@ -44,7 +44,7 @@ async def operate_environment(
|
|
44
44
|
query_files: str = None, # TODO: Handle query files
|
45
45
|
cancellation_event: Optional[asyncio.Event] = None,
|
46
46
|
interrupt_queue: Optional[asyncio.Queue] = None,
|
47
|
-
abort_message: Optional[str] =
|
47
|
+
abort_message: Optional[str] = ChatEvent.END_EVENT.value,
|
48
48
|
tracer: dict = {},
|
49
49
|
):
|
50
50
|
response, user_input_message = None, None
|
khoj/routers/api_agents.py
CHANGED
khoj/routers/api_chat.py
CHANGED
@@ -4,6 +4,7 @@ import json
|
|
4
4
|
import logging
|
5
5
|
import time
|
6
6
|
import uuid
|
7
|
+
from dataclasses import dataclass
|
7
8
|
from datetime import datetime
|
8
9
|
from functools import partial
|
9
10
|
from typing import Any, Dict, List, Optional
|
@@ -703,7 +704,6 @@ async def event_generator(
|
|
703
704
|
train_of_thought = []
|
704
705
|
cancellation_event = asyncio.Event()
|
705
706
|
child_interrupt_queue: asyncio.Queue = asyncio.Queue(maxsize=10)
|
706
|
-
event_delimiter = "␃🔚␗"
|
707
707
|
|
708
708
|
tracer: dict = {
|
709
709
|
"mid": turn_id,
|
@@ -790,7 +790,7 @@ async def event_generator(
|
|
790
790
|
|
791
791
|
# Check if any interrupt query is received
|
792
792
|
if interrupt_query := get_message_from_queue(parent_interrupt_queue):
|
793
|
-
if interrupt_query ==
|
793
|
+
if interrupt_query == ChatEvent.END_EVENT.value:
|
794
794
|
cancellation_event.set()
|
795
795
|
logger.debug(f"Chat cancelled by user {user} via interrupt queue.")
|
796
796
|
else:
|
@@ -871,7 +871,7 @@ async def event_generator(
|
|
871
871
|
)
|
872
872
|
finally:
|
873
873
|
if not cancellation_event.is_set():
|
874
|
-
yield
|
874
|
+
yield ChatEvent.END_EVENT.value
|
875
875
|
# Cancel the disconnect monitor task if it is still running
|
876
876
|
if cancellation_event.is_set() or event_type == ChatEvent.END_RESPONSE:
|
877
877
|
await cancel_disconnect_monitor()
|
@@ -1043,7 +1043,7 @@ async def event_generator(
|
|
1043
1043
|
tracer=tracer,
|
1044
1044
|
cancellation_event=cancellation_event,
|
1045
1045
|
interrupt_queue=child_interrupt_queue,
|
1046
|
-
abort_message=
|
1046
|
+
abort_message=ChatEvent.END_EVENT.value,
|
1047
1047
|
):
|
1048
1048
|
if isinstance(research_result, ResearchIteration):
|
1049
1049
|
if research_result.summarizedResult:
|
@@ -1397,6 +1397,7 @@ async def event_generator(
|
|
1397
1397
|
)
|
1398
1398
|
|
1399
1399
|
full_response = ""
|
1400
|
+
message_start = True
|
1400
1401
|
async for item in llm_response:
|
1401
1402
|
# Should not happen with async generator. Skip.
|
1402
1403
|
if item is None or not isinstance(item, ResponseWithThought):
|
@@ -1410,10 +1411,11 @@ async def event_generator(
|
|
1410
1411
|
async for result in send_event(ChatEvent.THOUGHT, item.thought):
|
1411
1412
|
yield result
|
1412
1413
|
continue
|
1413
|
-
|
1414
1414
|
# Start sending response
|
1415
|
-
|
1416
|
-
|
1415
|
+
elif message_start:
|
1416
|
+
message_start = False
|
1417
|
+
async for result in send_event(ChatEvent.START_LLM_RESPONSE, ""):
|
1418
|
+
yield result
|
1417
1419
|
|
1418
1420
|
try:
|
1419
1421
|
async for result in send_event(ChatEvent.MESSAGE, message):
|
@@ -1423,6 +1425,13 @@ async def event_generator(
|
|
1423
1425
|
logger.warning(f"Error during streaming. Stopping send: {e}")
|
1424
1426
|
break
|
1425
1427
|
|
1428
|
+
# Check if the user has disconnected
|
1429
|
+
if cancellation_event.is_set():
|
1430
|
+
logger.debug(f"Stopping LLM response to user {user} on {common.client} client.")
|
1431
|
+
# Cancel the disconnect monitor task if it is still running
|
1432
|
+
await cancel_disconnect_monitor()
|
1433
|
+
return
|
1434
|
+
|
1426
1435
|
# Save conversation once finish streaming
|
1427
1436
|
asyncio.create_task(
|
1428
1437
|
save_to_conversation_log(
|
@@ -1448,16 +1457,16 @@ async def event_generator(
|
|
1448
1457
|
)
|
1449
1458
|
|
1450
1459
|
# Signal end of LLM response after the loop finishes
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1460
|
+
async for result in send_event(ChatEvent.END_LLM_RESPONSE, ""):
|
1461
|
+
yield result
|
1462
|
+
|
1463
|
+
# Send Usage Metadata once llm interactions are complete
|
1464
|
+
if tracer.get("usage"):
|
1465
|
+
async for event in send_event(ChatEvent.USAGE, tracer.get("usage")):
|
1466
|
+
yield event
|
1467
|
+
async for result in send_event(ChatEvent.END_RESPONSE, ""):
|
1468
|
+
yield result
|
1469
|
+
logger.debug("Finished streaming response")
|
1461
1470
|
|
1462
1471
|
# Cancel the disconnect monitor task if it is still running
|
1463
1472
|
await cancel_disconnect_monitor()
|
@@ -1509,8 +1518,7 @@ async def chat_ws(
|
|
1509
1518
|
if data.get("type") == "interrupt":
|
1510
1519
|
if current_task and not current_task.done():
|
1511
1520
|
# Send interrupt signal to the ongoing task
|
1512
|
-
|
1513
|
-
await interrupt_queue.put(data.get("query") or abort_message)
|
1521
|
+
await interrupt_queue.put(data.get("query") or ChatEvent.END_EVENT.value)
|
1514
1522
|
logger.info(
|
1515
1523
|
f"Interrupt signal sent to ongoing task for user {websocket.scope['user'].object.id} with query: {data.get('query')}"
|
1516
1524
|
)
|
@@ -1572,6 +1580,37 @@ async def process_chat_request(
|
|
1572
1580
|
interrupt_queue: asyncio.Queue,
|
1573
1581
|
):
|
1574
1582
|
"""Process a single chat request with interrupt support"""
|
1583
|
+
|
1584
|
+
# Server-side message buffering for better streaming performance
|
1585
|
+
@dataclass
|
1586
|
+
class MessageBuffer:
|
1587
|
+
"""Buffer for managing streamed chat messages with timing control."""
|
1588
|
+
|
1589
|
+
content: str = ""
|
1590
|
+
timeout: Optional[asyncio.Task] = None
|
1591
|
+
last_flush: float = 0.0
|
1592
|
+
|
1593
|
+
def __post_init__(self):
|
1594
|
+
"""Initialize last_flush with current time if not provided."""
|
1595
|
+
if self.last_flush == 0.0:
|
1596
|
+
self.last_flush = time.perf_counter()
|
1597
|
+
|
1598
|
+
message_buffer = MessageBuffer()
|
1599
|
+
BUFFER_FLUSH_INTERVAL = 0.1 # 100ms buffer interval
|
1600
|
+
BUFFER_MAX_SIZE = 512 # Flush if buffer reaches this size
|
1601
|
+
|
1602
|
+
async def flush_message_buffer():
|
1603
|
+
"""Flush the accumulated message buffer to the client"""
|
1604
|
+
nonlocal message_buffer
|
1605
|
+
if message_buffer.content:
|
1606
|
+
buffered_content = message_buffer.content
|
1607
|
+
message_buffer.content = ""
|
1608
|
+
message_buffer.last_flush = time.perf_counter()
|
1609
|
+
if message_buffer.timeout:
|
1610
|
+
message_buffer.timeout.cancel()
|
1611
|
+
message_buffer.timeout = None
|
1612
|
+
yield buffered_content
|
1613
|
+
|
1575
1614
|
try:
|
1576
1615
|
# Since we are using websockets, we can ignore the stream parameter and always stream
|
1577
1616
|
response_iterator = event_generator(
|
@@ -1583,7 +1622,43 @@ async def process_chat_request(
|
|
1583
1622
|
interrupt_queue,
|
1584
1623
|
)
|
1585
1624
|
async for event in response_iterator:
|
1586
|
-
|
1625
|
+
if event.startswith("{") and event.endswith("}"):
|
1626
|
+
evt_json = json.loads(event)
|
1627
|
+
if evt_json["type"] == ChatEvent.END_LLM_RESPONSE.value:
|
1628
|
+
# Flush remaining buffer content on end llm response event
|
1629
|
+
chunks = "".join([chunk async for chunk in flush_message_buffer()])
|
1630
|
+
await websocket.send_text(chunks)
|
1631
|
+
await websocket.send_text(ChatEvent.END_EVENT.value)
|
1632
|
+
await websocket.send_text(event)
|
1633
|
+
await websocket.send_text(ChatEvent.END_EVENT.value)
|
1634
|
+
elif event != ChatEvent.END_EVENT.value:
|
1635
|
+
# Buffer MESSAGE events for better streaming performance
|
1636
|
+
message_buffer.content += str(event)
|
1637
|
+
|
1638
|
+
# Flush if buffer is too large or enough time has passed
|
1639
|
+
current_time = time.perf_counter()
|
1640
|
+
should_flush_time = (current_time - message_buffer.last_flush) >= BUFFER_FLUSH_INTERVAL
|
1641
|
+
should_flush_size = len(message_buffer.content) >= BUFFER_MAX_SIZE
|
1642
|
+
|
1643
|
+
if should_flush_size or should_flush_time:
|
1644
|
+
chunks = "".join([chunk async for chunk in flush_message_buffer()])
|
1645
|
+
await websocket.send_text(chunks)
|
1646
|
+
await websocket.send_text(ChatEvent.END_EVENT.value)
|
1647
|
+
else:
|
1648
|
+
# Cancel any previous timeout tasks to reset the flush timer
|
1649
|
+
if message_buffer.timeout:
|
1650
|
+
message_buffer.timeout.cancel()
|
1651
|
+
|
1652
|
+
async def delayed_flush():
|
1653
|
+
"""Flush message buffer if no new messages arrive within debounce interval."""
|
1654
|
+
await asyncio.sleep(BUFFER_FLUSH_INTERVAL)
|
1655
|
+
# Check if there's still content to flush
|
1656
|
+
chunks = "".join([chunk async for chunk in flush_message_buffer()])
|
1657
|
+
await websocket.send_text(chunks)
|
1658
|
+
await websocket.send_text(ChatEvent.END_EVENT.value)
|
1659
|
+
|
1660
|
+
# Flush buffer if no new messages arrive within debounce interval
|
1661
|
+
message_buffer.timeout = asyncio.create_task(delayed_flush())
|
1587
1662
|
except asyncio.CancelledError:
|
1588
1663
|
logger.debug(f"Chat request cancelled for user {websocket.scope['user'].object.id}")
|
1589
1664
|
raise
|
khoj/routers/helpers.py
CHANGED
@@ -2099,7 +2099,8 @@ class WebSocketConnectionManager:
|
|
2099
2099
|
user=user, slug__startswith=self.connection_slug_prefix
|
2100
2100
|
).acount()
|
2101
2101
|
|
2102
|
-
|
2102
|
+
# Restrict max active connections per user in production
|
2103
|
+
return active_connections < max_connections or state.anonymous_mode or in_debug_mode()
|
2103
2104
|
|
2104
2105
|
async def register_connection(self, user: KhojUser, connection_id: str) -> None:
|
2105
2106
|
"""Register a new WebSocket connection."""
|
@@ -2616,7 +2617,6 @@ class MessageProcessor:
|
|
2616
2617
|
|
2617
2618
|
async def read_chat_stream(response_iterator: AsyncGenerator[str, None]) -> Dict[str, Any]:
|
2618
2619
|
processor = MessageProcessor()
|
2619
|
-
event_delimiter = "␃🔚␗"
|
2620
2620
|
buffer = ""
|
2621
2621
|
|
2622
2622
|
async for chunk in response_iterator:
|
@@ -2624,9 +2624,9 @@ async def read_chat_stream(response_iterator: AsyncGenerator[str, None]) -> Dict
|
|
2624
2624
|
buffer += chunk
|
2625
2625
|
|
2626
2626
|
# Once the buffer contains a complete event
|
2627
|
-
while
|
2627
|
+
while ChatEvent.END_EVENT.value in buffer:
|
2628
2628
|
# Extract the event from the buffer
|
2629
|
-
event, buffer = buffer.split(
|
2629
|
+
event, buffer = buffer.split(ChatEvent.END_EVENT.value, 1)
|
2630
2630
|
# Process the event
|
2631
2631
|
if event:
|
2632
2632
|
processor.process_message_chunk(event)
|
khoj/routers/research.py
CHANGED
@@ -224,7 +224,7 @@ async def research(
|
|
224
224
|
query_files: str = None,
|
225
225
|
cancellation_event: Optional[asyncio.Event] = None,
|
226
226
|
interrupt_queue: Optional[asyncio.Queue] = None,
|
227
|
-
abort_message: str =
|
227
|
+
abort_message: str = ChatEvent.END_EVENT.value,
|
228
228
|
):
|
229
229
|
max_document_searches = 7
|
230
230
|
max_online_searches = 3
|
khoj/routers/web_client.py
CHANGED
@@ -139,3 +139,8 @@ def automations_config_page(
|
|
139
139
|
@web_client.get("/.well-known/assetlinks.json", response_class=FileResponse)
|
140
140
|
def assetlinks(request: Request):
|
141
141
|
return FileResponse(constants.assetlinks_file_path)
|
142
|
+
|
143
|
+
|
144
|
+
@web_client.get("/server/error", response_class=HTMLResponse)
|
145
|
+
def server_error_page(request: Request):
|
146
|
+
return templates.TemplateResponse("error.html", context={"request": request})
|