khoj 1.41.1.dev37__py3-none-any.whl → 1.41.1.dev39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. khoj/interface/compiled/404/index.html +1 -1
  2. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  3. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +1 -0
  4. khoj/interface/compiled/agents/index.html +1 -1
  5. khoj/interface/compiled/agents/index.txt +1 -1
  6. khoj/interface/compiled/automations/index.html +1 -1
  7. khoj/interface/compiled/automations/index.txt +1 -1
  8. khoj/interface/compiled/chat/index.html +1 -1
  9. khoj/interface/compiled/chat/index.txt +1 -1
  10. khoj/interface/compiled/index.html +1 -1
  11. khoj/interface/compiled/index.txt +1 -1
  12. khoj/interface/compiled/search/index.html +1 -1
  13. khoj/interface/compiled/search/index.txt +1 -1
  14. khoj/interface/compiled/settings/index.html +1 -1
  15. khoj/interface/compiled/settings/index.txt +1 -1
  16. khoj/interface/compiled/share/chat/index.html +1 -1
  17. khoj/interface/compiled/share/chat/index.txt +1 -1
  18. khoj/processor/conversation/anthropic/utils.py +79 -72
  19. khoj/processor/conversation/google/utils.py +86 -77
  20. khoj/processor/conversation/openai/utils.py +156 -120
  21. khoj/utils/helpers.py +26 -0
  22. {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/METADATA +1 -1
  23. {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/RECORD +28 -28
  24. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
  25. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-6fb51c5c80f8ec67.js +0 -1
  26. /khoj/interface/compiled/_next/static/{v77HARG2K4O2DRocPvOst → lAg9p8vAsLV6wpRoMm4qC}/_buildManifest.js +0 -0
  27. /khoj/interface/compiled/_next/static/{v77HARG2K4O2DRocPvOst → lAg9p8vAsLV6wpRoMm4qC}/_ssgManifest.js +0 -0
  28. {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/WHEEL +0 -0
  29. {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/entry_points.txt +0 -0
  30. {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/licenses/LICENSE +0 -0
@@ -73,6 +73,9 @@ def _is_retryable_error(exception: BaseException) -> bool:
73
73
  # client errors
74
74
  if isinstance(exception, httpx.TimeoutException) or isinstance(exception, httpx.NetworkError):
75
75
  return True
76
+ # validation errors
77
+ if isinstance(exception, ValueError):
78
+ return True
76
79
  return False
77
80
 
78
81
 
@@ -84,8 +87,8 @@ def _is_retryable_error(exception: BaseException) -> bool:
84
87
  reraise=True,
85
88
  )
86
89
  def gemini_completion_with_backoff(
87
- messages,
88
- system_prompt,
90
+ messages: list[ChatMessage],
91
+ system_prompt: str,
89
92
  model_name: str,
90
93
  temperature=1.0,
91
94
  api_key=None,
@@ -144,6 +147,11 @@ def gemini_completion_with_backoff(
144
147
  model_name, input_tokens, output_tokens, thought_tokens=thought_tokens, usage=tracer.get("usage")
145
148
  )
146
149
 
150
+ # Validate the response. If empty, raise an error to retry.
151
+ if is_none_or_empty(response_text):
152
+ logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
153
+ raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
154
+
147
155
  # Save conversation trace
148
156
  tracer["chat_model"] = model_name
149
157
  tracer["temperature"] = temperature
@@ -157,89 +165,90 @@ def gemini_completion_with_backoff(
157
165
  retry=retry_if_exception(_is_retryable_error),
158
166
  wait=wait_exponential(multiplier=1, min=4, max=10),
159
167
  stop=stop_after_attempt(3),
160
- before_sleep=before_sleep_log(logger, logging.DEBUG),
161
- reraise=True,
168
+ before_sleep=before_sleep_log(logger, logging.WARNING),
169
+ reraise=False,
162
170
  )
163
171
  async def gemini_chat_completion_with_backoff(
164
- messages,
165
- model_name,
166
- temperature,
167
- api_key,
168
- api_base_url,
169
- system_prompt,
172
+ messages: list[ChatMessage],
173
+ model_name: str,
174
+ temperature: float,
175
+ api_key: str,
176
+ api_base_url: str,
177
+ system_prompt: str,
170
178
  model_kwargs=None,
171
179
  deepthought=False,
172
180
  tracer: dict = {},
173
181
  ) -> AsyncGenerator[str, None]:
174
- try:
175
- client = gemini_clients.get(api_key)
176
- if not client:
177
- client = get_gemini_client(api_key, api_base_url)
178
- gemini_clients[api_key] = client
179
-
180
- formatted_messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
181
-
182
- thinking_config = None
183
- if deepthought and model_name.startswith("gemini-2-5"):
184
- thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
185
-
186
- seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
187
- config = gtypes.GenerateContentConfig(
188
- system_instruction=system_prompt,
189
- temperature=temperature,
190
- thinking_config=thinking_config,
191
- max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
192
- stop_sequences=["Notes:\n["],
193
- safety_settings=SAFETY_SETTINGS,
194
- seed=seed,
195
- http_options=gtypes.HttpOptions(async_client_args={"timeout": httpx.Timeout(30.0, read=60.0)}),
196
- )
182
+ client = gemini_clients.get(api_key)
183
+ if not client:
184
+ client = get_gemini_client(api_key, api_base_url)
185
+ gemini_clients[api_key] = client
197
186
 
198
- aggregated_response = ""
199
- final_chunk = None
200
- response_started = False
201
- start_time = perf_counter()
202
- chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream(
203
- model=model_name, config=config, contents=formatted_messages
204
- )
205
- async for chunk in chat_stream:
206
- # Log the time taken to start response
207
- if not response_started:
208
- response_started = True
209
- logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
210
- # Keep track of the last chunk for usage data
211
- final_chunk = chunk
212
- # Handle streamed response chunk
213
- message, stopped = handle_gemini_response(chunk.candidates, chunk.prompt_feedback)
214
- message = message or chunk.text
215
- aggregated_response += message
216
- yield message
217
- if stopped:
218
- raise ValueError(message)
219
-
220
- # Log the time taken to stream the entire response
221
- logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
222
-
223
- # Calculate cost of chat
224
- input_tokens = final_chunk.usage_metadata.prompt_token_count or 0 if final_chunk else 0
225
- output_tokens = final_chunk.usage_metadata.candidates_token_count or 0 if final_chunk else 0
226
- thought_tokens = final_chunk.usage_metadata.thoughts_token_count or 0 if final_chunk else 0
227
- tracer["usage"] = get_chat_usage_metrics(
228
- model_name, input_tokens, output_tokens, thought_tokens=thought_tokens, usage=tracer.get("usage")
229
- )
187
+ formatted_messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
230
188
 
231
- # Save conversation trace
232
- tracer["chat_model"] = model_name
233
- tracer["temperature"] = temperature
234
- if is_promptrace_enabled():
235
- commit_conversation_trace(messages, aggregated_response, tracer)
236
- except ValueError as e:
237
- logger.warning(
238
- f"LLM Response Prevented for {model_name}: {e.args[0]}.\n"
239
- + f"Last Message by {messages[-1].role}: {messages[-1].content}"
240
- )
241
- except Exception as e:
242
- logger.error(f"Error in gemini_chat_completion_with_backoff stream: {e}", exc_info=True)
189
+ thinking_config = None
190
+ if deepthought and model_name.startswith("gemini-2-5"):
191
+ thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
192
+
193
+ seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
194
+ config = gtypes.GenerateContentConfig(
195
+ system_instruction=system_prompt,
196
+ temperature=temperature,
197
+ thinking_config=thinking_config,
198
+ max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
199
+ stop_sequences=["Notes:\n["],
200
+ safety_settings=SAFETY_SETTINGS,
201
+ seed=seed,
202
+ http_options=gtypes.HttpOptions(async_client_args={"timeout": httpx.Timeout(30.0, read=60.0)}),
203
+ )
204
+
205
+ aggregated_response = ""
206
+ final_chunk = None
207
+ response_started = False
208
+ start_time = perf_counter()
209
+ chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream(
210
+ model=model_name, config=config, contents=formatted_messages
211
+ )
212
+ async for chunk in chat_stream:
213
+ # Log the time taken to start response
214
+ if not response_started:
215
+ response_started = True
216
+ logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
217
+ # Keep track of the last chunk for usage data
218
+ final_chunk = chunk
219
+ # Handle streamed response chunk
220
+ stop_message, stopped = handle_gemini_response(chunk.candidates, chunk.prompt_feedback)
221
+ message = stop_message or chunk.text
222
+ aggregated_response += message
223
+ yield message
224
+ if stopped:
225
+ logger.warning(
226
+ f"LLM Response Prevented for {model_name}: {stop_message}.\n"
227
+ + f"Last Message by {messages[-1].role}: {messages[-1].content}"
228
+ )
229
+ break
230
+
231
+ # Calculate cost of chat
232
+ input_tokens = final_chunk.usage_metadata.prompt_token_count or 0 if final_chunk else 0
233
+ output_tokens = final_chunk.usage_metadata.candidates_token_count or 0 if final_chunk else 0
234
+ thought_tokens = final_chunk.usage_metadata.thoughts_token_count or 0 if final_chunk else 0
235
+ tracer["usage"] = get_chat_usage_metrics(
236
+ model_name, input_tokens, output_tokens, thought_tokens=thought_tokens, usage=tracer.get("usage")
237
+ )
238
+
239
+ # Validate the response. If empty, raise an error to retry.
240
+ if is_none_or_empty(aggregated_response):
241
+ logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
242
+ raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
243
+
244
+ # Log the time taken to stream the entire response
245
+ logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
246
+
247
+ # Save conversation trace
248
+ tracer["chat_model"] = model_name
249
+ tracer["temperature"] = temperature
250
+ if is_promptrace_enabled():
251
+ commit_conversation_trace(messages, aggregated_response, tracer)
243
252
 
244
253
 
245
254
  def handle_gemini_response(
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import os
3
+ from copy import deepcopy
3
4
  from functools import partial
4
5
  from time import perf_counter
5
6
  from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union
@@ -7,6 +8,7 @@ from urllib.parse import urlparse
7
8
 
8
9
  import httpx
9
10
  import openai
11
+ from langchain_core.messages.chat import ChatMessage
10
12
  from openai.lib.streaming.chat import (
11
13
  ChatCompletionStream,
12
14
  ChatCompletionStreamEvent,
@@ -32,9 +34,11 @@ from khoj.processor.conversation.utils import (
32
34
  commit_conversation_trace,
33
35
  )
34
36
  from khoj.utils.helpers import (
37
+ convert_image_data_uri,
35
38
  get_chat_usage_metrics,
36
39
  get_openai_async_client,
37
40
  get_openai_client,
41
+ is_none_or_empty,
38
42
  is_promptrace_enabled,
39
43
  )
40
44
 
@@ -51,6 +55,7 @@ openai_async_clients: Dict[str, openai.AsyncOpenAI] = {}
51
55
  | retry_if_exception_type(openai._exceptions.APIConnectionError)
52
56
  | retry_if_exception_type(openai._exceptions.RateLimitError)
53
57
  | retry_if_exception_type(openai._exceptions.APIStatusError)
58
+ | retry_if_exception_type(ValueError)
54
59
  ),
55
60
  wait=wait_random_exponential(min=1, max=10),
56
61
  stop=stop_after_attempt(3),
@@ -58,7 +63,7 @@ openai_async_clients: Dict[str, openai.AsyncOpenAI] = {}
58
63
  reraise=True,
59
64
  )
60
65
  def completion_with_backoff(
61
- messages,
66
+ messages: List[ChatMessage],
62
67
  model_name: str,
63
68
  temperature=0.8,
64
69
  openai_api_key=None,
@@ -74,7 +79,7 @@ def completion_with_backoff(
74
79
  openai_clients[client_key] = client
75
80
 
76
81
  stream_processor = default_stream_processor
77
- formatted_messages = [{"role": message.role, "content": message.content} for message in messages]
82
+ formatted_messages = format_message_for_api(messages, api_base_url)
78
83
 
79
84
  # Tune reasoning models arguments
80
85
  if is_openai_reasoning_model(model_name, api_base_url):
@@ -133,6 +138,11 @@ def completion_with_backoff(
133
138
  model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
134
139
  )
135
140
 
141
+ # Validate the response. If empty, raise an error to retry.
142
+ if is_none_or_empty(aggregated_response):
143
+ logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
144
+ raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
145
+
136
146
  # Save conversation trace
137
147
  tracer["chat_model"] = model_name
138
148
  tracer["temperature"] = temperature
@@ -149,14 +159,15 @@ def completion_with_backoff(
149
159
  | retry_if_exception_type(openai._exceptions.APIConnectionError)
150
160
  | retry_if_exception_type(openai._exceptions.RateLimitError)
151
161
  | retry_if_exception_type(openai._exceptions.APIStatusError)
162
+ | retry_if_exception_type(ValueError)
152
163
  ),
153
164
  wait=wait_exponential(multiplier=1, min=4, max=10),
154
165
  stop=stop_after_attempt(3),
155
- before_sleep=before_sleep_log(logger, logging.DEBUG),
156
- reraise=True,
166
+ before_sleep=before_sleep_log(logger, logging.WARNING),
167
+ reraise=False,
157
168
  )
158
169
  async def chat_completion_with_backoff(
159
- messages,
170
+ messages: list[ChatMessage],
160
171
  model_name: str,
161
172
  temperature,
162
173
  openai_api_key=None,
@@ -165,120 +176,122 @@ async def chat_completion_with_backoff(
165
176
  model_kwargs: dict = {},
166
177
  tracer: dict = {},
167
178
  ) -> AsyncGenerator[ResponseWithThought, None]:
168
- try:
169
- client_key = f"{openai_api_key}--{api_base_url}"
170
- client = openai_async_clients.get(client_key)
171
- if not client:
172
- client = get_openai_async_client(openai_api_key, api_base_url)
173
- openai_async_clients[client_key] = client
174
-
175
- stream_processor = adefault_stream_processor
176
- formatted_messages = [{"role": message.role, "content": message.content} for message in messages]
177
-
178
- # Configure thinking for openai reasoning models
179
- if is_openai_reasoning_model(model_name, api_base_url):
180
- temperature = 1
181
- reasoning_effort = "medium" if deepthought else "low"
182
- model_kwargs["reasoning_effort"] = reasoning_effort
183
- model_kwargs.pop("stop", None) # Remove unsupported stop param for reasoning models
184
-
185
- # Get the first system message and add the string `Formatting re-enabled` to it.
186
- # See https://platform.openai.com/docs/guides/reasoning-best-practices
187
- if len(formatted_messages) > 0:
188
- system_messages = [
189
- (i, message) for i, message in enumerate(formatted_messages) if message["role"] == "system"
190
- ]
191
- if len(system_messages) > 0:
192
- first_system_message_index, first_system_message = system_messages[0]
193
- first_system_message_content = first_system_message["content"]
194
- formatted_messages[first_system_message_index][
195
- "content"
196
- ] = f"{first_system_message_content}\nFormatting re-enabled"
197
- elif is_twitter_reasoning_model(model_name, api_base_url):
198
- stream_processor = adeepseek_stream_processor
199
- reasoning_effort = "high" if deepthought else "low"
200
- model_kwargs["reasoning_effort"] = reasoning_effort
201
- elif model_name.startswith("deepseek-reasoner"):
202
- stream_processor = adeepseek_stream_processor
203
- # Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
204
- # The first message should always be a user message (except system message).
205
- updated_messages: List[dict] = []
206
- for i, message in enumerate(formatted_messages):
207
- if i > 0 and message["role"] == formatted_messages[i - 1]["role"]:
208
- updated_messages[-1]["content"] += " " + message["content"]
209
- elif i == 1 and formatted_messages[i - 1]["role"] == "system" and message["role"] == "assistant":
210
- updated_messages[-1]["content"] += " " + message["content"]
211
- else:
212
- updated_messages.append(message)
213
- formatted_messages = updated_messages
214
- elif is_qwen_reasoning_model(model_name, api_base_url):
215
- stream_processor = partial(ain_stream_thought_processor, thought_tag="think")
216
- # Reasoning is enabled by default. Disable when deepthought is False.
217
- # See https://qwenlm.github.io/blog/qwen3/#advanced-usages
218
- if not deepthought and len(formatted_messages) > 0:
219
- formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
220
-
221
- stream = True
222
- read_timeout = 300 if is_local_api(api_base_url) else 60
223
- model_kwargs["stream_options"] = {"include_usage": True}
224
- if os.getenv("KHOJ_LLM_SEED"):
225
- model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
226
-
227
- aggregated_response = ""
228
- final_chunk = None
229
- response_started = False
230
- start_time = perf_counter()
231
- chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
232
- messages=formatted_messages, # type: ignore
233
- model=model_name,
234
- stream=stream,
235
- temperature=temperature,
236
- timeout=httpx.Timeout(30, read=read_timeout),
237
- **model_kwargs,
238
- )
239
- async for chunk in stream_processor(chat_stream):
240
- # Log the time taken to start response
241
- if not response_started:
242
- response_started = True
243
- logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
244
- # Keep track of the last chunk for usage data
245
- final_chunk = chunk
246
- # Skip empty chunks
247
- if len(chunk.choices) == 0:
248
- continue
249
- # Handle streamed response chunk
250
- response_chunk: ResponseWithThought = None
251
- response_delta = chunk.choices[0].delta
252
- if response_delta.content:
253
- response_chunk = ResponseWithThought(response=response_delta.content)
254
- aggregated_response += response_chunk.response
255
- elif response_delta.thought:
256
- response_chunk = ResponseWithThought(thought=response_delta.thought)
257
- if response_chunk:
258
- yield response_chunk
259
-
260
- # Log the time taken to stream the entire response
261
- logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
262
-
263
- # Calculate cost of chat after stream finishes
264
- input_tokens, output_tokens, cost = 0, 0, 0
265
- if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
266
- input_tokens = final_chunk.usage.prompt_tokens
267
- output_tokens = final_chunk.usage.completion_tokens
268
- # Estimated costs returned by DeepInfra API
269
- if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
270
- cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
271
-
272
- # Save conversation trace
273
- tracer["chat_model"] = model_name
274
- tracer["temperature"] = temperature
275
- tracer["usage"] = get_chat_usage_metrics(
276
- model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
277
- )
278
- if is_promptrace_enabled():
279
- commit_conversation_trace(messages, aggregated_response, tracer)
280
- except Exception as e:
281
- logger.error(f"Error in chat_completion_with_backoff stream: {e}", exc_info=True)
179
+ client_key = f"{openai_api_key}--{api_base_url}"
180
+ client = openai_async_clients.get(client_key)
181
+ if not client:
182
+ client = get_openai_async_client(openai_api_key, api_base_url)
183
+ openai_async_clients[client_key] = client
184
+
185
+ stream_processor = adefault_stream_processor
186
+ formatted_messages = format_message_for_api(messages, api_base_url)
187
+
188
+ # Configure thinking for openai reasoning models
189
+ if is_openai_reasoning_model(model_name, api_base_url):
190
+ temperature = 1
191
+ reasoning_effort = "medium" if deepthought else "low"
192
+ model_kwargs["reasoning_effort"] = reasoning_effort
193
+ model_kwargs.pop("stop", None) # Remove unsupported stop param for reasoning models
194
+
195
+ # Get the first system message and add the string `Formatting re-enabled` to it.
196
+ # See https://platform.openai.com/docs/guides/reasoning-best-practices
197
+ if len(formatted_messages) > 0:
198
+ system_messages = [
199
+ (i, message) for i, message in enumerate(formatted_messages) if message["role"] == "system"
200
+ ]
201
+ if len(system_messages) > 0:
202
+ first_system_message_index, first_system_message = system_messages[0]
203
+ first_system_message_content = first_system_message["content"]
204
+ formatted_messages[first_system_message_index][
205
+ "content"
206
+ ] = f"{first_system_message_content}\nFormatting re-enabled"
207
+ elif is_twitter_reasoning_model(model_name, api_base_url):
208
+ stream_processor = adeepseek_stream_processor
209
+ reasoning_effort = "high" if deepthought else "low"
210
+ model_kwargs["reasoning_effort"] = reasoning_effort
211
+ elif model_name.startswith("deepseek-reasoner"):
212
+ stream_processor = adeepseek_stream_processor
213
+ # Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
214
+ # The first message should always be a user message (except system message).
215
+ updated_messages: List[dict] = []
216
+ for i, message in enumerate(formatted_messages):
217
+ if i > 0 and message["role"] == formatted_messages[i - 1]["role"]:
218
+ updated_messages[-1]["content"] += " " + message["content"]
219
+ elif i == 1 and formatted_messages[i - 1]["role"] == "system" and message["role"] == "assistant":
220
+ updated_messages[-1]["content"] += " " + message["content"]
221
+ else:
222
+ updated_messages.append(message)
223
+ formatted_messages = updated_messages
224
+ elif is_qwen_reasoning_model(model_name, api_base_url):
225
+ stream_processor = partial(ain_stream_thought_processor, thought_tag="think")
226
+ # Reasoning is enabled by default. Disable when deepthought is False.
227
+ # See https://qwenlm.github.io/blog/qwen3/#advanced-usages
228
+ if not deepthought and len(formatted_messages) > 0:
229
+ formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
230
+
231
+ stream = True
232
+ read_timeout = 300 if is_local_api(api_base_url) else 60
233
+ model_kwargs["stream_options"] = {"include_usage": True}
234
+ if os.getenv("KHOJ_LLM_SEED"):
235
+ model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
236
+
237
+ aggregated_response = ""
238
+ final_chunk = None
239
+ response_started = False
240
+ start_time = perf_counter()
241
+ chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
242
+ messages=formatted_messages, # type: ignore
243
+ model=model_name,
244
+ stream=stream,
245
+ temperature=temperature,
246
+ timeout=httpx.Timeout(30, read=read_timeout),
247
+ **model_kwargs,
248
+ )
249
+ async for chunk in stream_processor(chat_stream):
250
+ # Log the time taken to start response
251
+ if not response_started:
252
+ response_started = True
253
+ logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
254
+ # Keep track of the last chunk for usage data
255
+ final_chunk = chunk
256
+ # Skip empty chunks
257
+ if len(chunk.choices) == 0:
258
+ continue
259
+ # Handle streamed response chunk
260
+ response_chunk: ResponseWithThought = None
261
+ response_delta = chunk.choices[0].delta
262
+ if response_delta.content:
263
+ response_chunk = ResponseWithThought(response=response_delta.content)
264
+ aggregated_response += response_chunk.response
265
+ elif response_delta.thought:
266
+ response_chunk = ResponseWithThought(thought=response_delta.thought)
267
+ if response_chunk:
268
+ yield response_chunk
269
+
270
+ # Calculate cost of chat after stream finishes
271
+ input_tokens, output_tokens, cost = 0, 0, 0
272
+ if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
273
+ input_tokens = final_chunk.usage.prompt_tokens
274
+ output_tokens = final_chunk.usage.completion_tokens
275
+ # Estimated costs returned by DeepInfra API
276
+ if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
277
+ cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
278
+ tracer["usage"] = get_chat_usage_metrics(
279
+ model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
280
+ )
281
+
282
+ # Validate the response. If empty, raise an error to retry.
283
+ if is_none_or_empty(aggregated_response):
284
+ logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
285
+ raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
286
+
287
+ # Log the time taken to stream the entire response
288
+ logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
289
+
290
+ # Save conversation trace
291
+ tracer["chat_model"] = model_name
292
+ tracer["temperature"] = temperature
293
+ if is_promptrace_enabled():
294
+ commit_conversation_trace(messages, aggregated_response, tracer)
282
295
 
283
296
 
284
297
  def get_openai_api_json_support(model_name: str, api_base_url: str = None) -> JsonSupport:
@@ -293,11 +306,34 @@ def get_openai_api_json_support(model_name: str, api_base_url: str = None) -> Js
293
306
  return JsonSupport.SCHEMA
294
307
 
295
308
 
309
+ def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> List[dict]:
310
+ """
311
+ Format messages to send to chat model served over OpenAI (compatible) API.
312
+ """
313
+ formatted_messages = []
314
+ for message in deepcopy(messages):
315
+ # Convert images to PNG format if message to be sent to non OpenAI API
316
+ if isinstance(message.content, list) and not is_openai_api(api_base_url):
317
+ for part in message.content:
318
+ if part.get("type") == "image_url":
319
+ part["image_url"]["url"] = convert_image_data_uri(part["image_url"]["url"], target_format="png")
320
+ formatted_messages.append({"role": message.role, "content": message.content})
321
+
322
+ return formatted_messages
323
+
324
+
325
+ def is_openai_api(api_base_url: str = None) -> bool:
326
+ """
327
+ Check if the model is served over the official OpenAI API
328
+ """
329
+ return api_base_url is None or api_base_url.startswith("https://api.openai.com/v1")
330
+
331
+
296
332
  def is_openai_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
297
333
  """
298
334
  Check if the model is an OpenAI reasoning model
299
335
  """
300
- return model_name.startswith("o") and (api_base_url is None or api_base_url.startswith("https://api.openai.com/v1"))
336
+ return model_name.startswith("o") and is_openai_api(api_base_url)
301
337
 
302
338
 
303
339
  def is_twitter_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
khoj/utils/helpers.py CHANGED
@@ -555,6 +555,32 @@ def convert_image_to_webp(image_bytes):
555
555
  return webp_image_bytes
556
556
 
557
557
 
558
+ def convert_image_data_uri(image_data_uri: str, target_format: str = "png") -> str:
559
+ """
560
+ Convert image (in data URI) to target format.
561
+
562
+ Target format can be png, jpg, webp etc.
563
+ Returns the converted image as a data URI.
564
+ """
565
+ base64_data = image_data_uri.split(",", 1)[1]
566
+ image_type = image_data_uri.split(";")[0].split(":")[1].split("/")[1]
567
+ if image_type.lower() == target_format.lower():
568
+ return image_data_uri
569
+
570
+ image_bytes = base64.b64decode(base64_data)
571
+ image_io = io.BytesIO(image_bytes)
572
+ with Image.open(image_io) as original_image:
573
+ output_image_io = io.BytesIO()
574
+ original_image.save(output_image_io, target_format.upper())
575
+
576
+ # Encode the image back to base64
577
+ output_image_bytes = output_image_io.getvalue()
578
+ output_image_io.close()
579
+ output_base64_data = base64.b64encode(output_image_bytes).decode("utf-8")
580
+ output_data_uri = f"data:image/{target_format};base64,{output_base64_data}"
581
+ return output_data_uri
582
+
583
+
558
584
  def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]:
559
585
  """
560
586
  Truncate large output files and drop image file data from code results.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: khoj
3
- Version: 1.41.1.dev37
3
+ Version: 1.41.1.dev39
4
4
  Summary: Your Second Brain
5
5
  Project-URL: Homepage, https://khoj.dev
6
6
  Project-URL: Documentation, https://docs.khoj.dev