khoj 1.41.1.dev37__py3-none-any.whl → 1.41.1.dev39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +1 -0
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +1 -1
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +1 -1
- khoj/interface/compiled/chat/index.html +1 -1
- khoj/interface/compiled/chat/index.txt +1 -1
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +1 -1
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +1 -1
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +1 -1
- khoj/interface/compiled/share/chat/index.html +1 -1
- khoj/interface/compiled/share/chat/index.txt +1 -1
- khoj/processor/conversation/anthropic/utils.py +79 -72
- khoj/processor/conversation/google/utils.py +86 -77
- khoj/processor/conversation/openai/utils.py +156 -120
- khoj/utils/helpers.py +26 -0
- {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/METADATA +1 -1
- {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/RECORD +28 -28
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-6fb51c5c80f8ec67.js +0 -1
- /khoj/interface/compiled/_next/static/{v77HARG2K4O2DRocPvOst → lAg9p8vAsLV6wpRoMm4qC}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{v77HARG2K4O2DRocPvOst → lAg9p8vAsLV6wpRoMm4qC}/_ssgManifest.js +0 -0
- {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev37.dist-info → khoj-1.41.1.dev39.dist-info}/licenses/LICENSE +0 -0
@@ -73,6 +73,9 @@ def _is_retryable_error(exception: BaseException) -> bool:
|
|
73
73
|
# client errors
|
74
74
|
if isinstance(exception, httpx.TimeoutException) or isinstance(exception, httpx.NetworkError):
|
75
75
|
return True
|
76
|
+
# validation errors
|
77
|
+
if isinstance(exception, ValueError):
|
78
|
+
return True
|
76
79
|
return False
|
77
80
|
|
78
81
|
|
@@ -84,8 +87,8 @@ def _is_retryable_error(exception: BaseException) -> bool:
|
|
84
87
|
reraise=True,
|
85
88
|
)
|
86
89
|
def gemini_completion_with_backoff(
|
87
|
-
messages,
|
88
|
-
system_prompt,
|
90
|
+
messages: list[ChatMessage],
|
91
|
+
system_prompt: str,
|
89
92
|
model_name: str,
|
90
93
|
temperature=1.0,
|
91
94
|
api_key=None,
|
@@ -144,6 +147,11 @@ def gemini_completion_with_backoff(
|
|
144
147
|
model_name, input_tokens, output_tokens, thought_tokens=thought_tokens, usage=tracer.get("usage")
|
145
148
|
)
|
146
149
|
|
150
|
+
# Validate the response. If empty, raise an error to retry.
|
151
|
+
if is_none_or_empty(response_text):
|
152
|
+
logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
|
153
|
+
raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
|
154
|
+
|
147
155
|
# Save conversation trace
|
148
156
|
tracer["chat_model"] = model_name
|
149
157
|
tracer["temperature"] = temperature
|
@@ -157,89 +165,90 @@ def gemini_completion_with_backoff(
|
|
157
165
|
retry=retry_if_exception(_is_retryable_error),
|
158
166
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
159
167
|
stop=stop_after_attempt(3),
|
160
|
-
before_sleep=before_sleep_log(logger, logging.
|
161
|
-
reraise=
|
168
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
169
|
+
reraise=False,
|
162
170
|
)
|
163
171
|
async def gemini_chat_completion_with_backoff(
|
164
|
-
messages,
|
165
|
-
model_name,
|
166
|
-
temperature,
|
167
|
-
api_key,
|
168
|
-
api_base_url,
|
169
|
-
system_prompt,
|
172
|
+
messages: list[ChatMessage],
|
173
|
+
model_name: str,
|
174
|
+
temperature: float,
|
175
|
+
api_key: str,
|
176
|
+
api_base_url: str,
|
177
|
+
system_prompt: str,
|
170
178
|
model_kwargs=None,
|
171
179
|
deepthought=False,
|
172
180
|
tracer: dict = {},
|
173
181
|
) -> AsyncGenerator[str, None]:
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
gemini_clients[api_key] = client
|
179
|
-
|
180
|
-
formatted_messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
|
181
|
-
|
182
|
-
thinking_config = None
|
183
|
-
if deepthought and model_name.startswith("gemini-2-5"):
|
184
|
-
thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
|
185
|
-
|
186
|
-
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
187
|
-
config = gtypes.GenerateContentConfig(
|
188
|
-
system_instruction=system_prompt,
|
189
|
-
temperature=temperature,
|
190
|
-
thinking_config=thinking_config,
|
191
|
-
max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
|
192
|
-
stop_sequences=["Notes:\n["],
|
193
|
-
safety_settings=SAFETY_SETTINGS,
|
194
|
-
seed=seed,
|
195
|
-
http_options=gtypes.HttpOptions(async_client_args={"timeout": httpx.Timeout(30.0, read=60.0)}),
|
196
|
-
)
|
182
|
+
client = gemini_clients.get(api_key)
|
183
|
+
if not client:
|
184
|
+
client = get_gemini_client(api_key, api_base_url)
|
185
|
+
gemini_clients[api_key] = client
|
197
186
|
|
198
|
-
|
199
|
-
final_chunk = None
|
200
|
-
response_started = False
|
201
|
-
start_time = perf_counter()
|
202
|
-
chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream(
|
203
|
-
model=model_name, config=config, contents=formatted_messages
|
204
|
-
)
|
205
|
-
async for chunk in chat_stream:
|
206
|
-
# Log the time taken to start response
|
207
|
-
if not response_started:
|
208
|
-
response_started = True
|
209
|
-
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
|
210
|
-
# Keep track of the last chunk for usage data
|
211
|
-
final_chunk = chunk
|
212
|
-
# Handle streamed response chunk
|
213
|
-
message, stopped = handle_gemini_response(chunk.candidates, chunk.prompt_feedback)
|
214
|
-
message = message or chunk.text
|
215
|
-
aggregated_response += message
|
216
|
-
yield message
|
217
|
-
if stopped:
|
218
|
-
raise ValueError(message)
|
219
|
-
|
220
|
-
# Log the time taken to stream the entire response
|
221
|
-
logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
|
222
|
-
|
223
|
-
# Calculate cost of chat
|
224
|
-
input_tokens = final_chunk.usage_metadata.prompt_token_count or 0 if final_chunk else 0
|
225
|
-
output_tokens = final_chunk.usage_metadata.candidates_token_count or 0 if final_chunk else 0
|
226
|
-
thought_tokens = final_chunk.usage_metadata.thoughts_token_count or 0 if final_chunk else 0
|
227
|
-
tracer["usage"] = get_chat_usage_metrics(
|
228
|
-
model_name, input_tokens, output_tokens, thought_tokens=thought_tokens, usage=tracer.get("usage")
|
229
|
-
)
|
187
|
+
formatted_messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
|
230
188
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
189
|
+
thinking_config = None
|
190
|
+
if deepthought and model_name.startswith("gemini-2-5"):
|
191
|
+
thinking_config = gtypes.ThinkingConfig(thinking_budget=MAX_REASONING_TOKENS_GEMINI)
|
192
|
+
|
193
|
+
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
194
|
+
config = gtypes.GenerateContentConfig(
|
195
|
+
system_instruction=system_prompt,
|
196
|
+
temperature=temperature,
|
197
|
+
thinking_config=thinking_config,
|
198
|
+
max_output_tokens=MAX_OUTPUT_TOKENS_GEMINI,
|
199
|
+
stop_sequences=["Notes:\n["],
|
200
|
+
safety_settings=SAFETY_SETTINGS,
|
201
|
+
seed=seed,
|
202
|
+
http_options=gtypes.HttpOptions(async_client_args={"timeout": httpx.Timeout(30.0, read=60.0)}),
|
203
|
+
)
|
204
|
+
|
205
|
+
aggregated_response = ""
|
206
|
+
final_chunk = None
|
207
|
+
response_started = False
|
208
|
+
start_time = perf_counter()
|
209
|
+
chat_stream: AsyncIterator[gtypes.GenerateContentResponse] = await client.aio.models.generate_content_stream(
|
210
|
+
model=model_name, config=config, contents=formatted_messages
|
211
|
+
)
|
212
|
+
async for chunk in chat_stream:
|
213
|
+
# Log the time taken to start response
|
214
|
+
if not response_started:
|
215
|
+
response_started = True
|
216
|
+
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
|
217
|
+
# Keep track of the last chunk for usage data
|
218
|
+
final_chunk = chunk
|
219
|
+
# Handle streamed response chunk
|
220
|
+
stop_message, stopped = handle_gemini_response(chunk.candidates, chunk.prompt_feedback)
|
221
|
+
message = stop_message or chunk.text
|
222
|
+
aggregated_response += message
|
223
|
+
yield message
|
224
|
+
if stopped:
|
225
|
+
logger.warning(
|
226
|
+
f"LLM Response Prevented for {model_name}: {stop_message}.\n"
|
227
|
+
+ f"Last Message by {messages[-1].role}: {messages[-1].content}"
|
228
|
+
)
|
229
|
+
break
|
230
|
+
|
231
|
+
# Calculate cost of chat
|
232
|
+
input_tokens = final_chunk.usage_metadata.prompt_token_count or 0 if final_chunk else 0
|
233
|
+
output_tokens = final_chunk.usage_metadata.candidates_token_count or 0 if final_chunk else 0
|
234
|
+
thought_tokens = final_chunk.usage_metadata.thoughts_token_count or 0 if final_chunk else 0
|
235
|
+
tracer["usage"] = get_chat_usage_metrics(
|
236
|
+
model_name, input_tokens, output_tokens, thought_tokens=thought_tokens, usage=tracer.get("usage")
|
237
|
+
)
|
238
|
+
|
239
|
+
# Validate the response. If empty, raise an error to retry.
|
240
|
+
if is_none_or_empty(aggregated_response):
|
241
|
+
logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
|
242
|
+
raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
|
243
|
+
|
244
|
+
# Log the time taken to stream the entire response
|
245
|
+
logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
|
246
|
+
|
247
|
+
# Save conversation trace
|
248
|
+
tracer["chat_model"] = model_name
|
249
|
+
tracer["temperature"] = temperature
|
250
|
+
if is_promptrace_enabled():
|
251
|
+
commit_conversation_trace(messages, aggregated_response, tracer)
|
243
252
|
|
244
253
|
|
245
254
|
def handle_gemini_response(
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
+
from copy import deepcopy
|
3
4
|
from functools import partial
|
4
5
|
from time import perf_counter
|
5
6
|
from typing import AsyncGenerator, Dict, Generator, List, Literal, Optional, Union
|
@@ -7,6 +8,7 @@ from urllib.parse import urlparse
|
|
7
8
|
|
8
9
|
import httpx
|
9
10
|
import openai
|
11
|
+
from langchain_core.messages.chat import ChatMessage
|
10
12
|
from openai.lib.streaming.chat import (
|
11
13
|
ChatCompletionStream,
|
12
14
|
ChatCompletionStreamEvent,
|
@@ -32,9 +34,11 @@ from khoj.processor.conversation.utils import (
|
|
32
34
|
commit_conversation_trace,
|
33
35
|
)
|
34
36
|
from khoj.utils.helpers import (
|
37
|
+
convert_image_data_uri,
|
35
38
|
get_chat_usage_metrics,
|
36
39
|
get_openai_async_client,
|
37
40
|
get_openai_client,
|
41
|
+
is_none_or_empty,
|
38
42
|
is_promptrace_enabled,
|
39
43
|
)
|
40
44
|
|
@@ -51,6 +55,7 @@ openai_async_clients: Dict[str, openai.AsyncOpenAI] = {}
|
|
51
55
|
| retry_if_exception_type(openai._exceptions.APIConnectionError)
|
52
56
|
| retry_if_exception_type(openai._exceptions.RateLimitError)
|
53
57
|
| retry_if_exception_type(openai._exceptions.APIStatusError)
|
58
|
+
| retry_if_exception_type(ValueError)
|
54
59
|
),
|
55
60
|
wait=wait_random_exponential(min=1, max=10),
|
56
61
|
stop=stop_after_attempt(3),
|
@@ -58,7 +63,7 @@ openai_async_clients: Dict[str, openai.AsyncOpenAI] = {}
|
|
58
63
|
reraise=True,
|
59
64
|
)
|
60
65
|
def completion_with_backoff(
|
61
|
-
messages,
|
66
|
+
messages: List[ChatMessage],
|
62
67
|
model_name: str,
|
63
68
|
temperature=0.8,
|
64
69
|
openai_api_key=None,
|
@@ -74,7 +79,7 @@ def completion_with_backoff(
|
|
74
79
|
openai_clients[client_key] = client
|
75
80
|
|
76
81
|
stream_processor = default_stream_processor
|
77
|
-
formatted_messages =
|
82
|
+
formatted_messages = format_message_for_api(messages, api_base_url)
|
78
83
|
|
79
84
|
# Tune reasoning models arguments
|
80
85
|
if is_openai_reasoning_model(model_name, api_base_url):
|
@@ -133,6 +138,11 @@ def completion_with_backoff(
|
|
133
138
|
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
|
134
139
|
)
|
135
140
|
|
141
|
+
# Validate the response. If empty, raise an error to retry.
|
142
|
+
if is_none_or_empty(aggregated_response):
|
143
|
+
logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
|
144
|
+
raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
|
145
|
+
|
136
146
|
# Save conversation trace
|
137
147
|
tracer["chat_model"] = model_name
|
138
148
|
tracer["temperature"] = temperature
|
@@ -149,14 +159,15 @@ def completion_with_backoff(
|
|
149
159
|
| retry_if_exception_type(openai._exceptions.APIConnectionError)
|
150
160
|
| retry_if_exception_type(openai._exceptions.RateLimitError)
|
151
161
|
| retry_if_exception_type(openai._exceptions.APIStatusError)
|
162
|
+
| retry_if_exception_type(ValueError)
|
152
163
|
),
|
153
164
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
154
165
|
stop=stop_after_attempt(3),
|
155
|
-
before_sleep=before_sleep_log(logger, logging.
|
156
|
-
reraise=
|
166
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
167
|
+
reraise=False,
|
157
168
|
)
|
158
169
|
async def chat_completion_with_backoff(
|
159
|
-
messages,
|
170
|
+
messages: list[ChatMessage],
|
160
171
|
model_name: str,
|
161
172
|
temperature,
|
162
173
|
openai_api_key=None,
|
@@ -165,120 +176,122 @@ async def chat_completion_with_backoff(
|
|
165
176
|
model_kwargs: dict = {},
|
166
177
|
tracer: dict = {},
|
167
178
|
) -> AsyncGenerator[ResponseWithThought, None]:
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
if final_chunk and
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
179
|
+
client_key = f"{openai_api_key}--{api_base_url}"
|
180
|
+
client = openai_async_clients.get(client_key)
|
181
|
+
if not client:
|
182
|
+
client = get_openai_async_client(openai_api_key, api_base_url)
|
183
|
+
openai_async_clients[client_key] = client
|
184
|
+
|
185
|
+
stream_processor = adefault_stream_processor
|
186
|
+
formatted_messages = format_message_for_api(messages, api_base_url)
|
187
|
+
|
188
|
+
# Configure thinking for openai reasoning models
|
189
|
+
if is_openai_reasoning_model(model_name, api_base_url):
|
190
|
+
temperature = 1
|
191
|
+
reasoning_effort = "medium" if deepthought else "low"
|
192
|
+
model_kwargs["reasoning_effort"] = reasoning_effort
|
193
|
+
model_kwargs.pop("stop", None) # Remove unsupported stop param for reasoning models
|
194
|
+
|
195
|
+
# Get the first system message and add the string `Formatting re-enabled` to it.
|
196
|
+
# See https://platform.openai.com/docs/guides/reasoning-best-practices
|
197
|
+
if len(formatted_messages) > 0:
|
198
|
+
system_messages = [
|
199
|
+
(i, message) for i, message in enumerate(formatted_messages) if message["role"] == "system"
|
200
|
+
]
|
201
|
+
if len(system_messages) > 0:
|
202
|
+
first_system_message_index, first_system_message = system_messages[0]
|
203
|
+
first_system_message_content = first_system_message["content"]
|
204
|
+
formatted_messages[first_system_message_index][
|
205
|
+
"content"
|
206
|
+
] = f"{first_system_message_content}\nFormatting re-enabled"
|
207
|
+
elif is_twitter_reasoning_model(model_name, api_base_url):
|
208
|
+
stream_processor = adeepseek_stream_processor
|
209
|
+
reasoning_effort = "high" if deepthought else "low"
|
210
|
+
model_kwargs["reasoning_effort"] = reasoning_effort
|
211
|
+
elif model_name.startswith("deepseek-reasoner"):
|
212
|
+
stream_processor = adeepseek_stream_processor
|
213
|
+
# Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
|
214
|
+
# The first message should always be a user message (except system message).
|
215
|
+
updated_messages: List[dict] = []
|
216
|
+
for i, message in enumerate(formatted_messages):
|
217
|
+
if i > 0 and message["role"] == formatted_messages[i - 1]["role"]:
|
218
|
+
updated_messages[-1]["content"] += " " + message["content"]
|
219
|
+
elif i == 1 and formatted_messages[i - 1]["role"] == "system" and message["role"] == "assistant":
|
220
|
+
updated_messages[-1]["content"] += " " + message["content"]
|
221
|
+
else:
|
222
|
+
updated_messages.append(message)
|
223
|
+
formatted_messages = updated_messages
|
224
|
+
elif is_qwen_reasoning_model(model_name, api_base_url):
|
225
|
+
stream_processor = partial(ain_stream_thought_processor, thought_tag="think")
|
226
|
+
# Reasoning is enabled by default. Disable when deepthought is False.
|
227
|
+
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
228
|
+
if not deepthought and len(formatted_messages) > 0:
|
229
|
+
formatted_messages[-1]["content"] = formatted_messages[-1]["content"] + " /no_think"
|
230
|
+
|
231
|
+
stream = True
|
232
|
+
read_timeout = 300 if is_local_api(api_base_url) else 60
|
233
|
+
model_kwargs["stream_options"] = {"include_usage": True}
|
234
|
+
if os.getenv("KHOJ_LLM_SEED"):
|
235
|
+
model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
|
236
|
+
|
237
|
+
aggregated_response = ""
|
238
|
+
final_chunk = None
|
239
|
+
response_started = False
|
240
|
+
start_time = perf_counter()
|
241
|
+
chat_stream: openai.AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
|
242
|
+
messages=formatted_messages, # type: ignore
|
243
|
+
model=model_name,
|
244
|
+
stream=stream,
|
245
|
+
temperature=temperature,
|
246
|
+
timeout=httpx.Timeout(30, read=read_timeout),
|
247
|
+
**model_kwargs,
|
248
|
+
)
|
249
|
+
async for chunk in stream_processor(chat_stream):
|
250
|
+
# Log the time taken to start response
|
251
|
+
if not response_started:
|
252
|
+
response_started = True
|
253
|
+
logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
|
254
|
+
# Keep track of the last chunk for usage data
|
255
|
+
final_chunk = chunk
|
256
|
+
# Skip empty chunks
|
257
|
+
if len(chunk.choices) == 0:
|
258
|
+
continue
|
259
|
+
# Handle streamed response chunk
|
260
|
+
response_chunk: ResponseWithThought = None
|
261
|
+
response_delta = chunk.choices[0].delta
|
262
|
+
if response_delta.content:
|
263
|
+
response_chunk = ResponseWithThought(response=response_delta.content)
|
264
|
+
aggregated_response += response_chunk.response
|
265
|
+
elif response_delta.thought:
|
266
|
+
response_chunk = ResponseWithThought(thought=response_delta.thought)
|
267
|
+
if response_chunk:
|
268
|
+
yield response_chunk
|
269
|
+
|
270
|
+
# Calculate cost of chat after stream finishes
|
271
|
+
input_tokens, output_tokens, cost = 0, 0, 0
|
272
|
+
if final_chunk and hasattr(final_chunk, "usage") and final_chunk.usage:
|
273
|
+
input_tokens = final_chunk.usage.prompt_tokens
|
274
|
+
output_tokens = final_chunk.usage.completion_tokens
|
275
|
+
# Estimated costs returned by DeepInfra API
|
276
|
+
if final_chunk.usage.model_extra and "estimated_cost" in final_chunk.usage.model_extra:
|
277
|
+
cost = final_chunk.usage.model_extra.get("estimated_cost", 0)
|
278
|
+
tracer["usage"] = get_chat_usage_metrics(
|
279
|
+
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
|
280
|
+
)
|
281
|
+
|
282
|
+
# Validate the response. If empty, raise an error to retry.
|
283
|
+
if is_none_or_empty(aggregated_response):
|
284
|
+
logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
|
285
|
+
raise ValueError(f"Empty or no response by {model_name} over API. Retry if needed.")
|
286
|
+
|
287
|
+
# Log the time taken to stream the entire response
|
288
|
+
logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
|
289
|
+
|
290
|
+
# Save conversation trace
|
291
|
+
tracer["chat_model"] = model_name
|
292
|
+
tracer["temperature"] = temperature
|
293
|
+
if is_promptrace_enabled():
|
294
|
+
commit_conversation_trace(messages, aggregated_response, tracer)
|
282
295
|
|
283
296
|
|
284
297
|
def get_openai_api_json_support(model_name: str, api_base_url: str = None) -> JsonSupport:
|
@@ -293,11 +306,34 @@ def get_openai_api_json_support(model_name: str, api_base_url: str = None) -> Js
|
|
293
306
|
return JsonSupport.SCHEMA
|
294
307
|
|
295
308
|
|
309
|
+
def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> List[dict]:
|
310
|
+
"""
|
311
|
+
Format messages to send to chat model served over OpenAI (compatible) API.
|
312
|
+
"""
|
313
|
+
formatted_messages = []
|
314
|
+
for message in deepcopy(messages):
|
315
|
+
# Convert images to PNG format if message to be sent to non OpenAI API
|
316
|
+
if isinstance(message.content, list) and not is_openai_api(api_base_url):
|
317
|
+
for part in message.content:
|
318
|
+
if part.get("type") == "image_url":
|
319
|
+
part["image_url"]["url"] = convert_image_data_uri(part["image_url"]["url"], target_format="png")
|
320
|
+
formatted_messages.append({"role": message.role, "content": message.content})
|
321
|
+
|
322
|
+
return formatted_messages
|
323
|
+
|
324
|
+
|
325
|
+
def is_openai_api(api_base_url: str = None) -> bool:
|
326
|
+
"""
|
327
|
+
Check if the model is served over the official OpenAI API
|
328
|
+
"""
|
329
|
+
return api_base_url is None or api_base_url.startswith("https://api.openai.com/v1")
|
330
|
+
|
331
|
+
|
296
332
|
def is_openai_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
|
297
333
|
"""
|
298
334
|
Check if the model is an OpenAI reasoning model
|
299
335
|
"""
|
300
|
-
return model_name.startswith("o") and (api_base_url
|
336
|
+
return model_name.startswith("o") and is_openai_api(api_base_url)
|
301
337
|
|
302
338
|
|
303
339
|
def is_twitter_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
|
khoj/utils/helpers.py
CHANGED
@@ -555,6 +555,32 @@ def convert_image_to_webp(image_bytes):
|
|
555
555
|
return webp_image_bytes
|
556
556
|
|
557
557
|
|
558
|
+
def convert_image_data_uri(image_data_uri: str, target_format: str = "png") -> str:
|
559
|
+
"""
|
560
|
+
Convert image (in data URI) to target format.
|
561
|
+
|
562
|
+
Target format can be png, jpg, webp etc.
|
563
|
+
Returns the converted image as a data URI.
|
564
|
+
"""
|
565
|
+
base64_data = image_data_uri.split(",", 1)[1]
|
566
|
+
image_type = image_data_uri.split(";")[0].split(":")[1].split("/")[1]
|
567
|
+
if image_type.lower() == target_format.lower():
|
568
|
+
return image_data_uri
|
569
|
+
|
570
|
+
image_bytes = base64.b64decode(base64_data)
|
571
|
+
image_io = io.BytesIO(image_bytes)
|
572
|
+
with Image.open(image_io) as original_image:
|
573
|
+
output_image_io = io.BytesIO()
|
574
|
+
original_image.save(output_image_io, target_format.upper())
|
575
|
+
|
576
|
+
# Encode the image back to base64
|
577
|
+
output_image_bytes = output_image_io.getvalue()
|
578
|
+
output_image_io.close()
|
579
|
+
output_base64_data = base64.b64encode(output_image_bytes).decode("utf-8")
|
580
|
+
output_data_uri = f"data:image/{target_format};base64,{output_base64_data}"
|
581
|
+
return output_data_uri
|
582
|
+
|
583
|
+
|
558
584
|
def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000) -> dict[str, Any]:
|
559
585
|
"""
|
560
586
|
Truncate large output files and drop image file data from code results.
|