khoj 2.0.0b13.dev19__py3-none-any.whl → 2.0.0b14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/admin.py +2 -2
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/{2327-fe87dd989d71d0eb.js → 2327-438aaec1657c5ada.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{3260-43d3019b92c315bb.js → 3260-82d2521fab032ff1.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/9808-c0742b05e1ef29ba.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-0114c87d7ccf6d9b.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/automations/layout-8639ff99d6c2fec6.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-198b26df6e09bbb0.js → page-1047097af99d31c7.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-2ff3e18a6feae92a.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/page-ac7ed0a1aff1b145.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/layout-78dd7cdd97510485.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-8addeb8079c3215b.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-e0dcb1762f8c8f88.js → page-819c6536c15e3d31.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-d60b0c57a6c38d0f.js → webpack-5393aad3d824e0cb.js} +1 -1
- khoj/interface/compiled/_next/static/css/5c7a72bad47e50b3.css +25 -0
- khoj/interface/compiled/_next/static/css/821d0d60b0b6871d.css +1 -0
- khoj/interface/compiled/_next/static/css/ecea704005ba630c.css +1 -0
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +3 -3
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +5 -4
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +4 -4
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/google/gemini_chat.py +1 -1
- khoj/processor/conversation/google/utils.py +62 -19
- khoj/processor/conversation/openai/gpt.py +65 -28
- khoj/processor/conversation/openai/utils.py +401 -28
- khoj/processor/conversation/prompts.py +48 -30
- khoj/processor/conversation/utils.py +5 -1
- khoj/processor/tools/run_code.py +15 -22
- khoj/routers/api_chat.py +8 -3
- khoj/routers/api_content.py +1 -1
- khoj/routers/helpers.py +62 -42
- khoj/routers/research.py +7 -5
- khoj/utils/constants.py +9 -1
- khoj/utils/helpers.py +55 -15
- {khoj-2.0.0b13.dev19.dist-info → khoj-2.0.0b14.dist-info}/METADATA +1 -1
- {khoj-2.0.0b13.dev19.dist-info → khoj-2.0.0b14.dist-info}/RECORD +58 -58
- khoj/interface/compiled/_next/static/chunks/7127-97b83757db125ba6.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/layout-63603d2cb33279f7.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/page-9a75d7369f2a7cd2.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
- khoj/interface/compiled/_next/static/css/23b26df423cd8a9c.css +0 -1
- khoj/interface/compiled/_next/static/css/2945c4a857922f3b.css +0 -1
- khoj/interface/compiled/_next/static/css/3090706713c12a32.css +0 -25
- /khoj/interface/compiled/_next/static/{N-GdBSXoYe-DuObnbXVRO → Qn_2XyeVWxjaIRks7rzM-}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{N-GdBSXoYe-DuObnbXVRO → Qn_2XyeVWxjaIRks7rzM-}/_ssgManifest.js +0 -0
- /khoj/interface/compiled/_next/static/chunks/{1327-511bb0a862efce80.js → 1327-e254819a9172cfa7.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{1915-fbfe167c84ad60c5.js → 1915-5c6508f6ebb62a30.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2117-e78b6902ad6f75ec.js → 2117-080746c8e170c81a.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2939-4d4084c5b888b960.js → 2939-4af3fd24b8ffc9ad.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4447-d6cf93724d57e34b.js → 4447-cd95608f8e93e711.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{8667-4b7790573b08c50d.js → 8667-50b03a89e82e0ba7.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{9139-ce1ae935dac9c871.js → 9139-8ac4d9feb10f8869.js} +0 -0
- {khoj-2.0.0b13.dev19.dist-info → khoj-2.0.0b14.dist-info}/WHEEL +0 -0
- {khoj-2.0.0b13.dev19.dist-info → khoj-2.0.0b14.dist-info}/entry_points.txt +0 -0
- {khoj-2.0.0b13.dev19.dist-info → khoj-2.0.0b14.dist-info}/licenses/LICENSE +0 -0
@@ -21,6 +21,8 @@ from openai.types.chat.chat_completion_chunk import (
|
|
21
21
|
Choice,
|
22
22
|
ChoiceDelta,
|
23
23
|
)
|
24
|
+
from openai.types.responses import Response as OpenAIResponse
|
25
|
+
from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
|
24
26
|
from pydantic import BaseModel
|
25
27
|
from tenacity import (
|
26
28
|
before_sleep_log,
|
@@ -53,13 +55,31 @@ openai_clients: Dict[str, openai.OpenAI] = {}
|
|
53
55
|
openai_async_clients: Dict[str, openai.AsyncOpenAI] = {}
|
54
56
|
|
55
57
|
|
58
|
+
def _extract_text_for_instructions(content: Union[str, List, Dict, None]) -> str:
|
59
|
+
"""Extract plain text from a message content suitable for Responses API instructions."""
|
60
|
+
if content is None:
|
61
|
+
return ""
|
62
|
+
if isinstance(content, str):
|
63
|
+
return content
|
64
|
+
if isinstance(content, list):
|
65
|
+
texts: List[str] = []
|
66
|
+
for part in content:
|
67
|
+
if isinstance(part, dict) and part.get("type") == "input_text" and part.get("text"):
|
68
|
+
texts.append(str(part.get("text")))
|
69
|
+
return "\n\n".join(texts)
|
70
|
+
if isinstance(content, dict):
|
71
|
+
# If a single part dict was passed
|
72
|
+
if content.get("type") == "input_text" and content.get("text"):
|
73
|
+
return str(content.get("text"))
|
74
|
+
# Fallback to string conversion
|
75
|
+
return str(content)
|
76
|
+
|
77
|
+
|
56
78
|
@retry(
|
57
79
|
retry=(
|
58
80
|
retry_if_exception_type(openai._exceptions.APITimeoutError)
|
59
|
-
| retry_if_exception_type(openai._exceptions.APIError)
|
60
|
-
| retry_if_exception_type(openai._exceptions.APIConnectionError)
|
61
81
|
| retry_if_exception_type(openai._exceptions.RateLimitError)
|
62
|
-
| retry_if_exception_type(openai._exceptions.
|
82
|
+
| retry_if_exception_type(openai._exceptions.InternalServerError)
|
63
83
|
| retry_if_exception_type(ValueError)
|
64
84
|
),
|
65
85
|
wait=wait_random_exponential(min=1, max=10),
|
@@ -106,7 +126,7 @@ def completion_with_backoff(
|
|
106
126
|
if model_name.startswith("grok-4"):
|
107
127
|
# Grok-4 models do not support reasoning_effort parameter
|
108
128
|
model_kwargs.pop("reasoning_effort", None)
|
109
|
-
elif model_name.startswith("deepseek-reasoner"):
|
129
|
+
elif model_name.startswith("deepseek-reasoner") or model_name.startswith("deepseek-chat"):
|
110
130
|
stream_processor = in_stream_thought_processor
|
111
131
|
# Two successive messages cannot be from the same role. Should merge any back-to-back messages from the same role.
|
112
132
|
# The first message should always be a user message (except system message).
|
@@ -125,6 +145,8 @@ def completion_with_backoff(
|
|
125
145
|
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
126
146
|
if not deepthought:
|
127
147
|
add_qwen_no_think_tag(formatted_messages)
|
148
|
+
elif is_groq_api(api_base_url):
|
149
|
+
model_kwargs["service_tier"] = "auto"
|
128
150
|
|
129
151
|
read_timeout = 300 if is_local_api(api_base_url) else 60
|
130
152
|
if os.getenv("KHOJ_LLM_SEED"):
|
@@ -150,8 +172,16 @@ def completion_with_backoff(
|
|
150
172
|
chunk.type == "chunk"
|
151
173
|
and chunk.chunk.choices
|
152
174
|
and hasattr(chunk.chunk.choices[0].delta, "reasoning_content")
|
175
|
+
and chunk.chunk.choices[0].delta.reasoning_content
|
153
176
|
):
|
154
177
|
thoughts += chunk.chunk.choices[0].delta.reasoning_content
|
178
|
+
elif (
|
179
|
+
chunk.type == "chunk"
|
180
|
+
and chunk.chunk.choices
|
181
|
+
and hasattr(chunk.chunk.choices[0].delta, "reasoning")
|
182
|
+
and chunk.chunk.choices[0].delta.reasoning
|
183
|
+
):
|
184
|
+
thoughts += chunk.chunk.choices[0].delta.reasoning
|
155
185
|
elif chunk.type == "chunk" and chunk.chunk.choices and chunk.chunk.choices[0].delta.tool_calls:
|
156
186
|
tool_ids += [tool_call.id for tool_call in chunk.chunk.choices[0].delta.tool_calls]
|
157
187
|
elif chunk.type == "tool_calls.function.arguments.done":
|
@@ -174,7 +204,6 @@ def completion_with_backoff(
|
|
174
204
|
chunk = client.beta.chat.completions.parse(
|
175
205
|
messages=formatted_messages, # type: ignore
|
176
206
|
model=model_name,
|
177
|
-
temperature=temperature,
|
178
207
|
timeout=httpx.Timeout(30, read=read_timeout),
|
179
208
|
**model_kwargs,
|
180
209
|
)
|
@@ -199,6 +228,10 @@ def completion_with_backoff(
|
|
199
228
|
# Json dump tool calls into aggregated response
|
200
229
|
aggregated_response = json.dumps([tool_call.__dict__ for tool_call in tool_calls])
|
201
230
|
|
231
|
+
# Align chunk definition with non-streaming mode for post stream completion usage
|
232
|
+
if hasattr(chunk, "chunk"):
|
233
|
+
chunk = chunk.chunk
|
234
|
+
|
202
235
|
# Calculate cost of chat
|
203
236
|
input_tokens = chunk.usage.prompt_tokens if hasattr(chunk, "usage") and chunk.usage else 0
|
204
237
|
output_tokens = chunk.usage.completion_tokens if hasattr(chunk, "usage") and chunk.usage else 0
|
@@ -227,10 +260,8 @@ def completion_with_backoff(
|
|
227
260
|
@retry(
|
228
261
|
retry=(
|
229
262
|
retry_if_exception_type(openai._exceptions.APITimeoutError)
|
230
|
-
| retry_if_exception_type(openai._exceptions.APIError)
|
231
|
-
| retry_if_exception_type(openai._exceptions.APIConnectionError)
|
232
263
|
| retry_if_exception_type(openai._exceptions.RateLimitError)
|
233
|
-
| retry_if_exception_type(openai._exceptions.
|
264
|
+
| retry_if_exception_type(openai._exceptions.InternalServerError)
|
234
265
|
| retry_if_exception_type(ValueError)
|
235
266
|
),
|
236
267
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
@@ -291,8 +322,12 @@ async def chat_completion_with_backoff(
|
|
291
322
|
# Grok-4 models do not support reasoning_effort parameter
|
292
323
|
if not model_name.startswith("grok-4"):
|
293
324
|
model_kwargs["reasoning_effort"] = reasoning_effort
|
294
|
-
elif
|
295
|
-
|
325
|
+
elif (
|
326
|
+
model_name.startswith("deepseek-chat")
|
327
|
+
or model_name.startswith("deepseek-reasoner")
|
328
|
+
or "deepseek-r1" in model_name
|
329
|
+
):
|
330
|
+
# Official Deepseek models and some inference APIs like vLLM return structured thinking output.
|
296
331
|
# Others like DeepInfra return it in response stream.
|
297
332
|
# Using the instream thought processor handles both cases, structured thoughts and in response thoughts.
|
298
333
|
stream_processor = ain_stream_thought_processor
|
@@ -317,6 +352,8 @@ async def chat_completion_with_backoff(
|
|
317
352
|
# See https://qwenlm.github.io/blog/qwen3/#advanced-usages
|
318
353
|
if not deepthought:
|
319
354
|
add_qwen_no_think_tag(formatted_messages)
|
355
|
+
elif is_groq_api(api_base_url):
|
356
|
+
model_kwargs["service_tier"] = "auto"
|
320
357
|
|
321
358
|
read_timeout = 300 if is_local_api(api_base_url) else 60
|
322
359
|
if os.getenv("KHOJ_LLM_SEED"):
|
@@ -390,6 +427,283 @@ async def chat_completion_with_backoff(
|
|
390
427
|
commit_conversation_trace(messages, aggregated_response, tracer)
|
391
428
|
|
392
429
|
|
430
|
+
@retry(
|
431
|
+
retry=(
|
432
|
+
retry_if_exception_type(openai._exceptions.APITimeoutError)
|
433
|
+
| retry_if_exception_type(openai._exceptions.RateLimitError)
|
434
|
+
| retry_if_exception_type(openai._exceptions.InternalServerError)
|
435
|
+
| retry_if_exception_type(ValueError)
|
436
|
+
),
|
437
|
+
wait=wait_random_exponential(min=1, max=10),
|
438
|
+
stop=stop_after_attempt(3),
|
439
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
440
|
+
reraise=True,
|
441
|
+
)
|
442
|
+
def responses_completion_with_backoff(
|
443
|
+
messages: List[ChatMessage],
|
444
|
+
model_name: str,
|
445
|
+
temperature=0.6,
|
446
|
+
openai_api_key=None,
|
447
|
+
api_base_url=None,
|
448
|
+
deepthought: bool = False,
|
449
|
+
model_kwargs: dict = {},
|
450
|
+
tracer: dict = {},
|
451
|
+
) -> ResponseWithThought:
|
452
|
+
"""
|
453
|
+
Synchronous helper using the OpenAI Responses API in streaming mode under the hood.
|
454
|
+
Aggregates streamed deltas and returns a ResponseWithThought.
|
455
|
+
"""
|
456
|
+
client_key = f"{openai_api_key}--{api_base_url}"
|
457
|
+
client = openai_clients.get(client_key)
|
458
|
+
if not client:
|
459
|
+
client = get_openai_client(openai_api_key, api_base_url)
|
460
|
+
openai_clients[client_key] = client
|
461
|
+
|
462
|
+
formatted_messages = format_message_for_api(messages, api_base_url)
|
463
|
+
# Move the first system message to Responses API instructions
|
464
|
+
instructions: Optional[str] = None
|
465
|
+
if formatted_messages and formatted_messages[0].get("role") == "system":
|
466
|
+
instructions = _extract_text_for_instructions(formatted_messages[0].get("content")) or None
|
467
|
+
formatted_messages = formatted_messages[1:]
|
468
|
+
|
469
|
+
model_kwargs = deepcopy(model_kwargs)
|
470
|
+
model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
|
471
|
+
# Configure thinking for openai reasoning models
|
472
|
+
if is_openai_reasoning_model(model_name, api_base_url):
|
473
|
+
temperature = 1
|
474
|
+
reasoning_effort = "medium" if deepthought else "low"
|
475
|
+
model_kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"}
|
476
|
+
model_kwargs["include"] = ["reasoning.encrypted_content"]
|
477
|
+
# Remove unsupported params for reasoning models
|
478
|
+
model_kwargs.pop("top_p", None)
|
479
|
+
model_kwargs.pop("stop", None)
|
480
|
+
|
481
|
+
read_timeout = 300 if is_local_api(api_base_url) else 60
|
482
|
+
|
483
|
+
# Stream and aggregate
|
484
|
+
model_response: OpenAIResponse = client.responses.create(
|
485
|
+
input=formatted_messages,
|
486
|
+
instructions=instructions,
|
487
|
+
model=model_name,
|
488
|
+
temperature=temperature,
|
489
|
+
timeout=httpx.Timeout(30, read=read_timeout), # type: ignore
|
490
|
+
store=False,
|
491
|
+
**model_kwargs,
|
492
|
+
)
|
493
|
+
if not model_response or not isinstance(model_response, OpenAIResponse) or not model_response.output:
|
494
|
+
raise ValueError(f"Empty response returned by {model_name}.")
|
495
|
+
|
496
|
+
raw_content = [item.model_dump() for item in model_response.output]
|
497
|
+
aggregated_text = model_response.output_text
|
498
|
+
thoughts = ""
|
499
|
+
tool_calls: List[ToolCall] = []
|
500
|
+
for item in model_response.output:
|
501
|
+
if isinstance(item, ResponseFunctionToolCall):
|
502
|
+
tool_calls.append(ToolCall(name=item.name, args=json.loads(item.arguments), id=item.call_id))
|
503
|
+
elif isinstance(item, ResponseReasoningItem):
|
504
|
+
thoughts = "\n\n".join([summary.text for summary in item.summary])
|
505
|
+
|
506
|
+
if tool_calls:
|
507
|
+
if thoughts and aggregated_text:
|
508
|
+
# If there are tool calls, aggregate thoughts and responses into thoughts
|
509
|
+
thoughts = "\n".join([f"*{line.strip()}*" for line in thoughts.splitlines() if line.strip()])
|
510
|
+
thoughts = f"{thoughts}\n\n{aggregated_text}"
|
511
|
+
else:
|
512
|
+
thoughts = thoughts or aggregated_text
|
513
|
+
# Json dump tool calls into aggregated response
|
514
|
+
aggregated_text = json.dumps([tool_call.__dict__ for tool_call in tool_calls])
|
515
|
+
|
516
|
+
# Usage/cost tracking
|
517
|
+
input_tokens = model_response.usage.input_tokens if model_response and model_response.usage else 0
|
518
|
+
output_tokens = model_response.usage.output_tokens if model_response and model_response.usage else 0
|
519
|
+
cost = 0
|
520
|
+
cache_read_tokens = 0
|
521
|
+
if model_response and model_response.usage and model_response.usage.input_tokens_details:
|
522
|
+
cache_read_tokens = model_response.usage.input_tokens_details.cached_tokens
|
523
|
+
input_tokens -= cache_read_tokens
|
524
|
+
tracer["usage"] = get_chat_usage_metrics(
|
525
|
+
model_name, input_tokens, output_tokens, cache_read_tokens, usage=tracer.get("usage"), cost=cost
|
526
|
+
)
|
527
|
+
|
528
|
+
# Validate final aggregated text (either message or tool-calls JSON)
|
529
|
+
if is_none_or_empty(aggregated_text):
|
530
|
+
logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
|
531
|
+
raise ValueError(f"Empty or no response by {model_name} over Responses API. Retry if needed.")
|
532
|
+
|
533
|
+
# Trace
|
534
|
+
tracer["chat_model"] = model_name
|
535
|
+
tracer["temperature"] = temperature
|
536
|
+
if is_promptrace_enabled():
|
537
|
+
commit_conversation_trace(messages, aggregated_text, tracer)
|
538
|
+
|
539
|
+
return ResponseWithThought(text=aggregated_text, thought=thoughts, raw_content=raw_content)
|
540
|
+
|
541
|
+
|
542
|
+
@retry(
|
543
|
+
retry=(
|
544
|
+
retry_if_exception_type(openai._exceptions.APITimeoutError)
|
545
|
+
| retry_if_exception_type(openai._exceptions.RateLimitError)
|
546
|
+
| retry_if_exception_type(openai._exceptions.InternalServerError)
|
547
|
+
| retry_if_exception_type(ValueError)
|
548
|
+
),
|
549
|
+
wait=wait_exponential(multiplier=1, min=4, max=10),
|
550
|
+
stop=stop_after_attempt(3),
|
551
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
552
|
+
reraise=False,
|
553
|
+
)
|
554
|
+
async def responses_chat_completion_with_backoff(
|
555
|
+
messages: list[ChatMessage],
|
556
|
+
model_name: str,
|
557
|
+
temperature,
|
558
|
+
openai_api_key=None,
|
559
|
+
api_base_url=None,
|
560
|
+
deepthought=False, # Unused; parity with legacy signature
|
561
|
+
tracer: dict = {},
|
562
|
+
) -> AsyncGenerator[ResponseWithThought, None]:
|
563
|
+
"""
|
564
|
+
Async streaming helper using the OpenAI Responses API.
|
565
|
+
Yields ResponseWithThought chunks as text/think deltas arrive.
|
566
|
+
"""
|
567
|
+
client_key = f"{openai_api_key}--{api_base_url}"
|
568
|
+
client = openai_async_clients.get(client_key)
|
569
|
+
if not client:
|
570
|
+
client = get_openai_async_client(openai_api_key, api_base_url)
|
571
|
+
openai_async_clients[client_key] = client
|
572
|
+
|
573
|
+
formatted_messages = format_message_for_api(messages, api_base_url)
|
574
|
+
# Move the first system message to Responses API instructions
|
575
|
+
instructions: Optional[str] = None
|
576
|
+
if formatted_messages and formatted_messages[0].get("role") == "system":
|
577
|
+
instructions = _extract_text_for_instructions(formatted_messages[0].get("content")) or None
|
578
|
+
formatted_messages = formatted_messages[1:]
|
579
|
+
|
580
|
+
model_kwargs: dict = {}
|
581
|
+
model_kwargs["top_p"] = model_kwargs.get("top_p", 0.95)
|
582
|
+
# Configure thinking for openai reasoning models
|
583
|
+
if is_openai_reasoning_model(model_name, api_base_url):
|
584
|
+
temperature = 1
|
585
|
+
reasoning_effort = "medium" if deepthought else "low"
|
586
|
+
model_kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"}
|
587
|
+
# Remove unsupported params for reasoning models
|
588
|
+
model_kwargs.pop("top_p", None)
|
589
|
+
model_kwargs.pop("stop", None)
|
590
|
+
|
591
|
+
read_timeout = 300 if is_local_api(api_base_url) else 60
|
592
|
+
|
593
|
+
aggregated_text = ""
|
594
|
+
last_final: Optional[OpenAIResponse] = None
|
595
|
+
# Tool call assembly buffers
|
596
|
+
tool_calls_args: Dict[str, str] = {}
|
597
|
+
tool_calls_name: Dict[str, str] = {}
|
598
|
+
tool_call_order: List[str] = []
|
599
|
+
|
600
|
+
async with client.responses.stream(
|
601
|
+
input=formatted_messages,
|
602
|
+
instructions=instructions,
|
603
|
+
model=model_name,
|
604
|
+
temperature=temperature,
|
605
|
+
timeout=httpx.Timeout(30, read=read_timeout),
|
606
|
+
**model_kwargs,
|
607
|
+
) as stream: # type: ignore
|
608
|
+
async for event in stream: # type: ignore
|
609
|
+
et = getattr(event, "type", "")
|
610
|
+
if et == "response.output_text.delta":
|
611
|
+
delta = getattr(event, "delta", "") or getattr(event, "output_text", "")
|
612
|
+
if delta:
|
613
|
+
aggregated_text += delta
|
614
|
+
yield ResponseWithThought(text=delta)
|
615
|
+
elif et == "response.reasoning.delta":
|
616
|
+
delta = getattr(event, "delta", "")
|
617
|
+
if delta:
|
618
|
+
yield ResponseWithThought(thought=delta)
|
619
|
+
elif et == "response.tool_call.created":
|
620
|
+
item = getattr(event, "item", None)
|
621
|
+
tool_id = (
|
622
|
+
getattr(event, "id", None)
|
623
|
+
or getattr(event, "tool_call_id", None)
|
624
|
+
or (getattr(item, "id", None) if item is not None else None)
|
625
|
+
)
|
626
|
+
name = (
|
627
|
+
getattr(event, "name", None)
|
628
|
+
or (getattr(item, "name", None) if item is not None else None)
|
629
|
+
or getattr(event, "tool_name", None)
|
630
|
+
)
|
631
|
+
if tool_id:
|
632
|
+
if tool_id not in tool_calls_args:
|
633
|
+
tool_calls_args[tool_id] = ""
|
634
|
+
tool_call_order.append(tool_id)
|
635
|
+
if name:
|
636
|
+
tool_calls_name[tool_id] = name
|
637
|
+
elif et == "response.tool_call.delta":
|
638
|
+
tool_id = getattr(event, "id", None) or getattr(event, "tool_call_id", None)
|
639
|
+
delta = getattr(event, "delta", None)
|
640
|
+
if hasattr(delta, "arguments"):
|
641
|
+
arg_delta = getattr(delta, "arguments", "")
|
642
|
+
else:
|
643
|
+
arg_delta = delta if isinstance(delta, str) else getattr(event, "arguments", "")
|
644
|
+
if tool_id and arg_delta:
|
645
|
+
tool_calls_args[tool_id] = tool_calls_args.get(tool_id, "") + arg_delta
|
646
|
+
if tool_id not in tool_call_order:
|
647
|
+
tool_call_order.append(tool_id)
|
648
|
+
elif et == "response.tool_call.completed":
|
649
|
+
item = getattr(event, "item", None)
|
650
|
+
tool_id = (
|
651
|
+
getattr(event, "id", None)
|
652
|
+
or getattr(event, "tool_call_id", None)
|
653
|
+
or (getattr(item, "id", None) if item is not None else None)
|
654
|
+
)
|
655
|
+
args_final = None
|
656
|
+
if item is not None:
|
657
|
+
args_final = getattr(item, "arguments", None) or getattr(item, "args", None)
|
658
|
+
if tool_id and args_final:
|
659
|
+
tool_calls_args[tool_id] = args_final if isinstance(args_final, str) else json.dumps(args_final)
|
660
|
+
if tool_id not in tool_call_order:
|
661
|
+
tool_call_order.append(tool_id)
|
662
|
+
# ignore other events for now
|
663
|
+
last_final = await stream.get_final_response()
|
664
|
+
|
665
|
+
# Usage/cost tracking after stream ends
|
666
|
+
input_tokens = last_final.usage.input_tokens if last_final and last_final.usage else 0
|
667
|
+
output_tokens = last_final.usage.output_tokens if last_final and last_final.usage else 0
|
668
|
+
cost = 0
|
669
|
+
tracer["usage"] = get_chat_usage_metrics(
|
670
|
+
model_name, input_tokens, output_tokens, usage=tracer.get("usage"), cost=cost
|
671
|
+
)
|
672
|
+
|
673
|
+
# If there are tool calls, package them into aggregated text for tracing parity
|
674
|
+
if tool_call_order:
|
675
|
+
packaged_tool_calls: List[ToolCall] = []
|
676
|
+
for tool_id in tool_call_order:
|
677
|
+
name = tool_calls_name.get(tool_id) or ""
|
678
|
+
args_str = tool_calls_args.get(tool_id, "")
|
679
|
+
try:
|
680
|
+
args = json.loads(args_str) if isinstance(args_str, str) else args_str
|
681
|
+
except Exception:
|
682
|
+
logger.warning(f"Failed to parse tool call arguments for {tool_id}: {args_str}")
|
683
|
+
args = {}
|
684
|
+
packaged_tool_calls.append(ToolCall(name=name, args=args, id=tool_id))
|
685
|
+
# Move any text into trace thought
|
686
|
+
tracer_text = aggregated_text
|
687
|
+
aggregated_text = json.dumps([tc.__dict__ for tc in packaged_tool_calls])
|
688
|
+
# Save for trace below
|
689
|
+
if tracer_text:
|
690
|
+
tracer.setdefault("_responses_stream_text", tracer_text)
|
691
|
+
|
692
|
+
if is_none_or_empty(aggregated_text):
|
693
|
+
logger.warning(f"No response by {model_name}\nLast Message by {messages[-1].role}: {messages[-1].content}.")
|
694
|
+
raise ValueError(f"Empty or no response by {model_name} over Responses API. Retry if needed.")
|
695
|
+
|
696
|
+
tracer["chat_model"] = model_name
|
697
|
+
tracer["temperature"] = temperature
|
698
|
+
if is_promptrace_enabled():
|
699
|
+
# If tool-calls were present, include any streamed text in the trace thought
|
700
|
+
trace_payload = aggregated_text
|
701
|
+
if tracer.get("_responses_stream_text"):
|
702
|
+
thoughts = tracer.pop("_responses_stream_text")
|
703
|
+
trace_payload = thoughts
|
704
|
+
commit_conversation_trace(messages, trace_payload, tracer)
|
705
|
+
|
706
|
+
|
393
707
|
def get_structured_output_support(model_name: str, api_base_url: str = None) -> StructuredOutputSupport:
|
394
708
|
if model_name.startswith("deepseek-reasoner"):
|
395
709
|
return StructuredOutputSupport.NONE
|
@@ -412,6 +726,12 @@ def format_message_for_api(raw_messages: List[ChatMessage], api_base_url: str) -
|
|
412
726
|
# Handle tool call and tool result message types
|
413
727
|
message_type = message.additional_kwargs.get("message_type")
|
414
728
|
if message_type == "tool_call":
|
729
|
+
if is_openai_api(api_base_url):
|
730
|
+
for part in message.content:
|
731
|
+
if "status" in part:
|
732
|
+
part.pop("status") # Drop unsupported tool call status field
|
733
|
+
formatted_messages.extend(message.content)
|
734
|
+
continue
|
415
735
|
# Convert tool_call to OpenAI function call format
|
416
736
|
content = []
|
417
737
|
for part in message.content:
|
@@ -450,14 +770,23 @@ def format_message_for_api(raw_messages: List[ChatMessage], api_base_url: str) -
|
|
450
770
|
if not tool_call_id:
|
451
771
|
logger.warning(f"Dropping tool result without valid tool_call_id: {part.get('name')}")
|
452
772
|
continue
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
773
|
+
if is_openai_api(api_base_url):
|
774
|
+
formatted_messages.append(
|
775
|
+
{
|
776
|
+
"type": "function_call_output",
|
777
|
+
"call_id": tool_call_id,
|
778
|
+
"output": part.get("content") or "No output",
|
779
|
+
}
|
780
|
+
)
|
781
|
+
else:
|
782
|
+
formatted_messages.append(
|
783
|
+
{
|
784
|
+
"role": "tool",
|
785
|
+
"tool_call_id": tool_call_id,
|
786
|
+
"name": part.get("name"),
|
787
|
+
"content": part.get("content") or "No output",
|
788
|
+
}
|
789
|
+
)
|
461
790
|
continue
|
462
791
|
if isinstance(message.content, list) and not is_openai_api(api_base_url):
|
463
792
|
assistant_texts = []
|
@@ -489,6 +818,12 @@ def format_message_for_api(raw_messages: List[ChatMessage], api_base_url: str) -
|
|
489
818
|
message.content.remove(part)
|
490
819
|
elif part["type"] == "image_url" and not part.get("image_url"):
|
491
820
|
message.content.remove(part)
|
821
|
+
# OpenAI models use the Responses API which uses slightly different content types
|
822
|
+
if part["type"] == "text":
|
823
|
+
part["type"] = "output_text" if message.role == "assistant" else "input_text"
|
824
|
+
if part["type"] == "image_url":
|
825
|
+
part["type"] = "output_image" if message.role == "assistant" else "input_image"
|
826
|
+
part["image_url"] = part["image_url"]["url"]
|
492
827
|
# If no valid content parts left, remove the message
|
493
828
|
if is_none_or_empty(message.content):
|
494
829
|
messages.remove(message)
|
@@ -513,7 +848,11 @@ def is_openai_reasoning_model(model_name: str, api_base_url: str = None) -> bool
|
|
513
848
|
"""
|
514
849
|
Check if the model is an OpenAI reasoning model
|
515
850
|
"""
|
516
|
-
return
|
851
|
+
return (
|
852
|
+
is_openai_api(api_base_url)
|
853
|
+
and (model_name.lower().startswith("o") or model_name.lower().startswith("gpt-5"))
|
854
|
+
or model_name.lower().startswith("gpt-oss")
|
855
|
+
)
|
517
856
|
|
518
857
|
|
519
858
|
def is_non_streaming_model(model_name: str, api_base_url: str = None) -> bool:
|
@@ -536,6 +875,13 @@ def is_twitter_reasoning_model(model_name: str, api_base_url: str = None) -> boo
|
|
536
875
|
)
|
537
876
|
|
538
877
|
|
878
|
+
def is_groq_api(api_base_url: str = None) -> bool:
|
879
|
+
"""
|
880
|
+
Check if the model is served over the Groq API
|
881
|
+
"""
|
882
|
+
return api_base_url is not None and api_base_url.startswith("https://api.groq.com")
|
883
|
+
|
884
|
+
|
539
885
|
def is_qwen_style_reasoning_model(model_name: str, api_base_url: str = None) -> bool:
|
540
886
|
"""
|
541
887
|
Check if the model is a Qwen style reasoning model
|
@@ -609,6 +955,9 @@ async def astream_thought_processor(
|
|
609
955
|
if not chunk_data.get("object") or chunk_data.get("object") != "chat.completion.chunk":
|
610
956
|
logger.warning(f"Skipping invalid chunk with object field: {chunk_data.get('object', 'missing')}")
|
611
957
|
continue
|
958
|
+
# Handle unsupported service tiers like "on_demand" by Groq
|
959
|
+
if chunk.service_tier and chunk.service_tier == "on_demand":
|
960
|
+
chunk_data["service_tier"] = "auto"
|
612
961
|
|
613
962
|
tchunk = ChatCompletionWithThoughtsChunk.model_validate(chunk_data)
|
614
963
|
|
@@ -620,6 +969,14 @@ async def astream_thought_processor(
|
|
620
969
|
):
|
621
970
|
tchunk.choices[0].delta.thought = chunk.choices[0].delta.reasoning_content
|
622
971
|
|
972
|
+
# Handlle openai reasoning style response with thoughts. Used by gpt-oss.
|
973
|
+
if (
|
974
|
+
len(tchunk.choices) > 0
|
975
|
+
and hasattr(tchunk.choices[0].delta, "reasoning")
|
976
|
+
and tchunk.choices[0].delta.reasoning
|
977
|
+
):
|
978
|
+
tchunk.choices[0].delta.thought = chunk.choices[0].delta.reasoning
|
979
|
+
|
623
980
|
# Handlle llama.cpp server style response with thoughts.
|
624
981
|
elif len(tchunk.choices) > 0 and tchunk.choices[0].delta.model_extra.get("reasoning_content"):
|
625
982
|
tchunk.choices[0].delta.thought = tchunk.choices[0].delta.model_extra.get("reasoning_content")
|
@@ -750,6 +1107,10 @@ async def ain_stream_thought_processor(
|
|
750
1107
|
yield chunk
|
751
1108
|
continue
|
752
1109
|
|
1110
|
+
if chunk.choices[0].delta.content is None:
|
1111
|
+
# If delta content is None, we can't process it, just yield the chunk
|
1112
|
+
continue
|
1113
|
+
|
753
1114
|
buf += chunk.choices[0].delta.content
|
754
1115
|
|
755
1116
|
if mode == "detect_start":
|
@@ -850,20 +1211,32 @@ def add_qwen_no_think_tag(formatted_messages: List[dict]) -> None:
|
|
850
1211
|
break
|
851
1212
|
|
852
1213
|
|
853
|
-
def to_openai_tools(tools: List[ToolDefinition]) -> List[Dict] | None:
|
1214
|
+
def to_openai_tools(tools: List[ToolDefinition], use_responses_api: bool) -> List[Dict] | None:
|
854
1215
|
"Transform tool definitions from standard format to OpenAI format."
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
1216
|
+
if use_responses_api:
|
1217
|
+
openai_tools = [
|
1218
|
+
{
|
1219
|
+
"type": "function",
|
859
1220
|
"name": tool.name,
|
860
1221
|
"description": tool.description,
|
861
1222
|
"parameters": clean_response_schema(tool.schema),
|
862
1223
|
"strict": True,
|
863
|
-
}
|
864
|
-
|
865
|
-
|
866
|
-
|
1224
|
+
}
|
1225
|
+
for tool in tools
|
1226
|
+
]
|
1227
|
+
else:
|
1228
|
+
openai_tools = [
|
1229
|
+
{
|
1230
|
+
"type": "function",
|
1231
|
+
"function": {
|
1232
|
+
"name": tool.name,
|
1233
|
+
"description": tool.description,
|
1234
|
+
"parameters": clean_response_schema(tool.schema),
|
1235
|
+
"strict": True,
|
1236
|
+
},
|
1237
|
+
}
|
1238
|
+
for tool in tools
|
1239
|
+
]
|
867
1240
|
|
868
1241
|
return openai_tools or None
|
869
1242
|
|