khoj 1.42.8.dev6__py3-none-any.whl → 1.42.9.dev17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +20 -0
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-9a4610474cd59a71.js → page-5db6ad18da10d353.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-f7bb9d777b7745d4.js → page-6271e2e31c7571d1.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad68326d2f849cec.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-ef738950ea1babc3.js → page-76fc915800aa90f4.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-2b3056cba8aa96ce.js → page-a19a597629e87fb8.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-484d34239ed0f2b1.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/{page-4885df3cd175c957.js → page-fa366ac14b228688.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-8be3b35178abf2ec.js → page-8f9a85f96088c18b.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-4a4b0c0f4749c2b2.js → page-ed7787cf4938b8e3.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-15412ee214acd999.js → webpack-92ce8aaf95718ec4.js} +1 -1
- khoj/interface/compiled/_next/static/css/{e6da1287d41f5409.css → 02f60900b0d89ec7.css} +1 -1
- khoj/interface/compiled/_next/static/css/{821d0d60b0b6871d.css → 93eeacc43e261162.css} +1 -1
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +2 -2
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +11 -2
- khoj/processor/conversation/anthropic/utils.py +90 -103
- khoj/processor/conversation/google/gemini_chat.py +4 -1
- khoj/processor/conversation/google/utils.py +80 -18
- khoj/processor/conversation/offline/chat_model.py +3 -3
- khoj/processor/conversation/openai/gpt.py +13 -38
- khoj/processor/conversation/openai/utils.py +113 -12
- khoj/processor/conversation/prompts.py +17 -35
- khoj/processor/conversation/utils.py +128 -57
- khoj/processor/operator/grounding_agent.py +1 -1
- khoj/processor/operator/operator_agent_binary.py +4 -3
- khoj/processor/tools/online_search.py +18 -0
- khoj/processor/tools/run_code.py +1 -1
- khoj/routers/api_chat.py +1 -1
- khoj/routers/api_subscription.py +22 -0
- khoj/routers/helpers.py +293 -26
- khoj/routers/research.py +169 -155
- khoj/utils/helpers.py +284 -8
- {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev17.dist-info}/METADATA +1 -1
- {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev17.dist-info}/RECORD +54 -54
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
- /khoj/interface/compiled/_next/static/{cJdFAXV3MR9BSimUwQ40G → rRy7eX2lAtmXdtQuJoVrw}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{cJdFAXV3MR9BSimUwQ40G → rRy7eX2lAtmXdtQuJoVrw}/_ssgManifest.js +0 -0
- {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev17.dist-info}/WHEEL +0 -0
- {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev17.dist-info}/entry_points.txt +0 -0
- {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev17.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
import json
|
1
2
|
import logging
|
2
3
|
import os
|
3
4
|
from copy import deepcopy
|
@@ -9,6 +10,7 @@ from urllib.parse import urlparse
|
|
9
10
|
import httpx
|
10
11
|
import openai
|
11
12
|
from langchain_core.messages.chat import ChatMessage
|
13
|
+
from openai.lib._pydantic import _ensure_strict_json_schema
|
12
14
|
from openai.lib.streaming.chat import (
|
13
15
|
ChatCompletionStream,
|
14
16
|
ChatCompletionStreamEvent,
|
@@ -20,6 +22,7 @@ from openai.types.chat.chat_completion_chunk import (
|
|
20
22
|
Choice,
|
21
23
|
ChoiceDelta,
|
22
24
|
)
|
25
|
+
from pydantic import BaseModel
|
23
26
|
from tenacity import (
|
24
27
|
before_sleep_log,
|
25
28
|
retry,
|
@@ -30,11 +33,13 @@ from tenacity import (
|
|
30
33
|
)
|
31
34
|
|
32
35
|
from khoj.processor.conversation.utils import (
|
33
|
-
JsonSupport,
|
34
36
|
ResponseWithThought,
|
37
|
+
StructuredOutputSupport,
|
38
|
+
ToolCall,
|
35
39
|
commit_conversation_trace,
|
36
40
|
)
|
37
41
|
from khoj.utils.helpers import (
|
42
|
+
ToolDefinition,
|
38
43
|
convert_image_data_uri,
|
39
44
|
get_chat_usage_metrics,
|
40
45
|
get_openai_async_client,
|
@@ -72,7 +77,7 @@ def completion_with_backoff(
|
|
72
77
|
deepthought: bool = False,
|
73
78
|
model_kwargs: dict = {},
|
74
79
|
tracer: dict = {},
|
75
|
-
) ->
|
80
|
+
) -> ResponseWithThought:
|
76
81
|
client_key = f"{openai_api_key}--{api_base_url}"
|
77
82
|
client = openai_clients.get(client_key)
|
78
83
|
if not client:
|
@@ -117,6 +122,9 @@ def completion_with_backoff(
|
|
117
122
|
if os.getenv("KHOJ_LLM_SEED"):
|
118
123
|
model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
|
119
124
|
|
125
|
+
tool_ids = []
|
126
|
+
tool_calls: list[ToolCall] = []
|
127
|
+
thoughts = ""
|
120
128
|
aggregated_response = ""
|
121
129
|
if stream:
|
122
130
|
with client.beta.chat.completions.stream(
|
@@ -130,7 +138,16 @@ def completion_with_backoff(
|
|
130
138
|
if chunk.type == "content.delta":
|
131
139
|
aggregated_response += chunk.delta
|
132
140
|
elif chunk.type == "thought.delta":
|
133
|
-
|
141
|
+
thoughts += chunk.delta
|
142
|
+
elif chunk.type == "chunk" and chunk.chunk.choices and chunk.chunk.choices[0].delta.tool_calls:
|
143
|
+
tool_ids += [tool_call.id for tool_call in chunk.chunk.choices[0].delta.tool_calls]
|
144
|
+
elif chunk.type == "tool_calls.function.arguments.done":
|
145
|
+
tool_calls += [ToolCall(name=chunk.name, args=json.loads(chunk.arguments), id=None)]
|
146
|
+
if tool_calls:
|
147
|
+
tool_calls = [
|
148
|
+
ToolCall(name=chunk.name, args=chunk.args, id=tool_id) for chunk, tool_id in zip(tool_calls, tool_ids)
|
149
|
+
]
|
150
|
+
aggregated_response = json.dumps([tool_call.__dict__ for tool_call in tool_calls])
|
134
151
|
else:
|
135
152
|
# Non-streaming chat completion
|
136
153
|
chunk = client.beta.chat.completions.parse(
|
@@ -164,7 +181,7 @@ def completion_with_backoff(
|
|
164
181
|
if is_promptrace_enabled():
|
165
182
|
commit_conversation_trace(messages, aggregated_response, tracer)
|
166
183
|
|
167
|
-
return aggregated_response
|
184
|
+
return ResponseWithThought(text=aggregated_response, thought=thoughts)
|
168
185
|
|
169
186
|
|
170
187
|
@retry(
|
@@ -190,6 +207,7 @@ async def chat_completion_with_backoff(
|
|
190
207
|
deepthought=False,
|
191
208
|
model_kwargs: dict = {},
|
192
209
|
tracer: dict = {},
|
210
|
+
tools=None,
|
193
211
|
) -> AsyncGenerator[ResponseWithThought, None]:
|
194
212
|
client_key = f"{openai_api_key}--{api_base_url}"
|
195
213
|
client = openai_async_clients.get(client_key)
|
@@ -258,6 +276,8 @@ async def chat_completion_with_backoff(
|
|
258
276
|
read_timeout = 300 if is_local_api(api_base_url) else 60
|
259
277
|
if os.getenv("KHOJ_LLM_SEED"):
|
260
278
|
model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
|
279
|
+
if tools:
|
280
|
+
model_kwargs["tools"] = tools
|
261
281
|
|
262
282
|
aggregated_response = ""
|
263
283
|
final_chunk = None
|
@@ -277,7 +297,7 @@ async def chat_completion_with_backoff(
|
|
277
297
|
raise ValueError("No response by model.")
|
278
298
|
aggregated_response = response.choices[0].message.content
|
279
299
|
final_chunk = response
|
280
|
-
yield ResponseWithThought(
|
300
|
+
yield ResponseWithThought(text=aggregated_response)
|
281
301
|
else:
|
282
302
|
async for chunk in stream_processor(response):
|
283
303
|
# Log the time taken to start response
|
@@ -293,8 +313,8 @@ async def chat_completion_with_backoff(
|
|
293
313
|
response_chunk: ResponseWithThought = None
|
294
314
|
response_delta = chunk.choices[0].delta
|
295
315
|
if response_delta.content:
|
296
|
-
response_chunk = ResponseWithThought(
|
297
|
-
aggregated_response += response_chunk.
|
316
|
+
response_chunk = ResponseWithThought(text=response_delta.content)
|
317
|
+
aggregated_response += response_chunk.text
|
298
318
|
elif response_delta.thought:
|
299
319
|
response_chunk = ResponseWithThought(thought=response_delta.thought)
|
300
320
|
if response_chunk:
|
@@ -327,16 +347,16 @@ async def chat_completion_with_backoff(
|
|
327
347
|
commit_conversation_trace(messages, aggregated_response, tracer)
|
328
348
|
|
329
349
|
|
330
|
-
def
|
350
|
+
def get_structured_output_support(model_name: str, api_base_url: str = None) -> StructuredOutputSupport:
|
331
351
|
if model_name.startswith("deepseek-reasoner"):
|
332
|
-
return
|
352
|
+
return StructuredOutputSupport.NONE
|
333
353
|
if api_base_url:
|
334
354
|
host = urlparse(api_base_url).hostname
|
335
355
|
if host and host.endswith(".ai.azure.com"):
|
336
|
-
return
|
356
|
+
return StructuredOutputSupport.OBJECT
|
337
357
|
if host == "api.deepinfra.com":
|
338
|
-
return
|
339
|
-
return
|
358
|
+
return StructuredOutputSupport.OBJECT
|
359
|
+
return StructuredOutputSupport.TOOL
|
340
360
|
|
341
361
|
|
342
362
|
def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> List[dict]:
|
@@ -345,6 +365,43 @@ def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> Li
|
|
345
365
|
"""
|
346
366
|
formatted_messages = []
|
347
367
|
for message in deepcopy(messages):
|
368
|
+
# Handle tool call and tool result message types
|
369
|
+
message_type = message.additional_kwargs.get("message_type")
|
370
|
+
if message_type == "tool_call":
|
371
|
+
# Convert tool_call to OpenAI function call format
|
372
|
+
content = []
|
373
|
+
for part in message.content:
|
374
|
+
content.append(
|
375
|
+
{
|
376
|
+
"type": "function",
|
377
|
+
"id": part.get("id"),
|
378
|
+
"function": {
|
379
|
+
"name": part.get("name"),
|
380
|
+
"arguments": json.dumps(part.get("input", part.get("args", {}))),
|
381
|
+
},
|
382
|
+
}
|
383
|
+
)
|
384
|
+
formatted_messages.append(
|
385
|
+
{
|
386
|
+
"role": "assistant",
|
387
|
+
"content": None,
|
388
|
+
"tool_calls": content,
|
389
|
+
}
|
390
|
+
)
|
391
|
+
continue
|
392
|
+
if message_type == "tool_result":
|
393
|
+
# Convert tool_result to OpenAI tool result format
|
394
|
+
# Each part is a result for a tool call
|
395
|
+
for part in message.content:
|
396
|
+
formatted_messages.append(
|
397
|
+
{
|
398
|
+
"role": "tool",
|
399
|
+
"tool_call_id": part.get("id") or part.get("tool_use_id"),
|
400
|
+
"name": part.get("name"),
|
401
|
+
"content": part.get("content"),
|
402
|
+
}
|
403
|
+
)
|
404
|
+
continue
|
348
405
|
if isinstance(message.content, list) and not is_openai_api(api_base_url):
|
349
406
|
assistant_texts = []
|
350
407
|
has_images = False
|
@@ -708,3 +765,47 @@ def add_qwen_no_think_tag(formatted_messages: List[dict]) -> None:
|
|
708
765
|
if isinstance(content_part, dict) and content_part.get("type") == "text":
|
709
766
|
content_part["text"] += " /no_think"
|
710
767
|
break
|
768
|
+
|
769
|
+
|
770
|
+
def to_openai_tools(tools: List[ToolDefinition]) -> List[Dict] | None:
|
771
|
+
"Transform tool definitions from standard format to OpenAI format."
|
772
|
+
openai_tools = [
|
773
|
+
{
|
774
|
+
"type": "function",
|
775
|
+
"function": {
|
776
|
+
"name": tool.name,
|
777
|
+
"description": tool.description,
|
778
|
+
"parameters": clean_response_schema(tool.schema),
|
779
|
+
},
|
780
|
+
}
|
781
|
+
for tool in tools
|
782
|
+
]
|
783
|
+
|
784
|
+
return openai_tools or None
|
785
|
+
|
786
|
+
|
787
|
+
def clean_response_schema(schema: BaseModel | dict) -> dict:
|
788
|
+
"""
|
789
|
+
Format response schema to be compatible with OpenAI API.
|
790
|
+
|
791
|
+
Clean the response schema by removing unsupported fields.
|
792
|
+
"""
|
793
|
+
# Normalize schema to OpenAI compatible JSON schema format
|
794
|
+
schema_json = schema if isinstance(schema, dict) else schema.model_json_schema()
|
795
|
+
schema_json = _ensure_strict_json_schema(schema_json, path=(), root=schema_json)
|
796
|
+
|
797
|
+
# Recursively drop unsupported fields from schema passed to OpenAI API
|
798
|
+
# See https://platform.openai.com/docs/guides/structured-outputs#supported-schemas
|
799
|
+
fields_to_exclude = ["minItems", "maxItems"]
|
800
|
+
if isinstance(schema_json, dict) and isinstance(schema_json.get("properties"), dict):
|
801
|
+
for _, prop_value in schema_json["properties"].items():
|
802
|
+
if isinstance(prop_value, dict):
|
803
|
+
# Remove specified fields from direct properties
|
804
|
+
for field in fields_to_exclude:
|
805
|
+
prop_value.pop(field, None)
|
806
|
+
# Recursively remove specified fields from child properties
|
807
|
+
if "items" in prop_value and isinstance(prop_value["items"], dict):
|
808
|
+
clean_response_schema(prop_value["items"])
|
809
|
+
|
810
|
+
# Return cleaned schema
|
811
|
+
return schema_json
|
@@ -667,33 +667,37 @@ Here's some additional context about you:
|
|
667
667
|
|
668
668
|
plan_function_execution = PromptTemplate.from_template(
|
669
669
|
"""
|
670
|
-
You are Khoj, a smart, creative and
|
671
|
-
Create a multi-step plan and intelligently iterate on the plan
|
670
|
+
You are Khoj, a smart, creative and meticulous researcher. Use the provided tool AIs to accomplish the task assigned to you.
|
671
|
+
Create a multi-step plan and intelligently iterate on the plan to complete the task.
|
672
672
|
{personality_context}
|
673
673
|
|
674
674
|
# Instructions
|
675
|
-
-
|
675
|
+
- Provide highly diverse, detailed requests to the tool AIs, one tool AI at a time, to gather information, perform actions etc. Their response will be shown to you in the next iteration.
|
676
676
|
- Break down your research process into independent, self-contained steps that can be executed sequentially using the available tool AIs to answer the user's query. Write your step-by-step plan in the scratchpad.
|
677
677
|
- Always ask a new query that was not asked to the tool AI in a previous iteration. Build on the results of the previous iterations.
|
678
678
|
- Ensure that all required context is passed to the tool AIs for successful execution. Include any relevant stuff that has previously been attempted. They only know the context provided in your query.
|
679
679
|
- Think step by step to come up with creative strategies when the previous iteration did not yield useful results.
|
680
|
-
- You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to
|
681
|
-
- Stop when you have the required information by returning a JSON object with the "tool" field set to "text" and "query" field empty. E.g., {{"scratchpad": "I have all I need", "tool": "text", "query": ""}}
|
680
|
+
- You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to accomplish the task assigned to you. Only stop when you have completed the task.
|
682
681
|
|
683
682
|
# Examples
|
684
|
-
Assuming you can search the user's
|
683
|
+
Assuming you can search the user's files and the internet.
|
685
684
|
- When the user asks for the population of their hometown
|
686
|
-
1. Try look up their hometown in their notes. Ask the
|
687
|
-
2.
|
688
|
-
3.
|
685
|
+
1. Try look up their hometown in their notes. Ask the semantic search AI to search for their birth certificate, childhood memories, school, resume etc.
|
686
|
+
2. Use the other document retrieval tools to build on the semantic search results, fill in the gaps, add more details or confirm your hypothesis.
|
687
|
+
3. If not found in their notes, try infer their hometown from their online social media profiles. Ask the online search AI to look for {username}'s biography, school, resume on linkedin, facebook, website etc.
|
688
|
+
4. Only then try find the latest population of their hometown by reading official websites with the help of the online search and web page reading AI.
|
689
689
|
- When the user asks for their computer's specs
|
690
|
-
1. Try find their computer model in their
|
690
|
+
1. Try find their computer model in their documents.
|
691
691
|
2. Now find webpages with their computer model's spec online.
|
692
692
|
3. Ask the webpage tool AI to extract the required information from the relevant webpages.
|
693
693
|
- When the user asks what clothes to carry for their upcoming trip
|
694
|
-
1.
|
694
|
+
1. Use the semantic search tool to find the itinerary of their upcoming trip in their documents.
|
695
695
|
2. Next find the weather forecast at the destination online.
|
696
|
-
3. Then find if
|
696
|
+
3. Then combine the semantic search, regex search, view file and list files tools to find if all the clothes they own in their files.
|
697
|
+
- When the user asks you to summarize their expenses in a particular month
|
698
|
+
1. Combine the semantic search and regex search tool AI to find all transactions in the user's documents for that month.
|
699
|
+
2. Use the view file tool to read the line ranges in the matched files
|
700
|
+
3. Finally summarize the expenses
|
697
701
|
|
698
702
|
# Background Context
|
699
703
|
- Current Date: {day_of_week}, {current_date}
|
@@ -701,31 +705,9 @@ Assuming you can search the user's notes and the internet.
|
|
701
705
|
- User Name: {username}
|
702
706
|
|
703
707
|
# Available Tool AIs
|
704
|
-
You decide which of the tool AIs listed below would you use to
|
708
|
+
You decide which of the tool AIs listed below would you use to accomplish the user assigned task. You **only** have access to the following tool AIs:
|
705
709
|
|
706
710
|
{tools}
|
707
|
-
|
708
|
-
Your response should always be a valid JSON object with keys: "scratchpad" (str), "tool" (str) and "query" (str). Do not say anything else.
|
709
|
-
Response format:
|
710
|
-
{{"scratchpad": "<your_scratchpad_to_reason_about_which_tool_to_use>", "tool": "<name_of_tool_ai>", "query": "<your_detailed_query_for_the_tool_ai>"}}
|
711
|
-
""".strip()
|
712
|
-
)
|
713
|
-
|
714
|
-
plan_function_execution_next_tool = PromptTemplate.from_template(
|
715
|
-
"""
|
716
|
-
Given the results of your previous iterations, which tool AI will you use next to answer the target query?
|
717
|
-
|
718
|
-
# Target Query:
|
719
|
-
{query}
|
720
|
-
""".strip()
|
721
|
-
)
|
722
|
-
|
723
|
-
previous_iteration = PromptTemplate.from_template(
|
724
|
-
"""
|
725
|
-
# Iteration {index}:
|
726
|
-
- tool: {tool}
|
727
|
-
- query: {query}
|
728
|
-
- result: {result}
|
729
711
|
""".strip()
|
730
712
|
)
|
731
713
|
|
@@ -10,7 +10,7 @@ from dataclasses import dataclass
|
|
10
10
|
from datetime import datetime
|
11
11
|
from enum import Enum
|
12
12
|
from io import BytesIO
|
13
|
-
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
13
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
14
14
|
|
15
15
|
import PIL.Image
|
16
16
|
import pyjson5
|
@@ -137,60 +137,83 @@ class OperatorRun:
|
|
137
137
|
}
|
138
138
|
|
139
139
|
|
140
|
+
class ToolCall:
|
141
|
+
def __init__(self, name: str, args: dict, id: str):
|
142
|
+
self.name = name
|
143
|
+
self.args = args
|
144
|
+
self.id = id
|
145
|
+
|
146
|
+
|
140
147
|
class ResearchIteration:
|
141
148
|
def __init__(
|
142
149
|
self,
|
143
|
-
|
144
|
-
query: str,
|
150
|
+
query: ToolCall | dict | str,
|
145
151
|
context: list = None,
|
146
152
|
onlineContext: dict = None,
|
147
153
|
codeContext: dict = None,
|
148
154
|
operatorContext: dict | OperatorRun = None,
|
149
155
|
summarizedResult: str = None,
|
150
156
|
warning: str = None,
|
157
|
+
raw_response: list = None,
|
151
158
|
):
|
152
|
-
self.
|
153
|
-
self.query = query
|
159
|
+
self.query = ToolCall(**query) if isinstance(query, dict) else query
|
154
160
|
self.context = context
|
155
161
|
self.onlineContext = onlineContext
|
156
162
|
self.codeContext = codeContext
|
157
163
|
self.operatorContext = OperatorRun(**operatorContext) if isinstance(operatorContext, dict) else operatorContext
|
158
164
|
self.summarizedResult = summarizedResult
|
159
165
|
self.warning = warning
|
166
|
+
self.raw_response = raw_response
|
160
167
|
|
161
168
|
def to_dict(self) -> dict:
|
162
169
|
data = vars(self).copy()
|
170
|
+
data["query"] = self.query.__dict__ if isinstance(self.query, ToolCall) else self.query
|
163
171
|
data["operatorContext"] = self.operatorContext.to_dict() if self.operatorContext else None
|
164
172
|
return data
|
165
173
|
|
166
174
|
|
167
175
|
def construct_iteration_history(
|
168
176
|
previous_iterations: List[ResearchIteration],
|
169
|
-
previous_iteration_prompt: str,
|
170
177
|
query: str = None,
|
178
|
+
query_images: List[str] = None,
|
179
|
+
query_files: str = None,
|
171
180
|
) -> list[ChatMessageModel]:
|
172
181
|
iteration_history: list[ChatMessageModel] = []
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
tool=iteration.tool,
|
177
|
-
query=iteration.query,
|
178
|
-
result=iteration.summarizedResult,
|
179
|
-
index=idx + 1,
|
180
|
-
)
|
181
|
-
|
182
|
-
previous_iteration_messages.append({"type": "text", "text": iteration_data})
|
182
|
+
query_message_content = construct_structured_message(query, query_images, attached_file_context=query_files)
|
183
|
+
if query_message_content:
|
184
|
+
iteration_history.append(ChatMessageModel(by="you", message=query_message_content))
|
183
185
|
|
184
|
-
|
185
|
-
if query:
|
186
|
-
iteration_history.append(
|
187
|
-
|
186
|
+
for iteration in previous_iterations:
|
187
|
+
if not iteration.query or isinstance(iteration.query, str):
|
188
|
+
iteration_history.append(
|
189
|
+
ChatMessageModel(
|
190
|
+
by="you",
|
191
|
+
message=iteration.summarizedResult
|
192
|
+
or iteration.warning
|
193
|
+
or "Please specify what you want to do next.",
|
194
|
+
)
|
195
|
+
)
|
196
|
+
continue
|
197
|
+
iteration_history += [
|
188
198
|
ChatMessageModel(
|
189
199
|
by="khoj",
|
190
|
-
|
191
|
-
|
192
|
-
)
|
193
|
-
|
200
|
+
message=iteration.raw_response or [iteration.query.__dict__],
|
201
|
+
intent=Intent(type="tool_call", query=query),
|
202
|
+
),
|
203
|
+
ChatMessageModel(
|
204
|
+
by="you",
|
205
|
+
intent=Intent(type="tool_result"),
|
206
|
+
message=[
|
207
|
+
{
|
208
|
+
"type": "tool_result",
|
209
|
+
"id": iteration.query.id,
|
210
|
+
"name": iteration.query.name,
|
211
|
+
"content": iteration.summarizedResult,
|
212
|
+
}
|
213
|
+
],
|
214
|
+
),
|
215
|
+
]
|
216
|
+
|
194
217
|
return iteration_history
|
195
218
|
|
196
219
|
|
@@ -302,33 +325,44 @@ def construct_tool_chat_history(
|
|
302
325
|
ConversationCommand.Notes: (
|
303
326
|
lambda iteration: [c["query"] for c in iteration.context] if iteration.context else []
|
304
327
|
),
|
305
|
-
ConversationCommand.
|
328
|
+
ConversationCommand.SearchWeb: (
|
306
329
|
lambda iteration: list(iteration.onlineContext.keys()) if iteration.onlineContext else []
|
307
330
|
),
|
308
|
-
ConversationCommand.
|
331
|
+
ConversationCommand.ReadWebpage: (
|
309
332
|
lambda iteration: list(iteration.onlineContext.keys()) if iteration.onlineContext else []
|
310
333
|
),
|
311
|
-
ConversationCommand.
|
334
|
+
ConversationCommand.RunCode: (
|
312
335
|
lambda iteration: list(iteration.codeContext.keys()) if iteration.codeContext else []
|
313
336
|
),
|
314
337
|
}
|
315
338
|
for iteration in previous_iterations:
|
339
|
+
if not iteration.query or isinstance(iteration.query, str):
|
340
|
+
chat_history.append(
|
341
|
+
ChatMessageModel(
|
342
|
+
by="you",
|
343
|
+
message=iteration.summarizedResult
|
344
|
+
or iteration.warning
|
345
|
+
or "Please specify what you want to do next.",
|
346
|
+
)
|
347
|
+
)
|
348
|
+
continue
|
349
|
+
|
316
350
|
# If a tool is provided use the inferred query extractor for that tool if available
|
317
351
|
# If no tool is provided, use inferred query extractor for the tool used in the iteration
|
318
352
|
# Fallback to base extractor if the tool does not have an inferred query extractor
|
319
353
|
inferred_query_extractor = extract_inferred_query_map.get(
|
320
|
-
tool or ConversationCommand(iteration.
|
354
|
+
tool or ConversationCommand(iteration.query.name), base_extractor
|
321
355
|
)
|
322
356
|
chat_history += [
|
323
357
|
ChatMessageModel(
|
324
358
|
by="you",
|
325
|
-
message=iteration.query,
|
359
|
+
message=yaml.dump(iteration.query.args, default_flow_style=False),
|
326
360
|
),
|
327
361
|
ChatMessageModel(
|
328
362
|
by="khoj",
|
329
363
|
intent=Intent(
|
330
364
|
type="remember",
|
331
|
-
query=iteration.query,
|
365
|
+
query=yaml.dump(iteration.query.args, default_flow_style=False),
|
332
366
|
inferred_queries=inferred_query_extractor(iteration),
|
333
367
|
memory_type="notes",
|
334
368
|
),
|
@@ -481,28 +515,32 @@ Khoj: "{chat_response}"
|
|
481
515
|
|
482
516
|
def construct_structured_message(
|
483
517
|
message: list[dict] | str,
|
484
|
-
images: list[str],
|
485
|
-
model_type: str,
|
486
|
-
vision_enabled: bool,
|
518
|
+
images: list[str] = None,
|
519
|
+
model_type: str = None,
|
520
|
+
vision_enabled: bool = True,
|
487
521
|
attached_file_context: str = None,
|
488
522
|
):
|
489
523
|
"""
|
490
|
-
Format messages into appropriate multimedia format for supported chat model types
|
524
|
+
Format messages into appropriate multimedia format for supported chat model types.
|
525
|
+
|
526
|
+
Assume vision is enabled and chat model provider supports messages in chatml format, unless specified otherwise.
|
491
527
|
"""
|
492
|
-
if model_type in [
|
528
|
+
if not model_type or model_type in [
|
493
529
|
ChatModel.ModelType.OPENAI,
|
494
530
|
ChatModel.ModelType.GOOGLE,
|
495
531
|
ChatModel.ModelType.ANTHROPIC,
|
496
532
|
]:
|
497
|
-
constructed_messages: List[dict[str, Any]] =
|
498
|
-
|
499
|
-
|
500
|
-
|
533
|
+
constructed_messages: List[dict[str, Any]] = []
|
534
|
+
if not is_none_or_empty(message):
|
535
|
+
constructed_messages += [{"type": "text", "text": message}] if isinstance(message, str) else message
|
536
|
+
# Drop image message passed by caller if chat model does not have vision enabled
|
537
|
+
if not vision_enabled:
|
538
|
+
constructed_messages = [m for m in constructed_messages if m.get("type") != "image_url"]
|
501
539
|
if not is_none_or_empty(attached_file_context):
|
502
|
-
constructed_messages
|
540
|
+
constructed_messages += [{"type": "text", "text": attached_file_context}]
|
503
541
|
if vision_enabled and images:
|
504
542
|
for image in images:
|
505
|
-
constructed_messages
|
543
|
+
constructed_messages += [{"type": "image_url", "image_url": {"url": image}}]
|
506
544
|
return constructed_messages
|
507
545
|
|
508
546
|
message = message if isinstance(message, str) else "\n\n".join(m["text"] for m in message)
|
@@ -638,7 +676,11 @@ def generate_chatml_messages_with_context(
|
|
638
676
|
chat_message, chat.images if role == "user" else [], model_type, vision_enabled
|
639
677
|
)
|
640
678
|
|
641
|
-
reconstructed_message = ChatMessage(
|
679
|
+
reconstructed_message = ChatMessage(
|
680
|
+
content=message_content,
|
681
|
+
role=role,
|
682
|
+
additional_kwargs={"message_type": chat.intent.type if chat.intent else None},
|
683
|
+
)
|
642
684
|
chatml_messages.insert(0, reconstructed_message)
|
643
685
|
|
644
686
|
if len(chatml_messages) >= 3 * lookback_turns:
|
@@ -737,10 +779,21 @@ def count_tokens(
|
|
737
779
|
message_content_parts: list[str] = []
|
738
780
|
# Collate message content into single string to ease token counting
|
739
781
|
for part in message_content:
|
740
|
-
if isinstance(part, dict) and part.get("type") == "
|
741
|
-
message_content_parts.append(part["text"])
|
742
|
-
elif isinstance(part, dict) and part.get("type") == "image_url":
|
782
|
+
if isinstance(part, dict) and part.get("type") == "image_url":
|
743
783
|
image_count += 1
|
784
|
+
elif isinstance(part, dict) and part.get("type") == "text":
|
785
|
+
message_content_parts.append(part["text"])
|
786
|
+
elif isinstance(part, dict) and hasattr(part, "model_dump"):
|
787
|
+
message_content_parts.append(json.dumps(part.model_dump()))
|
788
|
+
elif isinstance(part, dict) and hasattr(part, "__dict__"):
|
789
|
+
message_content_parts.append(json.dumps(part.__dict__))
|
790
|
+
elif isinstance(part, dict):
|
791
|
+
# If part is a dict but not a recognized type, convert to JSON string
|
792
|
+
try:
|
793
|
+
message_content_parts.append(json.dumps(part))
|
794
|
+
except (TypeError, ValueError) as e:
|
795
|
+
logger.warning(f"Failed to serialize part {part} to JSON: {e}. Skipping.")
|
796
|
+
image_count += 1 # Treat as an image/binary if serialization fails
|
744
797
|
elif isinstance(part, str):
|
745
798
|
message_content_parts.append(part)
|
746
799
|
else:
|
@@ -753,6 +806,15 @@ def count_tokens(
|
|
753
806
|
return len(encoder.encode(json.dumps(message_content)))
|
754
807
|
|
755
808
|
|
809
|
+
def count_total_tokens(messages: list[ChatMessage], encoder, system_message: Optional[ChatMessage]) -> Tuple[int, int]:
|
810
|
+
"""Count total tokens in messages including system message"""
|
811
|
+
system_message_tokens = count_tokens(system_message.content, encoder) if system_message else 0
|
812
|
+
message_tokens = sum([count_tokens(message.content, encoder) for message in messages])
|
813
|
+
# Reserves 4 tokens to demarcate each message (e.g <|im_start|>user, <|im_end|>, <|endoftext|> etc.)
|
814
|
+
total_tokens = message_tokens + system_message_tokens + 4 * len(messages)
|
815
|
+
return total_tokens, system_message_tokens
|
816
|
+
|
817
|
+
|
756
818
|
def truncate_messages(
|
757
819
|
messages: list[ChatMessage],
|
758
820
|
max_prompt_size: int,
|
@@ -771,23 +833,30 @@ def truncate_messages(
|
|
771
833
|
break
|
772
834
|
|
773
835
|
# Drop older messages until under max supported prompt size by model
|
774
|
-
|
775
|
-
system_message_tokens = count_tokens(system_message.content, encoder) if system_message else 0
|
776
|
-
tokens = sum([count_tokens(message.content, encoder) for message in messages])
|
777
|
-
total_tokens = tokens + system_message_tokens + 4 * len(messages)
|
836
|
+
total_tokens, system_message_tokens = count_total_tokens(messages, encoder, system_message)
|
778
837
|
|
779
838
|
while total_tokens > max_prompt_size and (len(messages) > 1 or len(messages[0].content) > 1):
|
780
|
-
|
839
|
+
# If the last message has more than one content part, pop the oldest content part.
|
840
|
+
# For tool calls, the whole message should dropped, assistant's tool call content being truncated annoys AI APIs.
|
841
|
+
if len(messages[-1].content) > 1 and messages[-1].additional_kwargs.get("message_type") != "tool_call":
|
781
842
|
# The oldest content part is earlier in content list. So pop from the front.
|
782
843
|
messages[-1].content.pop(0)
|
844
|
+
# Otherwise, pop the last message if it has only one content part or is a tool call.
|
783
845
|
else:
|
784
846
|
# The oldest message is the last one. So pop from the back.
|
785
|
-
messages.pop()
|
786
|
-
|
787
|
-
|
847
|
+
dropped_message = messages.pop()
|
848
|
+
# Drop tool result pair of tool call, if tool call message has been removed
|
849
|
+
if (
|
850
|
+
dropped_message.additional_kwargs.get("message_type") == "tool_call"
|
851
|
+
and messages
|
852
|
+
and messages[-1].additional_kwargs.get("message_type") == "tool_result"
|
853
|
+
):
|
854
|
+
messages.pop()
|
855
|
+
|
856
|
+
total_tokens, _ = count_total_tokens(messages, encoder, system_message)
|
788
857
|
|
789
858
|
# Truncate current message if still over max supported prompt size by model
|
790
|
-
total_tokens =
|
859
|
+
total_tokens, _ = count_total_tokens(messages, encoder, system_message)
|
791
860
|
if total_tokens > max_prompt_size:
|
792
861
|
# At this point, a single message with a single content part of type dict should remain
|
793
862
|
assert (
|
@@ -1149,13 +1218,15 @@ def messages_to_print(messages: list[ChatMessage], max_length: int = 70) -> str:
|
|
1149
1218
|
return "\n".join([f"{json.dumps(safe_serialize(message.content))[:max_length]}..." for message in messages])
|
1150
1219
|
|
1151
1220
|
|
1152
|
-
class
|
1221
|
+
class StructuredOutputSupport(int, Enum):
|
1153
1222
|
NONE = 0
|
1154
1223
|
OBJECT = 1
|
1155
1224
|
SCHEMA = 2
|
1225
|
+
TOOL = 3
|
1156
1226
|
|
1157
1227
|
|
1158
1228
|
class ResponseWithThought:
|
1159
|
-
def __init__(self,
|
1160
|
-
self.
|
1229
|
+
def __init__(self, text: str = None, thought: str = None, raw_content: list = None):
|
1230
|
+
self.text = text
|
1161
1231
|
self.thought = thought
|
1232
|
+
self.raw_content = raw_content
|
@@ -73,7 +73,7 @@ class GroundingAgent:
|
|
73
73
|
grounding_user_prompt = self.get_instruction(instruction, self.environment_type)
|
74
74
|
screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
|
75
75
|
grounding_messages_content = construct_structured_message(
|
76
|
-
grounding_user_prompt, screenshots, self.model.
|
76
|
+
grounding_user_prompt, screenshots, self.model.model_type, vision_enabled=True
|
77
77
|
)
|
78
78
|
return [{"role": "user", "content": grounding_messages_content}]
|
79
79
|
|