khoj 1.42.8.dev6__py3-none-any.whl → 1.42.9.dev16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. khoj/database/adapters/__init__.py +20 -0
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/app/agents/layout-2e626327abfbe612.js +1 -0
  4. khoj/interface/compiled/_next/static/chunks/app/agents/{page-9a4610474cd59a71.js → page-0006674668eb5a4d.js} +1 -1
  5. khoj/interface/compiled/_next/static/chunks/app/automations/{page-f7bb9d777b7745d4.js → page-4c465cde2d14cb52.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/chat/layout-d6acbba22ccac0ff.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/chat/{page-ef738950ea1babc3.js → page-9967631715682f3c.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/{page-2b3056cba8aa96ce.js → page-6e91caf9bc0c8aba.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/search/layout-94c76c3a41db42a2.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/app/search/{page-4885df3cd175c957.js → page-883b7d8d2e3abe3e.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/settings/{page-8be3b35178abf2ec.js → page-95e994ddac31473f.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-95998f0bdc22bb13.js +1 -0
  13. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-4a4b0c0f4749c2b2.js → page-8c8c175f7f212b03.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/{webpack-15412ee214acd999.js → webpack-4bf3eab7681a1206.js} +1 -1
  15. khoj/interface/compiled/_next/static/css/1e9b757ee2a2b34b.css +1 -0
  16. khoj/interface/compiled/_next/static/css/440ae0f0f650dc35.css +1 -0
  17. khoj/interface/compiled/_next/static/css/bd2071cad2ecf293.css +1 -0
  18. khoj/interface/compiled/_next/static/css/ee66643a6a5bf71c.css +1 -0
  19. khoj/interface/compiled/agents/index.html +2 -2
  20. khoj/interface/compiled/agents/index.txt +2 -2
  21. khoj/interface/compiled/automations/index.html +2 -2
  22. khoj/interface/compiled/automations/index.txt +3 -3
  23. khoj/interface/compiled/chat/index.html +2 -2
  24. khoj/interface/compiled/chat/index.txt +2 -2
  25. khoj/interface/compiled/index.html +2 -2
  26. khoj/interface/compiled/index.txt +2 -2
  27. khoj/interface/compiled/search/index.html +2 -2
  28. khoj/interface/compiled/search/index.txt +2 -2
  29. khoj/interface/compiled/settings/index.html +2 -2
  30. khoj/interface/compiled/settings/index.txt +4 -4
  31. khoj/interface/compiled/share/chat/index.html +2 -2
  32. khoj/interface/compiled/share/chat/index.txt +2 -2
  33. khoj/processor/conversation/anthropic/anthropic_chat.py +11 -2
  34. khoj/processor/conversation/anthropic/utils.py +90 -103
  35. khoj/processor/conversation/google/gemini_chat.py +4 -1
  36. khoj/processor/conversation/google/utils.py +80 -18
  37. khoj/processor/conversation/offline/chat_model.py +3 -3
  38. khoj/processor/conversation/openai/gpt.py +13 -38
  39. khoj/processor/conversation/openai/utils.py +113 -12
  40. khoj/processor/conversation/prompts.py +17 -35
  41. khoj/processor/conversation/utils.py +128 -57
  42. khoj/processor/operator/grounding_agent.py +1 -1
  43. khoj/processor/operator/operator_agent_binary.py +4 -3
  44. khoj/processor/tools/online_search.py +18 -0
  45. khoj/processor/tools/run_code.py +1 -1
  46. khoj/routers/api_chat.py +1 -1
  47. khoj/routers/helpers.py +293 -26
  48. khoj/routers/research.py +169 -155
  49. khoj/utils/helpers.py +284 -8
  50. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/METADATA +1 -1
  51. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/RECORD +62 -62
  52. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +0 -1
  53. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +0 -1
  54. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
  55. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
  56. khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +0 -1
  57. khoj/interface/compiled/_next/static/css/76c658ee459140a9.css +0 -1
  58. khoj/interface/compiled/_next/static/css/821d0d60b0b6871d.css +0 -1
  59. khoj/interface/compiled/_next/static/css/e6da1287d41f5409.css +0 -1
  60. /khoj/interface/compiled/_next/static/chunks/{1327-1a9107b9a2a04a98.js → 1327-3b1a41af530fa8ee.js} +0 -0
  61. /khoj/interface/compiled/_next/static/chunks/{1915-5c6508f6ebb62a30.js → 1915-fbfe167c84ad60c5.js} +0 -0
  62. /khoj/interface/compiled/_next/static/chunks/{2117-080746c8e170c81a.js → 2117-e78b6902ad6f75ec.js} +0 -0
  63. /khoj/interface/compiled/_next/static/chunks/{2939-4af3fd24b8ffc9ad.js → 2939-4d4084c5b888b960.js} +0 -0
  64. /khoj/interface/compiled/_next/static/chunks/{4447-cd95608f8e93e711.js → 4447-d6cf93724d57e34b.js} +0 -0
  65. /khoj/interface/compiled/_next/static/chunks/{8667-50b03a89e82e0ba7.js → 8667-4b7790573b08c50d.js} +0 -0
  66. /khoj/interface/compiled/_next/static/{cJdFAXV3MR9BSimUwQ40G → w19FJJa9p2AFJB6DEektd}/_buildManifest.js +0 -0
  67. /khoj/interface/compiled/_next/static/{cJdFAXV3MR9BSimUwQ40G → w19FJJa9p2AFJB6DEektd}/_ssgManifest.js +0 -0
  68. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/WHEEL +0 -0
  69. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/entry_points.txt +0 -0
  70. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import logging
2
3
  import os
3
4
  from copy import deepcopy
@@ -9,6 +10,7 @@ from urllib.parse import urlparse
9
10
  import httpx
10
11
  import openai
11
12
  from langchain_core.messages.chat import ChatMessage
13
+ from openai.lib._pydantic import _ensure_strict_json_schema
12
14
  from openai.lib.streaming.chat import (
13
15
  ChatCompletionStream,
14
16
  ChatCompletionStreamEvent,
@@ -20,6 +22,7 @@ from openai.types.chat.chat_completion_chunk import (
20
22
  Choice,
21
23
  ChoiceDelta,
22
24
  )
25
+ from pydantic import BaseModel
23
26
  from tenacity import (
24
27
  before_sleep_log,
25
28
  retry,
@@ -30,11 +33,13 @@ from tenacity import (
30
33
  )
31
34
 
32
35
  from khoj.processor.conversation.utils import (
33
- JsonSupport,
34
36
  ResponseWithThought,
37
+ StructuredOutputSupport,
38
+ ToolCall,
35
39
  commit_conversation_trace,
36
40
  )
37
41
  from khoj.utils.helpers import (
42
+ ToolDefinition,
38
43
  convert_image_data_uri,
39
44
  get_chat_usage_metrics,
40
45
  get_openai_async_client,
@@ -72,7 +77,7 @@ def completion_with_backoff(
72
77
  deepthought: bool = False,
73
78
  model_kwargs: dict = {},
74
79
  tracer: dict = {},
75
- ) -> str:
80
+ ) -> ResponseWithThought:
76
81
  client_key = f"{openai_api_key}--{api_base_url}"
77
82
  client = openai_clients.get(client_key)
78
83
  if not client:
@@ -117,6 +122,9 @@ def completion_with_backoff(
117
122
  if os.getenv("KHOJ_LLM_SEED"):
118
123
  model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
119
124
 
125
+ tool_ids = []
126
+ tool_calls: list[ToolCall] = []
127
+ thoughts = ""
120
128
  aggregated_response = ""
121
129
  if stream:
122
130
  with client.beta.chat.completions.stream(
@@ -130,7 +138,16 @@ def completion_with_backoff(
130
138
  if chunk.type == "content.delta":
131
139
  aggregated_response += chunk.delta
132
140
  elif chunk.type == "thought.delta":
133
- pass
141
+ thoughts += chunk.delta
142
+ elif chunk.type == "chunk" and chunk.chunk.choices and chunk.chunk.choices[0].delta.tool_calls:
143
+ tool_ids += [tool_call.id for tool_call in chunk.chunk.choices[0].delta.tool_calls]
144
+ elif chunk.type == "tool_calls.function.arguments.done":
145
+ tool_calls += [ToolCall(name=chunk.name, args=json.loads(chunk.arguments), id=None)]
146
+ if tool_calls:
147
+ tool_calls = [
148
+ ToolCall(name=chunk.name, args=chunk.args, id=tool_id) for chunk, tool_id in zip(tool_calls, tool_ids)
149
+ ]
150
+ aggregated_response = json.dumps([tool_call.__dict__ for tool_call in tool_calls])
134
151
  else:
135
152
  # Non-streaming chat completion
136
153
  chunk = client.beta.chat.completions.parse(
@@ -164,7 +181,7 @@ def completion_with_backoff(
164
181
  if is_promptrace_enabled():
165
182
  commit_conversation_trace(messages, aggregated_response, tracer)
166
183
 
167
- return aggregated_response
184
+ return ResponseWithThought(text=aggregated_response, thought=thoughts)
168
185
 
169
186
 
170
187
  @retry(
@@ -190,6 +207,7 @@ async def chat_completion_with_backoff(
190
207
  deepthought=False,
191
208
  model_kwargs: dict = {},
192
209
  tracer: dict = {},
210
+ tools=None,
193
211
  ) -> AsyncGenerator[ResponseWithThought, None]:
194
212
  client_key = f"{openai_api_key}--{api_base_url}"
195
213
  client = openai_async_clients.get(client_key)
@@ -258,6 +276,8 @@ async def chat_completion_with_backoff(
258
276
  read_timeout = 300 if is_local_api(api_base_url) else 60
259
277
  if os.getenv("KHOJ_LLM_SEED"):
260
278
  model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
279
+ if tools:
280
+ model_kwargs["tools"] = tools
261
281
 
262
282
  aggregated_response = ""
263
283
  final_chunk = None
@@ -277,7 +297,7 @@ async def chat_completion_with_backoff(
277
297
  raise ValueError("No response by model.")
278
298
  aggregated_response = response.choices[0].message.content
279
299
  final_chunk = response
280
- yield ResponseWithThought(response=aggregated_response)
300
+ yield ResponseWithThought(text=aggregated_response)
281
301
  else:
282
302
  async for chunk in stream_processor(response):
283
303
  # Log the time taken to start response
@@ -293,8 +313,8 @@ async def chat_completion_with_backoff(
293
313
  response_chunk: ResponseWithThought = None
294
314
  response_delta = chunk.choices[0].delta
295
315
  if response_delta.content:
296
- response_chunk = ResponseWithThought(response=response_delta.content)
297
- aggregated_response += response_chunk.response
316
+ response_chunk = ResponseWithThought(text=response_delta.content)
317
+ aggregated_response += response_chunk.text
298
318
  elif response_delta.thought:
299
319
  response_chunk = ResponseWithThought(thought=response_delta.thought)
300
320
  if response_chunk:
@@ -327,16 +347,16 @@ async def chat_completion_with_backoff(
327
347
  commit_conversation_trace(messages, aggregated_response, tracer)
328
348
 
329
349
 
330
- def get_openai_api_json_support(model_name: str, api_base_url: str = None) -> JsonSupport:
350
+ def get_structured_output_support(model_name: str, api_base_url: str = None) -> StructuredOutputSupport:
331
351
  if model_name.startswith("deepseek-reasoner"):
332
- return JsonSupport.NONE
352
+ return StructuredOutputSupport.NONE
333
353
  if api_base_url:
334
354
  host = urlparse(api_base_url).hostname
335
355
  if host and host.endswith(".ai.azure.com"):
336
- return JsonSupport.OBJECT
356
+ return StructuredOutputSupport.OBJECT
337
357
  if host == "api.deepinfra.com":
338
- return JsonSupport.OBJECT
339
- return JsonSupport.SCHEMA
358
+ return StructuredOutputSupport.OBJECT
359
+ return StructuredOutputSupport.TOOL
340
360
 
341
361
 
342
362
  def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> List[dict]:
@@ -345,6 +365,43 @@ def format_message_for_api(messages: List[ChatMessage], api_base_url: str) -> Li
345
365
  """
346
366
  formatted_messages = []
347
367
  for message in deepcopy(messages):
368
+ # Handle tool call and tool result message types
369
+ message_type = message.additional_kwargs.get("message_type")
370
+ if message_type == "tool_call":
371
+ # Convert tool_call to OpenAI function call format
372
+ content = []
373
+ for part in message.content:
374
+ content.append(
375
+ {
376
+ "type": "function",
377
+ "id": part.get("id"),
378
+ "function": {
379
+ "name": part.get("name"),
380
+ "arguments": json.dumps(part.get("input", part.get("args", {}))),
381
+ },
382
+ }
383
+ )
384
+ formatted_messages.append(
385
+ {
386
+ "role": "assistant",
387
+ "content": None,
388
+ "tool_calls": content,
389
+ }
390
+ )
391
+ continue
392
+ if message_type == "tool_result":
393
+ # Convert tool_result to OpenAI tool result format
394
+ # Each part is a result for a tool call
395
+ for part in message.content:
396
+ formatted_messages.append(
397
+ {
398
+ "role": "tool",
399
+ "tool_call_id": part.get("id") or part.get("tool_use_id"),
400
+ "name": part.get("name"),
401
+ "content": part.get("content"),
402
+ }
403
+ )
404
+ continue
348
405
  if isinstance(message.content, list) and not is_openai_api(api_base_url):
349
406
  assistant_texts = []
350
407
  has_images = False
@@ -708,3 +765,47 @@ def add_qwen_no_think_tag(formatted_messages: List[dict]) -> None:
708
765
  if isinstance(content_part, dict) and content_part.get("type") == "text":
709
766
  content_part["text"] += " /no_think"
710
767
  break
768
+
769
+
770
+ def to_openai_tools(tools: List[ToolDefinition]) -> List[Dict] | None:
771
+ "Transform tool definitions from standard format to OpenAI format."
772
+ openai_tools = [
773
+ {
774
+ "type": "function",
775
+ "function": {
776
+ "name": tool.name,
777
+ "description": tool.description,
778
+ "parameters": clean_response_schema(tool.schema),
779
+ },
780
+ }
781
+ for tool in tools
782
+ ]
783
+
784
+ return openai_tools or None
785
+
786
+
787
+ def clean_response_schema(schema: BaseModel | dict) -> dict:
788
+ """
789
+ Format response schema to be compatible with OpenAI API.
790
+
791
+ Clean the response schema by removing unsupported fields.
792
+ """
793
+ # Normalize schema to OpenAI compatible JSON schema format
794
+ schema_json = schema if isinstance(schema, dict) else schema.model_json_schema()
795
+ schema_json = _ensure_strict_json_schema(schema_json, path=(), root=schema_json)
796
+
797
+ # Recursively drop unsupported fields from schema passed to OpenAI API
798
+ # See https://platform.openai.com/docs/guides/structured-outputs#supported-schemas
799
+ fields_to_exclude = ["minItems", "maxItems"]
800
+ if isinstance(schema_json, dict) and isinstance(schema_json.get("properties"), dict):
801
+ for _, prop_value in schema_json["properties"].items():
802
+ if isinstance(prop_value, dict):
803
+ # Remove specified fields from direct properties
804
+ for field in fields_to_exclude:
805
+ prop_value.pop(field, None)
806
+ # Recursively remove specified fields from child properties
807
+ if "items" in prop_value and isinstance(prop_value["items"], dict):
808
+ clean_response_schema(prop_value["items"])
809
+
810
+ # Return cleaned schema
811
+ return schema_json
@@ -667,33 +667,37 @@ Here's some additional context about you:
667
667
 
668
668
  plan_function_execution = PromptTemplate.from_template(
669
669
  """
670
- You are Khoj, a smart, creative and methodical researcher. Use the provided tool AIs to investigate information to answer query.
671
- Create a multi-step plan and intelligently iterate on the plan based on the retrieved information to find the requested information.
670
+ You are Khoj, a smart, creative and meticulous researcher. Use the provided tool AIs to accomplish the task assigned to you.
671
+ Create a multi-step plan and intelligently iterate on the plan to complete the task.
672
672
  {personality_context}
673
673
 
674
674
  # Instructions
675
- - Ask highly diverse, detailed queries to the tool AIs, one tool AI at a time, to discover required information or run calculations. Their response will be shown to you in the next iteration.
675
+ - Provide highly diverse, detailed requests to the tool AIs, one tool AI at a time, to gather information, perform actions etc. Their response will be shown to you in the next iteration.
676
676
  - Break down your research process into independent, self-contained steps that can be executed sequentially using the available tool AIs to answer the user's query. Write your step-by-step plan in the scratchpad.
677
677
  - Always ask a new query that was not asked to the tool AI in a previous iteration. Build on the results of the previous iterations.
678
678
  - Ensure that all required context is passed to the tool AIs for successful execution. Include any relevant stuff that has previously been attempted. They only know the context provided in your query.
679
679
  - Think step by step to come up with creative strategies when the previous iteration did not yield useful results.
680
- - You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to answer the user's question.
681
- - Stop when you have the required information by returning a JSON object with the "tool" field set to "text" and "query" field empty. E.g., {{"scratchpad": "I have all I need", "tool": "text", "query": ""}}
680
+ - You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to accomplish the task assigned to you. Only stop when you have completed the task.
682
681
 
683
682
  # Examples
684
- Assuming you can search the user's notes and the internet.
683
+ Assuming you can search the user's files and the internet.
685
684
  - When the user asks for the population of their hometown
686
- 1. Try look up their hometown in their notes. Ask the note search AI to search for their birth certificate, childhood memories, school, resume etc.
687
- 2. If not found in their notes, try infer their hometown from their online social media profiles. Ask the online search AI to look for {username}'s biography, school, resume on linkedin, facebook, website etc.
688
- 3. Only then try find the latest population of their hometown by reading official websites with the help of the online search and web page reading AI.
685
+ 1. Try look up their hometown in their notes. Ask the semantic search AI to search for their birth certificate, childhood memories, school, resume etc.
686
+ 2. Use the other document retrieval tools to build on the semantic search results, fill in the gaps, add more details or confirm your hypothesis.
687
+ 3. If not found in their notes, try infer their hometown from their online social media profiles. Ask the online search AI to look for {username}'s biography, school, resume on linkedin, facebook, website etc.
688
+ 4. Only then try find the latest population of their hometown by reading official websites with the help of the online search and web page reading AI.
689
689
  - When the user asks for their computer's specs
690
- 1. Try find their computer model in their notes.
690
+ 1. Try find their computer model in their documents.
691
691
  2. Now find webpages with their computer model's spec online.
692
692
  3. Ask the webpage tool AI to extract the required information from the relevant webpages.
693
693
  - When the user asks what clothes to carry for their upcoming trip
694
- 1. Find the itinerary of their upcoming trip in their notes.
694
+ 1. Use the semantic search tool to find the itinerary of their upcoming trip in their documents.
695
695
  2. Next find the weather forecast at the destination online.
696
- 3. Then find if they mentioned what clothes they own in their notes.
696
+ 3. Then combine the semantic search, regex search, view file and list files tools to find if all the clothes they own in their files.
697
+ - When the user asks you to summarize their expenses in a particular month
698
+ 1. Combine the semantic search and regex search tool AI to find all transactions in the user's documents for that month.
699
+ 2. Use the view file tool to read the line ranges in the matched files
700
+ 3. Finally summarize the expenses
697
701
 
698
702
  # Background Context
699
703
  - Current Date: {day_of_week}, {current_date}
@@ -701,31 +705,9 @@ Assuming you can search the user's notes and the internet.
701
705
  - User Name: {username}
702
706
 
703
707
  # Available Tool AIs
704
- You decide which of the tool AIs listed below would you use to answer the user's question. You **only** have access to the following tool AIs:
708
+ You decide which of the tool AIs listed below would you use to accomplish the user assigned task. You **only** have access to the following tool AIs:
705
709
 
706
710
  {tools}
707
-
708
- Your response should always be a valid JSON object with keys: "scratchpad" (str), "tool" (str) and "query" (str). Do not say anything else.
709
- Response format:
710
- {{"scratchpad": "<your_scratchpad_to_reason_about_which_tool_to_use>", "tool": "<name_of_tool_ai>", "query": "<your_detailed_query_for_the_tool_ai>"}}
711
- """.strip()
712
- )
713
-
714
- plan_function_execution_next_tool = PromptTemplate.from_template(
715
- """
716
- Given the results of your previous iterations, which tool AI will you use next to answer the target query?
717
-
718
- # Target Query:
719
- {query}
720
- """.strip()
721
- )
722
-
723
- previous_iteration = PromptTemplate.from_template(
724
- """
725
- # Iteration {index}:
726
- - tool: {tool}
727
- - query: {query}
728
- - result: {result}
729
711
  """.strip()
730
712
  )
731
713
 
@@ -10,7 +10,7 @@ from dataclasses import dataclass
10
10
  from datetime import datetime
11
11
  from enum import Enum
12
12
  from io import BytesIO
13
- from typing import Any, Callable, Dict, List, Literal, Optional, Union
13
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
14
14
 
15
15
  import PIL.Image
16
16
  import pyjson5
@@ -137,60 +137,83 @@ class OperatorRun:
137
137
  }
138
138
 
139
139
 
140
+ class ToolCall:
141
+ def __init__(self, name: str, args: dict, id: str):
142
+ self.name = name
143
+ self.args = args
144
+ self.id = id
145
+
146
+
140
147
  class ResearchIteration:
141
148
  def __init__(
142
149
  self,
143
- tool: str,
144
- query: str,
150
+ query: ToolCall | dict | str,
145
151
  context: list = None,
146
152
  onlineContext: dict = None,
147
153
  codeContext: dict = None,
148
154
  operatorContext: dict | OperatorRun = None,
149
155
  summarizedResult: str = None,
150
156
  warning: str = None,
157
+ raw_response: list = None,
151
158
  ):
152
- self.tool = tool
153
- self.query = query
159
+ self.query = ToolCall(**query) if isinstance(query, dict) else query
154
160
  self.context = context
155
161
  self.onlineContext = onlineContext
156
162
  self.codeContext = codeContext
157
163
  self.operatorContext = OperatorRun(**operatorContext) if isinstance(operatorContext, dict) else operatorContext
158
164
  self.summarizedResult = summarizedResult
159
165
  self.warning = warning
166
+ self.raw_response = raw_response
160
167
 
161
168
  def to_dict(self) -> dict:
162
169
  data = vars(self).copy()
170
+ data["query"] = self.query.__dict__ if isinstance(self.query, ToolCall) else self.query
163
171
  data["operatorContext"] = self.operatorContext.to_dict() if self.operatorContext else None
164
172
  return data
165
173
 
166
174
 
167
175
  def construct_iteration_history(
168
176
  previous_iterations: List[ResearchIteration],
169
- previous_iteration_prompt: str,
170
177
  query: str = None,
178
+ query_images: List[str] = None,
179
+ query_files: str = None,
171
180
  ) -> list[ChatMessageModel]:
172
181
  iteration_history: list[ChatMessageModel] = []
173
- previous_iteration_messages: list[dict] = []
174
- for idx, iteration in enumerate(previous_iterations):
175
- iteration_data = previous_iteration_prompt.format(
176
- tool=iteration.tool,
177
- query=iteration.query,
178
- result=iteration.summarizedResult,
179
- index=idx + 1,
180
- )
181
-
182
- previous_iteration_messages.append({"type": "text", "text": iteration_data})
182
+ query_message_content = construct_structured_message(query, query_images, attached_file_context=query_files)
183
+ if query_message_content:
184
+ iteration_history.append(ChatMessageModel(by="you", message=query_message_content))
183
185
 
184
- if previous_iteration_messages:
185
- if query:
186
- iteration_history.append(ChatMessageModel(by="you", message=query))
187
- iteration_history.append(
186
+ for iteration in previous_iterations:
187
+ if not iteration.query or isinstance(iteration.query, str):
188
+ iteration_history.append(
189
+ ChatMessageModel(
190
+ by="you",
191
+ message=iteration.summarizedResult
192
+ or iteration.warning
193
+ or "Please specify what you want to do next.",
194
+ )
195
+ )
196
+ continue
197
+ iteration_history += [
188
198
  ChatMessageModel(
189
199
  by="khoj",
190
- intent=Intent(type="remember", query=query),
191
- message=previous_iteration_messages,
192
- )
193
- )
200
+ message=iteration.raw_response or [iteration.query.__dict__],
201
+ intent=Intent(type="tool_call", query=query),
202
+ ),
203
+ ChatMessageModel(
204
+ by="you",
205
+ intent=Intent(type="tool_result"),
206
+ message=[
207
+ {
208
+ "type": "tool_result",
209
+ "id": iteration.query.id,
210
+ "name": iteration.query.name,
211
+ "content": iteration.summarizedResult,
212
+ }
213
+ ],
214
+ ),
215
+ ]
216
+
194
217
  return iteration_history
195
218
 
196
219
 
@@ -302,33 +325,44 @@ def construct_tool_chat_history(
302
325
  ConversationCommand.Notes: (
303
326
  lambda iteration: [c["query"] for c in iteration.context] if iteration.context else []
304
327
  ),
305
- ConversationCommand.Online: (
328
+ ConversationCommand.SearchWeb: (
306
329
  lambda iteration: list(iteration.onlineContext.keys()) if iteration.onlineContext else []
307
330
  ),
308
- ConversationCommand.Webpage: (
331
+ ConversationCommand.ReadWebpage: (
309
332
  lambda iteration: list(iteration.onlineContext.keys()) if iteration.onlineContext else []
310
333
  ),
311
- ConversationCommand.Code: (
334
+ ConversationCommand.RunCode: (
312
335
  lambda iteration: list(iteration.codeContext.keys()) if iteration.codeContext else []
313
336
  ),
314
337
  }
315
338
  for iteration in previous_iterations:
339
+ if not iteration.query or isinstance(iteration.query, str):
340
+ chat_history.append(
341
+ ChatMessageModel(
342
+ by="you",
343
+ message=iteration.summarizedResult
344
+ or iteration.warning
345
+ or "Please specify what you want to do next.",
346
+ )
347
+ )
348
+ continue
349
+
316
350
  # If a tool is provided use the inferred query extractor for that tool if available
317
351
  # If no tool is provided, use inferred query extractor for the tool used in the iteration
318
352
  # Fallback to base extractor if the tool does not have an inferred query extractor
319
353
  inferred_query_extractor = extract_inferred_query_map.get(
320
- tool or ConversationCommand(iteration.tool), base_extractor
354
+ tool or ConversationCommand(iteration.query.name), base_extractor
321
355
  )
322
356
  chat_history += [
323
357
  ChatMessageModel(
324
358
  by="you",
325
- message=iteration.query,
359
+ message=yaml.dump(iteration.query.args, default_flow_style=False),
326
360
  ),
327
361
  ChatMessageModel(
328
362
  by="khoj",
329
363
  intent=Intent(
330
364
  type="remember",
331
- query=iteration.query,
365
+ query=yaml.dump(iteration.query.args, default_flow_style=False),
332
366
  inferred_queries=inferred_query_extractor(iteration),
333
367
  memory_type="notes",
334
368
  ),
@@ -481,28 +515,32 @@ Khoj: "{chat_response}"
481
515
 
482
516
  def construct_structured_message(
483
517
  message: list[dict] | str,
484
- images: list[str],
485
- model_type: str,
486
- vision_enabled: bool,
518
+ images: list[str] = None,
519
+ model_type: str = None,
520
+ vision_enabled: bool = True,
487
521
  attached_file_context: str = None,
488
522
  ):
489
523
  """
490
- Format messages into appropriate multimedia format for supported chat model types
524
+ Format messages into appropriate multimedia format for supported chat model types.
525
+
526
+ Assume vision is enabled and chat model provider supports messages in chatml format, unless specified otherwise.
491
527
  """
492
- if model_type in [
528
+ if not model_type or model_type in [
493
529
  ChatModel.ModelType.OPENAI,
494
530
  ChatModel.ModelType.GOOGLE,
495
531
  ChatModel.ModelType.ANTHROPIC,
496
532
  ]:
497
- constructed_messages: List[dict[str, Any]] = (
498
- [{"type": "text", "text": message}] if isinstance(message, str) else message
499
- )
500
-
533
+ constructed_messages: List[dict[str, Any]] = []
534
+ if not is_none_or_empty(message):
535
+ constructed_messages += [{"type": "text", "text": message}] if isinstance(message, str) else message
536
+ # Drop image message passed by caller if chat model does not have vision enabled
537
+ if not vision_enabled:
538
+ constructed_messages = [m for m in constructed_messages if m.get("type") != "image_url"]
501
539
  if not is_none_or_empty(attached_file_context):
502
- constructed_messages.append({"type": "text", "text": attached_file_context})
540
+ constructed_messages += [{"type": "text", "text": attached_file_context}]
503
541
  if vision_enabled and images:
504
542
  for image in images:
505
- constructed_messages.append({"type": "image_url", "image_url": {"url": image}})
543
+ constructed_messages += [{"type": "image_url", "image_url": {"url": image}}]
506
544
  return constructed_messages
507
545
 
508
546
  message = message if isinstance(message, str) else "\n\n".join(m["text"] for m in message)
@@ -638,7 +676,11 @@ def generate_chatml_messages_with_context(
638
676
  chat_message, chat.images if role == "user" else [], model_type, vision_enabled
639
677
  )
640
678
 
641
- reconstructed_message = ChatMessage(content=message_content, role=role)
679
+ reconstructed_message = ChatMessage(
680
+ content=message_content,
681
+ role=role,
682
+ additional_kwargs={"message_type": chat.intent.type if chat.intent else None},
683
+ )
642
684
  chatml_messages.insert(0, reconstructed_message)
643
685
 
644
686
  if len(chatml_messages) >= 3 * lookback_turns:
@@ -737,10 +779,21 @@ def count_tokens(
737
779
  message_content_parts: list[str] = []
738
780
  # Collate message content into single string to ease token counting
739
781
  for part in message_content:
740
- if isinstance(part, dict) and part.get("type") == "text":
741
- message_content_parts.append(part["text"])
742
- elif isinstance(part, dict) and part.get("type") == "image_url":
782
+ if isinstance(part, dict) and part.get("type") == "image_url":
743
783
  image_count += 1
784
+ elif isinstance(part, dict) and part.get("type") == "text":
785
+ message_content_parts.append(part["text"])
786
+ elif isinstance(part, dict) and hasattr(part, "model_dump"):
787
+ message_content_parts.append(json.dumps(part.model_dump()))
788
+ elif isinstance(part, dict) and hasattr(part, "__dict__"):
789
+ message_content_parts.append(json.dumps(part.__dict__))
790
+ elif isinstance(part, dict):
791
+ # If part is a dict but not a recognized type, convert to JSON string
792
+ try:
793
+ message_content_parts.append(json.dumps(part))
794
+ except (TypeError, ValueError) as e:
795
+ logger.warning(f"Failed to serialize part {part} to JSON: {e}. Skipping.")
796
+ image_count += 1 # Treat as an image/binary if serialization fails
744
797
  elif isinstance(part, str):
745
798
  message_content_parts.append(part)
746
799
  else:
@@ -753,6 +806,15 @@ def count_tokens(
753
806
  return len(encoder.encode(json.dumps(message_content)))
754
807
 
755
808
 
809
+ def count_total_tokens(messages: list[ChatMessage], encoder, system_message: Optional[ChatMessage]) -> Tuple[int, int]:
810
+ """Count total tokens in messages including system message"""
811
+ system_message_tokens = count_tokens(system_message.content, encoder) if system_message else 0
812
+ message_tokens = sum([count_tokens(message.content, encoder) for message in messages])
813
+ # Reserves 4 tokens to demarcate each message (e.g <|im_start|>user, <|im_end|>, <|endoftext|> etc.)
814
+ total_tokens = message_tokens + system_message_tokens + 4 * len(messages)
815
+ return total_tokens, system_message_tokens
816
+
817
+
756
818
  def truncate_messages(
757
819
  messages: list[ChatMessage],
758
820
  max_prompt_size: int,
@@ -771,23 +833,30 @@ def truncate_messages(
771
833
  break
772
834
 
773
835
  # Drop older messages until under max supported prompt size by model
774
- # Reserves 4 tokens to demarcate each message (e.g <|im_start|>user, <|im_end|>, <|endoftext|> etc.)
775
- system_message_tokens = count_tokens(system_message.content, encoder) if system_message else 0
776
- tokens = sum([count_tokens(message.content, encoder) for message in messages])
777
- total_tokens = tokens + system_message_tokens + 4 * len(messages)
836
+ total_tokens, system_message_tokens = count_total_tokens(messages, encoder, system_message)
778
837
 
779
838
  while total_tokens > max_prompt_size and (len(messages) > 1 or len(messages[0].content) > 1):
780
- if len(messages[-1].content) > 1:
839
+ # If the last message has more than one content part, pop the oldest content part.
840
+ # For tool calls, the whole message should dropped, assistant's tool call content being truncated annoys AI APIs.
841
+ if len(messages[-1].content) > 1 and messages[-1].additional_kwargs.get("message_type") != "tool_call":
781
842
  # The oldest content part is earlier in content list. So pop from the front.
782
843
  messages[-1].content.pop(0)
844
+ # Otherwise, pop the last message if it has only one content part or is a tool call.
783
845
  else:
784
846
  # The oldest message is the last one. So pop from the back.
785
- messages.pop()
786
- tokens = sum([count_tokens(message.content, encoder) for message in messages])
787
- total_tokens = tokens + system_message_tokens + 4 * len(messages)
847
+ dropped_message = messages.pop()
848
+ # Drop tool result pair of tool call, if tool call message has been removed
849
+ if (
850
+ dropped_message.additional_kwargs.get("message_type") == "tool_call"
851
+ and messages
852
+ and messages[-1].additional_kwargs.get("message_type") == "tool_result"
853
+ ):
854
+ messages.pop()
855
+
856
+ total_tokens, _ = count_total_tokens(messages, encoder, system_message)
788
857
 
789
858
  # Truncate current message if still over max supported prompt size by model
790
- total_tokens = tokens + system_message_tokens + 4 * len(messages)
859
+ total_tokens, _ = count_total_tokens(messages, encoder, system_message)
791
860
  if total_tokens > max_prompt_size:
792
861
  # At this point, a single message with a single content part of type dict should remain
793
862
  assert (
@@ -1149,13 +1218,15 @@ def messages_to_print(messages: list[ChatMessage], max_length: int = 70) -> str:
1149
1218
  return "\n".join([f"{json.dumps(safe_serialize(message.content))[:max_length]}..." for message in messages])
1150
1219
 
1151
1220
 
1152
- class JsonSupport(int, Enum):
1221
+ class StructuredOutputSupport(int, Enum):
1153
1222
  NONE = 0
1154
1223
  OBJECT = 1
1155
1224
  SCHEMA = 2
1225
+ TOOL = 3
1156
1226
 
1157
1227
 
1158
1228
  class ResponseWithThought:
1159
- def __init__(self, response: str = None, thought: str = None):
1160
- self.response = response
1229
+ def __init__(self, text: str = None, thought: str = None, raw_content: list = None):
1230
+ self.text = text
1161
1231
  self.thought = thought
1232
+ self.raw_content = raw_content
@@ -73,7 +73,7 @@ class GroundingAgent:
73
73
  grounding_user_prompt = self.get_instruction(instruction, self.environment_type)
74
74
  screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
75
75
  grounding_messages_content = construct_structured_message(
76
- grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
76
+ grounding_user_prompt, screenshots, self.model.model_type, vision_enabled=True
77
77
  )
78
78
  return [{"role": "user", "content": grounding_messages_content}]
79
79