khoj 1.28.4.dev13__py3-none-any.whl → 1.28.4.dev22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. khoj/interface/compiled/404/index.html +1 -1
  2. khoj/interface/compiled/_next/static/chunks/app/agents/{page-36da67f03a173e52.js → page-f29d6b3efa6f96c6.js} +1 -1
  3. khoj/interface/compiled/_next/static/chunks/app/automations/{page-774ae3e033f938cd.js → page-d3edae545a1b5393.js} +1 -1
  4. khoj/interface/compiled/_next/static/chunks/app/chat/{page-a369e2bda9897794.js → page-60bfb0b3b81d3d9d.js} +1 -1
  5. khoj/interface/compiled/_next/static/chunks/app/{page-322c37514a3a613a.js → page-cbc0e7b837bd35fa.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/search/{page-9b64f61caa5bd7f9.js → page-a5c277eff207959e.js} +1 -1
  7. khoj/interface/compiled/_next/static/chunks/app/settings/{page-10b288c103f19468.js → page-210bd54db4841333.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-959d5f097cf38c93.js → page-79cf030b31c7e793.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/{webpack-d3a4ebfc304496fb.js → webpack-5018ce30001e8c48.js} +1 -1
  10. khoj/interface/compiled/_next/static/css/592ca99f5122e75a.css +1 -0
  11. khoj/interface/compiled/_next/static/css/63e106a52a0ec4ca.css +1 -0
  12. khoj/interface/compiled/agents/index.html +1 -1
  13. khoj/interface/compiled/agents/index.txt +2 -2
  14. khoj/interface/compiled/automations/index.html +1 -1
  15. khoj/interface/compiled/automations/index.txt +2 -2
  16. khoj/interface/compiled/chat/index.html +1 -1
  17. khoj/interface/compiled/chat/index.txt +2 -2
  18. khoj/interface/compiled/index.html +1 -1
  19. khoj/interface/compiled/index.txt +2 -2
  20. khoj/interface/compiled/search/index.html +1 -1
  21. khoj/interface/compiled/search/index.txt +2 -2
  22. khoj/interface/compiled/settings/index.html +1 -1
  23. khoj/interface/compiled/settings/index.txt +2 -2
  24. khoj/interface/compiled/share/chat/index.html +1 -1
  25. khoj/interface/compiled/share/chat/index.txt +2 -2
  26. khoj/processor/conversation/utils.py +7 -1
  27. khoj/processor/tools/online_search.py +49 -15
  28. khoj/routers/api.py +3 -1
  29. khoj/routers/api_chat.py +16 -9
  30. khoj/routers/helpers.py +45 -17
  31. khoj/routers/research.py +64 -29
  32. {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/METADATA +1 -3
  33. {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/RECORD +38 -38
  34. {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/WHEEL +1 -1
  35. khoj/interface/compiled/_next/static/css/798b0de12852bd20.css +0 -1
  36. khoj/interface/compiled/_next/static/css/80bd6301fc657983.css +0 -1
  37. /khoj/interface/compiled/_next/static/{FPLh9rnKQbUWwU3fdzk6T → hIwmaAtdW0-B6vZcnHMX0}/_buildManifest.js +0 -0
  38. /khoj/interface/compiled/_next/static/{FPLh9rnKQbUWwU3fdzk6T → hIwmaAtdW0-B6vZcnHMX0}/_ssgManifest.js +0 -0
  39. {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/entry_points.txt +0 -0
  40. {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/licenses/LICENSE +0 -0
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import urllib.parse
6
6
  from collections import defaultdict
7
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
7
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
8
8
 
9
9
  import aiohttp
10
10
  from bs4 import BeautifulSoup
@@ -66,6 +66,7 @@ async def search_online(
66
66
  custom_filters: List[str] = [],
67
67
  max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
68
68
  query_images: List[str] = None,
69
+ previous_subqueries: Set = set(),
69
70
  agent: Agent = None,
70
71
  tracer: dict = {},
71
72
  ):
@@ -76,36 +77,45 @@ async def search_online(
76
77
  return
77
78
 
78
79
  # Breakdown the query into subqueries to get the correct answer
79
- subqueries = await generate_online_subqueries(
80
+ new_subqueries = await generate_online_subqueries(
80
81
  query, conversation_history, location, user, query_images=query_images, agent=agent, tracer=tracer
81
82
  )
82
- response_dict = {}
83
+ subqueries = list(new_subqueries - previous_subqueries)
84
+ response_dict: Dict[str, Dict[str, List[Dict] | Dict]] = {}
83
85
 
84
- if subqueries:
85
- logger.info(f"🌐 Searching the Internet for {list(subqueries)}")
86
- if send_status_func:
87
- subqueries_str = "\n- " + "\n- ".join(list(subqueries))
88
- async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
89
- yield {ChatEvent.STATUS: event}
86
+ if is_none_or_empty(subqueries):
87
+ logger.info("No new subqueries to search online")
88
+ yield response_dict
89
+ return
90
+
91
+ logger.info(f"🌐 Searching the Internet for {subqueries}")
92
+ if send_status_func:
93
+ subqueries_str = "\n- " + "\n- ".join(subqueries)
94
+ async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
95
+ yield {ChatEvent.STATUS: event}
90
96
 
91
- with timer(f"Internet searches for {list(subqueries)} took", logger):
97
+ with timer(f"Internet searches for {subqueries} took", logger):
92
98
  search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
93
99
  search_tasks = [search_func(subquery, location) for subquery in subqueries]
94
100
  search_results = await asyncio.gather(*search_tasks)
95
101
  response_dict = {subquery: search_result for subquery, search_result in search_results}
96
102
 
97
103
  # Gather distinct web pages from organic results for subqueries without an instant answer.
98
- # Content of web pages is directly available when Jina is used for search.
99
104
  webpages: Dict[str, Dict] = {}
100
105
  for subquery in response_dict:
101
106
  if "answerBox" in response_dict[subquery]:
102
107
  continue
103
- for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
108
+ for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
104
109
  link = organic.get("link")
105
- if link in webpages:
110
+ if link in webpages and idx < max_webpages_to_read:
106
111
  webpages[link]["queries"].add(subquery)
107
- else:
112
+ # Content of web pages is directly available when Jina is used for search.
113
+ elif idx < max_webpages_to_read:
108
114
  webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
115
+ # Only keep webpage content for up to max_webpages_to_read organic results.
116
+ if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
117
+ organic["content"] = None
118
+ response_dict[subquery]["organic"][idx] = organic
109
119
 
110
120
  # Read, extract relevant info from the retrieved web pages
111
121
  if webpages:
@@ -115,7 +125,9 @@ async def search_online(
115
125
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
116
126
  yield {ChatEvent.STATUS: event}
117
127
  tasks = [
118
- read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent, tracer=tracer)
128
+ read_webpage_and_extract_content(
129
+ data["queries"], link, data.get("content"), user=user, agent=agent, tracer=tracer
130
+ )
119
131
  for link, data in webpages.items()
120
132
  ]
121
133
  results = await asyncio.gather(*tasks)
@@ -355,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
355
367
  for item in response_json["data"]
356
368
  ]
357
369
  return query, {"organic": parsed_response}
370
+
371
+
372
+ def deduplicate_organic_results(online_results: dict) -> dict:
373
+ """Deduplicate organic search results based on links across all queries."""
374
+ # Keep track of seen links to filter out duplicates across queries
375
+ seen_links = set()
376
+ deduplicated_results = {}
377
+
378
+ # Process each query's results
379
+ for query, results in online_results.items():
380
+ # Filter organic results keeping only first occurrence of each link
381
+ filtered_organic = []
382
+ for result in results.get("organic", []):
383
+ link = result.get("link")
384
+ if link and link not in seen_links:
385
+ seen_links.add(link)
386
+ filtered_organic.append(result)
387
+
388
+ # Update results with deduplicated organic entries
389
+ deduplicated_results[query] = {**results, "organic": filtered_organic}
390
+
391
+ return deduplicated_results
khoj/routers/api.py CHANGED
@@ -6,7 +6,7 @@ import os
6
6
  import threading
7
7
  import time
8
8
  import uuid
9
- from typing import Any, Callable, List, Optional, Union
9
+ from typing import Any, Callable, List, Optional, Set, Union
10
10
 
11
11
  import cron_descriptor
12
12
  import pytz
@@ -349,6 +349,7 @@ async def extract_references_and_questions(
349
349
  location_data: LocationData = None,
350
350
  send_status_func: Optional[Callable] = None,
351
351
  query_images: Optional[List[str]] = None,
352
+ previous_inferred_queries: Set = set(),
352
353
  agent: Agent = None,
353
354
  tracer: dict = {},
354
355
  ):
@@ -477,6 +478,7 @@ async def extract_references_and_questions(
477
478
  )
478
479
 
479
480
  # Collate search results as context for GPT
481
+ inferred_queries = list(set(inferred_queries) - previous_inferred_queries)
480
482
  with timer("Searching knowledge base took", logger):
481
483
  search_results = []
482
484
  logger.info(f"🔍 Searching knowledge base with queries: {inferred_queries}")
khoj/routers/api_chat.py CHANGED
@@ -28,7 +28,11 @@ from khoj.processor.conversation.prompts import help_message, no_entries_found
28
28
  from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
29
29
  from khoj.processor.image.generate import text_to_image
30
30
  from khoj.processor.speech.text_to_speech import generate_text_to_speech
31
- from khoj.processor.tools.online_search import read_webpages, search_online
31
+ from khoj.processor.tools.online_search import (
32
+ deduplicate_organic_results,
33
+ read_webpages,
34
+ search_online,
35
+ )
32
36
  from khoj.processor.tools.run_code import run_code
33
37
  from khoj.routers.api import extract_references_and_questions
34
38
  from khoj.routers.email import send_query_feedback
@@ -738,8 +742,13 @@ async def chat(
738
742
  conversation_commands.append(mode)
739
743
 
740
744
  for cmd in conversation_commands:
741
- await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
742
- q = q.replace(f"/{cmd.value}", "").strip()
745
+ try:
746
+ await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
747
+ q = q.replace(f"/{cmd.value}", "").strip()
748
+ except HTTPException as e:
749
+ async for result in send_llm_response(str(e.detail)):
750
+ yield result
751
+ return
743
752
 
744
753
  defiltered_query = defilter_query(q)
745
754
 
@@ -773,11 +782,8 @@ async def chat(
773
782
  yield research_result
774
783
 
775
784
  # researched_results = await extract_relevant_info(q, researched_results, agent)
776
- logger.info(f"Researched Results: {researched_results}")
777
-
778
- for cmd in conversation_commands:
779
- await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
780
- q = q.replace(f"/{cmd.value}", "").strip()
785
+ if state.verbose > 1:
786
+ logger.debug(f"Researched Results: {researched_results}")
781
787
 
782
788
  used_slash_summarize = conversation_commands == [ConversationCommand.Summarize]
783
789
  file_filters = conversation.file_filters if conversation else []
@@ -1024,12 +1030,13 @@ async def chat(
1024
1030
  )
1025
1031
 
1026
1032
  ## Send Gathered References
1033
+ unique_online_results = deduplicate_organic_results(online_results)
1027
1034
  async for result in send_event(
1028
1035
  ChatEvent.REFERENCES,
1029
1036
  {
1030
1037
  "inferredQueries": inferred_queries,
1031
1038
  "context": compiled_references,
1032
- "onlineContext": online_results,
1039
+ "onlineContext": unique_online_results,
1033
1040
  "codeContext": code_results,
1034
1041
  },
1035
1042
  ):
khoj/routers/helpers.py CHANGED
@@ -20,6 +20,7 @@ from typing import (
20
20
  Iterator,
21
21
  List,
22
22
  Optional,
23
+ Set,
23
24
  Tuple,
24
25
  Union,
25
26
  )
@@ -494,7 +495,7 @@ async def generate_online_subqueries(
494
495
  query_images: List[str] = None,
495
496
  agent: Agent = None,
496
497
  tracer: dict = {},
497
- ) -> List[str]:
498
+ ) -> Set[str]:
498
499
  """
499
500
  Generate subqueries from the given query
500
501
  """
@@ -529,14 +530,14 @@ async def generate_online_subqueries(
529
530
  try:
530
531
  response = clean_json(response)
531
532
  response = json.loads(response)
532
- response = [q.strip() for q in response["queries"] if q.strip()]
533
- if not isinstance(response, list) or not response or len(response) == 0:
533
+ response = {q.strip() for q in response["queries"] if q.strip()}
534
+ if not isinstance(response, set) or not response or len(response) == 0:
534
535
  logger.error(f"Invalid response for constructing subqueries: {response}. Returning original query: {q}")
535
- return [q]
536
+ return {q}
536
537
  return response
537
538
  except Exception as e:
538
539
  logger.error(f"Invalid response for constructing subqueries: {response}. Returning original query: {q}")
539
- return [q]
540
+ return {q}
540
541
 
541
542
 
542
543
  async def schedule_query(
@@ -1128,9 +1129,6 @@ def generate_chat_response(
1128
1129
 
1129
1130
  metadata = {}
1130
1131
  agent = AgentAdapters.get_conversation_agent_by_id(conversation.agent.id) if conversation.agent else None
1131
- query_to_run = q
1132
- if meta_research:
1133
- query_to_run = f"AI Research: {meta_research} {q}"
1134
1132
  try:
1135
1133
  partial_completion = partial(
1136
1134
  save_to_conversation_log,
@@ -1148,6 +1146,13 @@ def generate_chat_response(
1148
1146
  train_of_thought=train_of_thought,
1149
1147
  )
1150
1148
 
1149
+ query_to_run = q
1150
+ if meta_research:
1151
+ query_to_run = f"<query>{q}</query>\n<collected_research>\n{meta_research}\n</collected_research>"
1152
+ compiled_references = []
1153
+ online_results = {}
1154
+ code_results = {}
1155
+
1151
1156
  conversation_config = ConversationAdapters.get_valid_conversation_config(user, conversation)
1152
1157
  vision_available = conversation_config.vision_enabled
1153
1158
  if not vision_available and query_images:
@@ -1306,25 +1311,28 @@ class ApiUserRateLimiter:
1306
1311
  # Check if the user has exceeded the rate limit
1307
1312
  if subscribed and count_requests >= self.subscribed_requests:
1308
1313
  logger.info(
1309
- f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}. Limit is {self.subscribed_requests} requests."
1314
+ f"Rate limit: {count_requests}/{self.subscribed_requests} requests not allowed in {self.window} seconds for subscribed user: {user}."
1315
+ )
1316
+ raise HTTPException(
1317
+ status_code=429,
1318
+ detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. But let's chat more tomorrow?",
1310
1319
  )
1311
- raise HTTPException(status_code=429, detail="Slow down! Too Many Requests")
1312
1320
  if not subscribed and count_requests >= self.requests:
1313
1321
  if self.requests >= self.subscribed_requests:
1314
1322
  logger.info(
1315
- f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}. Limit is {self.subscribed_requests} requests."
1323
+ f"Rate limit: {count_requests}/{self.subscribed_requests} requests not allowed in {self.window} seconds for user: {user}."
1316
1324
  )
1317
1325
  raise HTTPException(
1318
1326
  status_code=429,
1319
- detail="Slow down! Too Many Requests",
1327
+ detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. But let's chat more tomorrow?",
1320
1328
  )
1321
1329
 
1322
1330
  logger.info(
1323
- f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}. Limit is {self.subscribed_requests} requests."
1331
+ f"Rate limit: {count_requests}/{self.requests} requests not allowed in {self.window} seconds for user: {user}."
1324
1332
  )
1325
1333
  raise HTTPException(
1326
1334
  status_code=429,
1327
- detail="I'm glad you're enjoying interacting with me! But you've exceeded your usage limit for today. Come back tomorrow or subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings).",
1335
+ detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. You can subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings) or we can continue our conversation tomorrow?",
1328
1336
  )
1329
1337
 
1330
1338
  # Add the current request to the cache
@@ -1350,6 +1358,7 @@ class ApiImageRateLimiter:
1350
1358
 
1351
1359
  # Check number of images
1352
1360
  if len(body.images) > self.max_images:
1361
+ logger.info(f"Rate limit: {len(body.images)}/{self.max_images} images not allowed per message.")
1353
1362
  raise HTTPException(
1354
1363
  status_code=429,
1355
1364
  detail=f"Those are way too many images for me! I can handle up to {self.max_images} images per message.",
@@ -1370,6 +1379,7 @@ class ApiImageRateLimiter:
1370
1379
  total_size_mb += len(image_bytes) / (1024 * 1024) # Convert bytes to MB
1371
1380
 
1372
1381
  if total_size_mb > self.max_combined_size_mb:
1382
+ logger.info(f"Data limit: {total_size_mb}MB/{self.max_combined_size_mb}MB size not allowed per message.")
1373
1383
  raise HTTPException(
1374
1384
  status_code=429,
1375
1385
  detail=f"Those images are way too large for me! I can handle up to {self.max_combined_size_mb}MB of images per message.",
@@ -1405,13 +1415,19 @@ class ConversationCommandRateLimiter:
1405
1415
 
1406
1416
  if subscribed and count_requests >= self.subscribed_rate_limit:
1407
1417
  logger.info(
1408
- f"Rate limit: {count_requests} requests in 24 hours for user: {user}. Limit is {self.subscribed_rate_limit} requests."
1418
+ f"Rate limit: {count_requests}/{self.subscribed_rate_limit} requests not allowed in 24 hours for subscribed user: {user}."
1419
+ )
1420
+ raise HTTPException(
1421
+ status_code=429,
1422
+ detail=f"I'm glad you're enjoying interacting with me! You've unfortunately exceeded your `/{conversation_command.value}` command usage limit for today. Maybe we can talk about something else for today?",
1409
1423
  )
1410
- raise HTTPException(status_code=429, detail="Slow down! Too Many Requests")
1411
1424
  if not subscribed and count_requests >= self.trial_rate_limit:
1425
+ logger.info(
1426
+ f"Rate limit: {count_requests}/{self.trial_rate_limit} requests not allowed in 24 hours for user: {user}."
1427
+ )
1412
1428
  raise HTTPException(
1413
1429
  status_code=429,
1414
- detail=f"We're glad you're enjoying Khoj! You've exceeded your `/{conversation_command.value}` command usage limit for today. Subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings).",
1430
+ detail=f"I'm glad you're enjoying interacting with me! You've unfortunately exceeded your `/{conversation_command.value}` command usage limit for today. You can subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings) or we can talk about something else for today?",
1415
1431
  )
1416
1432
  await UserRequests.objects.acreate(user=user, slug=command_slug)
1417
1433
  return
@@ -1457,16 +1473,28 @@ class ApiIndexedDataLimiter:
1457
1473
  logger.info(f"Deleted {num_deleted_entries} entries for user: {user}.")
1458
1474
 
1459
1475
  if subscribed and incoming_data_size_mb >= self.subscribed_num_entries_size:
1476
+ logger.info(
1477
+ f"Data limit: {incoming_data_size_mb}MB incoming will exceed {self.subscribed_num_entries_size}MB allowed for subscribed user: {user}."
1478
+ )
1460
1479
  raise HTTPException(status_code=429, detail="Too much data indexed.")
1461
1480
  if not subscribed and incoming_data_size_mb >= self.num_entries_size:
1481
+ logger.info(
1482
+ f"Data limit: {incoming_data_size_mb}MB incoming will exceed {self.num_entries_size}MB allowed for user: {user}."
1483
+ )
1462
1484
  raise HTTPException(
1463
1485
  status_code=429, detail="Too much data indexed. Subscribe to increase your data index limit."
1464
1486
  )
1465
1487
 
1466
1488
  user_size_data = EntryAdapters.get_size_of_indexed_data_in_mb(user)
1467
1489
  if subscribed and user_size_data + incoming_data_size_mb >= self.subscribed_total_entries_size:
1490
+ logger.info(
1491
+ f"Data limit: {incoming_data_size_mb}MB incoming + {user_size_data}MB existing will exceed {self.subscribed_total_entries_size}MB allowed for subscribed user: {user}."
1492
+ )
1468
1493
  raise HTTPException(status_code=429, detail="Too much data indexed.")
1469
1494
  if not subscribed and user_size_data + incoming_data_size_mb >= self.total_entries_size_limit:
1495
+ logger.info(
1496
+ f"Data limit: {incoming_data_size_mb}MB incoming + {user_size_data}MB existing will exceed {self.subscribed_total_entries_size}MB allowed for non subscribed user: {user}."
1497
+ )
1470
1498
  raise HTTPException(
1471
1499
  status_code=429, detail="Too much data indexed. Subscribe to increase your data index limit."
1472
1500
  )
khoj/routers/research.py CHANGED
@@ -43,38 +43,35 @@ async def apick_next_tool(
43
43
  location: LocationData = None,
44
44
  user_name: str = None,
45
45
  agent: Agent = None,
46
- previous_iterations_history: str = None,
46
+ previous_iterations: List[InformationCollectionIteration] = [],
47
47
  max_iterations: int = 5,
48
48
  send_status_func: Optional[Callable] = None,
49
49
  tracer: dict = {},
50
50
  ):
51
- """
52
- Given a query, determine which of the available tools the agent should use in order to answer appropriately. One at a time, and it's able to use subsequent iterations to refine the answer.
53
- """
51
+ """Given a query, determine which of the available tools the agent should use in order to answer appropriately."""
54
52
 
53
+ # Construct tool options for the agent to choose from
55
54
  tool_options = dict()
56
55
  tool_options_str = ""
57
-
58
56
  agent_tools = agent.input_tools if agent else []
59
-
60
57
  for tool, description in function_calling_description_for_llm.items():
61
58
  tool_options[tool.value] = description
62
59
  if len(agent_tools) == 0 or tool.value in agent_tools:
63
60
  tool_options_str += f'- "{tool.value}": "{description}"\n'
64
61
 
62
+ # Construct chat history with user and iteration history with researcher agent for context
65
63
  chat_history = construct_chat_history(conversation_history, agent_name=agent.name if agent else "Khoj")
64
+ previous_iterations_history = construct_iteration_history(previous_iterations, prompts.previous_iteration)
66
65
 
67
66
  if query_images:
68
67
  query = f"[placeholder for user attached images]\n{query}"
69
68
 
69
+ today = datetime.today()
70
+ location_data = f"{location}" if location else "Unknown"
70
71
  personality_context = (
71
72
  prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
72
73
  )
73
74
 
74
- # Extract Past User Message and Inferred Questions from Conversation Log
75
- today = datetime.today()
76
- location_data = f"{location}" if location else "Unknown"
77
-
78
75
  function_planning_prompt = prompts.plan_function_execution.format(
79
76
  tools=tool_options_str,
80
77
  chat_history=chat_history,
@@ -87,15 +84,24 @@ async def apick_next_tool(
87
84
  max_iterations=max_iterations,
88
85
  )
89
86
 
90
- with timer("Chat actor: Infer information sources to refer", logger):
91
- response = await send_message_to_model_wrapper(
92
- query=query,
93
- context=function_planning_prompt,
94
- response_type="json_object",
95
- user=user,
96
- query_images=query_images,
97
- tracer=tracer,
87
+ try:
88
+ with timer("Chat actor: Infer information sources to refer", logger):
89
+ response = await send_message_to_model_wrapper(
90
+ query=query,
91
+ context=function_planning_prompt,
92
+ response_type="json_object",
93
+ user=user,
94
+ query_images=query_images,
95
+ tracer=tracer,
96
+ )
97
+ except Exception as e:
98
+ logger.error(f"Failed to infer information sources to refer: {e}", exc_info=True)
99
+ yield InformationCollectionIteration(
100
+ tool=None,
101
+ query=None,
102
+ warning="Failed to infer information sources to refer. Skipping iteration. Try again.",
98
103
  )
104
+ return
99
105
 
100
106
  try:
101
107
  response = clean_json(response)
@@ -103,8 +109,15 @@ async def apick_next_tool(
103
109
  selected_tool = response.get("tool", None)
104
110
  generated_query = response.get("query", None)
105
111
  scratchpad = response.get("scratchpad", None)
112
+ warning = None
106
113
  logger.info(f"Response for determining relevant tools: {response}")
107
- if send_status_func:
114
+
115
+ # Detect selection of previously used query, tool combination.
116
+ previous_tool_query_combinations = {(i.tool, i.query) for i in previous_iterations}
117
+ if (selected_tool, generated_query) in previous_tool_query_combinations:
118
+ warning = f"Repeated tool, query combination detected. Skipping iteration. Try something different."
119
+ # Only send client status updates if we'll execute this iteration
120
+ elif send_status_func:
108
121
  determined_tool_message = "**Determined Tool**: "
109
122
  determined_tool_message += f"{selected_tool}({generated_query})." if selected_tool else "respond."
110
123
  determined_tool_message += f"\nReason: {scratchpad}" if scratchpad else ""
@@ -114,13 +127,14 @@ async def apick_next_tool(
114
127
  yield InformationCollectionIteration(
115
128
  tool=selected_tool,
116
129
  query=generated_query,
130
+ warning=warning,
117
131
  )
118
-
119
132
  except Exception as e:
120
133
  logger.error(f"Invalid response for determining relevant tools: {response}. {e}", exc_info=True)
121
134
  yield InformationCollectionIteration(
122
135
  tool=None,
123
136
  query=None,
137
+ warning=f"Invalid response for determining relevant tools: {response}. Skipping iteration. Fix error: {e}",
124
138
  )
125
139
 
126
140
 
@@ -147,7 +161,6 @@ async def execute_information_collection(
147
161
  document_results: List[Dict[str, str]] = []
148
162
  summarize_files: str = ""
149
163
  this_iteration = InformationCollectionIteration(tool=None, query=query)
150
- previous_iterations_history = construct_iteration_history(previous_iterations, prompts.previous_iteration)
151
164
 
152
165
  async for result in apick_next_tool(
153
166
  query,
@@ -157,7 +170,7 @@ async def execute_information_collection(
157
170
  location,
158
171
  user_name,
159
172
  agent,
160
- previous_iterations_history,
173
+ previous_iterations,
161
174
  MAX_ITERATIONS,
162
175
  send_status_func,
163
176
  tracer=tracer,
@@ -167,9 +180,16 @@ async def execute_information_collection(
167
180
  elif isinstance(result, InformationCollectionIteration):
168
181
  this_iteration = result
169
182
 
170
- if this_iteration.tool == ConversationCommand.Notes:
183
+ # Skip running iteration if warning present in iteration
184
+ if this_iteration.warning:
185
+ logger.warning(f"Research mode: {this_iteration.warning}.")
186
+
187
+ elif this_iteration.tool == ConversationCommand.Notes:
171
188
  this_iteration.context = []
172
189
  document_results = []
190
+ previous_inferred_queries = {
191
+ c["query"] for iteration in previous_iterations if iteration.context for c in iteration.context
192
+ }
173
193
  async for result in extract_references_and_questions(
174
194
  request,
175
195
  construct_tool_chat_history(previous_iterations, ConversationCommand.Notes),
@@ -181,6 +201,7 @@ async def execute_information_collection(
181
201
  location,
182
202
  send_status_func,
183
203
  query_images,
204
+ previous_inferred_queries=previous_inferred_queries,
184
205
  agent=agent,
185
206
  tracer=tracer,
186
207
  ):
@@ -204,6 +225,12 @@ async def execute_information_collection(
204
225
  logger.error(f"Error extracting document references: {e}", exc_info=True)
205
226
 
206
227
  elif this_iteration.tool == ConversationCommand.Online:
228
+ previous_subqueries = {
229
+ subquery
230
+ for iteration in previous_iterations
231
+ if iteration.onlineContext
232
+ for subquery in iteration.onlineContext.keys()
233
+ }
207
234
  async for result in search_online(
208
235
  this_iteration.query,
209
236
  construct_tool_chat_history(previous_iterations, ConversationCommand.Online),
@@ -213,11 +240,16 @@ async def execute_information_collection(
213
240
  [],
214
241
  max_webpages_to_read=0,
215
242
  query_images=query_images,
243
+ previous_subqueries=previous_subqueries,
216
244
  agent=agent,
217
245
  tracer=tracer,
218
246
  ):
219
247
  if isinstance(result, dict) and ChatEvent.STATUS in result:
220
248
  yield result[ChatEvent.STATUS]
249
+ elif is_none_or_empty(result):
250
+ this_iteration.warning = (
251
+ "Detected previously run online search queries. Skipping iteration. Try something different."
252
+ )
221
253
  else:
222
254
  online_results: Dict[str, Dict] = result # type: ignore
223
255
  this_iteration.onlineContext = online_results
@@ -302,16 +334,19 @@ async def execute_information_collection(
302
334
 
303
335
  current_iteration += 1
304
336
 
305
- if document_results or online_results or code_results or summarize_files:
306
- results_data = f"**Results**:\n"
337
+ if document_results or online_results or code_results or summarize_files or this_iteration.warning:
338
+ results_data = f"\n<iteration>{current_iteration}\n<tool>{this_iteration.tool}</tool>\n<query>{this_iteration.query}</query>\n<results>"
307
339
  if document_results:
308
- results_data += f"**Document References**:\n{yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
340
+ results_data += f"\n<document_references>\n{yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</document_references>"
309
341
  if online_results:
310
- results_data += f"**Online Results**:\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
342
+ results_data += f"\n<online_results>\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</online_results>"
311
343
  if code_results:
312
- results_data += f"**Code Results**:\n{yaml.dump(code_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
344
+ results_data += f"\n<code_results>\n{yaml.dump(code_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</code_results>"
313
345
  if summarize_files:
314
- results_data += f"**Summarized Files**:\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
346
+ results_data += f"\n<summarized_files>\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</summarized_files>"
347
+ if this_iteration.warning:
348
+ results_data += f"\n<warning>\n{this_iteration.warning}\n</warning>"
349
+ results_data += "\n</results>\n</iteration>"
315
350
 
316
351
  # intermediate_result = await extract_relevant_info(this_iteration.query, results_data, agent)
317
352
  this_iteration.summarizedResult = results_data
@@ -1,13 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: khoj
3
- Version: 1.28.4.dev13
3
+ Version: 1.28.4.dev22
4
4
  Summary: Your Second Brain
5
5
  Project-URL: Homepage, https://khoj.dev
6
6
  Project-URL: Documentation, https://docs.khoj.dev
7
7
  Project-URL: Code, https://github.com/khoj-ai/khoj
8
8
  Author: Debanjum Singh Solanky, Saba Imran
9
- License-Expression: AGPL-3.0-or-later
10
- License-File: LICENSE
11
9
  Keywords: AI,NLP,images,markdown,org-mode,pdf,productivity,search,semantic-search
12
10
  Classifier: Development Status :: 5 - Production/Stable
13
11
  Classifier: Intended Audience :: Information Technology