khoj 1.28.4.dev13__py3-none-any.whl → 1.28.4.dev22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-36da67f03a173e52.js → page-f29d6b3efa6f96c6.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-774ae3e033f938cd.js → page-d3edae545a1b5393.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-a369e2bda9897794.js → page-60bfb0b3b81d3d9d.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-322c37514a3a613a.js → page-cbc0e7b837bd35fa.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/{page-9b64f61caa5bd7f9.js → page-a5c277eff207959e.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-10b288c103f19468.js → page-210bd54db4841333.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-959d5f097cf38c93.js → page-79cf030b31c7e793.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-d3a4ebfc304496fb.js → webpack-5018ce30001e8c48.js} +1 -1
- khoj/interface/compiled/_next/static/css/592ca99f5122e75a.css +1 -0
- khoj/interface/compiled/_next/static/css/63e106a52a0ec4ca.css +1 -0
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +1 -1
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +2 -2
- khoj/interface/compiled/share/chat/index.html +1 -1
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/utils.py +7 -1
- khoj/processor/tools/online_search.py +49 -15
- khoj/routers/api.py +3 -1
- khoj/routers/api_chat.py +16 -9
- khoj/routers/helpers.py +45 -17
- khoj/routers/research.py +64 -29
- {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/METADATA +1 -3
- {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/RECORD +38 -38
- {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/WHEEL +1 -1
- khoj/interface/compiled/_next/static/css/798b0de12852bd20.css +0 -1
- khoj/interface/compiled/_next/static/css/80bd6301fc657983.css +0 -1
- /khoj/interface/compiled/_next/static/{FPLh9rnKQbUWwU3fdzk6T → hIwmaAtdW0-B6vZcnHMX0}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{FPLh9rnKQbUWwU3fdzk6T → hIwmaAtdW0-B6vZcnHMX0}/_ssgManifest.js +0 -0
- {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/entry_points.txt +0 -0
- {khoj-1.28.4.dev13.dist-info → khoj-1.28.4.dev22.dist-info}/licenses/LICENSE +0 -0
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import urllib.parse
|
6
6
|
from collections import defaultdict
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
8
8
|
|
9
9
|
import aiohttp
|
10
10
|
from bs4 import BeautifulSoup
|
@@ -66,6 +66,7 @@ async def search_online(
|
|
66
66
|
custom_filters: List[str] = [],
|
67
67
|
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
68
68
|
query_images: List[str] = None,
|
69
|
+
previous_subqueries: Set = set(),
|
69
70
|
agent: Agent = None,
|
70
71
|
tracer: dict = {},
|
71
72
|
):
|
@@ -76,36 +77,45 @@ async def search_online(
|
|
76
77
|
return
|
77
78
|
|
78
79
|
# Breakdown the query into subqueries to get the correct answer
|
79
|
-
|
80
|
+
new_subqueries = await generate_online_subqueries(
|
80
81
|
query, conversation_history, location, user, query_images=query_images, agent=agent, tracer=tracer
|
81
82
|
)
|
82
|
-
|
83
|
+
subqueries = list(new_subqueries - previous_subqueries)
|
84
|
+
response_dict: Dict[str, Dict[str, List[Dict] | Dict]] = {}
|
83
85
|
|
84
|
-
if subqueries:
|
85
|
-
logger.info(
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
86
|
+
if is_none_or_empty(subqueries):
|
87
|
+
logger.info("No new subqueries to search online")
|
88
|
+
yield response_dict
|
89
|
+
return
|
90
|
+
|
91
|
+
logger.info(f"🌐 Searching the Internet for {subqueries}")
|
92
|
+
if send_status_func:
|
93
|
+
subqueries_str = "\n- " + "\n- ".join(subqueries)
|
94
|
+
async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
|
95
|
+
yield {ChatEvent.STATUS: event}
|
90
96
|
|
91
|
-
with timer(f"Internet searches for {
|
97
|
+
with timer(f"Internet searches for {subqueries} took", logger):
|
92
98
|
search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
|
93
99
|
search_tasks = [search_func(subquery, location) for subquery in subqueries]
|
94
100
|
search_results = await asyncio.gather(*search_tasks)
|
95
101
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
96
102
|
|
97
103
|
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
98
|
-
# Content of web pages is directly available when Jina is used for search.
|
99
104
|
webpages: Dict[str, Dict] = {}
|
100
105
|
for subquery in response_dict:
|
101
106
|
if "answerBox" in response_dict[subquery]:
|
102
107
|
continue
|
103
|
-
for organic in response_dict[subquery].get("organic", [])
|
108
|
+
for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
|
104
109
|
link = organic.get("link")
|
105
|
-
if link in webpages:
|
110
|
+
if link in webpages and idx < max_webpages_to_read:
|
106
111
|
webpages[link]["queries"].add(subquery)
|
107
|
-
|
112
|
+
# Content of web pages is directly available when Jina is used for search.
|
113
|
+
elif idx < max_webpages_to_read:
|
108
114
|
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
115
|
+
# Only keep webpage content for up to max_webpages_to_read organic results.
|
116
|
+
if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
|
117
|
+
organic["content"] = None
|
118
|
+
response_dict[subquery]["organic"][idx] = organic
|
109
119
|
|
110
120
|
# Read, extract relevant info from the retrieved web pages
|
111
121
|
if webpages:
|
@@ -115,7 +125,9 @@ async def search_online(
|
|
115
125
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
116
126
|
yield {ChatEvent.STATUS: event}
|
117
127
|
tasks = [
|
118
|
-
read_webpage_and_extract_content(
|
128
|
+
read_webpage_and_extract_content(
|
129
|
+
data["queries"], link, data.get("content"), user=user, agent=agent, tracer=tracer
|
130
|
+
)
|
119
131
|
for link, data in webpages.items()
|
120
132
|
]
|
121
133
|
results = await asyncio.gather(*tasks)
|
@@ -355,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
|
|
355
367
|
for item in response_json["data"]
|
356
368
|
]
|
357
369
|
return query, {"organic": parsed_response}
|
370
|
+
|
371
|
+
|
372
|
+
def deduplicate_organic_results(online_results: dict) -> dict:
|
373
|
+
"""Deduplicate organic search results based on links across all queries."""
|
374
|
+
# Keep track of seen links to filter out duplicates across queries
|
375
|
+
seen_links = set()
|
376
|
+
deduplicated_results = {}
|
377
|
+
|
378
|
+
# Process each query's results
|
379
|
+
for query, results in online_results.items():
|
380
|
+
# Filter organic results keeping only first occurrence of each link
|
381
|
+
filtered_organic = []
|
382
|
+
for result in results.get("organic", []):
|
383
|
+
link = result.get("link")
|
384
|
+
if link and link not in seen_links:
|
385
|
+
seen_links.add(link)
|
386
|
+
filtered_organic.append(result)
|
387
|
+
|
388
|
+
# Update results with deduplicated organic entries
|
389
|
+
deduplicated_results[query] = {**results, "organic": filtered_organic}
|
390
|
+
|
391
|
+
return deduplicated_results
|
khoj/routers/api.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6
6
|
import threading
|
7
7
|
import time
|
8
8
|
import uuid
|
9
|
-
from typing import Any, Callable, List, Optional, Union
|
9
|
+
from typing import Any, Callable, List, Optional, Set, Union
|
10
10
|
|
11
11
|
import cron_descriptor
|
12
12
|
import pytz
|
@@ -349,6 +349,7 @@ async def extract_references_and_questions(
|
|
349
349
|
location_data: LocationData = None,
|
350
350
|
send_status_func: Optional[Callable] = None,
|
351
351
|
query_images: Optional[List[str]] = None,
|
352
|
+
previous_inferred_queries: Set = set(),
|
352
353
|
agent: Agent = None,
|
353
354
|
tracer: dict = {},
|
354
355
|
):
|
@@ -477,6 +478,7 @@ async def extract_references_and_questions(
|
|
477
478
|
)
|
478
479
|
|
479
480
|
# Collate search results as context for GPT
|
481
|
+
inferred_queries = list(set(inferred_queries) - previous_inferred_queries)
|
480
482
|
with timer("Searching knowledge base took", logger):
|
481
483
|
search_results = []
|
482
484
|
logger.info(f"🔍 Searching knowledge base with queries: {inferred_queries}")
|
khoj/routers/api_chat.py
CHANGED
@@ -28,7 +28,11 @@ from khoj.processor.conversation.prompts import help_message, no_entries_found
|
|
28
28
|
from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
|
29
29
|
from khoj.processor.image.generate import text_to_image
|
30
30
|
from khoj.processor.speech.text_to_speech import generate_text_to_speech
|
31
|
-
from khoj.processor.tools.online_search import
|
31
|
+
from khoj.processor.tools.online_search import (
|
32
|
+
deduplicate_organic_results,
|
33
|
+
read_webpages,
|
34
|
+
search_online,
|
35
|
+
)
|
32
36
|
from khoj.processor.tools.run_code import run_code
|
33
37
|
from khoj.routers.api import extract_references_and_questions
|
34
38
|
from khoj.routers.email import send_query_feedback
|
@@ -738,8 +742,13 @@ async def chat(
|
|
738
742
|
conversation_commands.append(mode)
|
739
743
|
|
740
744
|
for cmd in conversation_commands:
|
741
|
-
|
742
|
-
|
745
|
+
try:
|
746
|
+
await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
|
747
|
+
q = q.replace(f"/{cmd.value}", "").strip()
|
748
|
+
except HTTPException as e:
|
749
|
+
async for result in send_llm_response(str(e.detail)):
|
750
|
+
yield result
|
751
|
+
return
|
743
752
|
|
744
753
|
defiltered_query = defilter_query(q)
|
745
754
|
|
@@ -773,11 +782,8 @@ async def chat(
|
|
773
782
|
yield research_result
|
774
783
|
|
775
784
|
# researched_results = await extract_relevant_info(q, researched_results, agent)
|
776
|
-
|
777
|
-
|
778
|
-
for cmd in conversation_commands:
|
779
|
-
await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
|
780
|
-
q = q.replace(f"/{cmd.value}", "").strip()
|
785
|
+
if state.verbose > 1:
|
786
|
+
logger.debug(f"Researched Results: {researched_results}")
|
781
787
|
|
782
788
|
used_slash_summarize = conversation_commands == [ConversationCommand.Summarize]
|
783
789
|
file_filters = conversation.file_filters if conversation else []
|
@@ -1024,12 +1030,13 @@ async def chat(
|
|
1024
1030
|
)
|
1025
1031
|
|
1026
1032
|
## Send Gathered References
|
1033
|
+
unique_online_results = deduplicate_organic_results(online_results)
|
1027
1034
|
async for result in send_event(
|
1028
1035
|
ChatEvent.REFERENCES,
|
1029
1036
|
{
|
1030
1037
|
"inferredQueries": inferred_queries,
|
1031
1038
|
"context": compiled_references,
|
1032
|
-
"onlineContext":
|
1039
|
+
"onlineContext": unique_online_results,
|
1033
1040
|
"codeContext": code_results,
|
1034
1041
|
},
|
1035
1042
|
):
|
khoj/routers/helpers.py
CHANGED
@@ -20,6 +20,7 @@ from typing import (
|
|
20
20
|
Iterator,
|
21
21
|
List,
|
22
22
|
Optional,
|
23
|
+
Set,
|
23
24
|
Tuple,
|
24
25
|
Union,
|
25
26
|
)
|
@@ -494,7 +495,7 @@ async def generate_online_subqueries(
|
|
494
495
|
query_images: List[str] = None,
|
495
496
|
agent: Agent = None,
|
496
497
|
tracer: dict = {},
|
497
|
-
) ->
|
498
|
+
) -> Set[str]:
|
498
499
|
"""
|
499
500
|
Generate subqueries from the given query
|
500
501
|
"""
|
@@ -529,14 +530,14 @@ async def generate_online_subqueries(
|
|
529
530
|
try:
|
530
531
|
response = clean_json(response)
|
531
532
|
response = json.loads(response)
|
532
|
-
response =
|
533
|
-
if not isinstance(response,
|
533
|
+
response = {q.strip() for q in response["queries"] if q.strip()}
|
534
|
+
if not isinstance(response, set) or not response or len(response) == 0:
|
534
535
|
logger.error(f"Invalid response for constructing subqueries: {response}. Returning original query: {q}")
|
535
|
-
return
|
536
|
+
return {q}
|
536
537
|
return response
|
537
538
|
except Exception as e:
|
538
539
|
logger.error(f"Invalid response for constructing subqueries: {response}. Returning original query: {q}")
|
539
|
-
return
|
540
|
+
return {q}
|
540
541
|
|
541
542
|
|
542
543
|
async def schedule_query(
|
@@ -1128,9 +1129,6 @@ def generate_chat_response(
|
|
1128
1129
|
|
1129
1130
|
metadata = {}
|
1130
1131
|
agent = AgentAdapters.get_conversation_agent_by_id(conversation.agent.id) if conversation.agent else None
|
1131
|
-
query_to_run = q
|
1132
|
-
if meta_research:
|
1133
|
-
query_to_run = f"AI Research: {meta_research} {q}"
|
1134
1132
|
try:
|
1135
1133
|
partial_completion = partial(
|
1136
1134
|
save_to_conversation_log,
|
@@ -1148,6 +1146,13 @@ def generate_chat_response(
|
|
1148
1146
|
train_of_thought=train_of_thought,
|
1149
1147
|
)
|
1150
1148
|
|
1149
|
+
query_to_run = q
|
1150
|
+
if meta_research:
|
1151
|
+
query_to_run = f"<query>{q}</query>\n<collected_research>\n{meta_research}\n</collected_research>"
|
1152
|
+
compiled_references = []
|
1153
|
+
online_results = {}
|
1154
|
+
code_results = {}
|
1155
|
+
|
1151
1156
|
conversation_config = ConversationAdapters.get_valid_conversation_config(user, conversation)
|
1152
1157
|
vision_available = conversation_config.vision_enabled
|
1153
1158
|
if not vision_available and query_images:
|
@@ -1306,25 +1311,28 @@ class ApiUserRateLimiter:
|
|
1306
1311
|
# Check if the user has exceeded the rate limit
|
1307
1312
|
if subscribed and count_requests >= self.subscribed_requests:
|
1308
1313
|
logger.info(
|
1309
|
-
f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}.
|
1314
|
+
f"Rate limit: {count_requests}/{self.subscribed_requests} requests not allowed in {self.window} seconds for subscribed user: {user}."
|
1315
|
+
)
|
1316
|
+
raise HTTPException(
|
1317
|
+
status_code=429,
|
1318
|
+
detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. But let's chat more tomorrow?",
|
1310
1319
|
)
|
1311
|
-
raise HTTPException(status_code=429, detail="Slow down! Too Many Requests")
|
1312
1320
|
if not subscribed and count_requests >= self.requests:
|
1313
1321
|
if self.requests >= self.subscribed_requests:
|
1314
1322
|
logger.info(
|
1315
|
-
f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}.
|
1323
|
+
f"Rate limit: {count_requests}/{self.subscribed_requests} requests not allowed in {self.window} seconds for user: {user}."
|
1316
1324
|
)
|
1317
1325
|
raise HTTPException(
|
1318
1326
|
status_code=429,
|
1319
|
-
detail="
|
1327
|
+
detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. But let's chat more tomorrow?",
|
1320
1328
|
)
|
1321
1329
|
|
1322
1330
|
logger.info(
|
1323
|
-
f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}.
|
1331
|
+
f"Rate limit: {count_requests}/{self.requests} requests not allowed in {self.window} seconds for user: {user}."
|
1324
1332
|
)
|
1325
1333
|
raise HTTPException(
|
1326
1334
|
status_code=429,
|
1327
|
-
detail="I'm glad you're enjoying interacting with me!
|
1335
|
+
detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. You can subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings) or we can continue our conversation tomorrow?",
|
1328
1336
|
)
|
1329
1337
|
|
1330
1338
|
# Add the current request to the cache
|
@@ -1350,6 +1358,7 @@ class ApiImageRateLimiter:
|
|
1350
1358
|
|
1351
1359
|
# Check number of images
|
1352
1360
|
if len(body.images) > self.max_images:
|
1361
|
+
logger.info(f"Rate limit: {len(body.images)}/{self.max_images} images not allowed per message.")
|
1353
1362
|
raise HTTPException(
|
1354
1363
|
status_code=429,
|
1355
1364
|
detail=f"Those are way too many images for me! I can handle up to {self.max_images} images per message.",
|
@@ -1370,6 +1379,7 @@ class ApiImageRateLimiter:
|
|
1370
1379
|
total_size_mb += len(image_bytes) / (1024 * 1024) # Convert bytes to MB
|
1371
1380
|
|
1372
1381
|
if total_size_mb > self.max_combined_size_mb:
|
1382
|
+
logger.info(f"Data limit: {total_size_mb}MB/{self.max_combined_size_mb}MB size not allowed per message.")
|
1373
1383
|
raise HTTPException(
|
1374
1384
|
status_code=429,
|
1375
1385
|
detail=f"Those images are way too large for me! I can handle up to {self.max_combined_size_mb}MB of images per message.",
|
@@ -1405,13 +1415,19 @@ class ConversationCommandRateLimiter:
|
|
1405
1415
|
|
1406
1416
|
if subscribed and count_requests >= self.subscribed_rate_limit:
|
1407
1417
|
logger.info(
|
1408
|
-
f"Rate limit: {count_requests} requests in 24 hours for user: {user}.
|
1418
|
+
f"Rate limit: {count_requests}/{self.subscribed_rate_limit} requests not allowed in 24 hours for subscribed user: {user}."
|
1419
|
+
)
|
1420
|
+
raise HTTPException(
|
1421
|
+
status_code=429,
|
1422
|
+
detail=f"I'm glad you're enjoying interacting with me! You've unfortunately exceeded your `/{conversation_command.value}` command usage limit for today. Maybe we can talk about something else for today?",
|
1409
1423
|
)
|
1410
|
-
raise HTTPException(status_code=429, detail="Slow down! Too Many Requests")
|
1411
1424
|
if not subscribed and count_requests >= self.trial_rate_limit:
|
1425
|
+
logger.info(
|
1426
|
+
f"Rate limit: {count_requests}/{self.trial_rate_limit} requests not allowed in 24 hours for user: {user}."
|
1427
|
+
)
|
1412
1428
|
raise HTTPException(
|
1413
1429
|
status_code=429,
|
1414
|
-
detail=f"
|
1430
|
+
detail=f"I'm glad you're enjoying interacting with me! You've unfortunately exceeded your `/{conversation_command.value}` command usage limit for today. You can subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings) or we can talk about something else for today?",
|
1415
1431
|
)
|
1416
1432
|
await UserRequests.objects.acreate(user=user, slug=command_slug)
|
1417
1433
|
return
|
@@ -1457,16 +1473,28 @@ class ApiIndexedDataLimiter:
|
|
1457
1473
|
logger.info(f"Deleted {num_deleted_entries} entries for user: {user}.")
|
1458
1474
|
|
1459
1475
|
if subscribed and incoming_data_size_mb >= self.subscribed_num_entries_size:
|
1476
|
+
logger.info(
|
1477
|
+
f"Data limit: {incoming_data_size_mb}MB incoming will exceed {self.subscribed_num_entries_size}MB allowed for subscribed user: {user}."
|
1478
|
+
)
|
1460
1479
|
raise HTTPException(status_code=429, detail="Too much data indexed.")
|
1461
1480
|
if not subscribed and incoming_data_size_mb >= self.num_entries_size:
|
1481
|
+
logger.info(
|
1482
|
+
f"Data limit: {incoming_data_size_mb}MB incoming will exceed {self.num_entries_size}MB allowed for user: {user}."
|
1483
|
+
)
|
1462
1484
|
raise HTTPException(
|
1463
1485
|
status_code=429, detail="Too much data indexed. Subscribe to increase your data index limit."
|
1464
1486
|
)
|
1465
1487
|
|
1466
1488
|
user_size_data = EntryAdapters.get_size_of_indexed_data_in_mb(user)
|
1467
1489
|
if subscribed and user_size_data + incoming_data_size_mb >= self.subscribed_total_entries_size:
|
1490
|
+
logger.info(
|
1491
|
+
f"Data limit: {incoming_data_size_mb}MB incoming + {user_size_data}MB existing will exceed {self.subscribed_total_entries_size}MB allowed for subscribed user: {user}."
|
1492
|
+
)
|
1468
1493
|
raise HTTPException(status_code=429, detail="Too much data indexed.")
|
1469
1494
|
if not subscribed and user_size_data + incoming_data_size_mb >= self.total_entries_size_limit:
|
1495
|
+
logger.info(
|
1496
|
+
f"Data limit: {incoming_data_size_mb}MB incoming + {user_size_data}MB existing will exceed {self.subscribed_total_entries_size}MB allowed for non subscribed user: {user}."
|
1497
|
+
)
|
1470
1498
|
raise HTTPException(
|
1471
1499
|
status_code=429, detail="Too much data indexed. Subscribe to increase your data index limit."
|
1472
1500
|
)
|
khoj/routers/research.py
CHANGED
@@ -43,38 +43,35 @@ async def apick_next_tool(
|
|
43
43
|
location: LocationData = None,
|
44
44
|
user_name: str = None,
|
45
45
|
agent: Agent = None,
|
46
|
-
|
46
|
+
previous_iterations: List[InformationCollectionIteration] = [],
|
47
47
|
max_iterations: int = 5,
|
48
48
|
send_status_func: Optional[Callable] = None,
|
49
49
|
tracer: dict = {},
|
50
50
|
):
|
51
|
-
"""
|
52
|
-
Given a query, determine which of the available tools the agent should use in order to answer appropriately. One at a time, and it's able to use subsequent iterations to refine the answer.
|
53
|
-
"""
|
51
|
+
"""Given a query, determine which of the available tools the agent should use in order to answer appropriately."""
|
54
52
|
|
53
|
+
# Construct tool options for the agent to choose from
|
55
54
|
tool_options = dict()
|
56
55
|
tool_options_str = ""
|
57
|
-
|
58
56
|
agent_tools = agent.input_tools if agent else []
|
59
|
-
|
60
57
|
for tool, description in function_calling_description_for_llm.items():
|
61
58
|
tool_options[tool.value] = description
|
62
59
|
if len(agent_tools) == 0 or tool.value in agent_tools:
|
63
60
|
tool_options_str += f'- "{tool.value}": "{description}"\n'
|
64
61
|
|
62
|
+
# Construct chat history with user and iteration history with researcher agent for context
|
65
63
|
chat_history = construct_chat_history(conversation_history, agent_name=agent.name if agent else "Khoj")
|
64
|
+
previous_iterations_history = construct_iteration_history(previous_iterations, prompts.previous_iteration)
|
66
65
|
|
67
66
|
if query_images:
|
68
67
|
query = f"[placeholder for user attached images]\n{query}"
|
69
68
|
|
69
|
+
today = datetime.today()
|
70
|
+
location_data = f"{location}" if location else "Unknown"
|
70
71
|
personality_context = (
|
71
72
|
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
72
73
|
)
|
73
74
|
|
74
|
-
# Extract Past User Message and Inferred Questions from Conversation Log
|
75
|
-
today = datetime.today()
|
76
|
-
location_data = f"{location}" if location else "Unknown"
|
77
|
-
|
78
75
|
function_planning_prompt = prompts.plan_function_execution.format(
|
79
76
|
tools=tool_options_str,
|
80
77
|
chat_history=chat_history,
|
@@ -87,15 +84,24 @@ async def apick_next_tool(
|
|
87
84
|
max_iterations=max_iterations,
|
88
85
|
)
|
89
86
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
87
|
+
try:
|
88
|
+
with timer("Chat actor: Infer information sources to refer", logger):
|
89
|
+
response = await send_message_to_model_wrapper(
|
90
|
+
query=query,
|
91
|
+
context=function_planning_prompt,
|
92
|
+
response_type="json_object",
|
93
|
+
user=user,
|
94
|
+
query_images=query_images,
|
95
|
+
tracer=tracer,
|
96
|
+
)
|
97
|
+
except Exception as e:
|
98
|
+
logger.error(f"Failed to infer information sources to refer: {e}", exc_info=True)
|
99
|
+
yield InformationCollectionIteration(
|
100
|
+
tool=None,
|
101
|
+
query=None,
|
102
|
+
warning="Failed to infer information sources to refer. Skipping iteration. Try again.",
|
98
103
|
)
|
104
|
+
return
|
99
105
|
|
100
106
|
try:
|
101
107
|
response = clean_json(response)
|
@@ -103,8 +109,15 @@ async def apick_next_tool(
|
|
103
109
|
selected_tool = response.get("tool", None)
|
104
110
|
generated_query = response.get("query", None)
|
105
111
|
scratchpad = response.get("scratchpad", None)
|
112
|
+
warning = None
|
106
113
|
logger.info(f"Response for determining relevant tools: {response}")
|
107
|
-
|
114
|
+
|
115
|
+
# Detect selection of previously used query, tool combination.
|
116
|
+
previous_tool_query_combinations = {(i.tool, i.query) for i in previous_iterations}
|
117
|
+
if (selected_tool, generated_query) in previous_tool_query_combinations:
|
118
|
+
warning = f"Repeated tool, query combination detected. Skipping iteration. Try something different."
|
119
|
+
# Only send client status updates if we'll execute this iteration
|
120
|
+
elif send_status_func:
|
108
121
|
determined_tool_message = "**Determined Tool**: "
|
109
122
|
determined_tool_message += f"{selected_tool}({generated_query})." if selected_tool else "respond."
|
110
123
|
determined_tool_message += f"\nReason: {scratchpad}" if scratchpad else ""
|
@@ -114,13 +127,14 @@ async def apick_next_tool(
|
|
114
127
|
yield InformationCollectionIteration(
|
115
128
|
tool=selected_tool,
|
116
129
|
query=generated_query,
|
130
|
+
warning=warning,
|
117
131
|
)
|
118
|
-
|
119
132
|
except Exception as e:
|
120
133
|
logger.error(f"Invalid response for determining relevant tools: {response}. {e}", exc_info=True)
|
121
134
|
yield InformationCollectionIteration(
|
122
135
|
tool=None,
|
123
136
|
query=None,
|
137
|
+
warning=f"Invalid response for determining relevant tools: {response}. Skipping iteration. Fix error: {e}",
|
124
138
|
)
|
125
139
|
|
126
140
|
|
@@ -147,7 +161,6 @@ async def execute_information_collection(
|
|
147
161
|
document_results: List[Dict[str, str]] = []
|
148
162
|
summarize_files: str = ""
|
149
163
|
this_iteration = InformationCollectionIteration(tool=None, query=query)
|
150
|
-
previous_iterations_history = construct_iteration_history(previous_iterations, prompts.previous_iteration)
|
151
164
|
|
152
165
|
async for result in apick_next_tool(
|
153
166
|
query,
|
@@ -157,7 +170,7 @@ async def execute_information_collection(
|
|
157
170
|
location,
|
158
171
|
user_name,
|
159
172
|
agent,
|
160
|
-
|
173
|
+
previous_iterations,
|
161
174
|
MAX_ITERATIONS,
|
162
175
|
send_status_func,
|
163
176
|
tracer=tracer,
|
@@ -167,9 +180,16 @@ async def execute_information_collection(
|
|
167
180
|
elif isinstance(result, InformationCollectionIteration):
|
168
181
|
this_iteration = result
|
169
182
|
|
170
|
-
if
|
183
|
+
# Skip running iteration if warning present in iteration
|
184
|
+
if this_iteration.warning:
|
185
|
+
logger.warning(f"Research mode: {this_iteration.warning}.")
|
186
|
+
|
187
|
+
elif this_iteration.tool == ConversationCommand.Notes:
|
171
188
|
this_iteration.context = []
|
172
189
|
document_results = []
|
190
|
+
previous_inferred_queries = {
|
191
|
+
c["query"] for iteration in previous_iterations if iteration.context for c in iteration.context
|
192
|
+
}
|
173
193
|
async for result in extract_references_and_questions(
|
174
194
|
request,
|
175
195
|
construct_tool_chat_history(previous_iterations, ConversationCommand.Notes),
|
@@ -181,6 +201,7 @@ async def execute_information_collection(
|
|
181
201
|
location,
|
182
202
|
send_status_func,
|
183
203
|
query_images,
|
204
|
+
previous_inferred_queries=previous_inferred_queries,
|
184
205
|
agent=agent,
|
185
206
|
tracer=tracer,
|
186
207
|
):
|
@@ -204,6 +225,12 @@ async def execute_information_collection(
|
|
204
225
|
logger.error(f"Error extracting document references: {e}", exc_info=True)
|
205
226
|
|
206
227
|
elif this_iteration.tool == ConversationCommand.Online:
|
228
|
+
previous_subqueries = {
|
229
|
+
subquery
|
230
|
+
for iteration in previous_iterations
|
231
|
+
if iteration.onlineContext
|
232
|
+
for subquery in iteration.onlineContext.keys()
|
233
|
+
}
|
207
234
|
async for result in search_online(
|
208
235
|
this_iteration.query,
|
209
236
|
construct_tool_chat_history(previous_iterations, ConversationCommand.Online),
|
@@ -213,11 +240,16 @@ async def execute_information_collection(
|
|
213
240
|
[],
|
214
241
|
max_webpages_to_read=0,
|
215
242
|
query_images=query_images,
|
243
|
+
previous_subqueries=previous_subqueries,
|
216
244
|
agent=agent,
|
217
245
|
tracer=tracer,
|
218
246
|
):
|
219
247
|
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
220
248
|
yield result[ChatEvent.STATUS]
|
249
|
+
elif is_none_or_empty(result):
|
250
|
+
this_iteration.warning = (
|
251
|
+
"Detected previously run online search queries. Skipping iteration. Try something different."
|
252
|
+
)
|
221
253
|
else:
|
222
254
|
online_results: Dict[str, Dict] = result # type: ignore
|
223
255
|
this_iteration.onlineContext = online_results
|
@@ -302,16 +334,19 @@ async def execute_information_collection(
|
|
302
334
|
|
303
335
|
current_iteration += 1
|
304
336
|
|
305
|
-
if document_results or online_results or code_results or summarize_files:
|
306
|
-
results_data = f"
|
337
|
+
if document_results or online_results or code_results or summarize_files or this_iteration.warning:
|
338
|
+
results_data = f"\n<iteration>{current_iteration}\n<tool>{this_iteration.tool}</tool>\n<query>{this_iteration.query}</query>\n<results>"
|
307
339
|
if document_results:
|
308
|
-
results_data += f"
|
340
|
+
results_data += f"\n<document_references>\n{yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</document_references>"
|
309
341
|
if online_results:
|
310
|
-
results_data += f"
|
342
|
+
results_data += f"\n<online_results>\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</online_results>"
|
311
343
|
if code_results:
|
312
|
-
results_data += f"
|
344
|
+
results_data += f"\n<code_results>\n{yaml.dump(code_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</code_results>"
|
313
345
|
if summarize_files:
|
314
|
-
results_data += f"
|
346
|
+
results_data += f"\n<summarized_files>\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</summarized_files>"
|
347
|
+
if this_iteration.warning:
|
348
|
+
results_data += f"\n<warning>\n{this_iteration.warning}\n</warning>"
|
349
|
+
results_data += "\n</results>\n</iteration>"
|
315
350
|
|
316
351
|
# intermediate_result = await extract_relevant_info(this_iteration.query, results_data, agent)
|
317
352
|
this_iteration.summarizedResult = results_data
|
@@ -1,13 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: khoj
|
3
|
-
Version: 1.28.4.
|
3
|
+
Version: 1.28.4.dev22
|
4
4
|
Summary: Your Second Brain
|
5
5
|
Project-URL: Homepage, https://khoj.dev
|
6
6
|
Project-URL: Documentation, https://docs.khoj.dev
|
7
7
|
Project-URL: Code, https://github.com/khoj-ai/khoj
|
8
8
|
Author: Debanjum Singh Solanky, Saba Imran
|
9
|
-
License-Expression: AGPL-3.0-or-later
|
10
|
-
License-File: LICENSE
|
11
9
|
Keywords: AI,NLP,images,markdown,org-mode,pdf,productivity,search,semantic-search
|
12
10
|
Classifier: Development Status :: 5 - Production/Stable
|
13
11
|
Classifier: Intended Audience :: Information Technology
|