khoj 1.24.2.dev3__py3-none-any.whl → 1.25.1.dev34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/configure.py +13 -4
- khoj/database/adapters/__init__.py +289 -52
- khoj/database/admin.py +20 -1
- khoj/database/migrations/0065_remove_agent_avatar_remove_agent_public_and_more.py +49 -0
- khoj/database/migrations/0066_remove_agent_tools_agent_input_tools_and_more.py +69 -0
- khoj/database/migrations/0067_alter_agent_style_icon.py +50 -0
- khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
- khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
- khoj/database/models/__init__.py +136 -18
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/1603-fa3ee48860b9dc5c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/2697-a38d01981ad3bdf8.js +1 -0
- khoj/interface/compiled/_next/static/chunks/3110-ef2cacd1b8d79ad8.js +1 -0
- khoj/interface/compiled/_next/static/chunks/4086-2c74808ba38a5a0f.js +1 -0
- khoj/interface/compiled/_next/static/chunks/477-ec86e93db10571c1.js +1 -0
- khoj/interface/compiled/_next/static/chunks/51-e8f5bdb69b5ea421.js +1 -0
- khoj/interface/compiled/_next/static/chunks/7762-79f2205740622b5c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/9178-899fe9a6b754ecfe.js +1 -0
- khoj/interface/compiled/_next/static/chunks/9417-29502e39c3e7d60c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/9479-7eed36fc954ef804.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{layout-e71c8e913cccf792.js → layout-75636ab3a413fa8e.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/page-fa282831808ee536.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/automations/page-5480731341f34450.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{layout-8102549127db3067.js → layout-96fcf62857bf8f30.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/page-702057ccbcf27881.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/factchecker/page-e7b34316ec6f44de.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/{layout-f3e40d346da53112.js → layout-d0f0a9067427fb20.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/page-10a5aad6e04f3cf8.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/page-d56541c746fded7d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/settings/{layout-6f9314b0d7a26046.js → layout-a8f33dfe92f997fb.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/page-e044a999468a7c5d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{layout-39f03f9e32399f0f.js → layout-2df56074e42adaa0.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/page-fbbd66a4d4633438.js +1 -0
- khoj/interface/compiled/_next/static/chunks/{webpack-d4781cada9b58e75.js → webpack-c0cd5a6afb1f0798.js} +1 -1
- khoj/interface/compiled/_next/static/css/2de69f0be774c768.css +1 -0
- khoj/interface/compiled/_next/static/css/467a524c75e7d7c0.css +1 -0
- khoj/interface/compiled/_next/static/css/592ca99f5122e75a.css +1 -0
- khoj/interface/compiled/_next/static/css/b9a6bf04305d98d7.css +25 -0
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +1 -1
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/factchecker/index.html +1 -1
- khoj/interface/compiled/factchecker/index.txt +2 -2
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +3 -3
- khoj/interface/compiled/share/chat/index.html +1 -1
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/interface/web/assets/icons/agents.svg +1 -0
- khoj/interface/web/assets/icons/automation.svg +1 -0
- khoj/interface/web/assets/icons/chat.svg +24 -0
- khoj/interface/web/login.html +11 -22
- khoj/processor/content/notion/notion_to_entries.py +2 -1
- khoj/processor/conversation/anthropic/anthropic_chat.py +2 -0
- khoj/processor/conversation/google/gemini_chat.py +6 -19
- khoj/processor/conversation/google/utils.py +33 -15
- khoj/processor/conversation/offline/chat_model.py +3 -1
- khoj/processor/conversation/openai/gpt.py +2 -0
- khoj/processor/conversation/prompts.py +67 -5
- khoj/processor/conversation/utils.py +3 -7
- khoj/processor/embeddings.py +6 -3
- khoj/processor/image/generate.py +4 -3
- khoj/processor/tools/online_search.py +139 -44
- khoj/routers/api.py +35 -6
- khoj/routers/api_agents.py +235 -4
- khoj/routers/api_chat.py +102 -530
- khoj/routers/api_content.py +14 -0
- khoj/routers/api_model.py +1 -1
- khoj/routers/auth.py +9 -1
- khoj/routers/helpers.py +181 -68
- khoj/routers/subscription.py +18 -4
- khoj/search_type/text_search.py +11 -3
- khoj/utils/helpers.py +64 -8
- khoj/utils/initialization.py +0 -3
- {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/METADATA +19 -21
- {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/RECORD +87 -81
- khoj/interface/compiled/_next/static/chunks/1603-3e2e1528e3b6ea1d.js +0 -1
- khoj/interface/compiled/_next/static/chunks/2697-a29cb9191a9e339c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/6648-ee109f4ea33a74e2.js +0 -1
- khoj/interface/compiled/_next/static/chunks/7071-b4711cecca6619a8.js +0 -1
- khoj/interface/compiled/_next/static/chunks/743-1a64254447cda71f.js +0 -1
- khoj/interface/compiled/_next/static/chunks/8423-62ac6c832be2461b.js +0 -1
- khoj/interface/compiled/_next/static/chunks/9162-0be016519a18568b.js +0 -1
- khoj/interface/compiled/_next/static/chunks/9178-7e815211edcb3657.js +0 -1
- khoj/interface/compiled/_next/static/chunks/9417-5d14ac74aaab2c66.js +0 -1
- khoj/interface/compiled/_next/static/chunks/9984-e410179c6fac7cf1.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/page-d302911777a3e027.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/page-0a5de8c254c29a1c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/page-d96bf6a84bb05290.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/factchecker/page-32e61af29e6b431d.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/page-96cab08c985716f4.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/search/page-b3193d46c65571c5.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/page-0db9b708366606ec.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/page-f06ac16cfe5b5a16.js +0 -1
- khoj/interface/compiled/_next/static/css/1538cedb321e3a97.css +0 -1
- khoj/interface/compiled/_next/static/css/24f141a6e37cd204.css +0 -25
- khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
- khoj/interface/compiled/_next/static/css/f768dddada62459d.css +0 -1
- /khoj/interface/compiled/_next/static/{_29ceahp81LhuIHo5QgOD → Jid9q6Qg851ioDaaO_fth}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{_29ceahp81LhuIHo5QgOD → Jid9q6Qg851ioDaaO_fth}/_ssgManifest.js +0 -0
- {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/WHEEL +0 -0
- {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/entry_points.txt +0 -0
- {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/licenses/LICENSE +0 -0
@@ -10,14 +10,22 @@ import aiohttp
|
|
10
10
|
from bs4 import BeautifulSoup
|
11
11
|
from markdownify import markdownify
|
12
12
|
|
13
|
-
from khoj.database.
|
13
|
+
from khoj.database.adapters import ConversationAdapters
|
14
|
+
from khoj.database.models import Agent, KhojUser, WebScraper
|
15
|
+
from khoj.processor.conversation import prompts
|
14
16
|
from khoj.routers.helpers import (
|
15
17
|
ChatEvent,
|
16
18
|
extract_relevant_info,
|
17
19
|
generate_online_subqueries,
|
18
20
|
infer_webpage_urls,
|
19
21
|
)
|
20
|
-
from khoj.utils.helpers import
|
22
|
+
from khoj.utils.helpers import (
|
23
|
+
is_env_var_true,
|
24
|
+
is_internal_url,
|
25
|
+
is_internet_connected,
|
26
|
+
is_none_or_empty,
|
27
|
+
timer,
|
28
|
+
)
|
21
29
|
from khoj.utils.rawconfig import LocationData
|
22
30
|
|
23
31
|
logger = logging.getLogger(__name__)
|
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
|
|
25
33
|
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
26
34
|
SERPER_DEV_URL = "https://google.serper.dev/search"
|
27
35
|
|
28
|
-
JINA_READER_API_URL = "https://r.jina.ai/"
|
29
36
|
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
30
37
|
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
31
38
|
|
32
|
-
|
33
|
-
|
39
|
+
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
40
|
+
|
34
41
|
OLOSTEP_QUERY_PARAMS = {
|
35
42
|
"timeout": 35, # seconds
|
36
43
|
"waitBeforeScraping": 1, # seconds
|
@@ -53,20 +60,20 @@ async def search_online(
|
|
53
60
|
conversation_history: dict,
|
54
61
|
location: LocationData,
|
55
62
|
user: KhojUser,
|
56
|
-
subscribed: bool = False,
|
57
63
|
send_status_func: Optional[Callable] = None,
|
58
64
|
custom_filters: List[str] = [],
|
59
65
|
uploaded_image_url: str = None,
|
66
|
+
agent: Agent = None,
|
60
67
|
):
|
61
68
|
query += " ".join(custom_filters)
|
62
69
|
if not is_internet_connected():
|
63
|
-
logger.
|
70
|
+
logger.warning("Cannot search online as not connected to internet")
|
64
71
|
yield {}
|
65
72
|
return
|
66
73
|
|
67
74
|
# Breakdown the query into subqueries to get the correct answer
|
68
75
|
subqueries = await generate_online_subqueries(
|
69
|
-
query, conversation_history, location, user, uploaded_image_url=uploaded_image_url
|
76
|
+
query, conversation_history, location, user, uploaded_image_url=uploaded_image_url, agent=agent
|
70
77
|
)
|
71
78
|
response_dict = {}
|
72
79
|
|
@@ -83,33 +90,36 @@ async def search_online(
|
|
83
90
|
search_results = await asyncio.gather(*search_tasks)
|
84
91
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
85
92
|
|
86
|
-
# Gather distinct web
|
93
|
+
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
87
94
|
# Content of web pages is directly available when Jina is used for search.
|
88
|
-
webpages = {
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
95
|
+
webpages: Dict[str, Dict] = {}
|
96
|
+
for subquery in response_dict:
|
97
|
+
if "answerBox" in response_dict[subquery]:
|
98
|
+
continue
|
99
|
+
for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
|
100
|
+
link = organic.get("link")
|
101
|
+
if link in webpages:
|
102
|
+
webpages[link]["queries"].add(subquery)
|
103
|
+
else:
|
104
|
+
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
94
105
|
|
95
106
|
# Read, extract relevant info from the retrieved web pages
|
96
107
|
if webpages:
|
97
|
-
|
98
|
-
logger.info(f"Reading web pages at: {list(webpage_links)}")
|
108
|
+
logger.info(f"Reading web pages at: {webpages.keys()}")
|
99
109
|
if send_status_func:
|
100
|
-
webpage_links_str = "\n- " + "\n- ".join(
|
110
|
+
webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
|
101
111
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
102
112
|
yield {ChatEvent.STATUS: event}
|
103
113
|
tasks = [
|
104
|
-
read_webpage_and_extract_content(
|
105
|
-
for link,
|
114
|
+
read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
|
115
|
+
for link, data in webpages.items()
|
106
116
|
]
|
107
117
|
results = await asyncio.gather(*tasks)
|
108
118
|
|
109
119
|
# Collect extracted info from the retrieved web pages
|
110
|
-
for
|
120
|
+
for subqueries, url, webpage_extract in results:
|
111
121
|
if webpage_extract is not None:
|
112
|
-
response_dict[
|
122
|
+
response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
|
113
123
|
|
114
124
|
yield response_dict
|
115
125
|
|
@@ -140,9 +150,9 @@ async def read_webpages(
|
|
140
150
|
conversation_history: dict,
|
141
151
|
location: LocationData,
|
142
152
|
user: KhojUser,
|
143
|
-
subscribed: bool = False,
|
144
153
|
send_status_func: Optional[Callable] = None,
|
145
154
|
uploaded_image_url: str = None,
|
155
|
+
agent: Agent = None,
|
146
156
|
):
|
147
157
|
"Infer web pages to read from the query and extract relevant information from them"
|
148
158
|
logger.info(f"Inferring web pages to read")
|
@@ -156,29 +166,66 @@ async def read_webpages(
|
|
156
166
|
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
157
167
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
158
168
|
yield {ChatEvent.STATUS: event}
|
159
|
-
tasks = [read_webpage_and_extract_content(query, url,
|
169
|
+
tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
|
160
170
|
results = await asyncio.gather(*tasks)
|
161
171
|
|
162
172
|
response: Dict[str, Dict] = defaultdict(dict)
|
163
173
|
response[query]["webpages"] = [
|
164
|
-
{"query":
|
174
|
+
{"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
|
165
175
|
]
|
166
176
|
yield response
|
167
177
|
|
168
178
|
|
179
|
+
async def read_webpage(
|
180
|
+
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
181
|
+
) -> Tuple[str | None, str | None]:
|
182
|
+
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
|
183
|
+
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
184
|
+
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
185
|
+
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
186
|
+
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
187
|
+
return await read_webpage_with_olostep(url, api_key, api_url), None
|
188
|
+
elif scraper_type == WebScraper.WebScraperType.JINA:
|
189
|
+
return await read_webpage_with_jina(url, api_key, api_url), None
|
190
|
+
else:
|
191
|
+
return await read_webpage_at_url(url), None
|
192
|
+
|
193
|
+
|
169
194
|
async def read_webpage_and_extract_content(
|
170
|
-
|
171
|
-
) -> Tuple[str, Union[None, str]
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
195
|
+
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
196
|
+
) -> Tuple[set[str], str, Union[None, str]]:
|
197
|
+
# Select the web scrapers to use for reading the web page
|
198
|
+
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
|
199
|
+
# Only use the direct web scraper for internal URLs
|
200
|
+
if is_internal_url(url):
|
201
|
+
web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
|
202
|
+
|
203
|
+
# Fallback through enabled web scrapers until we successfully read the web page
|
204
|
+
extracted_info = None
|
205
|
+
for scraper in web_scrapers:
|
206
|
+
try:
|
207
|
+
# Read the web page
|
208
|
+
if is_none_or_empty(content):
|
209
|
+
with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
|
210
|
+
content, extracted_info = await read_webpage(
|
211
|
+
url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
|
212
|
+
)
|
213
|
+
|
214
|
+
# Extract relevant information from the web page
|
215
|
+
if is_none_or_empty(extracted_info):
|
216
|
+
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
217
|
+
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
218
|
+
|
219
|
+
# If we successfully extracted information, break the loop
|
220
|
+
if not is_none_or_empty(extracted_info):
|
221
|
+
break
|
222
|
+
except Exception as e:
|
223
|
+
logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
|
224
|
+
# If this is the last web scraper in the list, log an error
|
225
|
+
if scraper.name == web_scrapers[-1].name:
|
226
|
+
logger.error(f"All web scrapers failed for '{url}'")
|
227
|
+
|
228
|
+
return subqueries, url, extracted_info
|
182
229
|
|
183
230
|
|
184
231
|
async def read_webpage_at_url(web_url: str) -> str:
|
@@ -195,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
|
|
195
242
|
return markdownify(body)
|
196
243
|
|
197
244
|
|
198
|
-
async def read_webpage_with_olostep(web_url: str) -> str:
|
199
|
-
headers = {"Authorization": f"Bearer {
|
245
|
+
async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
|
246
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
200
247
|
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
201
248
|
web_scraping_params["url"] = web_url
|
202
249
|
|
203
250
|
async with aiohttp.ClientSession() as session:
|
204
|
-
async with session.get(
|
251
|
+
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
|
205
252
|
response.raise_for_status()
|
206
253
|
response_json = await response.json()
|
207
254
|
return response_json["markdown_content"]
|
208
255
|
|
209
256
|
|
210
|
-
async def read_webpage_with_jina(web_url: str) -> str:
|
211
|
-
jina_reader_api_url = f"{
|
257
|
+
async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
|
258
|
+
jina_reader_api_url = f"{api_url}/{web_url}"
|
212
259
|
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
213
|
-
if
|
214
|
-
headers["Authorization"] = f"Bearer {
|
260
|
+
if api_key:
|
261
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
215
262
|
|
216
263
|
async with aiohttp.ClientSession() as session:
|
217
264
|
async with session.get(jina_reader_api_url, headers=headers) as response:
|
@@ -220,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
|
220
267
|
return response_json["data"]["content"]
|
221
268
|
|
222
269
|
|
270
|
+
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
271
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
272
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
273
|
+
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
274
|
+
|
275
|
+
async with aiohttp.ClientSession() as session:
|
276
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
277
|
+
response.raise_for_status()
|
278
|
+
response_json = await response.json()
|
279
|
+
return response_json["data"]["markdown"]
|
280
|
+
|
281
|
+
|
282
|
+
async def query_webpage_with_firecrawl(
|
283
|
+
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
|
284
|
+
) -> str:
|
285
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
286
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
287
|
+
schema = {
|
288
|
+
"type": "object",
|
289
|
+
"properties": {
|
290
|
+
"relevant_extract": {"type": "string"},
|
291
|
+
},
|
292
|
+
"required": [
|
293
|
+
"relevant_extract",
|
294
|
+
],
|
295
|
+
}
|
296
|
+
|
297
|
+
personality_context = (
|
298
|
+
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
299
|
+
)
|
300
|
+
system_prompt = f"""
|
301
|
+
{prompts.system_prompt_extract_relevant_information}
|
302
|
+
|
303
|
+
{personality_context}
|
304
|
+
User Query: {", ".join(queries)}
|
305
|
+
|
306
|
+
Collate only relevant information from the website to answer the target query and in the provided JSON schema.
|
307
|
+
""".strip()
|
308
|
+
|
309
|
+
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
|
310
|
+
|
311
|
+
async with aiohttp.ClientSession() as session:
|
312
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
313
|
+
response.raise_for_status()
|
314
|
+
response_json = await response.json()
|
315
|
+
return response_json["data"]["extract"]["relevant_extract"]
|
316
|
+
|
317
|
+
|
223
318
|
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
224
319
|
encoded_query = urllib.parse.quote(query)
|
225
320
|
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
khoj/routers/api.py
CHANGED
@@ -27,7 +27,13 @@ from khoj.database.adapters import (
|
|
27
27
|
get_user_photo,
|
28
28
|
get_user_search_model_or_default,
|
29
29
|
)
|
30
|
-
from khoj.database.models import
|
30
|
+
from khoj.database.models import (
|
31
|
+
Agent,
|
32
|
+
ChatModelOptions,
|
33
|
+
KhojUser,
|
34
|
+
SpeechToTextModelOptions,
|
35
|
+
)
|
36
|
+
from khoj.processor.conversation import prompts
|
31
37
|
from khoj.processor.conversation.anthropic.anthropic_chat import (
|
32
38
|
extract_questions_anthropic,
|
33
39
|
)
|
@@ -106,6 +112,7 @@ async def execute_search(
|
|
106
112
|
r: Optional[bool] = False,
|
107
113
|
max_distance: Optional[Union[float, None]] = None,
|
108
114
|
dedupe: Optional[bool] = True,
|
115
|
+
agent: Optional[Agent] = None,
|
109
116
|
):
|
110
117
|
start_time = time.time()
|
111
118
|
|
@@ -157,6 +164,7 @@ async def execute_search(
|
|
157
164
|
t,
|
158
165
|
question_embedding=encoded_asymmetric_query,
|
159
166
|
max_distance=max_distance,
|
167
|
+
agent=agent,
|
160
168
|
)
|
161
169
|
]
|
162
170
|
|
@@ -333,6 +341,7 @@ async def extract_references_and_questions(
|
|
333
341
|
location_data: LocationData = None,
|
334
342
|
send_status_func: Optional[Callable] = None,
|
335
343
|
uploaded_image_url: Optional[str] = None,
|
344
|
+
agent: Agent = None,
|
336
345
|
):
|
337
346
|
user = request.user.object if request.user.is_authenticated else None
|
338
347
|
|
@@ -340,17 +349,30 @@ async def extract_references_and_questions(
|
|
340
349
|
compiled_references: List[Any] = []
|
341
350
|
inferred_queries: List[str] = []
|
342
351
|
|
352
|
+
agent_has_entries = False
|
353
|
+
|
354
|
+
if agent:
|
355
|
+
agent_has_entries = await sync_to_async(EntryAdapters.agent_has_entries)(agent=agent)
|
356
|
+
|
343
357
|
if (
|
344
358
|
not ConversationCommand.Notes in conversation_commands
|
345
359
|
and not ConversationCommand.Default in conversation_commands
|
360
|
+
and not agent_has_entries
|
346
361
|
):
|
347
362
|
yield compiled_references, inferred_queries, q
|
348
363
|
return
|
349
364
|
|
365
|
+
# If Notes or Default is not in the conversation command, then the search should be restricted to the agent's knowledge base
|
366
|
+
should_limit_to_agent_knowledge = (
|
367
|
+
ConversationCommand.Notes not in conversation_commands
|
368
|
+
and ConversationCommand.Default not in conversation_commands
|
369
|
+
)
|
370
|
+
|
350
371
|
if not await sync_to_async(EntryAdapters.user_has_entries)(user=user):
|
351
|
-
|
352
|
-
|
353
|
-
|
372
|
+
if not agent_has_entries:
|
373
|
+
logger.debug("No documents in knowledge base. Use a Khoj client to sync and chat with your docs.")
|
374
|
+
yield compiled_references, inferred_queries, q
|
375
|
+
return
|
354
376
|
|
355
377
|
# Extract filter terms from user message
|
356
378
|
defiltered_query = q
|
@@ -368,10 +390,12 @@ async def extract_references_and_questions(
|
|
368
390
|
using_offline_chat = False
|
369
391
|
logger.debug(f"Filters in query: {filters_in_query}")
|
370
392
|
|
393
|
+
personality_context = prompts.personality_context.format(personality=agent.personality) if agent else ""
|
394
|
+
|
371
395
|
# Infer search queries from user message
|
372
396
|
with timer("Extracting search queries took", logger):
|
373
397
|
# If we've reached here, either the user has enabled offline chat or the openai model is enabled.
|
374
|
-
conversation_config = await ConversationAdapters.aget_default_conversation_config()
|
398
|
+
conversation_config = await ConversationAdapters.aget_default_conversation_config(user)
|
375
399
|
vision_enabled = conversation_config.vision_enabled
|
376
400
|
|
377
401
|
if conversation_config.model_type == ChatModelOptions.ModelType.OFFLINE:
|
@@ -392,6 +416,7 @@ async def extract_references_and_questions(
|
|
392
416
|
location_data=location_data,
|
393
417
|
user=user,
|
394
418
|
max_prompt_size=conversation_config.max_prompt_size,
|
419
|
+
personality_context=personality_context,
|
395
420
|
)
|
396
421
|
elif conversation_config.model_type == ChatModelOptions.ModelType.OPENAI:
|
397
422
|
openai_chat_config = conversation_config.openai_config
|
@@ -408,6 +433,7 @@ async def extract_references_and_questions(
|
|
408
433
|
user=user,
|
409
434
|
uploaded_image_url=uploaded_image_url,
|
410
435
|
vision_enabled=vision_enabled,
|
436
|
+
personality_context=personality_context,
|
411
437
|
)
|
412
438
|
elif conversation_config.model_type == ChatModelOptions.ModelType.ANTHROPIC:
|
413
439
|
api_key = conversation_config.openai_config.api_key
|
@@ -419,6 +445,7 @@ async def extract_references_and_questions(
|
|
419
445
|
conversation_log=meta_log,
|
420
446
|
location_data=location_data,
|
421
447
|
user=user,
|
448
|
+
personality_context=personality_context,
|
422
449
|
)
|
423
450
|
elif conversation_config.model_type == ChatModelOptions.ModelType.GOOGLE:
|
424
451
|
api_key = conversation_config.openai_config.api_key
|
@@ -431,6 +458,7 @@ async def extract_references_and_questions(
|
|
431
458
|
location_data=location_data,
|
432
459
|
max_tokens=conversation_config.max_prompt_size,
|
433
460
|
user=user,
|
461
|
+
personality_context=personality_context,
|
434
462
|
)
|
435
463
|
|
436
464
|
# Collate search results as context for GPT
|
@@ -445,13 +473,14 @@ async def extract_references_and_questions(
|
|
445
473
|
n_items = min(n, 3) if using_offline_chat else n
|
446
474
|
search_results.extend(
|
447
475
|
await execute_search(
|
448
|
-
user,
|
476
|
+
user if not should_limit_to_agent_knowledge else None,
|
449
477
|
f"{query} {filters_in_query}",
|
450
478
|
n=n_items,
|
451
479
|
t=SearchType.All,
|
452
480
|
r=True,
|
453
481
|
max_distance=d,
|
454
482
|
dedupe=False,
|
483
|
+
agent=agent,
|
455
484
|
)
|
456
485
|
)
|
457
486
|
search_results = text_search.deduplicated_search_responses(search_results)
|