khoj 1.24.2.dev3__py3-none-any.whl → 1.25.1.dev34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. khoj/configure.py +13 -4
  2. khoj/database/adapters/__init__.py +289 -52
  3. khoj/database/admin.py +20 -1
  4. khoj/database/migrations/0065_remove_agent_avatar_remove_agent_public_and_more.py +49 -0
  5. khoj/database/migrations/0066_remove_agent_tools_agent_input_tools_and_more.py +69 -0
  6. khoj/database/migrations/0067_alter_agent_style_icon.py +50 -0
  7. khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
  8. khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
  9. khoj/database/models/__init__.py +136 -18
  10. khoj/interface/compiled/404/index.html +1 -1
  11. khoj/interface/compiled/_next/static/chunks/1603-fa3ee48860b9dc5c.js +1 -0
  12. khoj/interface/compiled/_next/static/chunks/2697-a38d01981ad3bdf8.js +1 -0
  13. khoj/interface/compiled/_next/static/chunks/3110-ef2cacd1b8d79ad8.js +1 -0
  14. khoj/interface/compiled/_next/static/chunks/4086-2c74808ba38a5a0f.js +1 -0
  15. khoj/interface/compiled/_next/static/chunks/477-ec86e93db10571c1.js +1 -0
  16. khoj/interface/compiled/_next/static/chunks/51-e8f5bdb69b5ea421.js +1 -0
  17. khoj/interface/compiled/_next/static/chunks/7762-79f2205740622b5c.js +1 -0
  18. khoj/interface/compiled/_next/static/chunks/9178-899fe9a6b754ecfe.js +1 -0
  19. khoj/interface/compiled/_next/static/chunks/9417-29502e39c3e7d60c.js +1 -0
  20. khoj/interface/compiled/_next/static/chunks/9479-7eed36fc954ef804.js +1 -0
  21. khoj/interface/compiled/_next/static/chunks/app/agents/{layout-e71c8e913cccf792.js → layout-75636ab3a413fa8e.js} +1 -1
  22. khoj/interface/compiled/_next/static/chunks/app/agents/page-fa282831808ee536.js +1 -0
  23. khoj/interface/compiled/_next/static/chunks/app/automations/page-5480731341f34450.js +1 -0
  24. khoj/interface/compiled/_next/static/chunks/app/chat/{layout-8102549127db3067.js → layout-96fcf62857bf8f30.js} +1 -1
  25. khoj/interface/compiled/_next/static/chunks/app/chat/page-702057ccbcf27881.js +1 -0
  26. khoj/interface/compiled/_next/static/chunks/app/factchecker/page-e7b34316ec6f44de.js +1 -0
  27. khoj/interface/compiled/_next/static/chunks/app/{layout-f3e40d346da53112.js → layout-d0f0a9067427fb20.js} +1 -1
  28. khoj/interface/compiled/_next/static/chunks/app/page-10a5aad6e04f3cf8.js +1 -0
  29. khoj/interface/compiled/_next/static/chunks/app/search/page-d56541c746fded7d.js +1 -0
  30. khoj/interface/compiled/_next/static/chunks/app/settings/{layout-6f9314b0d7a26046.js → layout-a8f33dfe92f997fb.js} +1 -1
  31. khoj/interface/compiled/_next/static/chunks/app/settings/page-e044a999468a7c5d.js +1 -0
  32. khoj/interface/compiled/_next/static/chunks/app/share/chat/{layout-39f03f9e32399f0f.js → layout-2df56074e42adaa0.js} +1 -1
  33. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-fbbd66a4d4633438.js +1 -0
  34. khoj/interface/compiled/_next/static/chunks/{webpack-d4781cada9b58e75.js → webpack-c0cd5a6afb1f0798.js} +1 -1
  35. khoj/interface/compiled/_next/static/css/2de69f0be774c768.css +1 -0
  36. khoj/interface/compiled/_next/static/css/467a524c75e7d7c0.css +1 -0
  37. khoj/interface/compiled/_next/static/css/592ca99f5122e75a.css +1 -0
  38. khoj/interface/compiled/_next/static/css/b9a6bf04305d98d7.css +25 -0
  39. khoj/interface/compiled/agents/index.html +1 -1
  40. khoj/interface/compiled/agents/index.txt +2 -2
  41. khoj/interface/compiled/automations/index.html +1 -1
  42. khoj/interface/compiled/automations/index.txt +2 -2
  43. khoj/interface/compiled/chat/index.html +1 -1
  44. khoj/interface/compiled/chat/index.txt +2 -2
  45. khoj/interface/compiled/factchecker/index.html +1 -1
  46. khoj/interface/compiled/factchecker/index.txt +2 -2
  47. khoj/interface/compiled/index.html +1 -1
  48. khoj/interface/compiled/index.txt +2 -2
  49. khoj/interface/compiled/search/index.html +1 -1
  50. khoj/interface/compiled/search/index.txt +2 -2
  51. khoj/interface/compiled/settings/index.html +1 -1
  52. khoj/interface/compiled/settings/index.txt +3 -3
  53. khoj/interface/compiled/share/chat/index.html +1 -1
  54. khoj/interface/compiled/share/chat/index.txt +2 -2
  55. khoj/interface/web/assets/icons/agents.svg +1 -0
  56. khoj/interface/web/assets/icons/automation.svg +1 -0
  57. khoj/interface/web/assets/icons/chat.svg +24 -0
  58. khoj/interface/web/login.html +11 -22
  59. khoj/processor/content/notion/notion_to_entries.py +2 -1
  60. khoj/processor/conversation/anthropic/anthropic_chat.py +2 -0
  61. khoj/processor/conversation/google/gemini_chat.py +6 -19
  62. khoj/processor/conversation/google/utils.py +33 -15
  63. khoj/processor/conversation/offline/chat_model.py +3 -1
  64. khoj/processor/conversation/openai/gpt.py +2 -0
  65. khoj/processor/conversation/prompts.py +67 -5
  66. khoj/processor/conversation/utils.py +3 -7
  67. khoj/processor/embeddings.py +6 -3
  68. khoj/processor/image/generate.py +4 -3
  69. khoj/processor/tools/online_search.py +139 -44
  70. khoj/routers/api.py +35 -6
  71. khoj/routers/api_agents.py +235 -4
  72. khoj/routers/api_chat.py +102 -530
  73. khoj/routers/api_content.py +14 -0
  74. khoj/routers/api_model.py +1 -1
  75. khoj/routers/auth.py +9 -1
  76. khoj/routers/helpers.py +181 -68
  77. khoj/routers/subscription.py +18 -4
  78. khoj/search_type/text_search.py +11 -3
  79. khoj/utils/helpers.py +64 -8
  80. khoj/utils/initialization.py +0 -3
  81. {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/METADATA +19 -21
  82. {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/RECORD +87 -81
  83. khoj/interface/compiled/_next/static/chunks/1603-3e2e1528e3b6ea1d.js +0 -1
  84. khoj/interface/compiled/_next/static/chunks/2697-a29cb9191a9e339c.js +0 -1
  85. khoj/interface/compiled/_next/static/chunks/6648-ee109f4ea33a74e2.js +0 -1
  86. khoj/interface/compiled/_next/static/chunks/7071-b4711cecca6619a8.js +0 -1
  87. khoj/interface/compiled/_next/static/chunks/743-1a64254447cda71f.js +0 -1
  88. khoj/interface/compiled/_next/static/chunks/8423-62ac6c832be2461b.js +0 -1
  89. khoj/interface/compiled/_next/static/chunks/9162-0be016519a18568b.js +0 -1
  90. khoj/interface/compiled/_next/static/chunks/9178-7e815211edcb3657.js +0 -1
  91. khoj/interface/compiled/_next/static/chunks/9417-5d14ac74aaab2c66.js +0 -1
  92. khoj/interface/compiled/_next/static/chunks/9984-e410179c6fac7cf1.js +0 -1
  93. khoj/interface/compiled/_next/static/chunks/app/agents/page-d302911777a3e027.js +0 -1
  94. khoj/interface/compiled/_next/static/chunks/app/automations/page-0a5de8c254c29a1c.js +0 -1
  95. khoj/interface/compiled/_next/static/chunks/app/chat/page-d96bf6a84bb05290.js +0 -1
  96. khoj/interface/compiled/_next/static/chunks/app/factchecker/page-32e61af29e6b431d.js +0 -1
  97. khoj/interface/compiled/_next/static/chunks/app/page-96cab08c985716f4.js +0 -1
  98. khoj/interface/compiled/_next/static/chunks/app/search/page-b3193d46c65571c5.js +0 -1
  99. khoj/interface/compiled/_next/static/chunks/app/settings/page-0db9b708366606ec.js +0 -1
  100. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-f06ac16cfe5b5a16.js +0 -1
  101. khoj/interface/compiled/_next/static/css/1538cedb321e3a97.css +0 -1
  102. khoj/interface/compiled/_next/static/css/24f141a6e37cd204.css +0 -25
  103. khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
  104. khoj/interface/compiled/_next/static/css/f768dddada62459d.css +0 -1
  105. /khoj/interface/compiled/_next/static/{_29ceahp81LhuIHo5QgOD → Jid9q6Qg851ioDaaO_fth}/_buildManifest.js +0 -0
  106. /khoj/interface/compiled/_next/static/{_29ceahp81LhuIHo5QgOD → Jid9q6Qg851ioDaaO_fth}/_ssgManifest.js +0 -0
  107. {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/WHEEL +0 -0
  108. {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/entry_points.txt +0 -0
  109. {khoj-1.24.2.dev3.dist-info → khoj-1.25.1.dev34.dist-info}/licenses/LICENSE +0 -0
@@ -10,14 +10,22 @@ import aiohttp
10
10
  from bs4 import BeautifulSoup
11
11
  from markdownify import markdownify
12
12
 
13
- from khoj.database.models import KhojUser
13
+ from khoj.database.adapters import ConversationAdapters
14
+ from khoj.database.models import Agent, KhojUser, WebScraper
15
+ from khoj.processor.conversation import prompts
14
16
  from khoj.routers.helpers import (
15
17
  ChatEvent,
16
18
  extract_relevant_info,
17
19
  generate_online_subqueries,
18
20
  infer_webpage_urls,
19
21
  )
20
- from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer
22
+ from khoj.utils.helpers import (
23
+ is_env_var_true,
24
+ is_internal_url,
25
+ is_internet_connected,
26
+ is_none_or_empty,
27
+ timer,
28
+ )
21
29
  from khoj.utils.rawconfig import LocationData
22
30
 
23
31
  logger = logging.getLogger(__name__)
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
25
33
  SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
26
34
  SERPER_DEV_URL = "https://google.serper.dev/search"
27
35
 
28
- JINA_READER_API_URL = "https://r.jina.ai/"
29
36
  JINA_SEARCH_API_URL = "https://s.jina.ai/"
30
37
  JINA_API_KEY = os.getenv("JINA_API_KEY")
31
38
 
32
- OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
33
- OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
39
+ FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
40
+
34
41
  OLOSTEP_QUERY_PARAMS = {
35
42
  "timeout": 35, # seconds
36
43
  "waitBeforeScraping": 1, # seconds
@@ -53,20 +60,20 @@ async def search_online(
53
60
  conversation_history: dict,
54
61
  location: LocationData,
55
62
  user: KhojUser,
56
- subscribed: bool = False,
57
63
  send_status_func: Optional[Callable] = None,
58
64
  custom_filters: List[str] = [],
59
65
  uploaded_image_url: str = None,
66
+ agent: Agent = None,
60
67
  ):
61
68
  query += " ".join(custom_filters)
62
69
  if not is_internet_connected():
63
- logger.warn("Cannot search online as not connected to internet")
70
+ logger.warning("Cannot search online as not connected to internet")
64
71
  yield {}
65
72
  return
66
73
 
67
74
  # Breakdown the query into subqueries to get the correct answer
68
75
  subqueries = await generate_online_subqueries(
69
- query, conversation_history, location, user, uploaded_image_url=uploaded_image_url
76
+ query, conversation_history, location, user, uploaded_image_url=uploaded_image_url, agent=agent
70
77
  )
71
78
  response_dict = {}
72
79
 
@@ -83,33 +90,36 @@ async def search_online(
83
90
  search_results = await asyncio.gather(*search_tasks)
84
91
  response_dict = {subquery: search_result for subquery, search_result in search_results}
85
92
 
86
- # Gather distinct web page data from organic results of each subquery without an instant answer.
93
+ # Gather distinct web pages from organic results for subqueries without an instant answer.
87
94
  # Content of web pages is directly available when Jina is used for search.
88
- webpages = {
89
- (organic.get("link"), subquery, organic.get("content"))
90
- for subquery in response_dict
91
- for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
92
- if "answerBox" not in response_dict[subquery]
93
- }
95
+ webpages: Dict[str, Dict] = {}
96
+ for subquery in response_dict:
97
+ if "answerBox" in response_dict[subquery]:
98
+ continue
99
+ for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
100
+ link = organic.get("link")
101
+ if link in webpages:
102
+ webpages[link]["queries"].add(subquery)
103
+ else:
104
+ webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
94
105
 
95
106
  # Read, extract relevant info from the retrieved web pages
96
107
  if webpages:
97
- webpage_links = set([link for link, _, _ in webpages])
98
- logger.info(f"Reading web pages at: {list(webpage_links)}")
108
+ logger.info(f"Reading web pages at: {webpages.keys()}")
99
109
  if send_status_func:
100
- webpage_links_str = "\n- " + "\n- ".join(list(webpage_links))
110
+ webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
101
111
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
102
112
  yield {ChatEvent.STATUS: event}
103
113
  tasks = [
104
- read_webpage_and_extract_content(subquery, link, content, subscribed=subscribed)
105
- for link, subquery, content in webpages
114
+ read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
115
+ for link, data in webpages.items()
106
116
  ]
107
117
  results = await asyncio.gather(*tasks)
108
118
 
109
119
  # Collect extracted info from the retrieved web pages
110
- for subquery, webpage_extract, url in results:
120
+ for subqueries, url, webpage_extract in results:
111
121
  if webpage_extract is not None:
112
- response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
122
+ response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
113
123
 
114
124
  yield response_dict
115
125
 
@@ -140,9 +150,9 @@ async def read_webpages(
140
150
  conversation_history: dict,
141
151
  location: LocationData,
142
152
  user: KhojUser,
143
- subscribed: bool = False,
144
153
  send_status_func: Optional[Callable] = None,
145
154
  uploaded_image_url: str = None,
155
+ agent: Agent = None,
146
156
  ):
147
157
  "Infer web pages to read from the query and extract relevant information from them"
148
158
  logger.info(f"Inferring web pages to read")
@@ -156,29 +166,66 @@ async def read_webpages(
156
166
  webpage_links_str = "\n- " + "\n- ".join(list(urls))
157
167
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
158
168
  yield {ChatEvent.STATUS: event}
159
- tasks = [read_webpage_and_extract_content(query, url, subscribed=subscribed) for url in urls]
169
+ tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
160
170
  results = await asyncio.gather(*tasks)
161
171
 
162
172
  response: Dict[str, Dict] = defaultdict(dict)
163
173
  response[query]["webpages"] = [
164
- {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
174
+ {"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
165
175
  ]
166
176
  yield response
167
177
 
168
178
 
179
+ async def read_webpage(
180
+ url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
181
+ ) -> Tuple[str | None, str | None]:
182
+ if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
183
+ return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
184
+ elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
185
+ return await read_webpage_with_firecrawl(url, api_key, api_url), None
186
+ elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
187
+ return await read_webpage_with_olostep(url, api_key, api_url), None
188
+ elif scraper_type == WebScraper.WebScraperType.JINA:
189
+ return await read_webpage_with_jina(url, api_key, api_url), None
190
+ else:
191
+ return await read_webpage_at_url(url), None
192
+
193
+
169
194
  async def read_webpage_and_extract_content(
170
- subquery: str, url: str, content: str = None, subscribed: bool = False
171
- ) -> Tuple[str, Union[None, str], str]:
172
- try:
173
- if is_none_or_empty(content):
174
- with timer(f"Reading web page at '{url}' took", logger):
175
- content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url)
176
- with timer(f"Extracting relevant information from web page at '{url}' took", logger):
177
- extracted_info = await extract_relevant_info(subquery, content, subscribed=subscribed)
178
- return subquery, extracted_info, url
179
- except Exception as e:
180
- logger.error(f"Failed to read web page at '{url}' with {e}")
181
- return subquery, None, url
195
+ subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
196
+ ) -> Tuple[set[str], str, Union[None, str]]:
197
+ # Select the web scrapers to use for reading the web page
198
+ web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
199
+ # Only use the direct web scraper for internal URLs
200
+ if is_internal_url(url):
201
+ web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
202
+
203
+ # Fallback through enabled web scrapers until we successfully read the web page
204
+ extracted_info = None
205
+ for scraper in web_scrapers:
206
+ try:
207
+ # Read the web page
208
+ if is_none_or_empty(content):
209
+ with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
210
+ content, extracted_info = await read_webpage(
211
+ url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
212
+ )
213
+
214
+ # Extract relevant information from the web page
215
+ if is_none_or_empty(extracted_info):
216
+ with timer(f"Extracting relevant information from web page at '{url}' took", logger):
217
+ extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
218
+
219
+ # If we successfully extracted information, break the loop
220
+ if not is_none_or_empty(extracted_info):
221
+ break
222
+ except Exception as e:
223
+ logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
224
+ # If this is the last web scraper in the list, log an error
225
+ if scraper.name == web_scrapers[-1].name:
226
+ logger.error(f"All web scrapers failed for '{url}'")
227
+
228
+ return subqueries, url, extracted_info
182
229
 
183
230
 
184
231
  async def read_webpage_at_url(web_url: str) -> str:
@@ -195,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
195
242
  return markdownify(body)
196
243
 
197
244
 
198
- async def read_webpage_with_olostep(web_url: str) -> str:
199
- headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
245
+ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
246
+ headers = {"Authorization": f"Bearer {api_key}"}
200
247
  web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
201
248
  web_scraping_params["url"] = web_url
202
249
 
203
250
  async with aiohttp.ClientSession() as session:
204
- async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
251
+ async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
205
252
  response.raise_for_status()
206
253
  response_json = await response.json()
207
254
  return response_json["markdown_content"]
208
255
 
209
256
 
210
- async def read_webpage_with_jina(web_url: str) -> str:
211
- jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
257
+ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
258
+ jina_reader_api_url = f"{api_url}/{web_url}"
212
259
  headers = {"Accept": "application/json", "X-Timeout": "30"}
213
- if JINA_API_KEY:
214
- headers["Authorization"] = f"Bearer {JINA_API_KEY}"
260
+ if api_key:
261
+ headers["Authorization"] = f"Bearer {api_key}"
215
262
 
216
263
  async with aiohttp.ClientSession() as session:
217
264
  async with session.get(jina_reader_api_url, headers=headers) as response:
@@ -220,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
220
267
  return response_json["data"]["content"]
221
268
 
222
269
 
270
+ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
271
+ firecrawl_api_url = f"{api_url}/v1/scrape"
272
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
273
+ params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
274
+
275
+ async with aiohttp.ClientSession() as session:
276
+ async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
277
+ response.raise_for_status()
278
+ response_json = await response.json()
279
+ return response_json["data"]["markdown"]
280
+
281
+
282
+ async def query_webpage_with_firecrawl(
283
+ web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
284
+ ) -> str:
285
+ firecrawl_api_url = f"{api_url}/v1/scrape"
286
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
287
+ schema = {
288
+ "type": "object",
289
+ "properties": {
290
+ "relevant_extract": {"type": "string"},
291
+ },
292
+ "required": [
293
+ "relevant_extract",
294
+ ],
295
+ }
296
+
297
+ personality_context = (
298
+ prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
299
+ )
300
+ system_prompt = f"""
301
+ {prompts.system_prompt_extract_relevant_information}
302
+
303
+ {personality_context}
304
+ User Query: {", ".join(queries)}
305
+
306
+ Collate only relevant information from the website to answer the target query and in the provided JSON schema.
307
+ """.strip()
308
+
309
+ params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
310
+
311
+ async with aiohttp.ClientSession() as session:
312
+ async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
313
+ response.raise_for_status()
314
+ response_json = await response.json()
315
+ return response_json["data"]["extract"]["relevant_extract"]
316
+
317
+
223
318
  async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
224
319
  encoded_query = urllib.parse.quote(query)
225
320
  jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
khoj/routers/api.py CHANGED
@@ -27,7 +27,13 @@ from khoj.database.adapters import (
27
27
  get_user_photo,
28
28
  get_user_search_model_or_default,
29
29
  )
30
- from khoj.database.models import ChatModelOptions, KhojUser, SpeechToTextModelOptions
30
+ from khoj.database.models import (
31
+ Agent,
32
+ ChatModelOptions,
33
+ KhojUser,
34
+ SpeechToTextModelOptions,
35
+ )
36
+ from khoj.processor.conversation import prompts
31
37
  from khoj.processor.conversation.anthropic.anthropic_chat import (
32
38
  extract_questions_anthropic,
33
39
  )
@@ -106,6 +112,7 @@ async def execute_search(
106
112
  r: Optional[bool] = False,
107
113
  max_distance: Optional[Union[float, None]] = None,
108
114
  dedupe: Optional[bool] = True,
115
+ agent: Optional[Agent] = None,
109
116
  ):
110
117
  start_time = time.time()
111
118
 
@@ -157,6 +164,7 @@ async def execute_search(
157
164
  t,
158
165
  question_embedding=encoded_asymmetric_query,
159
166
  max_distance=max_distance,
167
+ agent=agent,
160
168
  )
161
169
  ]
162
170
 
@@ -333,6 +341,7 @@ async def extract_references_and_questions(
333
341
  location_data: LocationData = None,
334
342
  send_status_func: Optional[Callable] = None,
335
343
  uploaded_image_url: Optional[str] = None,
344
+ agent: Agent = None,
336
345
  ):
337
346
  user = request.user.object if request.user.is_authenticated else None
338
347
 
@@ -340,17 +349,30 @@ async def extract_references_and_questions(
340
349
  compiled_references: List[Any] = []
341
350
  inferred_queries: List[str] = []
342
351
 
352
+ agent_has_entries = False
353
+
354
+ if agent:
355
+ agent_has_entries = await sync_to_async(EntryAdapters.agent_has_entries)(agent=agent)
356
+
343
357
  if (
344
358
  not ConversationCommand.Notes in conversation_commands
345
359
  and not ConversationCommand.Default in conversation_commands
360
+ and not agent_has_entries
346
361
  ):
347
362
  yield compiled_references, inferred_queries, q
348
363
  return
349
364
 
365
+ # If Notes or Default is not in the conversation command, then the search should be restricted to the agent's knowledge base
366
+ should_limit_to_agent_knowledge = (
367
+ ConversationCommand.Notes not in conversation_commands
368
+ and ConversationCommand.Default not in conversation_commands
369
+ )
370
+
350
371
  if not await sync_to_async(EntryAdapters.user_has_entries)(user=user):
351
- logger.debug("No documents in knowledge base. Use a Khoj client to sync and chat with your docs.")
352
- yield compiled_references, inferred_queries, q
353
- return
372
+ if not agent_has_entries:
373
+ logger.debug("No documents in knowledge base. Use a Khoj client to sync and chat with your docs.")
374
+ yield compiled_references, inferred_queries, q
375
+ return
354
376
 
355
377
  # Extract filter terms from user message
356
378
  defiltered_query = q
@@ -368,10 +390,12 @@ async def extract_references_and_questions(
368
390
  using_offline_chat = False
369
391
  logger.debug(f"Filters in query: {filters_in_query}")
370
392
 
393
+ personality_context = prompts.personality_context.format(personality=agent.personality) if agent else ""
394
+
371
395
  # Infer search queries from user message
372
396
  with timer("Extracting search queries took", logger):
373
397
  # If we've reached here, either the user has enabled offline chat or the openai model is enabled.
374
- conversation_config = await ConversationAdapters.aget_default_conversation_config()
398
+ conversation_config = await ConversationAdapters.aget_default_conversation_config(user)
375
399
  vision_enabled = conversation_config.vision_enabled
376
400
 
377
401
  if conversation_config.model_type == ChatModelOptions.ModelType.OFFLINE:
@@ -392,6 +416,7 @@ async def extract_references_and_questions(
392
416
  location_data=location_data,
393
417
  user=user,
394
418
  max_prompt_size=conversation_config.max_prompt_size,
419
+ personality_context=personality_context,
395
420
  )
396
421
  elif conversation_config.model_type == ChatModelOptions.ModelType.OPENAI:
397
422
  openai_chat_config = conversation_config.openai_config
@@ -408,6 +433,7 @@ async def extract_references_and_questions(
408
433
  user=user,
409
434
  uploaded_image_url=uploaded_image_url,
410
435
  vision_enabled=vision_enabled,
436
+ personality_context=personality_context,
411
437
  )
412
438
  elif conversation_config.model_type == ChatModelOptions.ModelType.ANTHROPIC:
413
439
  api_key = conversation_config.openai_config.api_key
@@ -419,6 +445,7 @@ async def extract_references_and_questions(
419
445
  conversation_log=meta_log,
420
446
  location_data=location_data,
421
447
  user=user,
448
+ personality_context=personality_context,
422
449
  )
423
450
  elif conversation_config.model_type == ChatModelOptions.ModelType.GOOGLE:
424
451
  api_key = conversation_config.openai_config.api_key
@@ -431,6 +458,7 @@ async def extract_references_and_questions(
431
458
  location_data=location_data,
432
459
  max_tokens=conversation_config.max_prompt_size,
433
460
  user=user,
461
+ personality_context=personality_context,
434
462
  )
435
463
 
436
464
  # Collate search results as context for GPT
@@ -445,13 +473,14 @@ async def extract_references_and_questions(
445
473
  n_items = min(n, 3) if using_offline_chat else n
446
474
  search_results.extend(
447
475
  await execute_search(
448
- user,
476
+ user if not should_limit_to_agent_knowledge else None,
449
477
  f"{query} {filters_in_query}",
450
478
  n=n_items,
451
479
  t=SearchType.All,
452
480
  r=True,
453
481
  max_distance=d,
454
482
  dedupe=False,
483
+ agent=agent,
455
484
  )
456
485
  )
457
486
  search_results = text_search.deduplicated_search_responses(search_results)