khoj 1.25.1.dev12__py3-none-any.whl → 1.26.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. khoj/database/adapters/__init__.py +72 -2
  2. khoj/database/admin.py +16 -0
  3. khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
  4. khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
  5. khoj/database/models/__init__.py +78 -2
  6. khoj/interface/compiled/404/index.html +1 -1
  7. khoj/interface/compiled/_next/static/chunks/1603-fa3ee48860b9dc5c.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/{9417-1d158bf46d3a0dc9.js → 9417-1ad504db22331388.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/{9479-563e4d61f91d5a7c.js → 9479-adede27bb126b5d0.js} +1 -1
  10. khoj/interface/compiled/_next/static/chunks/app/agents/{layout-e71c8e913cccf792.js → layout-75636ab3a413fa8e.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/agents/{page-f8d03847a0fa2539.js → page-e9eee31dbdb4658c.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/automations/{page-5480731341f34450.js → page-2edc21f30819def4.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/chat/{layout-8102549127db3067.js → layout-96fcf62857bf8f30.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/app/chat/{page-702057ccbcf27881.js → page-4309c98e6dc497dd.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/app/factchecker/{page-e7b34316ec6f44de.js → page-f2c83e3a87a28657.js} +1 -1
  16. khoj/interface/compiled/_next/static/chunks/app/{layout-f3e40d346da53112.js → layout-d0f0a9067427fb20.js} +1 -1
  17. khoj/interface/compiled/_next/static/chunks/app/{page-421d13f70c505dd9.js → page-ab9beb5a26e396f7.js} +1 -1
  18. khoj/interface/compiled/_next/static/chunks/app/search/{page-d56541c746fded7d.js → page-b807caebd7f278c7.js} +1 -1
  19. khoj/interface/compiled/_next/static/chunks/app/settings/{layout-6f9314b0d7a26046.js → layout-a8f33dfe92f997fb.js} +1 -1
  20. khoj/interface/compiled/_next/static/chunks/app/settings/{page-e044a999468a7c5d.js → page-2932356ad11c2f7b.js} +1 -1
  21. khoj/interface/compiled/_next/static/chunks/app/share/chat/{layout-39f03f9e32399f0f.js → layout-2df56074e42adaa0.js} +1 -1
  22. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-fbbd66a4d4633438.js → page-a736a0826570af2b.js} +1 -1
  23. khoj/interface/compiled/_next/static/chunks/{webpack-dff708c71e9234cb.js → webpack-ba79408024891b00.js} +1 -1
  24. khoj/interface/compiled/_next/static/css/467a524c75e7d7c0.css +1 -0
  25. khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +1 -0
  26. khoj/interface/compiled/_next/static/css/f768dddada62459d.css +1 -0
  27. khoj/interface/compiled/agents/index.html +1 -1
  28. khoj/interface/compiled/agents/index.txt +2 -2
  29. khoj/interface/compiled/automations/index.html +1 -1
  30. khoj/interface/compiled/automations/index.txt +2 -2
  31. khoj/interface/compiled/chat/index.html +1 -1
  32. khoj/interface/compiled/chat/index.txt +2 -2
  33. khoj/interface/compiled/factchecker/index.html +1 -1
  34. khoj/interface/compiled/factchecker/index.txt +2 -2
  35. khoj/interface/compiled/index.html +1 -1
  36. khoj/interface/compiled/index.txt +2 -2
  37. khoj/interface/compiled/search/index.html +1 -1
  38. khoj/interface/compiled/search/index.txt +2 -2
  39. khoj/interface/compiled/settings/index.html +1 -1
  40. khoj/interface/compiled/settings/index.txt +3 -3
  41. khoj/interface/compiled/share/chat/index.html +1 -1
  42. khoj/interface/compiled/share/chat/index.txt +2 -2
  43. khoj/interface/web/assets/icons/agents.svg +1 -0
  44. khoj/interface/web/assets/icons/automation.svg +1 -0
  45. khoj/interface/web/assets/icons/chat.svg +24 -0
  46. khoj/interface/web/login.html +11 -22
  47. khoj/processor/content/images/image_to_entries.py +2 -0
  48. khoj/processor/conversation/google/utils.py +4 -0
  49. khoj/processor/conversation/prompts.py +1 -1
  50. khoj/processor/embeddings.py +1 -0
  51. khoj/processor/tools/online_search.py +135 -40
  52. khoj/routers/api_chat.py +41 -31
  53. khoj/routers/helpers.py +13 -11
  54. khoj/search_type/text_search.py +7 -2
  55. khoj/utils/helpers.py +50 -5
  56. {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/METADATA +4 -4
  57. {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/RECORD +62 -59
  58. khoj/interface/compiled/_next/static/chunks/1603-67a89278e2c5dbe6.js +0 -1
  59. khoj/interface/compiled/_next/static/css/1538cedb321e3a97.css +0 -1
  60. khoj/interface/compiled/_next/static/css/2de69f0be774c768.css +0 -1
  61. khoj/interface/compiled/_next/static/css/592ca99f5122e75a.css +0 -1
  62. /khoj/interface/compiled/_next/static/{CGyts-FEbV6owmPboHtLL → 0KX2AuxAEK1Jhb97imej7}/_buildManifest.js +0 -0
  63. /khoj/interface/compiled/_next/static/{CGyts-FEbV6owmPboHtLL → 0KX2AuxAEK1Jhb97imej7}/_ssgManifest.js +0 -0
  64. {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/WHEEL +0 -0
  65. {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/entry_points.txt +0 -0
  66. {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/licenses/LICENSE +0 -0
@@ -10,14 +10,22 @@ import aiohttp
10
10
  from bs4 import BeautifulSoup
11
11
  from markdownify import markdownify
12
12
 
13
- from khoj.database.models import Agent, KhojUser
13
+ from khoj.database.adapters import ConversationAdapters
14
+ from khoj.database.models import Agent, KhojUser, WebScraper
15
+ from khoj.processor.conversation import prompts
14
16
  from khoj.routers.helpers import (
15
17
  ChatEvent,
16
18
  extract_relevant_info,
17
19
  generate_online_subqueries,
18
20
  infer_webpage_urls,
19
21
  )
20
- from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer
22
+ from khoj.utils.helpers import (
23
+ is_env_var_true,
24
+ is_internal_url,
25
+ is_internet_connected,
26
+ is_none_or_empty,
27
+ timer,
28
+ )
21
29
  from khoj.utils.rawconfig import LocationData
22
30
 
23
31
  logger = logging.getLogger(__name__)
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
25
33
  SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
26
34
  SERPER_DEV_URL = "https://google.serper.dev/search"
27
35
 
28
- JINA_READER_API_URL = "https://r.jina.ai/"
29
36
  JINA_SEARCH_API_URL = "https://s.jina.ai/"
30
37
  JINA_API_KEY = os.getenv("JINA_API_KEY")
31
38
 
32
- OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
33
- OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
39
+ FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
40
+
34
41
  OLOSTEP_QUERY_PARAMS = {
35
42
  "timeout": 35, # seconds
36
43
  "waitBeforeScraping": 1, # seconds
@@ -83,33 +90,36 @@ async def search_online(
83
90
  search_results = await asyncio.gather(*search_tasks)
84
91
  response_dict = {subquery: search_result for subquery, search_result in search_results}
85
92
 
86
- # Gather distinct web page data from organic results of each subquery without an instant answer.
93
+ # Gather distinct web pages from organic results for subqueries without an instant answer.
87
94
  # Content of web pages is directly available when Jina is used for search.
88
- webpages = {
89
- (organic.get("link"), subquery, organic.get("content"))
90
- for subquery in response_dict
91
- for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
92
- if "answerBox" not in response_dict[subquery]
93
- }
95
+ webpages: Dict[str, Dict] = {}
96
+ for subquery in response_dict:
97
+ if "answerBox" in response_dict[subquery]:
98
+ continue
99
+ for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
100
+ link = organic.get("link")
101
+ if link in webpages:
102
+ webpages[link]["queries"].add(subquery)
103
+ else:
104
+ webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
94
105
 
95
106
  # Read, extract relevant info from the retrieved web pages
96
107
  if webpages:
97
- webpage_links = set([link for link, _, _ in webpages])
98
- logger.info(f"Reading web pages at: {list(webpage_links)}")
108
+ logger.info(f"Reading web pages at: {webpages.keys()}")
99
109
  if send_status_func:
100
- webpage_links_str = "\n- " + "\n- ".join(list(webpage_links))
110
+ webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
101
111
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
102
112
  yield {ChatEvent.STATUS: event}
103
113
  tasks = [
104
- read_webpage_and_extract_content(subquery, link, content, user=user, agent=agent)
105
- for link, subquery, content in webpages
114
+ read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
115
+ for link, data in webpages.items()
106
116
  ]
107
117
  results = await asyncio.gather(*tasks)
108
118
 
109
119
  # Collect extracted info from the retrieved web pages
110
- for subquery, webpage_extract, url in results:
120
+ for subqueries, url, webpage_extract in results:
111
121
  if webpage_extract is not None:
112
- response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
122
+ response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
113
123
 
114
124
  yield response_dict
115
125
 
@@ -156,29 +166,66 @@ async def read_webpages(
156
166
  webpage_links_str = "\n- " + "\n- ".join(list(urls))
157
167
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
158
168
  yield {ChatEvent.STATUS: event}
159
- tasks = [read_webpage_and_extract_content(query, url, user=user, agent=agent) for url in urls]
169
+ tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
160
170
  results = await asyncio.gather(*tasks)
161
171
 
162
172
  response: Dict[str, Dict] = defaultdict(dict)
163
173
  response[query]["webpages"] = [
164
- {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
174
+ {"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
165
175
  ]
166
176
  yield response
167
177
 
168
178
 
179
+ async def read_webpage(
180
+ url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
181
+ ) -> Tuple[str | None, str | None]:
182
+ if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
183
+ return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
184
+ elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
185
+ return await read_webpage_with_firecrawl(url, api_key, api_url), None
186
+ elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
187
+ return await read_webpage_with_olostep(url, api_key, api_url), None
188
+ elif scraper_type == WebScraper.WebScraperType.JINA:
189
+ return await read_webpage_with_jina(url, api_key, api_url), None
190
+ else:
191
+ return await read_webpage_at_url(url), None
192
+
193
+
169
194
  async def read_webpage_and_extract_content(
170
- subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None
171
- ) -> Tuple[str, Union[None, str], str]:
172
- try:
173
- if is_none_or_empty(content):
174
- with timer(f"Reading web page at '{url}' took", logger):
175
- content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url)
176
- with timer(f"Extracting relevant information from web page at '{url}' took", logger):
177
- extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent)
178
- return subquery, extracted_info, url
179
- except Exception as e:
180
- logger.error(f"Failed to read web page at '{url}' with {e}")
181
- return subquery, None, url
195
+ subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
196
+ ) -> Tuple[set[str], str, Union[None, str]]:
197
+ # Select the web scrapers to use for reading the web page
198
+ web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
199
+ # Only use the direct web scraper for internal URLs
200
+ if is_internal_url(url):
201
+ web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
202
+
203
+ # Fallback through enabled web scrapers until we successfully read the web page
204
+ extracted_info = None
205
+ for scraper in web_scrapers:
206
+ try:
207
+ # Read the web page
208
+ if is_none_or_empty(content):
209
+ with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
210
+ content, extracted_info = await read_webpage(
211
+ url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
212
+ )
213
+
214
+ # Extract relevant information from the web page
215
+ if is_none_or_empty(extracted_info):
216
+ with timer(f"Extracting relevant information from web page at '{url}' took", logger):
217
+ extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
218
+
219
+ # If we successfully extracted information, break the loop
220
+ if not is_none_or_empty(extracted_info):
221
+ break
222
+ except Exception as e:
223
+ logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
224
+ # If this is the last web scraper in the list, log an error
225
+ if scraper.name == web_scrapers[-1].name:
226
+ logger.error(f"All web scrapers failed for '{url}'")
227
+
228
+ return subqueries, url, extracted_info
182
229
 
183
230
 
184
231
  async def read_webpage_at_url(web_url: str) -> str:
@@ -195,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
195
242
  return markdownify(body)
196
243
 
197
244
 
198
- async def read_webpage_with_olostep(web_url: str) -> str:
199
- headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
245
+ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
246
+ headers = {"Authorization": f"Bearer {api_key}"}
200
247
  web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
201
248
  web_scraping_params["url"] = web_url
202
249
 
203
250
  async with aiohttp.ClientSession() as session:
204
- async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
251
+ async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
205
252
  response.raise_for_status()
206
253
  response_json = await response.json()
207
254
  return response_json["markdown_content"]
208
255
 
209
256
 
210
- async def read_webpage_with_jina(web_url: str) -> str:
211
- jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
257
+ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
258
+ jina_reader_api_url = f"{api_url}/{web_url}"
212
259
  headers = {"Accept": "application/json", "X-Timeout": "30"}
213
- if JINA_API_KEY:
214
- headers["Authorization"] = f"Bearer {JINA_API_KEY}"
260
+ if api_key:
261
+ headers["Authorization"] = f"Bearer {api_key}"
215
262
 
216
263
  async with aiohttp.ClientSession() as session:
217
264
  async with session.get(jina_reader_api_url, headers=headers) as response:
@@ -220,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
220
267
  return response_json["data"]["content"]
221
268
 
222
269
 
270
+ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
271
+ firecrawl_api_url = f"{api_url}/v1/scrape"
272
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
273
+ params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
274
+
275
+ async with aiohttp.ClientSession() as session:
276
+ async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
277
+ response.raise_for_status()
278
+ response_json = await response.json()
279
+ return response_json["data"]["markdown"]
280
+
281
+
282
+ async def query_webpage_with_firecrawl(
283
+ web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
284
+ ) -> str:
285
+ firecrawl_api_url = f"{api_url}/v1/scrape"
286
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
287
+ schema = {
288
+ "type": "object",
289
+ "properties": {
290
+ "relevant_extract": {"type": "string"},
291
+ },
292
+ "required": [
293
+ "relevant_extract",
294
+ ],
295
+ }
296
+
297
+ personality_context = (
298
+ prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
299
+ )
300
+ system_prompt = f"""
301
+ {prompts.system_prompt_extract_relevant_information}
302
+
303
+ {personality_context}
304
+ User Query: {", ".join(queries)}
305
+
306
+ Collate only relevant information from the website to answer the target query and in the provided JSON schema.
307
+ """.strip()
308
+
309
+ params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
310
+
311
+ async with aiohttp.ClientSession() as session:
312
+ async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
313
+ response.raise_for_status()
314
+ response_json = await response.json()
315
+ return response_json["data"]["extract"]["relevant_extract"]
316
+
317
+
223
318
  async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
224
319
  encoded_query = urllib.parse.quote(query)
225
320
  jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
khoj/routers/api_chat.py CHANGED
@@ -3,7 +3,6 @@ import base64
3
3
  import json
4
4
  import logging
5
5
  import time
6
- import warnings
7
6
  from datetime import datetime
8
7
  from functools import partial
9
8
  from typing import Dict, Optional
@@ -11,9 +10,8 @@ from urllib.parse import unquote
11
10
 
12
11
  from asgiref.sync import sync_to_async
13
12
  from fastapi import APIRouter, Depends, HTTPException, Request
14
- from fastapi.requests import Request
15
13
  from fastapi.responses import Response, StreamingResponse
16
- from starlette.authentication import has_required_scope, requires
14
+ from starlette.authentication import requires
17
15
 
18
16
  from khoj.app.settings import ALLOWED_HOSTS
19
17
  from khoj.database.adapters import (
@@ -574,7 +572,6 @@ async def chat(
574
572
  chat_metadata: dict = {}
575
573
  connection_alive = True
576
574
  user: KhojUser = request.user.object
577
- subscribed: bool = has_required_scope(request, ["premium"])
578
575
  event_delimiter = "␃🔚␗"
579
576
  q = unquote(q)
580
577
  nonlocal conversation_id
@@ -641,7 +638,7 @@ async def chat(
641
638
  request=request,
642
639
  telemetry_type="api",
643
640
  api="chat",
644
- client=request.user.client_app,
641
+ client=common.client,
645
642
  user_agent=request.headers.get("user-agent"),
646
643
  host=request.headers.get("host"),
647
644
  metadata=chat_metadata,
@@ -839,26 +836,34 @@ async def chat(
839
836
 
840
837
  # Gather Context
841
838
  ## Extract Document References
842
- compiled_references, inferred_queries, defiltered_query = [], [], None
843
- async for result in extract_references_and_questions(
844
- request,
845
- meta_log,
846
- q,
847
- (n or 7),
848
- d,
849
- conversation_id,
850
- conversation_commands,
851
- location,
852
- partial(send_event, ChatEvent.STATUS),
853
- uploaded_image_url=uploaded_image_url,
854
- agent=agent,
855
- ):
856
- if isinstance(result, dict) and ChatEvent.STATUS in result:
857
- yield result[ChatEvent.STATUS]
858
- else:
859
- compiled_references.extend(result[0])
860
- inferred_queries.extend(result[1])
861
- defiltered_query = result[2]
839
+ compiled_references, inferred_queries, defiltered_query = [], [], q
840
+ try:
841
+ async for result in extract_references_and_questions(
842
+ request,
843
+ meta_log,
844
+ q,
845
+ (n or 7),
846
+ d,
847
+ conversation_id,
848
+ conversation_commands,
849
+ location,
850
+ partial(send_event, ChatEvent.STATUS),
851
+ uploaded_image_url=uploaded_image_url,
852
+ agent=agent,
853
+ ):
854
+ if isinstance(result, dict) and ChatEvent.STATUS in result:
855
+ yield result[ChatEvent.STATUS]
856
+ else:
857
+ compiled_references.extend(result[0])
858
+ inferred_queries.extend(result[1])
859
+ defiltered_query = result[2]
860
+ except Exception as e:
861
+ error_message = f"Error searching knowledge base: {e}. Attempting to respond without document references."
862
+ logger.warning(error_message)
863
+ async for result in send_event(
864
+ ChatEvent.STATUS, "Document search failed. I'll try respond without document references"
865
+ ):
866
+ yield result
862
867
 
863
868
  if not is_none_or_empty(compiled_references):
864
869
  headings = "\n- " + "\n- ".join(set([c.get("compiled", c).split("\n")[0] for c in compiled_references]))
@@ -894,12 +899,13 @@ async def chat(
894
899
  yield result[ChatEvent.STATUS]
895
900
  else:
896
901
  online_results = result
897
- except ValueError as e:
902
+ except Exception as e:
898
903
  error_message = f"Error searching online: {e}. Attempting to respond without online results"
899
904
  logger.warning(error_message)
900
- async for result in send_llm_response(error_message):
905
+ async for result in send_event(
906
+ ChatEvent.STATUS, "Online search failed. I'll try respond without online references"
907
+ ):
901
908
  yield result
902
- return
903
909
 
904
910
  ## Gather Webpage References
905
911
  if ConversationCommand.Webpage in conversation_commands:
@@ -928,11 +934,15 @@ async def chat(
928
934
  webpages.append(webpage["link"])
929
935
  async for result in send_event(ChatEvent.STATUS, f"**Read web pages**: {webpages}"):
930
936
  yield result
931
- except ValueError as e:
937
+ except Exception as e:
932
938
  logger.warning(
933
- f"Error directly reading webpages: {e}. Attempting to respond without online results",
939
+ f"Error reading webpages: {e}. Attempting to respond without webpage results",
934
940
  exc_info=True,
935
941
  )
942
+ async for result in send_event(
943
+ ChatEvent.STATUS, "Webpage read failed. I'll try respond without webpage references"
944
+ ):
945
+ yield result
936
946
 
937
947
  ## Send Gathered References
938
948
  async for result in send_event(
@@ -949,7 +959,7 @@ async def chat(
949
959
  ## Generate Image Output
950
960
  if ConversationCommand.Image in conversation_commands:
951
961
  async for result in text_to_image(
952
- q,
962
+ defiltered_query,
953
963
  user,
954
964
  meta_log,
955
965
  location_data=location,
khoj/routers/helpers.py CHANGED
@@ -353,13 +353,13 @@ async def aget_relevant_information_sources(
353
353
  final_response = [ConversationCommand.Default]
354
354
  else:
355
355
  final_response = [ConversationCommand.General]
356
- return final_response
357
- except Exception as e:
356
+ except Exception:
358
357
  logger.error(f"Invalid response for determining relevant tools: {response}")
359
358
  if len(agent_tools) == 0:
360
359
  final_response = [ConversationCommand.Default]
361
360
  else:
362
361
  final_response = agent_tools
362
+ return final_response
363
363
 
364
364
 
365
365
  async def aget_relevant_output_modes(
@@ -551,12 +551,14 @@ async def schedule_query(
551
551
  raise AssertionError(f"Invalid response for scheduling query: {raw_response}")
552
552
 
553
553
 
554
- async def extract_relevant_info(q: str, corpus: str, user: KhojUser = None, agent: Agent = None) -> Union[str, None]:
554
+ async def extract_relevant_info(
555
+ qs: set[str], corpus: str, user: KhojUser = None, agent: Agent = None
556
+ ) -> Union[str, None]:
555
557
  """
556
558
  Extract relevant information for a given query from the target corpus
557
559
  """
558
560
 
559
- if is_none_or_empty(corpus) or is_none_or_empty(q):
561
+ if is_none_or_empty(corpus) or is_none_or_empty(qs):
560
562
  return None
561
563
 
562
564
  personality_context = (
@@ -564,17 +566,16 @@ async def extract_relevant_info(q: str, corpus: str, user: KhojUser = None, agen
564
566
  )
565
567
 
566
568
  extract_relevant_information = prompts.extract_relevant_information.format(
567
- query=q,
569
+ query=", ".join(qs),
568
570
  corpus=corpus.strip(),
569
571
  personality_context=personality_context,
570
572
  )
571
573
 
572
- with timer("Chat actor: Extract relevant information from data", logger):
573
- response = await send_message_to_model_wrapper(
574
- extract_relevant_information,
575
- prompts.system_prompt_extract_relevant_information,
576
- user=user,
577
- )
574
+ response = await send_message_to_model_wrapper(
575
+ extract_relevant_information,
576
+ prompts.system_prompt_extract_relevant_information,
577
+ user=user,
578
+ )
578
579
  return response.strip()
579
580
 
580
581
 
@@ -880,6 +881,7 @@ def send_message_to_model_wrapper_sync(
880
881
  messages=truncated_messages,
881
882
  api_key=api_key,
882
883
  model=chat_model,
884
+ response_type=response_type,
883
885
  )
884
886
  else:
885
887
  raise HTTPException(status_code=500, detail="Invalid conversation config")
@@ -3,6 +3,7 @@ import math
3
3
  from pathlib import Path
4
4
  from typing import List, Optional, Tuple, Type, Union
5
5
 
6
+ import requests
6
7
  import torch
7
8
  from asgiref.sync import sync_to_async
8
9
  from sentence_transformers import util
@@ -231,8 +232,12 @@ def setup(
231
232
 
232
233
  def cross_encoder_score(query: str, hits: List[SearchResponse], search_model_name: str) -> List[SearchResponse]:
233
234
  """Score all retrieved entries using the cross-encoder"""
234
- with timer("Cross-Encoder Predict Time", logger, state.device):
235
- cross_scores = state.cross_encoder_model[search_model_name].predict(query, hits)
235
+ try:
236
+ with timer("Cross-Encoder Predict Time", logger, state.device):
237
+ cross_scores = state.cross_encoder_model[search_model_name].predict(query, hits)
238
+ except requests.exceptions.HTTPError as e:
239
+ logger.error(f"Failed to rerank documents using the inference endpoint. Error: {e}.", exc_info=True)
240
+ cross_scores = [0.0] * len(hits)
236
241
 
237
242
  # Convert cross-encoder scores to distances and pass in hits for reranking
238
243
  for idx in range(len(cross_scores)):
khoj/utils/helpers.py CHANGED
@@ -2,10 +2,12 @@ from __future__ import annotations # to avoid quoting type hints
2
2
 
3
3
  import datetime
4
4
  import io
5
+ import ipaddress
5
6
  import logging
6
7
  import os
7
8
  import platform
8
9
  import random
10
+ import urllib.parse
9
11
  import uuid
10
12
  from collections import OrderedDict
11
13
  from enum import Enum
@@ -125,6 +127,8 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
125
127
  return "image", encoding
126
128
  elif file_type in ["image/png"]:
127
129
  return "image", encoding
130
+ elif file_type in ["image/webp"]:
131
+ return "image", encoding
128
132
  elif content_group in ["code", "text"]:
129
133
  return "plaintext", encoding
130
134
  else:
@@ -164,9 +168,9 @@ def get_class_by_name(name: str) -> object:
164
168
  class timer:
165
169
  """Context manager to log time taken for a block of code to run"""
166
170
 
167
- def __init__(self, message: str, logger: logging.Logger, device: torch.device = None):
171
+ def __init__(self, message: str, logger: logging.Logger, device: torch.device = None, log_level=logging.DEBUG):
168
172
  self.message = message
169
- self.logger = logger
173
+ self.logger = logger.debug if log_level == logging.DEBUG else logger.info
170
174
  self.device = device
171
175
 
172
176
  def __enter__(self):
@@ -176,9 +180,9 @@ class timer:
176
180
  def __exit__(self, *_):
177
181
  elapsed = perf_counter() - self.start
178
182
  if self.device is None:
179
- self.logger.debug(f"{self.message}: {elapsed:.3f} seconds")
183
+ self.logger(f"{self.message}: {elapsed:.3f} seconds")
180
184
  else:
181
- self.logger.debug(f"{self.message}: {elapsed:.3f} seconds on device: {self.device}")
185
+ self.logger(f"{self.message}: {elapsed:.3f} seconds on device: {self.device}")
182
186
 
183
187
 
184
188
  class LRU(OrderedDict):
@@ -347,12 +351,13 @@ tool_descriptions_for_llm = {
347
351
 
348
352
  mode_descriptions_for_llm = {
349
353
  ConversationCommand.Image: "Use this if the user is requesting you to generate a picture based on their description.",
350
- ConversationCommand.Automation: "Use this if the user is requesting a response at a scheduled date or time.",
354
+ ConversationCommand.Automation: "Use this if you are confident the user is requesting a response at a scheduled date, time and frequency",
351
355
  ConversationCommand.Text: "Use this if the other response modes don't seem to fit the query.",
352
356
  }
353
357
 
354
358
  mode_descriptions_for_agent = {
355
359
  ConversationCommand.Image: "Agent can generate image in response.",
360
+ ConversationCommand.Automation: "Agent can schedule a task to run at a scheduled date, time and frequency in response.",
356
361
  ConversationCommand.Text: "Agent can generate text in response.",
357
362
  }
358
363
 
@@ -435,6 +440,46 @@ def is_internet_connected():
435
440
  return False
436
441
 
437
442
 
443
+ def is_internal_url(url: str) -> bool:
444
+ """
445
+ Check if a URL is likely to be internal/non-public.
446
+
447
+ Args:
448
+ url (str): The URL to check.
449
+
450
+ Returns:
451
+ bool: True if the URL is likely internal, False otherwise.
452
+ """
453
+ try:
454
+ parsed_url = urllib.parse.urlparse(url)
455
+ hostname = parsed_url.hostname
456
+
457
+ # Check for localhost
458
+ if hostname in ["localhost", "127.0.0.1", "::1"]:
459
+ return True
460
+
461
+ # Check for IP addresses in private ranges
462
+ try:
463
+ ip = ipaddress.ip_address(hostname)
464
+ return ip.is_private
465
+ except ValueError:
466
+ pass # Not an IP address, continue with other checks
467
+
468
+ # Check for common internal TLDs
469
+ internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"]
470
+ if any(hostname.endswith(tld) for tld in internal_tlds):
471
+ return True
472
+
473
+ # Check for URLs without a TLD
474
+ if "." not in hostname:
475
+ return True
476
+
477
+ return False
478
+ except Exception:
479
+ # If we can't parse the URL or something else goes wrong, assume it's not internal
480
+ return False
481
+
482
+
438
483
  def convert_image_to_webp(image_bytes):
439
484
  """Convert image bytes to webp format for faster loading"""
440
485
  image_io = io.BytesIO(image_bytes)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: khoj
3
- Version: 1.25.1.dev12
3
+ Version: 1.26.1.dev3
4
4
  Summary: Your Second Brain
5
5
  Project-URL: Homepage, https://khoj.dev
6
6
  Project-URL: Documentation, https://docs.khoj.dev
@@ -32,7 +32,7 @@ Requires-Dist: dateparser>=1.1.1
32
32
  Requires-Dist: defusedxml==0.7.1
33
33
  Requires-Dist: django-apscheduler==0.6.2
34
34
  Requires-Dist: django-phonenumber-field==7.3.0
35
- Requires-Dist: django==5.0.8
35
+ Requires-Dist: django==5.0.9
36
36
  Requires-Dist: docx2txt==0.8
37
37
  Requires-Dist: einops==0.8.0
38
38
  Requires-Dist: fastapi>=0.110.0
@@ -138,8 +138,8 @@ Description-Content-Type: text/markdown
138
138
  - Chat with any local or online LLM (e.g llama3, qwen, gemma, mistral, gpt, claude, gemini).
139
139
  - Get answers from the internet and your docs (including image, pdf, markdown, org-mode, word, notion files).
140
140
  - Access it from your Browser, Obsidian, Emacs, Desktop, Phone or Whatsapp.
141
- - Build agents with custom knowledge bases and tools.
142
- - Create automations to get personal newsletters and smart notifications.
141
+ - Create agents with custom knowledge, persona, chat model and tools to take on any role.
142
+ - Automate away repetitive research. Get personal newsletters and smart notifications delivered to your inbox.
143
143
  - Find relevant docs quickly and easily using our advanced semantic search.
144
144
  - Generate images, talk out loud, play your messages.
145
145
  - Khoj is open-source, self-hostable. Always.