khoj 1.24.2.dev16__py3-none-any.whl → 1.25.1.dev34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. khoj/configure.py +13 -4
  2. khoj/database/adapters/__init__.py +163 -49
  3. khoj/database/admin.py +18 -1
  4. khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
  5. khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
  6. khoj/database/models/__init__.py +78 -2
  7. khoj/interface/compiled/404/index.html +1 -1
  8. khoj/interface/compiled/_next/static/chunks/1603-fa3ee48860b9dc5c.js +1 -0
  9. khoj/interface/compiled/_next/static/chunks/7762-79f2205740622b5c.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/app/agents/{layout-e71c8e913cccf792.js → layout-75636ab3a413fa8e.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/agents/page-fa282831808ee536.js +1 -0
  12. khoj/interface/compiled/_next/static/chunks/app/automations/{page-1688dead2f21270d.js → page-5480731341f34450.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/chat/{layout-8102549127db3067.js → layout-96fcf62857bf8f30.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/app/chat/{page-91abcb71846922b7.js → page-702057ccbcf27881.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/app/factchecker/{page-7ab093711c27041c.js → page-e7b34316ec6f44de.js} +1 -1
  16. khoj/interface/compiled/_next/static/chunks/app/{layout-f3e40d346da53112.js → layout-d0f0a9067427fb20.js} +1 -1
  17. khoj/interface/compiled/_next/static/chunks/app/{page-fada198096eab47f.js → page-10a5aad6e04f3cf8.js} +1 -1
  18. khoj/interface/compiled/_next/static/chunks/app/search/{page-a7e036689b6507ff.js → page-d56541c746fded7d.js} +1 -1
  19. khoj/interface/compiled/_next/static/chunks/app/settings/{layout-6f9314b0d7a26046.js → layout-a8f33dfe92f997fb.js} +1 -1
  20. khoj/interface/compiled/_next/static/chunks/app/settings/{page-fa11cafaec7ab39f.js → page-e044a999468a7c5d.js} +1 -1
  21. khoj/interface/compiled/_next/static/chunks/app/share/chat/{layout-39f03f9e32399f0f.js → layout-2df56074e42adaa0.js} +1 -1
  22. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-c5d2b9076e5390b2.js → page-fbbd66a4d4633438.js} +1 -1
  23. khoj/interface/compiled/_next/static/chunks/{webpack-f52083d548d804fa.js → webpack-c0cd5a6afb1f0798.js} +1 -1
  24. khoj/interface/compiled/_next/static/css/2de69f0be774c768.css +1 -0
  25. khoj/interface/compiled/_next/static/css/3e1f1fdd70775091.css +1 -0
  26. khoj/interface/compiled/_next/static/css/467a524c75e7d7c0.css +1 -0
  27. khoj/interface/compiled/_next/static/css/b9a6bf04305d98d7.css +25 -0
  28. khoj/interface/compiled/agents/index.html +1 -1
  29. khoj/interface/compiled/agents/index.txt +2 -2
  30. khoj/interface/compiled/automations/index.html +1 -1
  31. khoj/interface/compiled/automations/index.txt +2 -2
  32. khoj/interface/compiled/chat/index.html +1 -1
  33. khoj/interface/compiled/chat/index.txt +2 -2
  34. khoj/interface/compiled/factchecker/index.html +1 -1
  35. khoj/interface/compiled/factchecker/index.txt +2 -2
  36. khoj/interface/compiled/index.html +1 -1
  37. khoj/interface/compiled/index.txt +2 -2
  38. khoj/interface/compiled/search/index.html +1 -1
  39. khoj/interface/compiled/search/index.txt +2 -2
  40. khoj/interface/compiled/settings/index.html +1 -1
  41. khoj/interface/compiled/settings/index.txt +3 -3
  42. khoj/interface/compiled/share/chat/index.html +1 -1
  43. khoj/interface/compiled/share/chat/index.txt +2 -2
  44. khoj/interface/web/assets/icons/agents.svg +1 -0
  45. khoj/interface/web/assets/icons/automation.svg +1 -0
  46. khoj/interface/web/assets/icons/chat.svg +24 -0
  47. khoj/interface/web/login.html +11 -22
  48. khoj/processor/conversation/google/gemini_chat.py +4 -19
  49. khoj/processor/conversation/google/utils.py +33 -15
  50. khoj/processor/conversation/prompts.py +14 -3
  51. khoj/processor/conversation/utils.py +3 -7
  52. khoj/processor/embeddings.py +6 -3
  53. khoj/processor/image/generate.py +1 -2
  54. khoj/processor/tools/online_search.py +135 -42
  55. khoj/routers/api.py +1 -1
  56. khoj/routers/api_agents.py +6 -3
  57. khoj/routers/api_chat.py +63 -520
  58. khoj/routers/api_model.py +1 -1
  59. khoj/routers/auth.py +9 -1
  60. khoj/routers/helpers.py +74 -61
  61. khoj/routers/subscription.py +18 -4
  62. khoj/search_type/text_search.py +7 -2
  63. khoj/utils/helpers.py +56 -13
  64. khoj/utils/initialization.py +0 -3
  65. {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/METADATA +19 -14
  66. {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/RECORD +71 -68
  67. khoj/interface/compiled/_next/static/chunks/1269-2e52d48e7d0e5c61.js +0 -1
  68. khoj/interface/compiled/_next/static/chunks/1603-67a89278e2c5dbe6.js +0 -1
  69. khoj/interface/compiled/_next/static/chunks/app/agents/page-df26b497b7356151.js +0 -1
  70. khoj/interface/compiled/_next/static/css/1538cedb321e3a97.css +0 -1
  71. khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
  72. khoj/interface/compiled/_next/static/css/50d972a8c787730b.css +0 -25
  73. khoj/interface/compiled/_next/static/css/dfb67a9287720a2b.css +0 -1
  74. /khoj/interface/compiled/_next/static/{MyYNlmGMz32TGV_-febR4 → Jid9q6Qg851ioDaaO_fth}/_buildManifest.js +0 -0
  75. /khoj/interface/compiled/_next/static/{MyYNlmGMz32TGV_-febR4 → Jid9q6Qg851ioDaaO_fth}/_ssgManifest.js +0 -0
  76. {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/WHEEL +0 -0
  77. {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/entry_points.txt +0 -0
  78. {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/licenses/LICENSE +0 -0
@@ -45,6 +45,13 @@ Instructions:\n{bio}
45
45
  """.strip()
46
46
  )
47
47
 
48
+ # To make Gemini be more verbose and match language of user's query.
49
+ # Prompt forked from https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models
50
+ gemini_verbose_language_personality = """
51
+ All questions should be answered comprehensively with details, unless the user requests a concise response specifically.
52
+ Respond in the same language as the query. Use markdown to format your responses.
53
+ """.strip()
54
+
48
55
  ## General Conversation
49
56
  ## --
50
57
  general_conversation = PromptTemplate.from_template(
@@ -404,6 +411,10 @@ Tell the user exactly what the document says in response to their query, while a
404
411
  extract_relevant_summary = PromptTemplate.from_template(
405
412
  """
406
413
  {personality_context}
414
+
415
+ Conversation History:
416
+ {chat_history}
417
+
407
418
  Target Query: {query}
408
419
 
409
420
  Document Contents:
@@ -415,10 +426,10 @@ Collate only relevant information from the document to answer the target query.
415
426
 
416
427
  personality_context = PromptTemplate.from_template(
417
428
  """
418
- Here's some additional context about you:
419
- {personality}
429
+ Here's some additional context about you:
430
+ {personality}
420
431
 
421
- """
432
+ """
422
433
  )
423
434
 
424
435
  pick_relevant_output_mode = PromptTemplate.from_template(
@@ -223,7 +223,7 @@ def truncate_messages(
223
223
  ) -> list[ChatMessage]:
224
224
  """Truncate messages to fit within max prompt size supported by model"""
225
225
 
226
- default_tokenizer = "hf-internal-testing/llama-tokenizer"
226
+ default_tokenizer = "gpt-4o"
227
227
 
228
228
  try:
229
229
  if loaded_model:
@@ -240,13 +240,9 @@ def truncate_messages(
240
240
  else:
241
241
  encoder = download_model(model_name).tokenizer()
242
242
  except:
243
- if default_tokenizer in state.pretrained_tokenizers:
244
- encoder = state.pretrained_tokenizers[default_tokenizer]
245
- else:
246
- encoder = AutoTokenizer.from_pretrained(default_tokenizer)
247
- state.pretrained_tokenizers[default_tokenizer] = encoder
243
+ encoder = tiktoken.encoding_for_model(default_tokenizer)
248
244
  logger.debug(
249
- f"Fallback to default chat model tokenizer: {tokenizer_name}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
245
+ f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for model: {model_name} in Khoj settings to improve context stuffing."
250
246
  )
251
247
 
252
248
  # Extract system message from messages
@@ -13,7 +13,7 @@ from tenacity import (
13
13
  )
14
14
  from torch import nn
15
15
 
16
- from khoj.utils.helpers import get_device, merge_dicts
16
+ from khoj.utils.helpers import get_device, merge_dicts, timer
17
17
  from khoj.utils.rawconfig import SearchResponse
18
18
 
19
19
  logger = logging.getLogger(__name__)
@@ -37,7 +37,8 @@ class EmbeddingsModel:
37
37
  self.model_name = model_name
38
38
  self.inference_endpoint = embeddings_inference_endpoint
39
39
  self.api_key = embeddings_inference_endpoint_api_key
40
- self.embeddings_model = SentenceTransformer(self.model_name, **self.model_kwargs)
40
+ with timer(f"Loaded embedding model {self.model_name}", logger):
41
+ self.embeddings_model = SentenceTransformer(self.model_name, **self.model_kwargs)
41
42
 
42
43
  def inference_server_enabled(self) -> bool:
43
44
  return self.api_key is not None and self.inference_endpoint is not None
@@ -101,7 +102,8 @@ class CrossEncoderModel:
101
102
  self.inference_endpoint = cross_encoder_inference_endpoint
102
103
  self.api_key = cross_encoder_inference_endpoint_api_key
103
104
  self.model_kwargs = merge_dicts(model_kwargs, {"device": get_device()})
104
- self.cross_encoder_model = CrossEncoder(model_name=self.model_name, **self.model_kwargs)
105
+ with timer(f"Loaded cross-encoder model {self.model_name}", logger):
106
+ self.cross_encoder_model = CrossEncoder(model_name=self.model_name, **self.model_kwargs)
105
107
 
106
108
  def inference_server_enabled(self) -> bool:
107
109
  return self.api_key is not None and self.inference_endpoint is not None
@@ -112,6 +114,7 @@ class CrossEncoderModel:
112
114
  payload = {"inputs": {"query": query, "passages": [hit.additional[key] for hit in hits]}}
113
115
  headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
114
116
  response = requests.post(target_url, json=payload, headers=headers)
117
+ response.raise_for_status()
115
118
  return response.json()["scores"]
116
119
 
117
120
  cross_inp = [[query, hit.additional[key]] for hit in hits]
@@ -25,7 +25,6 @@ async def text_to_image(
25
25
  location_data: LocationData,
26
26
  references: List[Dict[str, Any]],
27
27
  online_results: Dict[str, Any],
28
- subscribed: bool = False,
29
28
  send_status_func: Optional[Callable] = None,
30
29
  uploaded_image_url: Optional[str] = None,
31
30
  agent: Agent = None,
@@ -66,8 +65,8 @@ async def text_to_image(
66
65
  note_references=references,
67
66
  online_results=online_results,
68
67
  model_type=text_to_image_config.model_type,
69
- subscribed=subscribed,
70
68
  uploaded_image_url=uploaded_image_url,
69
+ user=user,
71
70
  agent=agent,
72
71
  )
73
72
 
@@ -10,14 +10,22 @@ import aiohttp
10
10
  from bs4 import BeautifulSoup
11
11
  from markdownify import markdownify
12
12
 
13
- from khoj.database.models import Agent, KhojUser
13
+ from khoj.database.adapters import ConversationAdapters
14
+ from khoj.database.models import Agent, KhojUser, WebScraper
15
+ from khoj.processor.conversation import prompts
14
16
  from khoj.routers.helpers import (
15
17
  ChatEvent,
16
18
  extract_relevant_info,
17
19
  generate_online_subqueries,
18
20
  infer_webpage_urls,
19
21
  )
20
- from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer
22
+ from khoj.utils.helpers import (
23
+ is_env_var_true,
24
+ is_internal_url,
25
+ is_internet_connected,
26
+ is_none_or_empty,
27
+ timer,
28
+ )
21
29
  from khoj.utils.rawconfig import LocationData
22
30
 
23
31
  logger = logging.getLogger(__name__)
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
25
33
  SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
26
34
  SERPER_DEV_URL = "https://google.serper.dev/search"
27
35
 
28
- JINA_READER_API_URL = "https://r.jina.ai/"
29
36
  JINA_SEARCH_API_URL = "https://s.jina.ai/"
30
37
  JINA_API_KEY = os.getenv("JINA_API_KEY")
31
38
 
32
- OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
33
- OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
39
+ FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
40
+
34
41
  OLOSTEP_QUERY_PARAMS = {
35
42
  "timeout": 35, # seconds
36
43
  "waitBeforeScraping": 1, # seconds
@@ -53,7 +60,6 @@ async def search_online(
53
60
  conversation_history: dict,
54
61
  location: LocationData,
55
62
  user: KhojUser,
56
- subscribed: bool = False,
57
63
  send_status_func: Optional[Callable] = None,
58
64
  custom_filters: List[str] = [],
59
65
  uploaded_image_url: str = None,
@@ -84,33 +90,36 @@ async def search_online(
84
90
  search_results = await asyncio.gather(*search_tasks)
85
91
  response_dict = {subquery: search_result for subquery, search_result in search_results}
86
92
 
87
- # Gather distinct web page data from organic results of each subquery without an instant answer.
93
+ # Gather distinct web pages from organic results for subqueries without an instant answer.
88
94
  # Content of web pages is directly available when Jina is used for search.
89
- webpages = {
90
- (organic.get("link"), subquery, organic.get("content"))
91
- for subquery in response_dict
92
- for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
93
- if "answerBox" not in response_dict[subquery]
94
- }
95
+ webpages: Dict[str, Dict] = {}
96
+ for subquery in response_dict:
97
+ if "answerBox" in response_dict[subquery]:
98
+ continue
99
+ for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
100
+ link = organic.get("link")
101
+ if link in webpages:
102
+ webpages[link]["queries"].add(subquery)
103
+ else:
104
+ webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
95
105
 
96
106
  # Read, extract relevant info from the retrieved web pages
97
107
  if webpages:
98
- webpage_links = set([link for link, _, _ in webpages])
99
- logger.info(f"Reading web pages at: {list(webpage_links)}")
108
+ logger.info(f"Reading web pages at: {webpages.keys()}")
100
109
  if send_status_func:
101
- webpage_links_str = "\n- " + "\n- ".join(list(webpage_links))
110
+ webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
102
111
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
103
112
  yield {ChatEvent.STATUS: event}
104
113
  tasks = [
105
- read_webpage_and_extract_content(subquery, link, content, subscribed=subscribed, agent=agent)
106
- for link, subquery, content in webpages
114
+ read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
115
+ for link, data in webpages.items()
107
116
  ]
108
117
  results = await asyncio.gather(*tasks)
109
118
 
110
119
  # Collect extracted info from the retrieved web pages
111
- for subquery, webpage_extract, url in results:
120
+ for subqueries, url, webpage_extract in results:
112
121
  if webpage_extract is not None:
113
- response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
122
+ response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
114
123
 
115
124
  yield response_dict
116
125
 
@@ -141,7 +150,6 @@ async def read_webpages(
141
150
  conversation_history: dict,
142
151
  location: LocationData,
143
152
  user: KhojUser,
144
- subscribed: bool = False,
145
153
  send_status_func: Optional[Callable] = None,
146
154
  uploaded_image_url: str = None,
147
155
  agent: Agent = None,
@@ -158,29 +166,66 @@ async def read_webpages(
158
166
  webpage_links_str = "\n- " + "\n- ".join(list(urls))
159
167
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
160
168
  yield {ChatEvent.STATUS: event}
161
- tasks = [read_webpage_and_extract_content(query, url, subscribed=subscribed, agent=agent) for url in urls]
169
+ tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
162
170
  results = await asyncio.gather(*tasks)
163
171
 
164
172
  response: Dict[str, Dict] = defaultdict(dict)
165
173
  response[query]["webpages"] = [
166
- {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
174
+ {"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
167
175
  ]
168
176
  yield response
169
177
 
170
178
 
179
+ async def read_webpage(
180
+ url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
181
+ ) -> Tuple[str | None, str | None]:
182
+ if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
183
+ return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
184
+ elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
185
+ return await read_webpage_with_firecrawl(url, api_key, api_url), None
186
+ elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
187
+ return await read_webpage_with_olostep(url, api_key, api_url), None
188
+ elif scraper_type == WebScraper.WebScraperType.JINA:
189
+ return await read_webpage_with_jina(url, api_key, api_url), None
190
+ else:
191
+ return await read_webpage_at_url(url), None
192
+
193
+
171
194
  async def read_webpage_and_extract_content(
172
- subquery: str, url: str, content: str = None, subscribed: bool = False, agent: Agent = None
173
- ) -> Tuple[str, Union[None, str], str]:
174
- try:
175
- if is_none_or_empty(content):
176
- with timer(f"Reading web page at '{url}' took", logger):
177
- content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url)
178
- with timer(f"Extracting relevant information from web page at '{url}' took", logger):
179
- extracted_info = await extract_relevant_info(subquery, content, subscribed=subscribed, agent=agent)
180
- return subquery, extracted_info, url
181
- except Exception as e:
182
- logger.error(f"Failed to read web page at '{url}' with {e}")
183
- return subquery, None, url
195
+ subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
196
+ ) -> Tuple[set[str], str, Union[None, str]]:
197
+ # Select the web scrapers to use for reading the web page
198
+ web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
199
+ # Only use the direct web scraper for internal URLs
200
+ if is_internal_url(url):
201
+ web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
202
+
203
+ # Fallback through enabled web scrapers until we successfully read the web page
204
+ extracted_info = None
205
+ for scraper in web_scrapers:
206
+ try:
207
+ # Read the web page
208
+ if is_none_or_empty(content):
209
+ with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
210
+ content, extracted_info = await read_webpage(
211
+ url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
212
+ )
213
+
214
+ # Extract relevant information from the web page
215
+ if is_none_or_empty(extracted_info):
216
+ with timer(f"Extracting relevant information from web page at '{url}' took", logger):
217
+ extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
218
+
219
+ # If we successfully extracted information, break the loop
220
+ if not is_none_or_empty(extracted_info):
221
+ break
222
+ except Exception as e:
223
+ logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
224
+ # If this is the last web scraper in the list, log an error
225
+ if scraper.name == web_scrapers[-1].name:
226
+ logger.error(f"All web scrapers failed for '{url}'")
227
+
228
+ return subqueries, url, extracted_info
184
229
 
185
230
 
186
231
  async def read_webpage_at_url(web_url: str) -> str:
@@ -197,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
197
242
  return markdownify(body)
198
243
 
199
244
 
200
- async def read_webpage_with_olostep(web_url: str) -> str:
201
- headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
245
+ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
246
+ headers = {"Authorization": f"Bearer {api_key}"}
202
247
  web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
203
248
  web_scraping_params["url"] = web_url
204
249
 
205
250
  async with aiohttp.ClientSession() as session:
206
- async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
251
+ async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
207
252
  response.raise_for_status()
208
253
  response_json = await response.json()
209
254
  return response_json["markdown_content"]
210
255
 
211
256
 
212
- async def read_webpage_with_jina(web_url: str) -> str:
213
- jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
257
+ async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
258
+ jina_reader_api_url = f"{api_url}/{web_url}"
214
259
  headers = {"Accept": "application/json", "X-Timeout": "30"}
215
- if JINA_API_KEY:
216
- headers["Authorization"] = f"Bearer {JINA_API_KEY}"
260
+ if api_key:
261
+ headers["Authorization"] = f"Bearer {api_key}"
217
262
 
218
263
  async with aiohttp.ClientSession() as session:
219
264
  async with session.get(jina_reader_api_url, headers=headers) as response:
@@ -222,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
222
267
  return response_json["data"]["content"]
223
268
 
224
269
 
270
+ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
271
+ firecrawl_api_url = f"{api_url}/v1/scrape"
272
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
273
+ params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
274
+
275
+ async with aiohttp.ClientSession() as session:
276
+ async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
277
+ response.raise_for_status()
278
+ response_json = await response.json()
279
+ return response_json["data"]["markdown"]
280
+
281
+
282
+ async def query_webpage_with_firecrawl(
283
+ web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
284
+ ) -> str:
285
+ firecrawl_api_url = f"{api_url}/v1/scrape"
286
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
287
+ schema = {
288
+ "type": "object",
289
+ "properties": {
290
+ "relevant_extract": {"type": "string"},
291
+ },
292
+ "required": [
293
+ "relevant_extract",
294
+ ],
295
+ }
296
+
297
+ personality_context = (
298
+ prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
299
+ )
300
+ system_prompt = f"""
301
+ {prompts.system_prompt_extract_relevant_information}
302
+
303
+ {personality_context}
304
+ User Query: {", ".join(queries)}
305
+
306
+ Collate only relevant information from the website to answer the target query and in the provided JSON schema.
307
+ """.strip()
308
+
309
+ params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
310
+
311
+ async with aiohttp.ClientSession() as session:
312
+ async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
313
+ response.raise_for_status()
314
+ response_json = await response.json()
315
+ return response_json["data"]["extract"]["relevant_extract"]
316
+
317
+
225
318
  async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
226
319
  encoded_query = urllib.parse.quote(query)
227
320
  jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
khoj/routers/api.py CHANGED
@@ -395,7 +395,7 @@ async def extract_references_and_questions(
395
395
  # Infer search queries from user message
396
396
  with timer("Extracting search queries took", logger):
397
397
  # If we've reached here, either the user has enabled offline chat or the openai model is enabled.
398
- conversation_config = await ConversationAdapters.aget_default_conversation_config()
398
+ conversation_config = await ConversationAdapters.aget_default_conversation_config(user)
399
399
  vision_enabled = conversation_config.vision_enabled
400
400
 
401
401
  if conversation_config.model_type == ChatModelOptions.ModelType.OFFLINE:
@@ -35,6 +35,7 @@ class ModifyAgentBody(BaseModel):
35
35
  files: Optional[List[str]] = []
36
36
  input_tools: Optional[List[str]] = []
37
37
  output_modes: Optional[List[str]] = []
38
+ slug: Optional[str] = None
38
39
 
39
40
 
40
41
  @api_agents.get("", response_class=Response)
@@ -161,7 +162,7 @@ async def delete_agent(
161
162
 
162
163
 
163
164
  @api_agents.post("", response_class=Response)
164
- @requires(["authenticated"])
165
+ @requires(["authenticated", "premium"])
165
166
  async def create_agent(
166
167
  request: Request,
167
168
  common: CommonQueryParams,
@@ -192,6 +193,7 @@ async def create_agent(
192
193
  body.files,
193
194
  body.input_tools,
194
195
  body.output_modes,
196
+ body.slug,
195
197
  )
196
198
 
197
199
  agents_packet = {
@@ -213,7 +215,7 @@ async def create_agent(
213
215
 
214
216
 
215
217
  @api_agents.patch("", response_class=Response)
216
- @requires(["authenticated"])
218
+ @requires(["authenticated", "premium"])
217
219
  async def update_agent(
218
220
  request: Request,
219
221
  common: CommonQueryParams,
@@ -233,7 +235,7 @@ async def update_agent(
233
235
  status_code=400,
234
236
  )
235
237
 
236
- selected_agent = await AgentAdapters.aget_agent_by_name(body.name, user)
238
+ selected_agent = await AgentAdapters.aget_agent_by_slug(body.slug, user)
237
239
 
238
240
  if not selected_agent:
239
241
  return Response(
@@ -253,6 +255,7 @@ async def update_agent(
253
255
  body.files,
254
256
  body.input_tools,
255
257
  body.output_modes,
258
+ body.slug,
256
259
  )
257
260
 
258
261
  agents_packet = {