khoj 1.24.2.dev16__py3-none-any.whl → 1.25.1.dev34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/configure.py +13 -4
- khoj/database/adapters/__init__.py +163 -49
- khoj/database/admin.py +18 -1
- khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
- khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
- khoj/database/models/__init__.py +78 -2
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/1603-fa3ee48860b9dc5c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/7762-79f2205740622b5c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{layout-e71c8e913cccf792.js → layout-75636ab3a413fa8e.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/page-fa282831808ee536.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-1688dead2f21270d.js → page-5480731341f34450.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/{layout-8102549127db3067.js → layout-96fcf62857bf8f30.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-91abcb71846922b7.js → page-702057ccbcf27881.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/factchecker/{page-7ab093711c27041c.js → page-e7b34316ec6f44de.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{layout-f3e40d346da53112.js → layout-d0f0a9067427fb20.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-fada198096eab47f.js → page-10a5aad6e04f3cf8.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/{page-a7e036689b6507ff.js → page-d56541c746fded7d.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{layout-6f9314b0d7a26046.js → layout-a8f33dfe92f997fb.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-fa11cafaec7ab39f.js → page-e044a999468a7c5d.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{layout-39f03f9e32399f0f.js → layout-2df56074e42adaa0.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-c5d2b9076e5390b2.js → page-fbbd66a4d4633438.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-f52083d548d804fa.js → webpack-c0cd5a6afb1f0798.js} +1 -1
- khoj/interface/compiled/_next/static/css/2de69f0be774c768.css +1 -0
- khoj/interface/compiled/_next/static/css/3e1f1fdd70775091.css +1 -0
- khoj/interface/compiled/_next/static/css/467a524c75e7d7c0.css +1 -0
- khoj/interface/compiled/_next/static/css/b9a6bf04305d98d7.css +25 -0
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +1 -1
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/factchecker/index.html +1 -1
- khoj/interface/compiled/factchecker/index.txt +2 -2
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +3 -3
- khoj/interface/compiled/share/chat/index.html +1 -1
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/interface/web/assets/icons/agents.svg +1 -0
- khoj/interface/web/assets/icons/automation.svg +1 -0
- khoj/interface/web/assets/icons/chat.svg +24 -0
- khoj/interface/web/login.html +11 -22
- khoj/processor/conversation/google/gemini_chat.py +4 -19
- khoj/processor/conversation/google/utils.py +33 -15
- khoj/processor/conversation/prompts.py +14 -3
- khoj/processor/conversation/utils.py +3 -7
- khoj/processor/embeddings.py +6 -3
- khoj/processor/image/generate.py +1 -2
- khoj/processor/tools/online_search.py +135 -42
- khoj/routers/api.py +1 -1
- khoj/routers/api_agents.py +6 -3
- khoj/routers/api_chat.py +63 -520
- khoj/routers/api_model.py +1 -1
- khoj/routers/auth.py +9 -1
- khoj/routers/helpers.py +74 -61
- khoj/routers/subscription.py +18 -4
- khoj/search_type/text_search.py +7 -2
- khoj/utils/helpers.py +56 -13
- khoj/utils/initialization.py +0 -3
- {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/METADATA +19 -14
- {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/RECORD +71 -68
- khoj/interface/compiled/_next/static/chunks/1269-2e52d48e7d0e5c61.js +0 -1
- khoj/interface/compiled/_next/static/chunks/1603-67a89278e2c5dbe6.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/page-df26b497b7356151.js +0 -1
- khoj/interface/compiled/_next/static/css/1538cedb321e3a97.css +0 -1
- khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
- khoj/interface/compiled/_next/static/css/50d972a8c787730b.css +0 -25
- khoj/interface/compiled/_next/static/css/dfb67a9287720a2b.css +0 -1
- /khoj/interface/compiled/_next/static/{MyYNlmGMz32TGV_-febR4 → Jid9q6Qg851ioDaaO_fth}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{MyYNlmGMz32TGV_-febR4 → Jid9q6Qg851ioDaaO_fth}/_ssgManifest.js +0 -0
- {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/WHEEL +0 -0
- {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/entry_points.txt +0 -0
- {khoj-1.24.2.dev16.dist-info → khoj-1.25.1.dev34.dist-info}/licenses/LICENSE +0 -0
@@ -45,6 +45,13 @@ Instructions:\n{bio}
|
|
45
45
|
""".strip()
|
46
46
|
)
|
47
47
|
|
48
|
+
# To make Gemini be more verbose and match language of user's query.
|
49
|
+
# Prompt forked from https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models
|
50
|
+
gemini_verbose_language_personality = """
|
51
|
+
All questions should be answered comprehensively with details, unless the user requests a concise response specifically.
|
52
|
+
Respond in the same language as the query. Use markdown to format your responses.
|
53
|
+
""".strip()
|
54
|
+
|
48
55
|
## General Conversation
|
49
56
|
## --
|
50
57
|
general_conversation = PromptTemplate.from_template(
|
@@ -404,6 +411,10 @@ Tell the user exactly what the document says in response to their query, while a
|
|
404
411
|
extract_relevant_summary = PromptTemplate.from_template(
|
405
412
|
"""
|
406
413
|
{personality_context}
|
414
|
+
|
415
|
+
Conversation History:
|
416
|
+
{chat_history}
|
417
|
+
|
407
418
|
Target Query: {query}
|
408
419
|
|
409
420
|
Document Contents:
|
@@ -415,10 +426,10 @@ Collate only relevant information from the document to answer the target query.
|
|
415
426
|
|
416
427
|
personality_context = PromptTemplate.from_template(
|
417
428
|
"""
|
418
|
-
|
419
|
-
|
429
|
+
Here's some additional context about you:
|
430
|
+
{personality}
|
420
431
|
|
421
|
-
|
432
|
+
"""
|
422
433
|
)
|
423
434
|
|
424
435
|
pick_relevant_output_mode = PromptTemplate.from_template(
|
@@ -223,7 +223,7 @@ def truncate_messages(
|
|
223
223
|
) -> list[ChatMessage]:
|
224
224
|
"""Truncate messages to fit within max prompt size supported by model"""
|
225
225
|
|
226
|
-
default_tokenizer = "
|
226
|
+
default_tokenizer = "gpt-4o"
|
227
227
|
|
228
228
|
try:
|
229
229
|
if loaded_model:
|
@@ -240,13 +240,9 @@ def truncate_messages(
|
|
240
240
|
else:
|
241
241
|
encoder = download_model(model_name).tokenizer()
|
242
242
|
except:
|
243
|
-
|
244
|
-
encoder = state.pretrained_tokenizers[default_tokenizer]
|
245
|
-
else:
|
246
|
-
encoder = AutoTokenizer.from_pretrained(default_tokenizer)
|
247
|
-
state.pretrained_tokenizers[default_tokenizer] = encoder
|
243
|
+
encoder = tiktoken.encoding_for_model(default_tokenizer)
|
248
244
|
logger.debug(
|
249
|
-
f"Fallback to default chat model tokenizer: {
|
245
|
+
f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for model: {model_name} in Khoj settings to improve context stuffing."
|
250
246
|
)
|
251
247
|
|
252
248
|
# Extract system message from messages
|
khoj/processor/embeddings.py
CHANGED
@@ -13,7 +13,7 @@ from tenacity import (
|
|
13
13
|
)
|
14
14
|
from torch import nn
|
15
15
|
|
16
|
-
from khoj.utils.helpers import get_device, merge_dicts
|
16
|
+
from khoj.utils.helpers import get_device, merge_dicts, timer
|
17
17
|
from khoj.utils.rawconfig import SearchResponse
|
18
18
|
|
19
19
|
logger = logging.getLogger(__name__)
|
@@ -37,7 +37,8 @@ class EmbeddingsModel:
|
|
37
37
|
self.model_name = model_name
|
38
38
|
self.inference_endpoint = embeddings_inference_endpoint
|
39
39
|
self.api_key = embeddings_inference_endpoint_api_key
|
40
|
-
|
40
|
+
with timer(f"Loaded embedding model {self.model_name}", logger):
|
41
|
+
self.embeddings_model = SentenceTransformer(self.model_name, **self.model_kwargs)
|
41
42
|
|
42
43
|
def inference_server_enabled(self) -> bool:
|
43
44
|
return self.api_key is not None and self.inference_endpoint is not None
|
@@ -101,7 +102,8 @@ class CrossEncoderModel:
|
|
101
102
|
self.inference_endpoint = cross_encoder_inference_endpoint
|
102
103
|
self.api_key = cross_encoder_inference_endpoint_api_key
|
103
104
|
self.model_kwargs = merge_dicts(model_kwargs, {"device": get_device()})
|
104
|
-
|
105
|
+
with timer(f"Loaded cross-encoder model {self.model_name}", logger):
|
106
|
+
self.cross_encoder_model = CrossEncoder(model_name=self.model_name, **self.model_kwargs)
|
105
107
|
|
106
108
|
def inference_server_enabled(self) -> bool:
|
107
109
|
return self.api_key is not None and self.inference_endpoint is not None
|
@@ -112,6 +114,7 @@ class CrossEncoderModel:
|
|
112
114
|
payload = {"inputs": {"query": query, "passages": [hit.additional[key] for hit in hits]}}
|
113
115
|
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
|
114
116
|
response = requests.post(target_url, json=payload, headers=headers)
|
117
|
+
response.raise_for_status()
|
115
118
|
return response.json()["scores"]
|
116
119
|
|
117
120
|
cross_inp = [[query, hit.additional[key]] for hit in hits]
|
khoj/processor/image/generate.py
CHANGED
@@ -25,7 +25,6 @@ async def text_to_image(
|
|
25
25
|
location_data: LocationData,
|
26
26
|
references: List[Dict[str, Any]],
|
27
27
|
online_results: Dict[str, Any],
|
28
|
-
subscribed: bool = False,
|
29
28
|
send_status_func: Optional[Callable] = None,
|
30
29
|
uploaded_image_url: Optional[str] = None,
|
31
30
|
agent: Agent = None,
|
@@ -66,8 +65,8 @@ async def text_to_image(
|
|
66
65
|
note_references=references,
|
67
66
|
online_results=online_results,
|
68
67
|
model_type=text_to_image_config.model_type,
|
69
|
-
subscribed=subscribed,
|
70
68
|
uploaded_image_url=uploaded_image_url,
|
69
|
+
user=user,
|
71
70
|
agent=agent,
|
72
71
|
)
|
73
72
|
|
@@ -10,14 +10,22 @@ import aiohttp
|
|
10
10
|
from bs4 import BeautifulSoup
|
11
11
|
from markdownify import markdownify
|
12
12
|
|
13
|
-
from khoj.database.
|
13
|
+
from khoj.database.adapters import ConversationAdapters
|
14
|
+
from khoj.database.models import Agent, KhojUser, WebScraper
|
15
|
+
from khoj.processor.conversation import prompts
|
14
16
|
from khoj.routers.helpers import (
|
15
17
|
ChatEvent,
|
16
18
|
extract_relevant_info,
|
17
19
|
generate_online_subqueries,
|
18
20
|
infer_webpage_urls,
|
19
21
|
)
|
20
|
-
from khoj.utils.helpers import
|
22
|
+
from khoj.utils.helpers import (
|
23
|
+
is_env_var_true,
|
24
|
+
is_internal_url,
|
25
|
+
is_internet_connected,
|
26
|
+
is_none_or_empty,
|
27
|
+
timer,
|
28
|
+
)
|
21
29
|
from khoj.utils.rawconfig import LocationData
|
22
30
|
|
23
31
|
logger = logging.getLogger(__name__)
|
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
|
|
25
33
|
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
26
34
|
SERPER_DEV_URL = "https://google.serper.dev/search"
|
27
35
|
|
28
|
-
JINA_READER_API_URL = "https://r.jina.ai/"
|
29
36
|
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
30
37
|
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
31
38
|
|
32
|
-
|
33
|
-
|
39
|
+
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
40
|
+
|
34
41
|
OLOSTEP_QUERY_PARAMS = {
|
35
42
|
"timeout": 35, # seconds
|
36
43
|
"waitBeforeScraping": 1, # seconds
|
@@ -53,7 +60,6 @@ async def search_online(
|
|
53
60
|
conversation_history: dict,
|
54
61
|
location: LocationData,
|
55
62
|
user: KhojUser,
|
56
|
-
subscribed: bool = False,
|
57
63
|
send_status_func: Optional[Callable] = None,
|
58
64
|
custom_filters: List[str] = [],
|
59
65
|
uploaded_image_url: str = None,
|
@@ -84,33 +90,36 @@ async def search_online(
|
|
84
90
|
search_results = await asyncio.gather(*search_tasks)
|
85
91
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
86
92
|
|
87
|
-
# Gather distinct web
|
93
|
+
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
88
94
|
# Content of web pages is directly available when Jina is used for search.
|
89
|
-
webpages = {
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
+
webpages: Dict[str, Dict] = {}
|
96
|
+
for subquery in response_dict:
|
97
|
+
if "answerBox" in response_dict[subquery]:
|
98
|
+
continue
|
99
|
+
for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
|
100
|
+
link = organic.get("link")
|
101
|
+
if link in webpages:
|
102
|
+
webpages[link]["queries"].add(subquery)
|
103
|
+
else:
|
104
|
+
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
95
105
|
|
96
106
|
# Read, extract relevant info from the retrieved web pages
|
97
107
|
if webpages:
|
98
|
-
|
99
|
-
logger.info(f"Reading web pages at: {list(webpage_links)}")
|
108
|
+
logger.info(f"Reading web pages at: {webpages.keys()}")
|
100
109
|
if send_status_func:
|
101
|
-
webpage_links_str = "\n- " + "\n- ".join(
|
110
|
+
webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
|
102
111
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
103
112
|
yield {ChatEvent.STATUS: event}
|
104
113
|
tasks = [
|
105
|
-
read_webpage_and_extract_content(
|
106
|
-
for link,
|
114
|
+
read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
|
115
|
+
for link, data in webpages.items()
|
107
116
|
]
|
108
117
|
results = await asyncio.gather(*tasks)
|
109
118
|
|
110
119
|
# Collect extracted info from the retrieved web pages
|
111
|
-
for
|
120
|
+
for subqueries, url, webpage_extract in results:
|
112
121
|
if webpage_extract is not None:
|
113
|
-
response_dict[
|
122
|
+
response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
|
114
123
|
|
115
124
|
yield response_dict
|
116
125
|
|
@@ -141,7 +150,6 @@ async def read_webpages(
|
|
141
150
|
conversation_history: dict,
|
142
151
|
location: LocationData,
|
143
152
|
user: KhojUser,
|
144
|
-
subscribed: bool = False,
|
145
153
|
send_status_func: Optional[Callable] = None,
|
146
154
|
uploaded_image_url: str = None,
|
147
155
|
agent: Agent = None,
|
@@ -158,29 +166,66 @@ async def read_webpages(
|
|
158
166
|
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
159
167
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
160
168
|
yield {ChatEvent.STATUS: event}
|
161
|
-
tasks = [read_webpage_and_extract_content(query, url,
|
169
|
+
tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
|
162
170
|
results = await asyncio.gather(*tasks)
|
163
171
|
|
164
172
|
response: Dict[str, Dict] = defaultdict(dict)
|
165
173
|
response[query]["webpages"] = [
|
166
|
-
{"query":
|
174
|
+
{"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
|
167
175
|
]
|
168
176
|
yield response
|
169
177
|
|
170
178
|
|
179
|
+
async def read_webpage(
|
180
|
+
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
181
|
+
) -> Tuple[str | None, str | None]:
|
182
|
+
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
|
183
|
+
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
184
|
+
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
185
|
+
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
186
|
+
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
187
|
+
return await read_webpage_with_olostep(url, api_key, api_url), None
|
188
|
+
elif scraper_type == WebScraper.WebScraperType.JINA:
|
189
|
+
return await read_webpage_with_jina(url, api_key, api_url), None
|
190
|
+
else:
|
191
|
+
return await read_webpage_at_url(url), None
|
192
|
+
|
193
|
+
|
171
194
|
async def read_webpage_and_extract_content(
|
172
|
-
|
173
|
-
) -> Tuple[str, Union[None, str]
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
195
|
+
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
196
|
+
) -> Tuple[set[str], str, Union[None, str]]:
|
197
|
+
# Select the web scrapers to use for reading the web page
|
198
|
+
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
|
199
|
+
# Only use the direct web scraper for internal URLs
|
200
|
+
if is_internal_url(url):
|
201
|
+
web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
|
202
|
+
|
203
|
+
# Fallback through enabled web scrapers until we successfully read the web page
|
204
|
+
extracted_info = None
|
205
|
+
for scraper in web_scrapers:
|
206
|
+
try:
|
207
|
+
# Read the web page
|
208
|
+
if is_none_or_empty(content):
|
209
|
+
with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
|
210
|
+
content, extracted_info = await read_webpage(
|
211
|
+
url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
|
212
|
+
)
|
213
|
+
|
214
|
+
# Extract relevant information from the web page
|
215
|
+
if is_none_or_empty(extracted_info):
|
216
|
+
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
217
|
+
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
218
|
+
|
219
|
+
# If we successfully extracted information, break the loop
|
220
|
+
if not is_none_or_empty(extracted_info):
|
221
|
+
break
|
222
|
+
except Exception as e:
|
223
|
+
logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
|
224
|
+
# If this is the last web scraper in the list, log an error
|
225
|
+
if scraper.name == web_scrapers[-1].name:
|
226
|
+
logger.error(f"All web scrapers failed for '{url}'")
|
227
|
+
|
228
|
+
return subqueries, url, extracted_info
|
184
229
|
|
185
230
|
|
186
231
|
async def read_webpage_at_url(web_url: str) -> str:
|
@@ -197,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
|
|
197
242
|
return markdownify(body)
|
198
243
|
|
199
244
|
|
200
|
-
async def read_webpage_with_olostep(web_url: str) -> str:
|
201
|
-
headers = {"Authorization": f"Bearer {
|
245
|
+
async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
|
246
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
202
247
|
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
203
248
|
web_scraping_params["url"] = web_url
|
204
249
|
|
205
250
|
async with aiohttp.ClientSession() as session:
|
206
|
-
async with session.get(
|
251
|
+
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
|
207
252
|
response.raise_for_status()
|
208
253
|
response_json = await response.json()
|
209
254
|
return response_json["markdown_content"]
|
210
255
|
|
211
256
|
|
212
|
-
async def read_webpage_with_jina(web_url: str) -> str:
|
213
|
-
jina_reader_api_url = f"{
|
257
|
+
async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
|
258
|
+
jina_reader_api_url = f"{api_url}/{web_url}"
|
214
259
|
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
215
|
-
if
|
216
|
-
headers["Authorization"] = f"Bearer {
|
260
|
+
if api_key:
|
261
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
217
262
|
|
218
263
|
async with aiohttp.ClientSession() as session:
|
219
264
|
async with session.get(jina_reader_api_url, headers=headers) as response:
|
@@ -222,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
|
222
267
|
return response_json["data"]["content"]
|
223
268
|
|
224
269
|
|
270
|
+
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
271
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
272
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
273
|
+
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
274
|
+
|
275
|
+
async with aiohttp.ClientSession() as session:
|
276
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
277
|
+
response.raise_for_status()
|
278
|
+
response_json = await response.json()
|
279
|
+
return response_json["data"]["markdown"]
|
280
|
+
|
281
|
+
|
282
|
+
async def query_webpage_with_firecrawl(
|
283
|
+
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
|
284
|
+
) -> str:
|
285
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
286
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
287
|
+
schema = {
|
288
|
+
"type": "object",
|
289
|
+
"properties": {
|
290
|
+
"relevant_extract": {"type": "string"},
|
291
|
+
},
|
292
|
+
"required": [
|
293
|
+
"relevant_extract",
|
294
|
+
],
|
295
|
+
}
|
296
|
+
|
297
|
+
personality_context = (
|
298
|
+
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
299
|
+
)
|
300
|
+
system_prompt = f"""
|
301
|
+
{prompts.system_prompt_extract_relevant_information}
|
302
|
+
|
303
|
+
{personality_context}
|
304
|
+
User Query: {", ".join(queries)}
|
305
|
+
|
306
|
+
Collate only relevant information from the website to answer the target query and in the provided JSON schema.
|
307
|
+
""".strip()
|
308
|
+
|
309
|
+
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
|
310
|
+
|
311
|
+
async with aiohttp.ClientSession() as session:
|
312
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
313
|
+
response.raise_for_status()
|
314
|
+
response_json = await response.json()
|
315
|
+
return response_json["data"]["extract"]["relevant_extract"]
|
316
|
+
|
317
|
+
|
225
318
|
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
226
319
|
encoded_query = urllib.parse.quote(query)
|
227
320
|
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
khoj/routers/api.py
CHANGED
@@ -395,7 +395,7 @@ async def extract_references_and_questions(
|
|
395
395
|
# Infer search queries from user message
|
396
396
|
with timer("Extracting search queries took", logger):
|
397
397
|
# If we've reached here, either the user has enabled offline chat or the openai model is enabled.
|
398
|
-
conversation_config = await ConversationAdapters.aget_default_conversation_config()
|
398
|
+
conversation_config = await ConversationAdapters.aget_default_conversation_config(user)
|
399
399
|
vision_enabled = conversation_config.vision_enabled
|
400
400
|
|
401
401
|
if conversation_config.model_type == ChatModelOptions.ModelType.OFFLINE:
|
khoj/routers/api_agents.py
CHANGED
@@ -35,6 +35,7 @@ class ModifyAgentBody(BaseModel):
|
|
35
35
|
files: Optional[List[str]] = []
|
36
36
|
input_tools: Optional[List[str]] = []
|
37
37
|
output_modes: Optional[List[str]] = []
|
38
|
+
slug: Optional[str] = None
|
38
39
|
|
39
40
|
|
40
41
|
@api_agents.get("", response_class=Response)
|
@@ -161,7 +162,7 @@ async def delete_agent(
|
|
161
162
|
|
162
163
|
|
163
164
|
@api_agents.post("", response_class=Response)
|
164
|
-
@requires(["authenticated"])
|
165
|
+
@requires(["authenticated", "premium"])
|
165
166
|
async def create_agent(
|
166
167
|
request: Request,
|
167
168
|
common: CommonQueryParams,
|
@@ -192,6 +193,7 @@ async def create_agent(
|
|
192
193
|
body.files,
|
193
194
|
body.input_tools,
|
194
195
|
body.output_modes,
|
196
|
+
body.slug,
|
195
197
|
)
|
196
198
|
|
197
199
|
agents_packet = {
|
@@ -213,7 +215,7 @@ async def create_agent(
|
|
213
215
|
|
214
216
|
|
215
217
|
@api_agents.patch("", response_class=Response)
|
216
|
-
@requires(["authenticated"])
|
218
|
+
@requires(["authenticated", "premium"])
|
217
219
|
async def update_agent(
|
218
220
|
request: Request,
|
219
221
|
common: CommonQueryParams,
|
@@ -233,7 +235,7 @@ async def update_agent(
|
|
233
235
|
status_code=400,
|
234
236
|
)
|
235
237
|
|
236
|
-
selected_agent = await AgentAdapters.
|
238
|
+
selected_agent = await AgentAdapters.aget_agent_by_slug(body.slug, user)
|
237
239
|
|
238
240
|
if not selected_agent:
|
239
241
|
return Response(
|
@@ -253,6 +255,7 @@ async def update_agent(
|
|
253
255
|
body.files,
|
254
256
|
body.input_tools,
|
255
257
|
body.output_modes,
|
258
|
+
body.slug,
|
256
259
|
)
|
257
260
|
|
258
261
|
agents_packet = {
|