khoj 1.25.1.dev14__py3-none-any.whl → 1.25.1.dev33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +72 -2
- khoj/database/admin.py +16 -0
- khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
- khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
- khoj/database/models/__init__.py +78 -2
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/1603-fa3ee48860b9dc5c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/{webpack-3875a06385370d08.js → webpack-61a553b6ff44f97c.js} +1 -1
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +1 -1
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/factchecker/index.html +1 -1
- khoj/interface/compiled/factchecker/index.txt +2 -2
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +2 -2
- khoj/interface/compiled/share/chat/index.html +1 -1
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/interface/web/assets/icons/agents.svg +1 -0
- khoj/interface/web/assets/icons/automation.svg +1 -0
- khoj/interface/web/assets/icons/chat.svg +24 -0
- khoj/interface/web/login.html +11 -22
- khoj/processor/embeddings.py +1 -0
- khoj/processor/tools/online_search.py +135 -40
- khoj/routers/api_chat.py +38 -27
- khoj/routers/helpers.py +12 -11
- khoj/search_type/text_search.py +7 -2
- khoj/utils/helpers.py +48 -5
- {khoj-1.25.1.dev14.dist-info → khoj-1.25.1.dev33.dist-info}/METADATA +4 -4
- {khoj-1.25.1.dev14.dist-info → khoj-1.25.1.dev33.dist-info}/RECORD +41 -38
- khoj/interface/compiled/_next/static/chunks/1603-67a89278e2c5dbe6.js +0 -1
- /khoj/interface/compiled/_next/static/{y6sFTl0gpqdS79jlpmIvx → NaOqImuryddEYz7a7MQ6j}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{y6sFTl0gpqdS79jlpmIvx → NaOqImuryddEYz7a7MQ6j}/_ssgManifest.js +0 -0
- {khoj-1.25.1.dev14.dist-info → khoj-1.25.1.dev33.dist-info}/WHEEL +0 -0
- {khoj-1.25.1.dev14.dist-info → khoj-1.25.1.dev33.dist-info}/entry_points.txt +0 -0
- {khoj-1.25.1.dev14.dist-info → khoj-1.25.1.dev33.dist-info}/licenses/LICENSE +0 -0
khoj/interface/web/login.html
CHANGED
@@ -46,33 +46,16 @@
|
|
46
46
|
<p>Transform the way you think, create, and remember</p>
|
47
47
|
<div class="features">
|
48
48
|
<div class="feature">
|
49
|
-
<
|
50
|
-
fill="none">
|
51
|
-
<path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z" />
|
52
|
-
<path d="M14 2v6h6" />
|
53
|
-
<path d="M16 13H8" />
|
54
|
-
<path d="M16 17H8" />
|
55
|
-
<path d="M10 9H8" />
|
56
|
-
</svg>
|
49
|
+
<img src="/static/assets/icons/chat.svg" alt="Chat" width="24" height="24">
|
57
50
|
<span>Get answers across your documents and the internet</span>
|
58
51
|
</div>
|
59
52
|
<div class="feature">
|
60
|
-
<
|
61
|
-
|
62
|
-
<path
|
63
|
-
d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z" />
|
64
|
-
<path d="M3.3 7l8.7 5 8.7-5" />
|
65
|
-
</svg>
|
66
|
-
<span>Go deeper in the topics personal to you</span>
|
53
|
+
<img src="/static/assets/icons/agents.svg" alt="Agents" width="24" height="24">
|
54
|
+
<span>Create agents with the knowledge and tools to take on any role</span>
|
67
55
|
</div>
|
68
56
|
<div class="feature">
|
69
|
-
<
|
70
|
-
|
71
|
-
<path d="M12 2L2 7l10 5 10-5-10-5z" />
|
72
|
-
<path d="M2 17l10 5 10-5" />
|
73
|
-
<path d="M2 12l10 5 10-5" />
|
74
|
-
</svg>
|
75
|
-
<span>Use specialized agents</span>
|
57
|
+
<img src="/static/assets/icons/automation.svg" alt="Automations" width="24" height="24">
|
58
|
+
<span>Automate away repetitive research</span>
|
76
59
|
</div>
|
77
60
|
</div>
|
78
61
|
</div>
|
@@ -160,6 +143,12 @@
|
|
160
143
|
height: 24px;
|
161
144
|
stroke: white;
|
162
145
|
}
|
146
|
+
.feature img {
|
147
|
+
width: 24px;
|
148
|
+
height: 24px;
|
149
|
+
filter: invert(100%) sepia(0%) saturate(0%) hue-rotate(0deg) brightness(100%) contrast(100%);
|
150
|
+
stroke: white;
|
151
|
+
}
|
163
152
|
|
164
153
|
#login-modal {
|
165
154
|
display: grid;
|
khoj/processor/embeddings.py
CHANGED
@@ -114,6 +114,7 @@ class CrossEncoderModel:
|
|
114
114
|
payload = {"inputs": {"query": query, "passages": [hit.additional[key] for hit in hits]}}
|
115
115
|
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
|
116
116
|
response = requests.post(target_url, json=payload, headers=headers)
|
117
|
+
response.raise_for_status()
|
117
118
|
return response.json()["scores"]
|
118
119
|
|
119
120
|
cross_inp = [[query, hit.additional[key]] for hit in hits]
|
@@ -10,14 +10,22 @@ import aiohttp
|
|
10
10
|
from bs4 import BeautifulSoup
|
11
11
|
from markdownify import markdownify
|
12
12
|
|
13
|
-
from khoj.database.
|
13
|
+
from khoj.database.adapters import ConversationAdapters
|
14
|
+
from khoj.database.models import Agent, KhojUser, WebScraper
|
15
|
+
from khoj.processor.conversation import prompts
|
14
16
|
from khoj.routers.helpers import (
|
15
17
|
ChatEvent,
|
16
18
|
extract_relevant_info,
|
17
19
|
generate_online_subqueries,
|
18
20
|
infer_webpage_urls,
|
19
21
|
)
|
20
|
-
from khoj.utils.helpers import
|
22
|
+
from khoj.utils.helpers import (
|
23
|
+
is_env_var_true,
|
24
|
+
is_internal_url,
|
25
|
+
is_internet_connected,
|
26
|
+
is_none_or_empty,
|
27
|
+
timer,
|
28
|
+
)
|
21
29
|
from khoj.utils.rawconfig import LocationData
|
22
30
|
|
23
31
|
logger = logging.getLogger(__name__)
|
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
|
|
25
33
|
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
26
34
|
SERPER_DEV_URL = "https://google.serper.dev/search"
|
27
35
|
|
28
|
-
JINA_READER_API_URL = "https://r.jina.ai/"
|
29
36
|
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
30
37
|
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
31
38
|
|
32
|
-
|
33
|
-
|
39
|
+
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
40
|
+
|
34
41
|
OLOSTEP_QUERY_PARAMS = {
|
35
42
|
"timeout": 35, # seconds
|
36
43
|
"waitBeforeScraping": 1, # seconds
|
@@ -83,33 +90,36 @@ async def search_online(
|
|
83
90
|
search_results = await asyncio.gather(*search_tasks)
|
84
91
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
85
92
|
|
86
|
-
# Gather distinct web
|
93
|
+
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
87
94
|
# Content of web pages is directly available when Jina is used for search.
|
88
|
-
webpages = {
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
95
|
+
webpages: Dict[str, Dict] = {}
|
96
|
+
for subquery in response_dict:
|
97
|
+
if "answerBox" in response_dict[subquery]:
|
98
|
+
continue
|
99
|
+
for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
|
100
|
+
link = organic.get("link")
|
101
|
+
if link in webpages:
|
102
|
+
webpages[link]["queries"].add(subquery)
|
103
|
+
else:
|
104
|
+
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
94
105
|
|
95
106
|
# Read, extract relevant info from the retrieved web pages
|
96
107
|
if webpages:
|
97
|
-
|
98
|
-
logger.info(f"Reading web pages at: {list(webpage_links)}")
|
108
|
+
logger.info(f"Reading web pages at: {webpages.keys()}")
|
99
109
|
if send_status_func:
|
100
|
-
webpage_links_str = "\n- " + "\n- ".join(
|
110
|
+
webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
|
101
111
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
102
112
|
yield {ChatEvent.STATUS: event}
|
103
113
|
tasks = [
|
104
|
-
read_webpage_and_extract_content(
|
105
|
-
for link,
|
114
|
+
read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
|
115
|
+
for link, data in webpages.items()
|
106
116
|
]
|
107
117
|
results = await asyncio.gather(*tasks)
|
108
118
|
|
109
119
|
# Collect extracted info from the retrieved web pages
|
110
|
-
for
|
120
|
+
for subqueries, url, webpage_extract in results:
|
111
121
|
if webpage_extract is not None:
|
112
|
-
response_dict[
|
122
|
+
response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
|
113
123
|
|
114
124
|
yield response_dict
|
115
125
|
|
@@ -156,29 +166,66 @@ async def read_webpages(
|
|
156
166
|
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
157
167
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
158
168
|
yield {ChatEvent.STATUS: event}
|
159
|
-
tasks = [read_webpage_and_extract_content(query, url, user=user, agent=agent) for url in urls]
|
169
|
+
tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
|
160
170
|
results = await asyncio.gather(*tasks)
|
161
171
|
|
162
172
|
response: Dict[str, Dict] = defaultdict(dict)
|
163
173
|
response[query]["webpages"] = [
|
164
|
-
{"query":
|
174
|
+
{"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
|
165
175
|
]
|
166
176
|
yield response
|
167
177
|
|
168
178
|
|
179
|
+
async def read_webpage(
|
180
|
+
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
181
|
+
) -> Tuple[str | None, str | None]:
|
182
|
+
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
|
183
|
+
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
184
|
+
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
185
|
+
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
186
|
+
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
187
|
+
return await read_webpage_with_olostep(url, api_key, api_url), None
|
188
|
+
elif scraper_type == WebScraper.WebScraperType.JINA:
|
189
|
+
return await read_webpage_with_jina(url, api_key, api_url), None
|
190
|
+
else:
|
191
|
+
return await read_webpage_at_url(url), None
|
192
|
+
|
193
|
+
|
169
194
|
async def read_webpage_and_extract_content(
|
170
|
-
|
171
|
-
) -> Tuple[str, Union[None, str]
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
195
|
+
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
196
|
+
) -> Tuple[set[str], str, Union[None, str]]:
|
197
|
+
# Select the web scrapers to use for reading the web page
|
198
|
+
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
|
199
|
+
# Only use the direct web scraper for internal URLs
|
200
|
+
if is_internal_url(url):
|
201
|
+
web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
|
202
|
+
|
203
|
+
# Fallback through enabled web scrapers until we successfully read the web page
|
204
|
+
extracted_info = None
|
205
|
+
for scraper in web_scrapers:
|
206
|
+
try:
|
207
|
+
# Read the web page
|
208
|
+
if is_none_or_empty(content):
|
209
|
+
with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
|
210
|
+
content, extracted_info = await read_webpage(
|
211
|
+
url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
|
212
|
+
)
|
213
|
+
|
214
|
+
# Extract relevant information from the web page
|
215
|
+
if is_none_or_empty(extracted_info):
|
216
|
+
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
217
|
+
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
218
|
+
|
219
|
+
# If we successfully extracted information, break the loop
|
220
|
+
if not is_none_or_empty(extracted_info):
|
221
|
+
break
|
222
|
+
except Exception as e:
|
223
|
+
logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
|
224
|
+
# If this is the last web scraper in the list, log an error
|
225
|
+
if scraper.name == web_scrapers[-1].name:
|
226
|
+
logger.error(f"All web scrapers failed for '{url}'")
|
227
|
+
|
228
|
+
return subqueries, url, extracted_info
|
182
229
|
|
183
230
|
|
184
231
|
async def read_webpage_at_url(web_url: str) -> str:
|
@@ -195,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
|
|
195
242
|
return markdownify(body)
|
196
243
|
|
197
244
|
|
198
|
-
async def read_webpage_with_olostep(web_url: str) -> str:
|
199
|
-
headers = {"Authorization": f"Bearer {
|
245
|
+
async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
|
246
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
200
247
|
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
201
248
|
web_scraping_params["url"] = web_url
|
202
249
|
|
203
250
|
async with aiohttp.ClientSession() as session:
|
204
|
-
async with session.get(
|
251
|
+
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
|
205
252
|
response.raise_for_status()
|
206
253
|
response_json = await response.json()
|
207
254
|
return response_json["markdown_content"]
|
208
255
|
|
209
256
|
|
210
|
-
async def read_webpage_with_jina(web_url: str) -> str:
|
211
|
-
jina_reader_api_url = f"{
|
257
|
+
async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
|
258
|
+
jina_reader_api_url = f"{api_url}/{web_url}"
|
212
259
|
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
213
|
-
if
|
214
|
-
headers["Authorization"] = f"Bearer {
|
260
|
+
if api_key:
|
261
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
215
262
|
|
216
263
|
async with aiohttp.ClientSession() as session:
|
217
264
|
async with session.get(jina_reader_api_url, headers=headers) as response:
|
@@ -220,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
|
220
267
|
return response_json["data"]["content"]
|
221
268
|
|
222
269
|
|
270
|
+
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
271
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
272
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
273
|
+
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
274
|
+
|
275
|
+
async with aiohttp.ClientSession() as session:
|
276
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
277
|
+
response.raise_for_status()
|
278
|
+
response_json = await response.json()
|
279
|
+
return response_json["data"]["markdown"]
|
280
|
+
|
281
|
+
|
282
|
+
async def query_webpage_with_firecrawl(
|
283
|
+
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
|
284
|
+
) -> str:
|
285
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
286
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
287
|
+
schema = {
|
288
|
+
"type": "object",
|
289
|
+
"properties": {
|
290
|
+
"relevant_extract": {"type": "string"},
|
291
|
+
},
|
292
|
+
"required": [
|
293
|
+
"relevant_extract",
|
294
|
+
],
|
295
|
+
}
|
296
|
+
|
297
|
+
personality_context = (
|
298
|
+
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
299
|
+
)
|
300
|
+
system_prompt = f"""
|
301
|
+
{prompts.system_prompt_extract_relevant_information}
|
302
|
+
|
303
|
+
{personality_context}
|
304
|
+
User Query: {", ".join(queries)}
|
305
|
+
|
306
|
+
Collate only relevant information from the website to answer the target query and in the provided JSON schema.
|
307
|
+
""".strip()
|
308
|
+
|
309
|
+
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
|
310
|
+
|
311
|
+
async with aiohttp.ClientSession() as session:
|
312
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
313
|
+
response.raise_for_status()
|
314
|
+
response_json = await response.json()
|
315
|
+
return response_json["data"]["extract"]["relevant_extract"]
|
316
|
+
|
317
|
+
|
223
318
|
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
224
319
|
encoded_query = urllib.parse.quote(query)
|
225
320
|
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
khoj/routers/api_chat.py
CHANGED
@@ -3,7 +3,6 @@ import base64
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
import time
|
6
|
-
import warnings
|
7
6
|
from datetime import datetime
|
8
7
|
from functools import partial
|
9
8
|
from typing import Dict, Optional
|
@@ -574,7 +573,6 @@ async def chat(
|
|
574
573
|
chat_metadata: dict = {}
|
575
574
|
connection_alive = True
|
576
575
|
user: KhojUser = request.user.object
|
577
|
-
subscribed: bool = has_required_scope(request, ["premium"])
|
578
576
|
event_delimiter = "␃🔚␗"
|
579
577
|
q = unquote(q)
|
580
578
|
nonlocal conversation_id
|
@@ -641,7 +639,7 @@ async def chat(
|
|
641
639
|
request=request,
|
642
640
|
telemetry_type="api",
|
643
641
|
api="chat",
|
644
|
-
client=
|
642
|
+
client=common.client,
|
645
643
|
user_agent=request.headers.get("user-agent"),
|
646
644
|
host=request.headers.get("host"),
|
647
645
|
metadata=chat_metadata,
|
@@ -840,25 +838,33 @@ async def chat(
|
|
840
838
|
# Gather Context
|
841
839
|
## Extract Document References
|
842
840
|
compiled_references, inferred_queries, defiltered_query = [], [], None
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
841
|
+
try:
|
842
|
+
async for result in extract_references_and_questions(
|
843
|
+
request,
|
844
|
+
meta_log,
|
845
|
+
q,
|
846
|
+
(n or 7),
|
847
|
+
d,
|
848
|
+
conversation_id,
|
849
|
+
conversation_commands,
|
850
|
+
location,
|
851
|
+
partial(send_event, ChatEvent.STATUS),
|
852
|
+
uploaded_image_url=uploaded_image_url,
|
853
|
+
agent=agent,
|
854
|
+
):
|
855
|
+
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
856
|
+
yield result[ChatEvent.STATUS]
|
857
|
+
else:
|
858
|
+
compiled_references.extend(result[0])
|
859
|
+
inferred_queries.extend(result[1])
|
860
|
+
defiltered_query = result[2]
|
861
|
+
except Exception as e:
|
862
|
+
error_message = f"Error searching knowledge base: {e}. Attempting to respond without document references."
|
863
|
+
logger.warning(error_message)
|
864
|
+
async for result in send_event(
|
865
|
+
ChatEvent.STATUS, "Document search failed. I'll try respond without document references"
|
866
|
+
):
|
867
|
+
yield result
|
862
868
|
|
863
869
|
if not is_none_or_empty(compiled_references):
|
864
870
|
headings = "\n- " + "\n- ".join(set([c.get("compiled", c).split("\n")[0] for c in compiled_references]))
|
@@ -894,12 +900,13 @@ async def chat(
|
|
894
900
|
yield result[ChatEvent.STATUS]
|
895
901
|
else:
|
896
902
|
online_results = result
|
897
|
-
except
|
903
|
+
except Exception as e:
|
898
904
|
error_message = f"Error searching online: {e}. Attempting to respond without online results"
|
899
905
|
logger.warning(error_message)
|
900
|
-
async for result in
|
906
|
+
async for result in send_event(
|
907
|
+
ChatEvent.STATUS, "Online search failed. I'll try respond without online references"
|
908
|
+
):
|
901
909
|
yield result
|
902
|
-
return
|
903
910
|
|
904
911
|
## Gather Webpage References
|
905
912
|
if ConversationCommand.Webpage in conversation_commands:
|
@@ -928,11 +935,15 @@ async def chat(
|
|
928
935
|
webpages.append(webpage["link"])
|
929
936
|
async for result in send_event(ChatEvent.STATUS, f"**Read web pages**: {webpages}"):
|
930
937
|
yield result
|
931
|
-
except
|
938
|
+
except Exception as e:
|
932
939
|
logger.warning(
|
933
|
-
f"Error
|
940
|
+
f"Error reading webpages: {e}. Attempting to respond without webpage results",
|
934
941
|
exc_info=True,
|
935
942
|
)
|
943
|
+
async for result in send_event(
|
944
|
+
ChatEvent.STATUS, "Webpage read failed. I'll try respond without webpage references"
|
945
|
+
):
|
946
|
+
yield result
|
936
947
|
|
937
948
|
## Send Gathered References
|
938
949
|
async for result in send_event(
|
khoj/routers/helpers.py
CHANGED
@@ -353,13 +353,13 @@ async def aget_relevant_information_sources(
|
|
353
353
|
final_response = [ConversationCommand.Default]
|
354
354
|
else:
|
355
355
|
final_response = [ConversationCommand.General]
|
356
|
-
|
357
|
-
except Exception as e:
|
356
|
+
except Exception:
|
358
357
|
logger.error(f"Invalid response for determining relevant tools: {response}")
|
359
358
|
if len(agent_tools) == 0:
|
360
359
|
final_response = [ConversationCommand.Default]
|
361
360
|
else:
|
362
361
|
final_response = agent_tools
|
362
|
+
return final_response
|
363
363
|
|
364
364
|
|
365
365
|
async def aget_relevant_output_modes(
|
@@ -551,12 +551,14 @@ async def schedule_query(
|
|
551
551
|
raise AssertionError(f"Invalid response for scheduling query: {raw_response}")
|
552
552
|
|
553
553
|
|
554
|
-
async def extract_relevant_info(
|
554
|
+
async def extract_relevant_info(
|
555
|
+
qs: set[str], corpus: str, user: KhojUser = None, agent: Agent = None
|
556
|
+
) -> Union[str, None]:
|
555
557
|
"""
|
556
558
|
Extract relevant information for a given query from the target corpus
|
557
559
|
"""
|
558
560
|
|
559
|
-
if is_none_or_empty(corpus) or is_none_or_empty(
|
561
|
+
if is_none_or_empty(corpus) or is_none_or_empty(qs):
|
560
562
|
return None
|
561
563
|
|
562
564
|
personality_context = (
|
@@ -564,17 +566,16 @@ async def extract_relevant_info(q: str, corpus: str, user: KhojUser = None, agen
|
|
564
566
|
)
|
565
567
|
|
566
568
|
extract_relevant_information = prompts.extract_relevant_information.format(
|
567
|
-
query=
|
569
|
+
query=", ".join(qs),
|
568
570
|
corpus=corpus.strip(),
|
569
571
|
personality_context=personality_context,
|
570
572
|
)
|
571
573
|
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
)
|
574
|
+
response = await send_message_to_model_wrapper(
|
575
|
+
extract_relevant_information,
|
576
|
+
prompts.system_prompt_extract_relevant_information,
|
577
|
+
user=user,
|
578
|
+
)
|
578
579
|
return response.strip()
|
579
580
|
|
580
581
|
|
khoj/search_type/text_search.py
CHANGED
@@ -3,6 +3,7 @@ import math
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import List, Optional, Tuple, Type, Union
|
5
5
|
|
6
|
+
import requests
|
6
7
|
import torch
|
7
8
|
from asgiref.sync import sync_to_async
|
8
9
|
from sentence_transformers import util
|
@@ -231,8 +232,12 @@ def setup(
|
|
231
232
|
|
232
233
|
def cross_encoder_score(query: str, hits: List[SearchResponse], search_model_name: str) -> List[SearchResponse]:
|
233
234
|
"""Score all retrieved entries using the cross-encoder"""
|
234
|
-
|
235
|
-
|
235
|
+
try:
|
236
|
+
with timer("Cross-Encoder Predict Time", logger, state.device):
|
237
|
+
cross_scores = state.cross_encoder_model[search_model_name].predict(query, hits)
|
238
|
+
except requests.exceptions.HTTPError as e:
|
239
|
+
logger.error(f"Failed to rerank documents using the inference endpoint. Error: {e}.", exc_info=True)
|
240
|
+
cross_scores = [0.0] * len(hits)
|
236
241
|
|
237
242
|
# Convert cross-encoder scores to distances and pass in hits for reranking
|
238
243
|
for idx in range(len(cross_scores)):
|
khoj/utils/helpers.py
CHANGED
@@ -2,10 +2,12 @@ from __future__ import annotations # to avoid quoting type hints
|
|
2
2
|
|
3
3
|
import datetime
|
4
4
|
import io
|
5
|
+
import ipaddress
|
5
6
|
import logging
|
6
7
|
import os
|
7
8
|
import platform
|
8
9
|
import random
|
10
|
+
import urllib.parse
|
9
11
|
import uuid
|
10
12
|
from collections import OrderedDict
|
11
13
|
from enum import Enum
|
@@ -164,9 +166,9 @@ def get_class_by_name(name: str) -> object:
|
|
164
166
|
class timer:
|
165
167
|
"""Context manager to log time taken for a block of code to run"""
|
166
168
|
|
167
|
-
def __init__(self, message: str, logger: logging.Logger, device: torch.device = None):
|
169
|
+
def __init__(self, message: str, logger: logging.Logger, device: torch.device = None, log_level=logging.DEBUG):
|
168
170
|
self.message = message
|
169
|
-
self.logger = logger
|
171
|
+
self.logger = logger.debug if log_level == logging.DEBUG else logger.info
|
170
172
|
self.device = device
|
171
173
|
|
172
174
|
def __enter__(self):
|
@@ -176,9 +178,9 @@ class timer:
|
|
176
178
|
def __exit__(self, *_):
|
177
179
|
elapsed = perf_counter() - self.start
|
178
180
|
if self.device is None:
|
179
|
-
self.logger
|
181
|
+
self.logger(f"{self.message}: {elapsed:.3f} seconds")
|
180
182
|
else:
|
181
|
-
self.logger
|
183
|
+
self.logger(f"{self.message}: {elapsed:.3f} seconds on device: {self.device}")
|
182
184
|
|
183
185
|
|
184
186
|
class LRU(OrderedDict):
|
@@ -347,12 +349,13 @@ tool_descriptions_for_llm = {
|
|
347
349
|
|
348
350
|
mode_descriptions_for_llm = {
|
349
351
|
ConversationCommand.Image: "Use this if the user is requesting you to generate a picture based on their description.",
|
350
|
-
ConversationCommand.Automation: "Use this if the user is requesting a response at a scheduled date
|
352
|
+
ConversationCommand.Automation: "Use this if you are confident the user is requesting a response at a scheduled date, time and frequency",
|
351
353
|
ConversationCommand.Text: "Use this if the other response modes don't seem to fit the query.",
|
352
354
|
}
|
353
355
|
|
354
356
|
mode_descriptions_for_agent = {
|
355
357
|
ConversationCommand.Image: "Agent can generate image in response.",
|
358
|
+
ConversationCommand.Automation: "Agent can schedule a task to run at a scheduled date, time and frequency in response.",
|
356
359
|
ConversationCommand.Text: "Agent can generate text in response.",
|
357
360
|
}
|
358
361
|
|
@@ -435,6 +438,46 @@ def is_internet_connected():
|
|
435
438
|
return False
|
436
439
|
|
437
440
|
|
441
|
+
def is_internal_url(url: str) -> bool:
|
442
|
+
"""
|
443
|
+
Check if a URL is likely to be internal/non-public.
|
444
|
+
|
445
|
+
Args:
|
446
|
+
url (str): The URL to check.
|
447
|
+
|
448
|
+
Returns:
|
449
|
+
bool: True if the URL is likely internal, False otherwise.
|
450
|
+
"""
|
451
|
+
try:
|
452
|
+
parsed_url = urllib.parse.urlparse(url)
|
453
|
+
hostname = parsed_url.hostname
|
454
|
+
|
455
|
+
# Check for localhost
|
456
|
+
if hostname in ["localhost", "127.0.0.1", "::1"]:
|
457
|
+
return True
|
458
|
+
|
459
|
+
# Check for IP addresses in private ranges
|
460
|
+
try:
|
461
|
+
ip = ipaddress.ip_address(hostname)
|
462
|
+
return ip.is_private
|
463
|
+
except ValueError:
|
464
|
+
pass # Not an IP address, continue with other checks
|
465
|
+
|
466
|
+
# Check for common internal TLDs
|
467
|
+
internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"]
|
468
|
+
if any(hostname.endswith(tld) for tld in internal_tlds):
|
469
|
+
return True
|
470
|
+
|
471
|
+
# Check for URLs without a TLD
|
472
|
+
if "." not in hostname:
|
473
|
+
return True
|
474
|
+
|
475
|
+
return False
|
476
|
+
except Exception:
|
477
|
+
# If we can't parse the URL or something else goes wrong, assume it's not internal
|
478
|
+
return False
|
479
|
+
|
480
|
+
|
438
481
|
def convert_image_to_webp(image_bytes):
|
439
482
|
"""Convert image bytes to webp format for faster loading"""
|
440
483
|
image_io = io.BytesIO(image_bytes)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: khoj
|
3
|
-
Version: 1.25.1.
|
3
|
+
Version: 1.25.1.dev33
|
4
4
|
Summary: Your Second Brain
|
5
5
|
Project-URL: Homepage, https://khoj.dev
|
6
6
|
Project-URL: Documentation, https://docs.khoj.dev
|
@@ -32,7 +32,7 @@ Requires-Dist: dateparser>=1.1.1
|
|
32
32
|
Requires-Dist: defusedxml==0.7.1
|
33
33
|
Requires-Dist: django-apscheduler==0.6.2
|
34
34
|
Requires-Dist: django-phonenumber-field==7.3.0
|
35
|
-
Requires-Dist: django==5.0.
|
35
|
+
Requires-Dist: django==5.0.9
|
36
36
|
Requires-Dist: docx2txt==0.8
|
37
37
|
Requires-Dist: einops==0.8.0
|
38
38
|
Requires-Dist: fastapi>=0.110.0
|
@@ -138,8 +138,8 @@ Description-Content-Type: text/markdown
|
|
138
138
|
- Chat with any local or online LLM (e.g llama3, qwen, gemma, mistral, gpt, claude, gemini).
|
139
139
|
- Get answers from the internet and your docs (including image, pdf, markdown, org-mode, word, notion files).
|
140
140
|
- Access it from your Browser, Obsidian, Emacs, Desktop, Phone or Whatsapp.
|
141
|
-
-
|
142
|
-
-
|
141
|
+
- Create agents with custom knowledge, persona, chat model and tools to take on any role.
|
142
|
+
- Automate away repetitive research. Get personal newsletters and smart notifications delivered to your inbox.
|
143
143
|
- Find relevant docs quickly and easily using our advanced semantic search.
|
144
144
|
- Generate images, talk out loud, play your messages.
|
145
145
|
- Khoj is open-source, self-hostable. Always.
|