khoj 1.25.1.dev12__py3-none-any.whl → 1.26.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +72 -2
- khoj/database/admin.py +16 -0
- khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
- khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
- khoj/database/models/__init__.py +78 -2
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/1603-fa3ee48860b9dc5c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/{9417-1d158bf46d3a0dc9.js → 9417-1ad504db22331388.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{9479-563e4d61f91d5a7c.js → 9479-adede27bb126b5d0.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/{layout-e71c8e913cccf792.js → layout-75636ab3a413fa8e.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-f8d03847a0fa2539.js → page-e9eee31dbdb4658c.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-5480731341f34450.js → page-2edc21f30819def4.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/{layout-8102549127db3067.js → layout-96fcf62857bf8f30.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-702057ccbcf27881.js → page-4309c98e6dc497dd.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/factchecker/{page-e7b34316ec6f44de.js → page-f2c83e3a87a28657.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{layout-f3e40d346da53112.js → layout-d0f0a9067427fb20.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-421d13f70c505dd9.js → page-ab9beb5a26e396f7.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/{page-d56541c746fded7d.js → page-b807caebd7f278c7.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{layout-6f9314b0d7a26046.js → layout-a8f33dfe92f997fb.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-e044a999468a7c5d.js → page-2932356ad11c2f7b.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{layout-39f03f9e32399f0f.js → layout-2df56074e42adaa0.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-fbbd66a4d4633438.js → page-a736a0826570af2b.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-dff708c71e9234cb.js → webpack-ba79408024891b00.js} +1 -1
- khoj/interface/compiled/_next/static/css/467a524c75e7d7c0.css +1 -0
- khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +1 -0
- khoj/interface/compiled/_next/static/css/f768dddada62459d.css +1 -0
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +1 -1
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/factchecker/index.html +1 -1
- khoj/interface/compiled/factchecker/index.txt +2 -2
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +3 -3
- khoj/interface/compiled/share/chat/index.html +1 -1
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/interface/web/assets/icons/agents.svg +1 -0
- khoj/interface/web/assets/icons/automation.svg +1 -0
- khoj/interface/web/assets/icons/chat.svg +24 -0
- khoj/interface/web/login.html +11 -22
- khoj/processor/content/images/image_to_entries.py +2 -0
- khoj/processor/conversation/google/utils.py +4 -0
- khoj/processor/conversation/prompts.py +1 -1
- khoj/processor/embeddings.py +1 -0
- khoj/processor/tools/online_search.py +135 -40
- khoj/routers/api_chat.py +41 -31
- khoj/routers/helpers.py +13 -11
- khoj/search_type/text_search.py +7 -2
- khoj/utils/helpers.py +50 -5
- {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/METADATA +4 -4
- {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/RECORD +62 -59
- khoj/interface/compiled/_next/static/chunks/1603-67a89278e2c5dbe6.js +0 -1
- khoj/interface/compiled/_next/static/css/1538cedb321e3a97.css +0 -1
- khoj/interface/compiled/_next/static/css/2de69f0be774c768.css +0 -1
- khoj/interface/compiled/_next/static/css/592ca99f5122e75a.css +0 -1
- /khoj/interface/compiled/_next/static/{CGyts-FEbV6owmPboHtLL → 0KX2AuxAEK1Jhb97imej7}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{CGyts-FEbV6owmPboHtLL → 0KX2AuxAEK1Jhb97imej7}/_ssgManifest.js +0 -0
- {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/WHEEL +0 -0
- {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/entry_points.txt +0 -0
- {khoj-1.25.1.dev12.dist-info → khoj-1.26.1.dev3.dist-info}/licenses/LICENSE +0 -0
@@ -10,14 +10,22 @@ import aiohttp
|
|
10
10
|
from bs4 import BeautifulSoup
|
11
11
|
from markdownify import markdownify
|
12
12
|
|
13
|
-
from khoj.database.
|
13
|
+
from khoj.database.adapters import ConversationAdapters
|
14
|
+
from khoj.database.models import Agent, KhojUser, WebScraper
|
15
|
+
from khoj.processor.conversation import prompts
|
14
16
|
from khoj.routers.helpers import (
|
15
17
|
ChatEvent,
|
16
18
|
extract_relevant_info,
|
17
19
|
generate_online_subqueries,
|
18
20
|
infer_webpage_urls,
|
19
21
|
)
|
20
|
-
from khoj.utils.helpers import
|
22
|
+
from khoj.utils.helpers import (
|
23
|
+
is_env_var_true,
|
24
|
+
is_internal_url,
|
25
|
+
is_internet_connected,
|
26
|
+
is_none_or_empty,
|
27
|
+
timer,
|
28
|
+
)
|
21
29
|
from khoj.utils.rawconfig import LocationData
|
22
30
|
|
23
31
|
logger = logging.getLogger(__name__)
|
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
|
|
25
33
|
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
26
34
|
SERPER_DEV_URL = "https://google.serper.dev/search"
|
27
35
|
|
28
|
-
JINA_READER_API_URL = "https://r.jina.ai/"
|
29
36
|
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
30
37
|
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
31
38
|
|
32
|
-
|
33
|
-
|
39
|
+
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
40
|
+
|
34
41
|
OLOSTEP_QUERY_PARAMS = {
|
35
42
|
"timeout": 35, # seconds
|
36
43
|
"waitBeforeScraping": 1, # seconds
|
@@ -83,33 +90,36 @@ async def search_online(
|
|
83
90
|
search_results = await asyncio.gather(*search_tasks)
|
84
91
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
85
92
|
|
86
|
-
# Gather distinct web
|
93
|
+
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
87
94
|
# Content of web pages is directly available when Jina is used for search.
|
88
|
-
webpages = {
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
95
|
+
webpages: Dict[str, Dict] = {}
|
96
|
+
for subquery in response_dict:
|
97
|
+
if "answerBox" in response_dict[subquery]:
|
98
|
+
continue
|
99
|
+
for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
|
100
|
+
link = organic.get("link")
|
101
|
+
if link in webpages:
|
102
|
+
webpages[link]["queries"].add(subquery)
|
103
|
+
else:
|
104
|
+
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
94
105
|
|
95
106
|
# Read, extract relevant info from the retrieved web pages
|
96
107
|
if webpages:
|
97
|
-
|
98
|
-
logger.info(f"Reading web pages at: {list(webpage_links)}")
|
108
|
+
logger.info(f"Reading web pages at: {webpages.keys()}")
|
99
109
|
if send_status_func:
|
100
|
-
webpage_links_str = "\n- " + "\n- ".join(
|
110
|
+
webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
|
101
111
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
102
112
|
yield {ChatEvent.STATUS: event}
|
103
113
|
tasks = [
|
104
|
-
read_webpage_and_extract_content(
|
105
|
-
for link,
|
114
|
+
read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
|
115
|
+
for link, data in webpages.items()
|
106
116
|
]
|
107
117
|
results = await asyncio.gather(*tasks)
|
108
118
|
|
109
119
|
# Collect extracted info from the retrieved web pages
|
110
|
-
for
|
120
|
+
for subqueries, url, webpage_extract in results:
|
111
121
|
if webpage_extract is not None:
|
112
|
-
response_dict[
|
122
|
+
response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
|
113
123
|
|
114
124
|
yield response_dict
|
115
125
|
|
@@ -156,29 +166,66 @@ async def read_webpages(
|
|
156
166
|
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
157
167
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
158
168
|
yield {ChatEvent.STATUS: event}
|
159
|
-
tasks = [read_webpage_and_extract_content(query, url, user=user, agent=agent) for url in urls]
|
169
|
+
tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
|
160
170
|
results = await asyncio.gather(*tasks)
|
161
171
|
|
162
172
|
response: Dict[str, Dict] = defaultdict(dict)
|
163
173
|
response[query]["webpages"] = [
|
164
|
-
{"query":
|
174
|
+
{"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
|
165
175
|
]
|
166
176
|
yield response
|
167
177
|
|
168
178
|
|
179
|
+
async def read_webpage(
|
180
|
+
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
181
|
+
) -> Tuple[str | None, str | None]:
|
182
|
+
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
|
183
|
+
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
184
|
+
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
185
|
+
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
186
|
+
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
187
|
+
return await read_webpage_with_olostep(url, api_key, api_url), None
|
188
|
+
elif scraper_type == WebScraper.WebScraperType.JINA:
|
189
|
+
return await read_webpage_with_jina(url, api_key, api_url), None
|
190
|
+
else:
|
191
|
+
return await read_webpage_at_url(url), None
|
192
|
+
|
193
|
+
|
169
194
|
async def read_webpage_and_extract_content(
|
170
|
-
|
171
|
-
) -> Tuple[str, Union[None, str]
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
195
|
+
subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
|
196
|
+
) -> Tuple[set[str], str, Union[None, str]]:
|
197
|
+
# Select the web scrapers to use for reading the web page
|
198
|
+
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
|
199
|
+
# Only use the direct web scraper for internal URLs
|
200
|
+
if is_internal_url(url):
|
201
|
+
web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
|
202
|
+
|
203
|
+
# Fallback through enabled web scrapers until we successfully read the web page
|
204
|
+
extracted_info = None
|
205
|
+
for scraper in web_scrapers:
|
206
|
+
try:
|
207
|
+
# Read the web page
|
208
|
+
if is_none_or_empty(content):
|
209
|
+
with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
|
210
|
+
content, extracted_info = await read_webpage(
|
211
|
+
url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
|
212
|
+
)
|
213
|
+
|
214
|
+
# Extract relevant information from the web page
|
215
|
+
if is_none_or_empty(extracted_info):
|
216
|
+
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
217
|
+
extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
|
218
|
+
|
219
|
+
# If we successfully extracted information, break the loop
|
220
|
+
if not is_none_or_empty(extracted_info):
|
221
|
+
break
|
222
|
+
except Exception as e:
|
223
|
+
logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
|
224
|
+
# If this is the last web scraper in the list, log an error
|
225
|
+
if scraper.name == web_scrapers[-1].name:
|
226
|
+
logger.error(f"All web scrapers failed for '{url}'")
|
227
|
+
|
228
|
+
return subqueries, url, extracted_info
|
182
229
|
|
183
230
|
|
184
231
|
async def read_webpage_at_url(web_url: str) -> str:
|
@@ -195,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
|
|
195
242
|
return markdownify(body)
|
196
243
|
|
197
244
|
|
198
|
-
async def read_webpage_with_olostep(web_url: str) -> str:
|
199
|
-
headers = {"Authorization": f"Bearer {
|
245
|
+
async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
|
246
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
200
247
|
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
201
248
|
web_scraping_params["url"] = web_url
|
202
249
|
|
203
250
|
async with aiohttp.ClientSession() as session:
|
204
|
-
async with session.get(
|
251
|
+
async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
|
205
252
|
response.raise_for_status()
|
206
253
|
response_json = await response.json()
|
207
254
|
return response_json["markdown_content"]
|
208
255
|
|
209
256
|
|
210
|
-
async def read_webpage_with_jina(web_url: str) -> str:
|
211
|
-
jina_reader_api_url = f"{
|
257
|
+
async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
|
258
|
+
jina_reader_api_url = f"{api_url}/{web_url}"
|
212
259
|
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
213
|
-
if
|
214
|
-
headers["Authorization"] = f"Bearer {
|
260
|
+
if api_key:
|
261
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
215
262
|
|
216
263
|
async with aiohttp.ClientSession() as session:
|
217
264
|
async with session.get(jina_reader_api_url, headers=headers) as response:
|
@@ -220,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
|
|
220
267
|
return response_json["data"]["content"]
|
221
268
|
|
222
269
|
|
270
|
+
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
271
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
272
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
273
|
+
params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
|
274
|
+
|
275
|
+
async with aiohttp.ClientSession() as session:
|
276
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
277
|
+
response.raise_for_status()
|
278
|
+
response_json = await response.json()
|
279
|
+
return response_json["data"]["markdown"]
|
280
|
+
|
281
|
+
|
282
|
+
async def query_webpage_with_firecrawl(
|
283
|
+
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
|
284
|
+
) -> str:
|
285
|
+
firecrawl_api_url = f"{api_url}/v1/scrape"
|
286
|
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
287
|
+
schema = {
|
288
|
+
"type": "object",
|
289
|
+
"properties": {
|
290
|
+
"relevant_extract": {"type": "string"},
|
291
|
+
},
|
292
|
+
"required": [
|
293
|
+
"relevant_extract",
|
294
|
+
],
|
295
|
+
}
|
296
|
+
|
297
|
+
personality_context = (
|
298
|
+
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
299
|
+
)
|
300
|
+
system_prompt = f"""
|
301
|
+
{prompts.system_prompt_extract_relevant_information}
|
302
|
+
|
303
|
+
{personality_context}
|
304
|
+
User Query: {", ".join(queries)}
|
305
|
+
|
306
|
+
Collate only relevant information from the website to answer the target query and in the provided JSON schema.
|
307
|
+
""".strip()
|
308
|
+
|
309
|
+
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
|
310
|
+
|
311
|
+
async with aiohttp.ClientSession() as session:
|
312
|
+
async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
|
313
|
+
response.raise_for_status()
|
314
|
+
response_json = await response.json()
|
315
|
+
return response_json["data"]["extract"]["relevant_extract"]
|
316
|
+
|
317
|
+
|
223
318
|
async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
|
224
319
|
encoded_query = urllib.parse.quote(query)
|
225
320
|
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
khoj/routers/api_chat.py
CHANGED
@@ -3,7 +3,6 @@ import base64
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
import time
|
6
|
-
import warnings
|
7
6
|
from datetime import datetime
|
8
7
|
from functools import partial
|
9
8
|
from typing import Dict, Optional
|
@@ -11,9 +10,8 @@ from urllib.parse import unquote
|
|
11
10
|
|
12
11
|
from asgiref.sync import sync_to_async
|
13
12
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
14
|
-
from fastapi.requests import Request
|
15
13
|
from fastapi.responses import Response, StreamingResponse
|
16
|
-
from starlette.authentication import
|
14
|
+
from starlette.authentication import requires
|
17
15
|
|
18
16
|
from khoj.app.settings import ALLOWED_HOSTS
|
19
17
|
from khoj.database.adapters import (
|
@@ -574,7 +572,6 @@ async def chat(
|
|
574
572
|
chat_metadata: dict = {}
|
575
573
|
connection_alive = True
|
576
574
|
user: KhojUser = request.user.object
|
577
|
-
subscribed: bool = has_required_scope(request, ["premium"])
|
578
575
|
event_delimiter = "␃🔚␗"
|
579
576
|
q = unquote(q)
|
580
577
|
nonlocal conversation_id
|
@@ -641,7 +638,7 @@ async def chat(
|
|
641
638
|
request=request,
|
642
639
|
telemetry_type="api",
|
643
640
|
api="chat",
|
644
|
-
client=
|
641
|
+
client=common.client,
|
645
642
|
user_agent=request.headers.get("user-agent"),
|
646
643
|
host=request.headers.get("host"),
|
647
644
|
metadata=chat_metadata,
|
@@ -839,26 +836,34 @@ async def chat(
|
|
839
836
|
|
840
837
|
# Gather Context
|
841
838
|
## Extract Document References
|
842
|
-
compiled_references, inferred_queries, defiltered_query = [], [],
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
839
|
+
compiled_references, inferred_queries, defiltered_query = [], [], q
|
840
|
+
try:
|
841
|
+
async for result in extract_references_and_questions(
|
842
|
+
request,
|
843
|
+
meta_log,
|
844
|
+
q,
|
845
|
+
(n or 7),
|
846
|
+
d,
|
847
|
+
conversation_id,
|
848
|
+
conversation_commands,
|
849
|
+
location,
|
850
|
+
partial(send_event, ChatEvent.STATUS),
|
851
|
+
uploaded_image_url=uploaded_image_url,
|
852
|
+
agent=agent,
|
853
|
+
):
|
854
|
+
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
855
|
+
yield result[ChatEvent.STATUS]
|
856
|
+
else:
|
857
|
+
compiled_references.extend(result[0])
|
858
|
+
inferred_queries.extend(result[1])
|
859
|
+
defiltered_query = result[2]
|
860
|
+
except Exception as e:
|
861
|
+
error_message = f"Error searching knowledge base: {e}. Attempting to respond without document references."
|
862
|
+
logger.warning(error_message)
|
863
|
+
async for result in send_event(
|
864
|
+
ChatEvent.STATUS, "Document search failed. I'll try respond without document references"
|
865
|
+
):
|
866
|
+
yield result
|
862
867
|
|
863
868
|
if not is_none_or_empty(compiled_references):
|
864
869
|
headings = "\n- " + "\n- ".join(set([c.get("compiled", c).split("\n")[0] for c in compiled_references]))
|
@@ -894,12 +899,13 @@ async def chat(
|
|
894
899
|
yield result[ChatEvent.STATUS]
|
895
900
|
else:
|
896
901
|
online_results = result
|
897
|
-
except
|
902
|
+
except Exception as e:
|
898
903
|
error_message = f"Error searching online: {e}. Attempting to respond without online results"
|
899
904
|
logger.warning(error_message)
|
900
|
-
async for result in
|
905
|
+
async for result in send_event(
|
906
|
+
ChatEvent.STATUS, "Online search failed. I'll try respond without online references"
|
907
|
+
):
|
901
908
|
yield result
|
902
|
-
return
|
903
909
|
|
904
910
|
## Gather Webpage References
|
905
911
|
if ConversationCommand.Webpage in conversation_commands:
|
@@ -928,11 +934,15 @@ async def chat(
|
|
928
934
|
webpages.append(webpage["link"])
|
929
935
|
async for result in send_event(ChatEvent.STATUS, f"**Read web pages**: {webpages}"):
|
930
936
|
yield result
|
931
|
-
except
|
937
|
+
except Exception as e:
|
932
938
|
logger.warning(
|
933
|
-
f"Error
|
939
|
+
f"Error reading webpages: {e}. Attempting to respond without webpage results",
|
934
940
|
exc_info=True,
|
935
941
|
)
|
942
|
+
async for result in send_event(
|
943
|
+
ChatEvent.STATUS, "Webpage read failed. I'll try respond without webpage references"
|
944
|
+
):
|
945
|
+
yield result
|
936
946
|
|
937
947
|
## Send Gathered References
|
938
948
|
async for result in send_event(
|
@@ -949,7 +959,7 @@ async def chat(
|
|
949
959
|
## Generate Image Output
|
950
960
|
if ConversationCommand.Image in conversation_commands:
|
951
961
|
async for result in text_to_image(
|
952
|
-
|
962
|
+
defiltered_query,
|
953
963
|
user,
|
954
964
|
meta_log,
|
955
965
|
location_data=location,
|
khoj/routers/helpers.py
CHANGED
@@ -353,13 +353,13 @@ async def aget_relevant_information_sources(
|
|
353
353
|
final_response = [ConversationCommand.Default]
|
354
354
|
else:
|
355
355
|
final_response = [ConversationCommand.General]
|
356
|
-
|
357
|
-
except Exception as e:
|
356
|
+
except Exception:
|
358
357
|
logger.error(f"Invalid response for determining relevant tools: {response}")
|
359
358
|
if len(agent_tools) == 0:
|
360
359
|
final_response = [ConversationCommand.Default]
|
361
360
|
else:
|
362
361
|
final_response = agent_tools
|
362
|
+
return final_response
|
363
363
|
|
364
364
|
|
365
365
|
async def aget_relevant_output_modes(
|
@@ -551,12 +551,14 @@ async def schedule_query(
|
|
551
551
|
raise AssertionError(f"Invalid response for scheduling query: {raw_response}")
|
552
552
|
|
553
553
|
|
554
|
-
async def extract_relevant_info(
|
554
|
+
async def extract_relevant_info(
|
555
|
+
qs: set[str], corpus: str, user: KhojUser = None, agent: Agent = None
|
556
|
+
) -> Union[str, None]:
|
555
557
|
"""
|
556
558
|
Extract relevant information for a given query from the target corpus
|
557
559
|
"""
|
558
560
|
|
559
|
-
if is_none_or_empty(corpus) or is_none_or_empty(
|
561
|
+
if is_none_or_empty(corpus) or is_none_or_empty(qs):
|
560
562
|
return None
|
561
563
|
|
562
564
|
personality_context = (
|
@@ -564,17 +566,16 @@ async def extract_relevant_info(q: str, corpus: str, user: KhojUser = None, agen
|
|
564
566
|
)
|
565
567
|
|
566
568
|
extract_relevant_information = prompts.extract_relevant_information.format(
|
567
|
-
query=
|
569
|
+
query=", ".join(qs),
|
568
570
|
corpus=corpus.strip(),
|
569
571
|
personality_context=personality_context,
|
570
572
|
)
|
571
573
|
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
)
|
574
|
+
response = await send_message_to_model_wrapper(
|
575
|
+
extract_relevant_information,
|
576
|
+
prompts.system_prompt_extract_relevant_information,
|
577
|
+
user=user,
|
578
|
+
)
|
578
579
|
return response.strip()
|
579
580
|
|
580
581
|
|
@@ -880,6 +881,7 @@ def send_message_to_model_wrapper_sync(
|
|
880
881
|
messages=truncated_messages,
|
881
882
|
api_key=api_key,
|
882
883
|
model=chat_model,
|
884
|
+
response_type=response_type,
|
883
885
|
)
|
884
886
|
else:
|
885
887
|
raise HTTPException(status_code=500, detail="Invalid conversation config")
|
khoj/search_type/text_search.py
CHANGED
@@ -3,6 +3,7 @@ import math
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import List, Optional, Tuple, Type, Union
|
5
5
|
|
6
|
+
import requests
|
6
7
|
import torch
|
7
8
|
from asgiref.sync import sync_to_async
|
8
9
|
from sentence_transformers import util
|
@@ -231,8 +232,12 @@ def setup(
|
|
231
232
|
|
232
233
|
def cross_encoder_score(query: str, hits: List[SearchResponse], search_model_name: str) -> List[SearchResponse]:
|
233
234
|
"""Score all retrieved entries using the cross-encoder"""
|
234
|
-
|
235
|
-
|
235
|
+
try:
|
236
|
+
with timer("Cross-Encoder Predict Time", logger, state.device):
|
237
|
+
cross_scores = state.cross_encoder_model[search_model_name].predict(query, hits)
|
238
|
+
except requests.exceptions.HTTPError as e:
|
239
|
+
logger.error(f"Failed to rerank documents using the inference endpoint. Error: {e}.", exc_info=True)
|
240
|
+
cross_scores = [0.0] * len(hits)
|
236
241
|
|
237
242
|
# Convert cross-encoder scores to distances and pass in hits for reranking
|
238
243
|
for idx in range(len(cross_scores)):
|
khoj/utils/helpers.py
CHANGED
@@ -2,10 +2,12 @@ from __future__ import annotations # to avoid quoting type hints
|
|
2
2
|
|
3
3
|
import datetime
|
4
4
|
import io
|
5
|
+
import ipaddress
|
5
6
|
import logging
|
6
7
|
import os
|
7
8
|
import platform
|
8
9
|
import random
|
10
|
+
import urllib.parse
|
9
11
|
import uuid
|
10
12
|
from collections import OrderedDict
|
11
13
|
from enum import Enum
|
@@ -125,6 +127,8 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
|
125
127
|
return "image", encoding
|
126
128
|
elif file_type in ["image/png"]:
|
127
129
|
return "image", encoding
|
130
|
+
elif file_type in ["image/webp"]:
|
131
|
+
return "image", encoding
|
128
132
|
elif content_group in ["code", "text"]:
|
129
133
|
return "plaintext", encoding
|
130
134
|
else:
|
@@ -164,9 +168,9 @@ def get_class_by_name(name: str) -> object:
|
|
164
168
|
class timer:
|
165
169
|
"""Context manager to log time taken for a block of code to run"""
|
166
170
|
|
167
|
-
def __init__(self, message: str, logger: logging.Logger, device: torch.device = None):
|
171
|
+
def __init__(self, message: str, logger: logging.Logger, device: torch.device = None, log_level=logging.DEBUG):
|
168
172
|
self.message = message
|
169
|
-
self.logger = logger
|
173
|
+
self.logger = logger.debug if log_level == logging.DEBUG else logger.info
|
170
174
|
self.device = device
|
171
175
|
|
172
176
|
def __enter__(self):
|
@@ -176,9 +180,9 @@ class timer:
|
|
176
180
|
def __exit__(self, *_):
|
177
181
|
elapsed = perf_counter() - self.start
|
178
182
|
if self.device is None:
|
179
|
-
self.logger
|
183
|
+
self.logger(f"{self.message}: {elapsed:.3f} seconds")
|
180
184
|
else:
|
181
|
-
self.logger
|
185
|
+
self.logger(f"{self.message}: {elapsed:.3f} seconds on device: {self.device}")
|
182
186
|
|
183
187
|
|
184
188
|
class LRU(OrderedDict):
|
@@ -347,12 +351,13 @@ tool_descriptions_for_llm = {
|
|
347
351
|
|
348
352
|
mode_descriptions_for_llm = {
|
349
353
|
ConversationCommand.Image: "Use this if the user is requesting you to generate a picture based on their description.",
|
350
|
-
ConversationCommand.Automation: "Use this if the user is requesting a response at a scheduled date
|
354
|
+
ConversationCommand.Automation: "Use this if you are confident the user is requesting a response at a scheduled date, time and frequency",
|
351
355
|
ConversationCommand.Text: "Use this if the other response modes don't seem to fit the query.",
|
352
356
|
}
|
353
357
|
|
354
358
|
mode_descriptions_for_agent = {
|
355
359
|
ConversationCommand.Image: "Agent can generate image in response.",
|
360
|
+
ConversationCommand.Automation: "Agent can schedule a task to run at a scheduled date, time and frequency in response.",
|
356
361
|
ConversationCommand.Text: "Agent can generate text in response.",
|
357
362
|
}
|
358
363
|
|
@@ -435,6 +440,46 @@ def is_internet_connected():
|
|
435
440
|
return False
|
436
441
|
|
437
442
|
|
443
|
+
def is_internal_url(url: str) -> bool:
|
444
|
+
"""
|
445
|
+
Check if a URL is likely to be internal/non-public.
|
446
|
+
|
447
|
+
Args:
|
448
|
+
url (str): The URL to check.
|
449
|
+
|
450
|
+
Returns:
|
451
|
+
bool: True if the URL is likely internal, False otherwise.
|
452
|
+
"""
|
453
|
+
try:
|
454
|
+
parsed_url = urllib.parse.urlparse(url)
|
455
|
+
hostname = parsed_url.hostname
|
456
|
+
|
457
|
+
# Check for localhost
|
458
|
+
if hostname in ["localhost", "127.0.0.1", "::1"]:
|
459
|
+
return True
|
460
|
+
|
461
|
+
# Check for IP addresses in private ranges
|
462
|
+
try:
|
463
|
+
ip = ipaddress.ip_address(hostname)
|
464
|
+
return ip.is_private
|
465
|
+
except ValueError:
|
466
|
+
pass # Not an IP address, continue with other checks
|
467
|
+
|
468
|
+
# Check for common internal TLDs
|
469
|
+
internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"]
|
470
|
+
if any(hostname.endswith(tld) for tld in internal_tlds):
|
471
|
+
return True
|
472
|
+
|
473
|
+
# Check for URLs without a TLD
|
474
|
+
if "." not in hostname:
|
475
|
+
return True
|
476
|
+
|
477
|
+
return False
|
478
|
+
except Exception:
|
479
|
+
# If we can't parse the URL or something else goes wrong, assume it's not internal
|
480
|
+
return False
|
481
|
+
|
482
|
+
|
438
483
|
def convert_image_to_webp(image_bytes):
|
439
484
|
"""Convert image bytes to webp format for faster loading"""
|
440
485
|
image_io = io.BytesIO(image_bytes)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: khoj
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.26.1.dev3
|
4
4
|
Summary: Your Second Brain
|
5
5
|
Project-URL: Homepage, https://khoj.dev
|
6
6
|
Project-URL: Documentation, https://docs.khoj.dev
|
@@ -32,7 +32,7 @@ Requires-Dist: dateparser>=1.1.1
|
|
32
32
|
Requires-Dist: defusedxml==0.7.1
|
33
33
|
Requires-Dist: django-apscheduler==0.6.2
|
34
34
|
Requires-Dist: django-phonenumber-field==7.3.0
|
35
|
-
Requires-Dist: django==5.0.
|
35
|
+
Requires-Dist: django==5.0.9
|
36
36
|
Requires-Dist: docx2txt==0.8
|
37
37
|
Requires-Dist: einops==0.8.0
|
38
38
|
Requires-Dist: fastapi>=0.110.0
|
@@ -138,8 +138,8 @@ Description-Content-Type: text/markdown
|
|
138
138
|
- Chat with any local or online LLM (e.g llama3, qwen, gemma, mistral, gpt, claude, gemini).
|
139
139
|
- Get answers from the internet and your docs (including image, pdf, markdown, org-mode, word, notion files).
|
140
140
|
- Access it from your Browser, Obsidian, Emacs, Desktop, Phone or Whatsapp.
|
141
|
-
-
|
142
|
-
-
|
141
|
+
- Create agents with custom knowledge, persona, chat model and tools to take on any role.
|
142
|
+
- Automate away repetitive research. Get personal newsletters and smart notifications delivered to your inbox.
|
143
143
|
- Find relevant docs quickly and easily using our advanced semantic search.
|
144
144
|
- Generate images, talk out loud, play your messages.
|
145
145
|
- Khoj is open-source, self-hostable. Always.
|