PyPI - khoj - Versions diffs - 1.24.2.dev3__py3-none-any.whl → 1.25.1.dev34__py3-none-any.whl - Mend

khoj 1.24.2.dev3py3-none-any.whl → 1.25.1.dev34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

khoj/processor/tools/online_search.py CHANGED Viewed

@@ -10,14 +10,22 @@ import aiohttp
 from bs4 import BeautifulSoup
 from markdownify import markdownify
-from khoj.database.models import KhojUser
+from khoj.database.adapters import ConversationAdapters
+from khoj.database.models import Agent, KhojUser, WebScraper
+from khoj.processor.conversation import prompts
 from khoj.routers.helpers import (
     ChatEvent,
     extract_relevant_info,
     generate_online_subqueries,
     infer_webpage_urls,
 )
-from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer
+from khoj.utils.helpers import (
+    is_env_var_true,
+    is_internal_url,
+    is_internet_connected,
+    is_none_or_empty,
+    timer,
+)
 from khoj.utils.rawconfig import LocationData
 logger = logging.getLogger(__name__)
@@ -25,12 +33,11 @@ logger = logging.getLogger(__name__)
 SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
 SERPER_DEV_URL = "https://google.serper.dev/search"
-JINA_READER_API_URL = "https://r.jina.ai/"
 JINA_SEARCH_API_URL = "https://s.jina.ai/"
 JINA_API_KEY = os.getenv("JINA_API_KEY")
-OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
-OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
+FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
 OLOSTEP_QUERY_PARAMS = {
     "timeout": 35,  # seconds
     "waitBeforeScraping": 1,  # seconds
@@ -53,20 +60,20 @@ async def search_online(
     conversation_history: dict,
     location: LocationData,
     user: KhojUser,
-    subscribed: bool = False,
     send_status_func: Optional[Callable] = None,
     custom_filters: List[str] = [],
     uploaded_image_url: str = None,
+    agent: Agent = None,
 ):
     query += " ".join(custom_filters)
     if not is_internet_connected():
-        logger.warn("Cannot search online as not connected to internet")
+        logger.warning("Cannot search online as not connected to internet")
         yield {}
         return
     # Breakdown the query into subqueries to get the correct answer
     subqueries = await generate_online_subqueries(
-        query, conversation_history, location, user, uploaded_image_url=uploaded_image_url
+        query, conversation_history, location, user, uploaded_image_url=uploaded_image_url, agent=agent
     )
     response_dict = {}
@@ -83,33 +90,36 @@ async def search_online(
         search_results = await asyncio.gather(*search_tasks)
         response_dict = {subquery: search_result for subquery, search_result in search_results}
-    # Gather distinct web page data from organic results of each subquery without an instant answer.
+    # Gather distinct web pages from organic results for subqueries without an instant answer.
     # Content of web pages is directly available when Jina is used for search.
-    webpages = {
-        (organic.get("link"), subquery, organic.get("content"))
-        for subquery in response_dict
-        for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
-        if "answerBox" not in response_dict[subquery]
-    }
+    webpages: Dict[str, Dict] = {}
+    for subquery in response_dict:
+        if "answerBox" in response_dict[subquery]:
+            continue
+        for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]:
+            link = organic.get("link")
+            if link in webpages:
+                webpages[link]["queries"].add(subquery)
+            else:
+                webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
     # Read, extract relevant info from the retrieved web pages
     if webpages:
-        webpage_links = set([link for link, _, _ in webpages])
-        logger.info(f"Reading web pages at: {list(webpage_links)}")
+        logger.info(f"Reading web pages at: {webpages.keys()}")
         if send_status_func:
-            webpage_links_str = "\n- " + "\n- ".join(list(webpage_links))
+            webpage_links_str = "\n- " + "\n- ".join(webpages.keys())
             async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
                 yield {ChatEvent.STATUS: event}
     tasks = [
-        read_webpage_and_extract_content(subquery, link, content, subscribed=subscribed)
-        for link, subquery, content in webpages
+        read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent)
+        for link, data in webpages.items()
     ]
     results = await asyncio.gather(*tasks)
     # Collect extracted info from the retrieved web pages
-    for subquery, webpage_extract, url in results:
+    for subqueries, url, webpage_extract in results:
         if webpage_extract is not None:
-            response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
+            response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract}
     yield response_dict
@@ -140,9 +150,9 @@ async def read_webpages(
     conversation_history: dict,
     location: LocationData,
     user: KhojUser,
-    subscribed: bool = False,
     send_status_func: Optional[Callable] = None,
     uploaded_image_url: str = None,
+    agent: Agent = None,
 ):
     "Infer web pages to read from the query and extract relevant information from them"
     logger.info(f"Inferring web pages to read")
@@ -156,29 +166,66 @@ async def read_webpages(
         webpage_links_str = "\n- " + "\n- ".join(list(urls))
         async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
             yield {ChatEvent.STATUS: event}
-    tasks = [read_webpage_and_extract_content(query, url, subscribed=subscribed) for url in urls]
+    tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls]
     results = await asyncio.gather(*tasks)
     response: Dict[str, Dict] = defaultdict(dict)
     response[query]["webpages"] = [
-        {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
+        {"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None
     ]
     yield response
+async def read_webpage(
+    url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
+) -> Tuple[str | None, str | None]:
+    if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
+        return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
+    elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
+        return await read_webpage_with_firecrawl(url, api_key, api_url), None
+    elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
+        return await read_webpage_with_olostep(url, api_key, api_url), None
+    elif scraper_type == WebScraper.WebScraperType.JINA:
+        return await read_webpage_with_jina(url, api_key, api_url), None
+    else:
+        return await read_webpage_at_url(url), None
 async def read_webpage_and_extract_content(
-    subquery: str, url: str, content: str = None, subscribed: bool = False
-) -> Tuple[str, Union[None, str], str]:
-    try:
-        if is_none_or_empty(content):
-            with timer(f"Reading web page at '{url}' took", logger):
-                content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url)
-        with timer(f"Extracting relevant information from web page at '{url}' took", logger):
-            extracted_info = await extract_relevant_info(subquery, content, subscribed=subscribed)
-        return subquery, extracted_info, url
-    except Exception as e:
-        logger.error(f"Failed to read web page at '{url}' with {e}")
-        return subquery, None, url
+    subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None
+) -> Tuple[set[str], str, Union[None, str]]:
+    # Select the web scrapers to use for reading the web page
+    web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
+    # Only use the direct web scraper for internal URLs
+    if is_internal_url(url):
+        web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
+    # Fallback through enabled web scrapers until we successfully read the web page
+    extracted_info = None
+    for scraper in web_scrapers:
+        try:
+            # Read the web page
+            if is_none_or_empty(content):
+                with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
+                    content, extracted_info = await read_webpage(
+                        url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
+                    )
+            # Extract relevant information from the web page
+            if is_none_or_empty(extracted_info):
+                with timer(f"Extracting relevant information from web page at '{url}' took", logger):
+                    extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent)
+            # If we successfully extracted information, break the loop
+            if not is_none_or_empty(extracted_info):
+                break
+        except Exception as e:
+            logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
+            # If this is the last web scraper in the list, log an error
+            if scraper.name == web_scrapers[-1].name:
+                logger.error(f"All web scrapers failed for '{url}'")
+    return subqueries, url, extracted_info
 async def read_webpage_at_url(web_url: str) -> str:
@@ -195,23 +242,23 @@ async def read_webpage_at_url(web_url: str) -> str:
             return markdownify(body)
-async def read_webpage_with_olostep(web_url: str) -> str:
-    headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
+async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
+    headers = {"Authorization": f"Bearer {api_key}"}
     web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy()  # type: ignore
     web_scraping_params["url"] = web_url
     async with aiohttp.ClientSession() as session:
-        async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
+        async with session.get(api_url, params=web_scraping_params, headers=headers) as response:
             response.raise_for_status()
             response_json = await response.json()
             return response_json["markdown_content"]
-async def read_webpage_with_jina(web_url: str) -> str:
-    jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
+async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str:
+    jina_reader_api_url = f"{api_url}/{web_url}"
     headers = {"Accept": "application/json", "X-Timeout": "30"}
-    if JINA_API_KEY:
-        headers["Authorization"] = f"Bearer {JINA_API_KEY}"
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
     async with aiohttp.ClientSession() as session:
         async with session.get(jina_reader_api_url, headers=headers) as response:
@@ -220,6 +267,54 @@ async def read_webpage_with_jina(web_url: str) -> str:
             return response_json["data"]["content"]
+async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
+    firecrawl_api_url = f"{api_url}/v1/scrape"
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
+    params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]}
+    async with aiohttp.ClientSession() as session:
+        async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
+            response.raise_for_status()
+            response_json = await response.json()
+            return response_json["data"]["markdown"]
+async def query_webpage_with_firecrawl(
+    web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
+) -> str:
+    firecrawl_api_url = f"{api_url}/v1/scrape"
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
+    schema = {
+        "type": "object",
+        "properties": {
+            "relevant_extract": {"type": "string"},
+        },
+        "required": [
+            "relevant_extract",
+        ],
+    }
+    personality_context = (
+        prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
+    )
+    system_prompt = f"""
+{prompts.system_prompt_extract_relevant_information}
+{personality_context}
+User Query: {", ".join(queries)}
+Collate only relevant information from the website to answer the target query and in the provided JSON schema.
+""".strip()
+    params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
+    async with aiohttp.ClientSession() as session:
+        async with session.post(firecrawl_api_url, json=params, headers=headers) as response:
+            response.raise_for_status()
+            response_json = await response.json()
+            return response_json["data"]["extract"]["relevant_extract"]
 async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]:
     encoded_query = urllib.parse.quote(query)
     jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"

khoj/routers/api.py CHANGED Viewed

@@ -27,7 +27,13 @@ from khoj.database.adapters import (
     get_user_photo,
     get_user_search_model_or_default,
 )
-from khoj.database.models import ChatModelOptions, KhojUser, SpeechToTextModelOptions
+from khoj.database.models import (
+    Agent,
+    ChatModelOptions,
+    KhojUser,
+    SpeechToTextModelOptions,
+)
+from khoj.processor.conversation import prompts
 from khoj.processor.conversation.anthropic.anthropic_chat import (
     extract_questions_anthropic,
 )
@@ -106,6 +112,7 @@ async def execute_search(
     r: Optional[bool] = False,
     max_distance: Optional[Union[float, None]] = None,
     dedupe: Optional[bool] = True,
+    agent: Optional[Agent] = None,
 ):
     start_time = time.time()
@@ -157,6 +164,7 @@ async def execute_search(
                     t,
                     question_embedding=encoded_asymmetric_query,
                     max_distance=max_distance,
+                    agent=agent,
                 )
             ]
@@ -333,6 +341,7 @@ async def extract_references_and_questions(
     location_data: LocationData = None,
     send_status_func: Optional[Callable] = None,
     uploaded_image_url: Optional[str] = None,
+    agent: Agent = None,
 ):
     user = request.user.object if request.user.is_authenticated else None
@@ -340,17 +349,30 @@ async def extract_references_and_questions(
     compiled_references: List[Any] = []
     inferred_queries: List[str] = []
+    agent_has_entries = False
+    if agent:
+        agent_has_entries = await sync_to_async(EntryAdapters.agent_has_entries)(agent=agent)
     if (
         not ConversationCommand.Notes in conversation_commands
         and not ConversationCommand.Default in conversation_commands
+        and not agent_has_entries
     ):
         yield compiled_references, inferred_queries, q
         return
+    # If Notes or Default is not in the conversation command, then the search should be restricted to the agent's knowledge base
+    should_limit_to_agent_knowledge = (
+        ConversationCommand.Notes not in conversation_commands
+        and ConversationCommand.Default not in conversation_commands
+    )
     if not await sync_to_async(EntryAdapters.user_has_entries)(user=user):
-        logger.debug("No documents in knowledge base. Use a Khoj client to sync and chat with your docs.")
-        yield compiled_references, inferred_queries, q
-        return
+        if not agent_has_entries:
+            logger.debug("No documents in knowledge base. Use a Khoj client to sync and chat with your docs.")
+            yield compiled_references, inferred_queries, q
+            return
     # Extract filter terms from user message
     defiltered_query = q
@@ -368,10 +390,12 @@ async def extract_references_and_questions(
     using_offline_chat = False
     logger.debug(f"Filters in query: {filters_in_query}")
+    personality_context = prompts.personality_context.format(personality=agent.personality) if agent else ""
     # Infer search queries from user message
     with timer("Extracting search queries took", logger):
         # If we've reached here, either the user has enabled offline chat or the openai model is enabled.
-        conversation_config = await ConversationAdapters.aget_default_conversation_config()
+        conversation_config = await ConversationAdapters.aget_default_conversation_config(user)
         vision_enabled = conversation_config.vision_enabled
         if conversation_config.model_type == ChatModelOptions.ModelType.OFFLINE:
@@ -392,6 +416,7 @@ async def extract_references_and_questions(
                 location_data=location_data,
                 user=user,
                 max_prompt_size=conversation_config.max_prompt_size,
+                personality_context=personality_context,
             )
         elif conversation_config.model_type == ChatModelOptions.ModelType.OPENAI:
             openai_chat_config = conversation_config.openai_config
@@ -408,6 +433,7 @@ async def extract_references_and_questions(
                 user=user,
                 uploaded_image_url=uploaded_image_url,
                 vision_enabled=vision_enabled,
+                personality_context=personality_context,
             )
         elif conversation_config.model_type == ChatModelOptions.ModelType.ANTHROPIC:
             api_key = conversation_config.openai_config.api_key
@@ -419,6 +445,7 @@ async def extract_references_and_questions(
                 conversation_log=meta_log,
                 location_data=location_data,
                 user=user,
+                personality_context=personality_context,
             )
         elif conversation_config.model_type == ChatModelOptions.ModelType.GOOGLE:
             api_key = conversation_config.openai_config.api_key
@@ -431,6 +458,7 @@ async def extract_references_and_questions(
                 location_data=location_data,
                 max_tokens=conversation_config.max_prompt_size,
                 user=user,
+                personality_context=personality_context,
             )
     # Collate search results as context for GPT
@@ -445,13 +473,14 @@ async def extract_references_and_questions(
             n_items = min(n, 3) if using_offline_chat else n
             search_results.extend(
                 await execute_search(
-                    user,
+                    user if not should_limit_to_agent_knowledge else None,
                     f"{query} {filters_in_query}",
                     n=n_items,
                     t=SearchType.All,
                     r=True,
                     max_distance=d,
                     dedupe=False,
+                    agent=agent,
                 )
             )
         search_results = text_search.deduplicated_search_responses(search_results)

khoj 1.24.2.dev3__py3-none-any.whl → 1.25.1.dev34__py3-none-any.whl

khoj 1.24.2.dev3py3-none-any.whl → 1.25.1.dev34py3-none-any.whl