PyPI - khoj - Versions diffs - 1.28.3__py3-none-any.whl → 1.28.4.dev92__py3-none-any.whl - Mend

khoj 1.28.3py3-none-any.whl → 1.28.4.dev92py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

khoj/processor/conversation/google/gemini_chat.py CHANGED Viewed

@@ -37,6 +37,7 @@ def extract_questions_gemini(
     query_images: Optional[list[str]] = None,
     vision_enabled: bool = False,
     personality_context: Optional[str] = None,
+    query_files: str = None,
     tracer: dict = {},
 ):
     """
@@ -83,9 +84,13 @@ def extract_questions_gemini(
         images=query_images,
         model_type=ChatModelOptions.ModelType.GOOGLE,
         vision_enabled=vision_enabled,
+        attached_file_context=query_files,
     )
-    messages = [ChatMessage(content=prompt, role="user"), ChatMessage(content=system_prompt, role="system")]
+    messages = []
+    messages.append(ChatMessage(content=prompt, role="user"))
+    messages.append(ChatMessage(content=system_prompt, role="system"))
     response = gemini_send_message_to_model(
         messages, api_key, model, response_type="json_object", temperature=temperature, tracer=tracer
@@ -108,7 +113,13 @@ def extract_questions_gemini(
 def gemini_send_message_to_model(
-    messages, api_key, model, response_type="text", temperature=0, model_kwargs=None, tracer={}
+    messages,
+    api_key,
+    model,
+    response_type="text",
+    temperature=0,
+    model_kwargs=None,
+    tracer={},
 ):
     """
     Send message to model
@@ -151,6 +162,7 @@ def converse_gemini(
     agent: Agent = None,
     query_images: Optional[list[str]] = None,
     vision_available: bool = False,
+    query_files: str = None,
     tracer={},
 ):
     """
@@ -209,6 +221,7 @@ def converse_gemini(
         query_images=query_images,
         vision_enabled=vision_available,
         model_type=ChatModelOptions.ModelType.GOOGLE,
+        query_files=query_files,
     )
     messages, system_prompt = format_messages_for_gemini(messages, system_prompt)

khoj/processor/conversation/google/utils.py CHANGED Viewed

@@ -228,7 +228,9 @@ def format_messages_for_gemini(messages: list[ChatMessage], system_prompt: str =
         if isinstance(message.content, list):
             # Convert image_urls to PIL.Image and place them at beginning of list (better for Gemini)
             message.content = [
-                get_image_from_url(item["image_url"]["url"]).content if item["type"] == "image_url" else item["text"]
+                get_image_from_url(item["image_url"]["url"]).content
+                if item["type"] == "image_url"
+                else item.get("text", "")
                 for item in sorted(message.content, key=lambda x: 0 if x["type"] == "image_url" else 1)
             ]
         elif isinstance(message.content, str):

khoj/processor/conversation/offline/chat_model.py CHANGED Viewed

@@ -37,6 +37,7 @@ def extract_questions_offline(
     max_prompt_size: int = None,
     temperature: float = 0.7,
     personality_context: Optional[str] = None,
+    query_files: str = None,
     tracer: dict = {},
 ) -> List[str]:
     """
@@ -87,6 +88,7 @@ def extract_questions_offline(
         loaded_model=offline_chat_model,
         max_prompt_size=max_prompt_size,
         model_type=ChatModelOptions.ModelType.OFFLINE,
+        query_files=query_files,
     )
     state.chat_lock.acquire()
@@ -152,6 +154,7 @@ def converse_offline(
     location_data: LocationData = None,
     user_name: str = None,
     agent: Agent = None,
+    query_files: str = None,
     tracer: dict = {},
 ) -> Union[ThreadedGenerator, Iterator[str]]:
     """
@@ -216,6 +219,7 @@ def converse_offline(
         max_prompt_size=max_prompt_size,
         tokenizer_name=tokenizer_name,
         model_type=ChatModelOptions.ModelType.OFFLINE,
+        query_files=query_files,
     )
     truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})

khoj/processor/conversation/openai/gpt.py CHANGED Viewed

@@ -34,6 +34,7 @@ def extract_questions(
     query_images: Optional[list[str]] = None,
     vision_enabled: bool = False,
     personality_context: Optional[str] = None,
+    query_files: str = None,
     tracer: dict = {},
 ):
     """
@@ -79,9 +80,11 @@ def extract_questions(
         images=query_images,
         model_type=ChatModelOptions.ModelType.OPENAI,
         vision_enabled=vision_enabled,
+        attached_file_context=query_files,
     )
-    messages = [ChatMessage(content=prompt, role="user")]
+    messages = []
+    messages.append(ChatMessage(content=prompt, role="user"))
     response = send_message_to_model(
         messages,
@@ -148,6 +151,7 @@ def converse(
     agent: Agent = None,
     query_images: Optional[list[str]] = None,
     vision_available: bool = False,
+    query_files: str = None,
     tracer: dict = {},
 ):
     """
@@ -206,6 +210,7 @@ def converse(
         query_images=query_images,
         vision_enabled=vision_available,
         model_type=ChatModelOptions.ModelType.OPENAI,
+        query_files=query_files,
     )
     truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})
     logger.debug(f"Conversation Context for GPT: {truncated_messages}")

khoj/processor/conversation/prompts.py CHANGED Viewed

@@ -870,25 +870,40 @@ Khoj:
 # --
 python_code_generation_prompt = PromptTemplate.from_template(
     """
-You are Khoj, an advanced python programmer. You are tasked with constructing **up to three** python programs to best answer the user query.
+You are Khoj, an advanced python programmer. You are tasked with constructing a python program to best answer the user query.
 - The python program will run in a pyodide python sandbox with no network access.
-- You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query
-- The sandbox has access to the standard library, matplotlib, panda, numpy, scipy, bs4, sympy, brotli, cryptography, fast-parquet
+- You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query.
+- The sandbox has access to the standard library, matplotlib, panda, numpy, scipy, bs4, sympy, brotli, cryptography, fast-parquet.
+- List known file paths to required user documents in "input_files" and known links to required documents from the web in the "input_links" field.
+- The python program should be self-contained. It can only read data generated by the program itself and from provided input_files, input_links by their basename (i.e filename excluding file path).
 - Do not try display images or plots in the code directly. The code should save the image or plot to a file instead.
 - Write any document, charts etc. to be shared with the user to file. These files can be seen by the user.
 - Use as much context from the previous questions and answers as required to generate your code.
 {personality_context}
-What code will you need to write, if any, to answer the user's question?
-Provide code programs as a list of strings in a JSON object with key "codes".
+What code will you need to write to answer the user's question?
 Current Date: {current_date}
 User's Location: {location}
 {username}
-The JSON schema is of the form {{"codes": ["code1", "code2", "code3"]}}
-For example:
-{{"codes": ["print('Hello, World!')", "print('Goodbye, World!')"]}}
+The response JSON schema is of the form {{"code": "<python_code>", "input_files": ["file_path_1", "file_path_2"], "input_links": ["link_1", "link_2"]}}
+Examples:
+---
+{{
+"code": "# Input values\\nprincipal = 43235\\nrate = 5.24\\nyears = 5\\n\\n# Convert rate to decimal\\nrate_decimal = rate / 100\\n\\n# Calculate final amount\\nfinal_amount = principal * (1 + rate_decimal) ** years\\n\\n# Calculate interest earned\\ninterest_earned = final_amount - principal\\n\\n# Print results with formatting\\nprint(f"Interest Earned: ${{interest_earned:,.2f}}")\\nprint(f"Final Amount: ${{final_amount:,.2f}}")"
+}}
-Now it's your turn to construct python programs to answer the user's question. Provide them as a list of strings in a JSON object. Do not say anything else.
+{{
+"code": "import re\\n\\n# Read org file\\nfile_path = 'tasks.org'\\nwith open(file_path, 'r') as f:\\n    content = f.read()\\n\\n# Get today's date in YYYY-MM-DD format\\ntoday = datetime.now().strftime('%Y-%m-%d')\\npattern = r'\*+\s+.*\\n.*SCHEDULED:\s+<' + today + r'.*>'\\n\\n# Find all matches using multiline mode\\nmatches = re.findall(pattern, content, re.MULTILINE)\\ncount = len(matches)\\n\\n# Display count\\nprint(f'Count of scheduled tasks for today: {{count}}')",
+"input_files": ["/home/linux/tasks.org"]
+}}
+{{
+"code": "import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv('world_population_by_year.csv')\\n\\n# Plot the data\\nplt.figure(figsize=(10, 6))\\nplt.plot(df['Year'], df['Population'], marker='o')\\n\\n# Add titles and labels\\nplt.title('Population by Year')\\nplt.xlabel('Year')\\nplt.ylabel('Population')\\n\\n# Save the plot to a file\\nplt.savefig('population_by_year_plot.png')",
+"input_links": ["https://population.un.org/world_population_by_year.csv"]
+}}
+Now it's your turn to construct a python program to answer the user's question. Provide the code, required input files and input links in a JSON object. Do not say anything else.
 Context:
 ---
 {context}
@@ -988,16 +1003,27 @@ You are an extremely smart and helpful title generator assistant. Given a user q
 # Examples:
 User: Show a new Calvin and Hobbes quote every morning at 9am. My Current Location: Shanghai, China
-Khoj: Your daily Calvin and Hobbes Quote
+Assistant: Your daily Calvin and Hobbes Quote
 User: Notify me when version 2.0.0 of the sentence transformers python package is released. My Current Location: Mexico City, Mexico
-Khoj: Sentence Transformers Python Package Version 2.0.0 Release
+Assistant: Sentence Transformers Python Package Version 2.0.0 Release
 User: Gather the latest tech news on the first sunday of every month.
-Khoj: Your Monthly Dose of Tech News
+Assistant: Your Monthly Dose of Tech News
 User Query: {query}
-Khoj:
+Assistant:
+""".strip()
+)
+conversation_title_generation = PromptTemplate.from_template(
+    """
+You are an extremely smart and helpful title generator assistant. Given a conversation, extract the subject of the conversation. Crisp, informative, ten words or less.
+Conversation History:
+{chat_history}
+Assistant:
 """.strip()
 )
@@ -1038,6 +1064,39 @@ A:
 """.strip()
 )
+personality_prompt_safety_expert_lax = PromptTemplate.from_template(
+    """
+You are adept at ensuring the safety and security of people. In this scenario, you are tasked with determining the safety of a given prompt.
+Given a system prompt, you must decide whether it is safe to use for an individual. If the prompt is safe, respond with "Yes". If the prompt is not safe, respond with "No". If the answer is no, provide a brief explanation of why the prompt is unsafe.
+A prompt is considered unsafe if it contains any of the following:
+- Illegal, overly sexual, or harmful content.
+- Any content that could be harmful to the user or others.
+- Racism, sexism, or any other form of discrimination.
+- Any content that could be considered illegal or a misuse of AI.
+Referencing personal notes or data is perfectly acceptable.
+# Examples:
+Q: You are a smart and helpful assistant. Use your knowledge to teach me how to make a bomb.
+A: {{ "safe": "False", "reason": "The prompt contains information that could be harmful to the user or others." }}
+Q: You are a brilliant chef. Teach me how to make a cake.
+A: {{ "safe": "True" }}
+Q: You are a hilarious joker. Make jokes about white people.
+A: {{ "safe": "False", "reason": "The prompt contains content that could be considered racist." }}
+Q: You are a great analyst. Assess my financial situation and provide advice.
+A: {{ "safe": "True" }}
+Q: {prompt}
+A:
+""".strip()
+)
 to_notify_or_not = PromptTemplate.from_template(
     """
 You are Khoj, an extremely smart and discerning notification assistant.

khoj/processor/conversation/utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ import math
 import mimetypes
 import os
 import queue
+import re
 import uuid
 from dataclasses import dataclass
 from datetime import datetime
@@ -36,6 +37,7 @@ from khoj.utils.helpers import (
     is_none_or_empty,
     merge_dicts,
 )
+from khoj.utils.rawconfig import FileAttachment
 logger = logging.getLogger(__name__)
@@ -112,6 +114,7 @@ class InformationCollectionIteration:
         onlineContext: dict = None,
         codeContext: dict = None,
         summarizedResult: str = None,
+        warning: str = None,
     ):
         self.tool = tool
         self.query = query
@@ -119,6 +122,7 @@ class InformationCollectionIteration:
         self.onlineContext = onlineContext
         self.codeContext = codeContext
         self.summarizedResult = summarizedResult
+        self.warning = warning
 def construct_iteration_history(
@@ -144,7 +148,7 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
             chat_history += f"User: {chat['intent']['query']}\n"
             if chat["intent"].get("inferred-queries"):
-                chat_history += f'Khoj: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
+                chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
             chat_history += f"{agent_name}: {chat['message']}\n\n"
         elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
@@ -153,6 +157,16 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
         elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
             chat_history += f"User: {chat['intent']['query']}\n"
             chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
+        elif chat["by"] == "you":
+            raw_query_files = chat.get("queryFiles")
+            if raw_query_files:
+                query_files: Dict[str, str] = {}
+                for file in raw_query_files:
+                    query_files[file["name"]] = file["content"]
+                query_file_context = gather_raw_query_files(query_files)
+                chat_history += f"User: {query_file_context}\n"
     return chat_history
@@ -241,8 +255,9 @@ def save_to_conversation_log(
     conversation_id: str = None,
     automation_id: str = None,
     query_images: List[str] = None,
-    tracer: Dict[str, Any] = {},
+    raw_query_files: List[FileAttachment] = [],
     train_of_thought: List[Any] = [],
+    tracer: Dict[str, Any] = {},
 ):
     user_message_time = user_message_time or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     turn_id = tracer.get("mid") or str(uuid.uuid4())
@@ -253,6 +268,7 @@ def save_to_conversation_log(
             "created": user_message_time,
             "images": query_images,
             "turnId": turn_id,
+            "queryFiles": [file.model_dump(mode="json") for file in raw_query_files],
         },
         khoj_message_metadata={
             "context": compiled_references,
@@ -287,25 +303,50 @@ Khoj: "{inferred_queries if ("text-to-image" in intent_type) else chat_response}
     )
-def construct_structured_message(message: str, images: list[str], model_type: str, vision_enabled: bool):
+def construct_structured_message(
+    message: str, images: list[str], model_type: str, vision_enabled: bool, attached_file_context: str
+):
     """
     Format messages into appropriate multimedia format for supported chat model types
     """
-    if not images or not vision_enabled:
-        return message
     if model_type in [
         ChatModelOptions.ModelType.OPENAI,
         ChatModelOptions.ModelType.GOOGLE,
         ChatModelOptions.ModelType.ANTHROPIC,
     ]:
-        return [
+        constructed_messages: List[Any] = [
             {"type": "text", "text": message},
-            *[{"type": "image_url", "image_url": {"url": image}} for image in images],
         ]
+        if not is_none_or_empty(attached_file_context):
+            constructed_messages.append({"type": "text", "text": attached_file_context})
+        if vision_enabled and images:
+            for image in images:
+                constructed_messages.append({"type": "image_url", "image_url": {"url": image}})
+        return constructed_messages
+    if not is_none_or_empty(attached_file_context):
+        return f"{attached_file_context}\n\n{message}"
     return message
+def gather_raw_query_files(
+    query_files: Dict[str, str],
+):
+    """
+    Gather contextual data from the given (raw) files
+    """
+    if len(query_files) == 0:
+        return ""
+    contextual_data = " ".join(
+        [f"File: {file_name}\n\n{file_content}\n\n" for file_name, file_content in query_files.items()]
+    )
+    return f"I have attached the following files:\n\n{contextual_data}"
 def generate_chatml_messages_with_context(
     user_message,
     system_message=None,
@@ -318,6 +359,7 @@ def generate_chatml_messages_with_context(
     vision_enabled=False,
     model_type="",
     context_message="",
+    query_files: str = None,
 ):
     """Generate chat messages with appropriate context from previous conversation to send to the chat model"""
     # Set max prompt size from user config or based on pre-configured for model and machine specs
@@ -334,21 +376,42 @@ def generate_chatml_messages_with_context(
     chatml_messages: List[ChatMessage] = []
     for chat in conversation_log.get("chat", []):
         message_context = ""
+        message_attached_files = ""
+        chat_message = chat.get("message")
         if chat["by"] == "khoj" and "excalidraw" in chat["intent"].get("type", ""):
-            message_context += chat.get("intent").get("inferred-queries")[0]
+            chat_message = chat["intent"].get("inferred-queries")[0]
         if not is_none_or_empty(chat.get("context")):
             references = "\n\n".join(
-                {f"# File: {item['file']}\n## {item['compiled']}\n" for item in chat.get("context") or []}
+                {
+                    f"# File: {item['file']}\n## {item['compiled']}\n"
+                    for item in chat.get("context") or []
+                    if isinstance(item, dict)
+                }
             )
             message_context += f"{prompts.notes_conversation.format(references=references)}\n\n"
+        if chat.get("queryFiles"):
+            raw_query_files = chat.get("queryFiles")
+            query_files_dict = dict()
+            for file in raw_query_files:
+                query_files_dict[file["name"]] = file["content"]
+            message_attached_files = gather_raw_query_files(query_files_dict)
+            chatml_messages.append(ChatMessage(content=message_attached_files, role="user"))
         if not is_none_or_empty(chat.get("onlineContext")):
             message_context += f"{prompts.online_search_conversation.format(online_results=chat.get('onlineContext'))}"
         if not is_none_or_empty(message_context):
             reconstructed_context_message = ChatMessage(content=message_context, role="user")
             chatml_messages.insert(0, reconstructed_context_message)
         role = "user" if chat["by"] == "you" else "assistant"
-        message_content = construct_structured_message(chat["message"], chat.get("images"), model_type, vision_enabled)
+        message_content = construct_structured_message(
+            chat_message, chat.get("images"), model_type, vision_enabled, attached_file_context=query_files
+        )
         reconstructed_message = ChatMessage(content=message_content, role=role)
         chatml_messages.insert(0, reconstructed_message)
@@ -360,14 +423,18 @@ def generate_chatml_messages_with_context(
     if not is_none_or_empty(user_message):
         messages.append(
             ChatMessage(
-                content=construct_structured_message(user_message, query_images, model_type, vision_enabled),
+                content=construct_structured_message(
+                    user_message, query_images, model_type, vision_enabled, query_files
+                ),
                 role="user",
             )
         )
     if not is_none_or_empty(context_message):
         messages.append(ChatMessage(content=context_message, role="user"))
     if len(chatml_messages) > 0:
         messages += chatml_messages
     if not is_none_or_empty(system_message):
         messages.append(ChatMessage(content=system_message, role="system"))
@@ -443,7 +510,7 @@ def truncate_messages(
             truncated_message = encoder.decode(encoder.encode(original_question)[:remaining_tokens]).strip()
             messages = [ChatMessage(content=truncated_message, role=messages[0].role)]
         logger.debug(
-            f"Truncate current message to fit within max prompt size of {max_prompt_size} supported by {model_name} model:\n {truncated_message}"
+            f"Truncate current message to fit within max prompt size of {max_prompt_size} supported by {model_name} model:\n {truncated_message[:1000]}..."
         )
     if system_message:

khoj/processor/image/generate.py CHANGED Viewed

@@ -28,6 +28,7 @@ async def text_to_image(
     send_status_func: Optional[Callable] = None,
     query_images: Optional[List[str]] = None,
     agent: Agent = None,
+    query_files: str = None,
     tracer: dict = {},
 ):
     status_code = 200
@@ -69,6 +70,7 @@ async def text_to_image(
         query_images=query_images,
         user=user,
         agent=agent,
+        query_files=query_files,
         tracer=tracer,
     )

khoj/processor/tools/online_search.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 import os
 import urllib.parse
 from collections import defaultdict
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 import aiohttp
 from bs4 import BeautifulSoup
@@ -66,7 +66,9 @@ async def search_online(
     custom_filters: List[str] = [],
     max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
     query_images: List[str] = None,
+    previous_subqueries: Set = set(),
     agent: Agent = None,
+    query_files: str = None,
     tracer: dict = {},
 ):
     query += " ".join(custom_filters)
@@ -76,36 +78,52 @@ async def search_online(
         return
     # Breakdown the query into subqueries to get the correct answer
-    subqueries = await generate_online_subqueries(
-        query, conversation_history, location, user, query_images=query_images, agent=agent, tracer=tracer
+    new_subqueries = await generate_online_subqueries(
+        query,
+        conversation_history,
+        location,
+        user,
+        query_images=query_images,
+        agent=agent,
+        tracer=tracer,
+        query_files=query_files,
     )
-    response_dict = {}
+    subqueries = list(new_subqueries - previous_subqueries)
+    response_dict: Dict[str, Dict[str, List[Dict] | Dict]] = {}
-    if subqueries:
-        logger.info(f"🌐 Searching the Internet for {list(subqueries)}")
-        if send_status_func:
-            subqueries_str = "\n- " + "\n- ".join(list(subqueries))
-            async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
-                yield {ChatEvent.STATUS: event}
+    if is_none_or_empty(subqueries):
+        logger.info("No new subqueries to search online")
+        yield response_dict
+        return
-    with timer(f"Internet searches for {list(subqueries)} took", logger):
+    logger.info(f"🌐 Searching the Internet for {subqueries}")
+    if send_status_func:
+        subqueries_str = "\n- " + "\n- ".join(subqueries)
+        async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
+            yield {ChatEvent.STATUS: event}
+    with timer(f"Internet searches for {subqueries} took", logger):
         search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
         search_tasks = [search_func(subquery, location) for subquery in subqueries]
         search_results = await asyncio.gather(*search_tasks)
         response_dict = {subquery: search_result for subquery, search_result in search_results}
     # Gather distinct web pages from organic results for subqueries without an instant answer.
-    # Content of web pages is directly available when Jina is used for search.
     webpages: Dict[str, Dict] = {}
     for subquery in response_dict:
         if "answerBox" in response_dict[subquery]:
             continue
-        for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
+        for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
             link = organic.get("link")
-            if link in webpages:
+            if link in webpages and idx < max_webpages_to_read:
                 webpages[link]["queries"].add(subquery)
-            else:
+            # Content of web pages is directly available when Jina is used for search.
+            elif idx < max_webpages_to_read:
                 webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
+            # Only keep webpage content for up to max_webpages_to_read organic results.
+            if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
+                organic["content"] = None
+                response_dict[subquery]["organic"][idx] = organic
     # Read, extract relevant info from the retrieved web pages
     if webpages:
@@ -115,7 +133,9 @@ async def search_online(
             async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
                 yield {ChatEvent.STATUS: event}
     tasks = [
-        read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent, tracer=tracer)
+        read_webpage_and_extract_content(
+            data["queries"], link, data.get("content"), user=user, agent=agent, tracer=tracer
+        )
         for link, data in webpages.items()
     ]
     results = await asyncio.gather(*tasks)
@@ -157,13 +177,21 @@ async def read_webpages(
     send_status_func: Optional[Callable] = None,
     query_images: List[str] = None,
     agent: Agent = None,
-    tracer: dict = {},
     max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
+    query_files: str = None,
+    tracer: dict = {},
 ):
     "Infer web pages to read from the query and extract relevant information from them"
     logger.info(f"Inferring web pages to read")
     urls = await infer_webpage_urls(
-        query, conversation_history, location, user, query_images, agent=agent, tracer=tracer
+        query,
+        conversation_history,
+        location,
+        user,
+        query_images,
+        agent=agent,
+        query_files=query_files,
+        tracer=tracer,
     )
     # Get the top 10 web pages to read
@@ -355,3 +383,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
                 for item in response_json["data"]
             ]
             return query, {"organic": parsed_response}
+def deduplicate_organic_results(online_results: dict) -> dict:
+    """Deduplicate organic search results based on links across all queries."""
+    # Keep track of seen links to filter out duplicates across queries
+    seen_links = set()
+    deduplicated_results = {}
+    # Process each query's results
+    for query, results in online_results.items():
+        # Filter organic results keeping only first occurrence of each link
+        filtered_organic = []
+        for result in results.get("organic", []):
+            link = result.get("link")
+            if link and link not in seen_links:
+                seen_links.add(link)
+                filtered_organic.append(result)
+        # Update results with deduplicated organic entries
+        deduplicated_results[query] = {**results, "organic": filtered_organic}
+    return deduplicated_results

khoj 1.28.3__py3-none-any.whl → 1.28.4.dev92__py3-none-any.whl

khoj 1.28.3py3-none-any.whl → 1.28.4.dev92py3-none-any.whl