PyPI - khoj - Versions diffs - 1.27.2.dev18__py3-none-any.whl → 1.27.2.dev130__py3-none-any.whl - Mend

khoj 1.27.2.dev18py3-none-any.whl → 1.27.2.dev130py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

khoj/routers/research.py ADDED Viewed

@@ -0,0 +1,321 @@
+import json
+import logging
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional
+import yaml
+from fastapi import Request
+from khoj.database.adapters import ConversationAdapters, EntryAdapters
+from khoj.database.models import Agent, KhojUser
+from khoj.processor.conversation import prompts
+from khoj.processor.conversation.utils import (
+    InformationCollectionIteration,
+    clean_json,
+    construct_iteration_history,
+    construct_tool_chat_history,
+)
+from khoj.processor.tools.online_search import read_webpages, search_online
+from khoj.processor.tools.run_code import run_code
+from khoj.routers.api import extract_references_and_questions
+from khoj.routers.helpers import (
+    ChatEvent,
+    construct_chat_history,
+    extract_relevant_info,
+    generate_summary_from_files,
+    send_message_to_model_wrapper,
+)
+from khoj.utils.helpers import (
+    ConversationCommand,
+    function_calling_description_for_llm,
+    is_none_or_empty,
+    timer,
+)
+from khoj.utils.rawconfig import LocationData
+logger = logging.getLogger(__name__)
+async def apick_next_tool(
+    query: str,
+    conversation_history: dict,
+    user: KhojUser = None,
+    query_images: List[str] = [],
+    location: LocationData = None,
+    user_name: str = None,
+    agent: Agent = None,
+    previous_iterations_history: str = None,
+    max_iterations: int = 5,
+    send_status_func: Optional[Callable] = None,
+    tracer: dict = {},
+):
+    """
+    Given a query, determine which of the available tools the agent should use in order to answer appropriately. One at a time, and it's able to use subsequent iterations to refine the answer.
+    """
+    tool_options = dict()
+    tool_options_str = ""
+    agent_tools = agent.input_tools if agent else []
+    for tool, description in function_calling_description_for_llm.items():
+        tool_options[tool.value] = description
+        if len(agent_tools) == 0 or tool.value in agent_tools:
+            tool_options_str += f'- "{tool.value}": "{description}"\n'
+    chat_history = construct_chat_history(conversation_history, agent_name=agent.name if agent else "Khoj")
+    if query_images:
+        query = f"[placeholder for user attached images]\n{query}"
+    personality_context = (
+        prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
+    )
+    # Extract Past User Message and Inferred Questions from Conversation Log
+    today = datetime.today()
+    location_data = f"{location}" if location else "Unknown"
+    function_planning_prompt = prompts.plan_function_execution.format(
+        tools=tool_options_str,
+        chat_history=chat_history,
+        personality_context=personality_context,
+        current_date=today.strftime("%Y-%m-%d"),
+        day_of_week=today.strftime("%A"),
+        username=user_name or "Unknown",
+        location=location_data,
+        previous_iterations=previous_iterations_history,
+        max_iterations=max_iterations,
+    )
+    with timer("Chat actor: Infer information sources to refer", logger):
+        response = await send_message_to_model_wrapper(
+            query=query,
+            context=function_planning_prompt,
+            response_type="json_object",
+            user=user,
+            query_images=query_images,
+            tracer=tracer,
+        )
+    try:
+        response = clean_json(response)
+        response = json.loads(response)
+        selected_tool = response.get("tool", None)
+        generated_query = response.get("query", None)
+        scratchpad = response.get("scratchpad", None)
+        logger.info(f"Response for determining relevant tools: {response}")
+        if send_status_func:
+            determined_tool_message = "**Determined Tool**: "
+            determined_tool_message += f"{selected_tool}({generated_query})." if selected_tool else "respond."
+            determined_tool_message += f"\nReason: {scratchpad}" if scratchpad else ""
+            async for event in send_status_func(f"{scratchpad}"):
+                yield {ChatEvent.STATUS: event}
+        yield InformationCollectionIteration(
+            tool=selected_tool,
+            query=generated_query,
+        )
+    except Exception as e:
+        logger.error(f"Invalid response for determining relevant tools: {response}. {e}", exc_info=True)
+        yield InformationCollectionIteration(
+            tool=None,
+            query=None,
+        )
+async def execute_information_collection(
+    request: Request,
+    user: KhojUser,
+    query: str,
+    conversation_id: str,
+    conversation_history: dict,
+    query_images: List[str],
+    agent: Agent = None,
+    send_status_func: Optional[Callable] = None,
+    user_name: str = None,
+    location: LocationData = None,
+    file_filters: List[str] = [],
+    tracer: dict = {},
+):
+    current_iteration = 0
+    MAX_ITERATIONS = 5
+    previous_iterations: List[InformationCollectionIteration] = []
+    while current_iteration < MAX_ITERATIONS:
+        online_results: Dict = dict()
+        code_results: Dict = dict()
+        document_results: List[Dict[str, str]] = []
+        summarize_files: str = ""
+        this_iteration = InformationCollectionIteration(tool=None, query=query)
+        previous_iterations_history = construct_iteration_history(previous_iterations, prompts.previous_iteration)
+        async for result in apick_next_tool(
+            query,
+            conversation_history,
+            user,
+            query_images,
+            location,
+            user_name,
+            agent,
+            previous_iterations_history,
+            MAX_ITERATIONS,
+            send_status_func,
+            tracer=tracer,
+        ):
+            if isinstance(result, dict) and ChatEvent.STATUS in result:
+                yield result[ChatEvent.STATUS]
+            elif isinstance(result, InformationCollectionIteration):
+                this_iteration = result
+        if this_iteration.tool == ConversationCommand.Notes:
+            this_iteration.context = []
+            document_results = []
+            async for result in extract_references_and_questions(
+                request,
+                construct_tool_chat_history(previous_iterations, ConversationCommand.Notes),
+                this_iteration.query,
+                7,
+                None,
+                conversation_id,
+                [ConversationCommand.Default],
+                location,
+                send_status_func,
+                query_images,
+                agent=agent,
+                tracer=tracer,
+            ):
+                if isinstance(result, dict) and ChatEvent.STATUS in result:
+                    yield result[ChatEvent.STATUS]
+                elif isinstance(result, tuple):
+                    document_results = result[0]
+                    this_iteration.context += document_results
+        if not is_none_or_empty(document_results):
+            try:
+                distinct_files = {d["file"] for d in document_results}
+                distinct_headings = set([d["compiled"].split("\n")[0] for d in document_results if "compiled" in d])
+                # Strip only leading # from headings
+                headings_str = "\n- " + "\n- ".join(distinct_headings).replace("#", "")
+                async for result in send_status_func(
+                    f"**Found {len(distinct_headings)} Notes Across {len(distinct_files)} Files**: {headings_str}"
+                ):
+                    yield result
+            except Exception as e:
+                logger.error(f"Error extracting document references: {e}", exc_info=True)
+        elif this_iteration.tool == ConversationCommand.Online:
+            async for result in search_online(
+                this_iteration.query,
+                construct_tool_chat_history(previous_iterations, ConversationCommand.Online),
+                location,
+                user,
+                send_status_func,
+                [],
+                max_webpages_to_read=0,
+                query_images=query_images,
+                agent=agent,
+                tracer=tracer,
+            ):
+                if isinstance(result, dict) and ChatEvent.STATUS in result:
+                    yield result[ChatEvent.STATUS]
+                else:
+                    online_results: Dict[str, Dict] = result  # type: ignore
+                    this_iteration.onlineContext = online_results
+        elif this_iteration.tool == ConversationCommand.Webpage:
+            try:
+                async for result in read_webpages(
+                    this_iteration.query,
+                    construct_tool_chat_history(previous_iterations, ConversationCommand.Webpage),
+                    location,
+                    user,
+                    send_status_func,
+                    query_images=query_images,
+                    agent=agent,
+                    tracer=tracer,
+                ):
+                    if isinstance(result, dict) and ChatEvent.STATUS in result:
+                        yield result[ChatEvent.STATUS]
+                    else:
+                        direct_web_pages: Dict[str, Dict] = result  # type: ignore
+                        webpages = []
+                        for web_query in direct_web_pages:
+                            if online_results.get(web_query):
+                                online_results[web_query]["webpages"] = direct_web_pages[web_query]["webpages"]
+                            else:
+                                online_results[web_query] = {"webpages": direct_web_pages[web_query]["webpages"]}
+                            for webpage in direct_web_pages[web_query]["webpages"]:
+                                webpages.append(webpage["link"])
+                        this_iteration.onlineContext = online_results
+            except Exception as e:
+                logger.error(f"Error reading webpages: {e}", exc_info=True)
+        elif this_iteration.tool == ConversationCommand.Code:
+            try:
+                async for result in run_code(
+                    this_iteration.query,
+                    construct_tool_chat_history(previous_iterations, ConversationCommand.Webpage),
+                    "",
+                    location,
+                    user,
+                    send_status_func,
+                    query_images=query_images,
+                    agent=agent,
+                    tracer=tracer,
+                ):
+                    if isinstance(result, dict) and ChatEvent.STATUS in result:
+                        yield result[ChatEvent.STATUS]
+                    else:
+                        code_results: Dict[str, Dict] = result  # type: ignore
+                        this_iteration.codeContext = code_results
+                async for result in send_status_func(f"**Ran code snippets**: {len(this_iteration.codeContext)}"):
+                    yield result
+            except ValueError as e:
+                logger.warning(
+                    f"Failed to use code tool: {e}. Attempting to respond without code results",
+                    exc_info=True,
+                )
+        elif this_iteration.tool == ConversationCommand.Summarize:
+            try:
+                async for result in generate_summary_from_files(
+                    this_iteration.query,
+                    user,
+                    file_filters,
+                    construct_tool_chat_history(previous_iterations),
+                    query_images=query_images,
+                    agent=agent,
+                    send_status_func=send_status_func,
+                ):
+                    if isinstance(result, dict) and ChatEvent.STATUS in result:
+                        yield result[ChatEvent.STATUS]
+                    else:
+                        summarize_files = result  # type: ignore
+            except Exception as e:
+                logger.error(f"Error generating summary: {e}", exc_info=True)
+        else:
+            # No valid tools. This is our exit condition.
+            current_iteration = MAX_ITERATIONS
+        current_iteration += 1
+        if document_results or online_results or code_results or summarize_files:
+            results_data = f"**Results**:\n"
+            if document_results:
+                results_data += f"**Document References**: {yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+            if online_results:
+                results_data += f"**Online Results**: {yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+            if code_results:
+                results_data += f"**Code Results**: {yaml.dump(code_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+            if summarize_files:
+                results_data += f"**Summarized Files**: {yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+            # intermediate_result = await extract_relevant_info(this_iteration.query, results_data, agent)
+            this_iteration.summarizedResult = results_data
+        previous_iterations.append(this_iteration)
+        yield this_iteration

khoj/search_filter/date_filter.py CHANGED Viewed

@@ -7,8 +7,6 @@ from math import inf
 from typing import List, Tuple
 import dateparser as dtparse
-from dateparser.search import search_dates
-from dateparser_data.settings import default_parsers
 from dateutil.relativedelta import relativedelta
 from khoj.search_filter.base_filter import BaseFilter
@@ -23,7 +21,7 @@ class DateFilter(BaseFilter):
     # - dt>="yesterday" dt<"tomorrow"
     # - dt>="last week"
     # - dt:"2 years ago"
-    date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']"
+    date_regex = r"dt([:><=]{1,2})[\"'‘’](.*?)[\"'‘’]"
     def __init__(self, entry_key="compiled"):
         self.entry_key = entry_key

khoj/search_filter/file_filter.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import fnmatch
 import logging
 import re
 from collections import defaultdict
 from typing import List
 from khoj.search_filter.base_filter import BaseFilter
-from khoj.utils.helpers import LRU, timer
+from khoj.utils.helpers import LRU
 logger = logging.getLogger(__name__)

khoj/search_type/text_search.py CHANGED Viewed

@@ -102,8 +102,8 @@ def load_embeddings(
 async def query(
-    user: KhojUser,
     raw_query: str,
+    user: KhojUser,
     type: SearchType = SearchType.All,
     question_embedding: Union[torch.Tensor, None] = None,
     max_distance: float = None,
@@ -130,12 +130,12 @@ async def query(
     top_k = 10
     with timer("Search Time", logger, state.device):
         hits = EntryAdapters.search_with_embeddings(
-            user=user,
+            raw_query=raw_query,
             embeddings=question_embedding,
             max_results=top_k,
             file_type_filter=file_type,
-            raw_query=raw_query,
             max_distance=max_distance,
+            user=user,
             agent=agent,
         ).all()
         hits = await sync_to_async(list)(hits)  # type: ignore[call-arg]

khoj/utils/helpers.py CHANGED Viewed

@@ -313,12 +313,14 @@ class ConversationCommand(str, Enum):
     Help = "help"
     Online = "online"
     Webpage = "webpage"
+    Code = "code"
     Image = "image"
     Text = "text"
     Automation = "automation"
     AutomatedTask = "automated_task"
     Summarize = "summarize"
     Diagram = "diagram"
+    Research = "research"
 command_descriptions = {
@@ -327,11 +329,13 @@ command_descriptions = {
     ConversationCommand.Default: "The default command when no command specified. It intelligently auto-switches between general and notes mode.",
     ConversationCommand.Online: "Search for information on the internet.",
     ConversationCommand.Webpage: "Get information from webpage suggested by you.",
+    ConversationCommand.Code: "Run Python code to parse information, run complex calculations, create documents and charts.",
     ConversationCommand.Image: "Generate illustrative, creative images by describing your imagination in words.",
     ConversationCommand.Automation: "Automatically run your query at a specified time or interval.",
     ConversationCommand.Help: "Get help with how to use or setup Khoj from the documentation",
     ConversationCommand.Summarize: "Get help with a question pertaining to an entire document.",
     ConversationCommand.Diagram: "Draw a flowchart, diagram, or any other visual representation best expressed with primitives like lines, rectangles, and text.",
+    ConversationCommand.Research: "Do deep research on a topic. This will take longer than usual, but give a more detailed, comprehensive answer.",
 }
 command_descriptions_for_agent = {
@@ -340,6 +344,7 @@ command_descriptions_for_agent = {
     ConversationCommand.Online: "Agent can search the internet for information.",
     ConversationCommand.Webpage: "Agent can read suggested web pages for information.",
     ConversationCommand.Summarize: "Agent can read an entire document. Agents knowledge base must be a single document.",
+    ConversationCommand.Research: "Agent can do deep research on a topic.",
 }
 tool_descriptions_for_llm = {
@@ -348,18 +353,26 @@ tool_descriptions_for_llm = {
     ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.",
     ConversationCommand.Online: "To search for the latest, up-to-date information from the internet. Note: **Questions about Khoj should always use this data source**",
     ConversationCommand.Webpage: "To use if the user has directly provided the webpage urls or you are certain of the webpage urls to read.",
+    ConversationCommand.Code: "To run Python code in a Pyodide sandbox with no network access. Helpful when need to parse information, run complex calculations, create documents and charts for user. Matplotlib, bs4, pandas, numpy, etc. are available.",
     ConversationCommand.Summarize: "To retrieve an answer that depends on the entire document or a large text.",
 }
+function_calling_description_for_llm = {
+    ConversationCommand.Notes: "To search the user's personal knowledge base. Especially helpful if the question expects context from the user's notes or documents.",
+    ConversationCommand.Online: "To search the internet for information. Provide all relevant context to ensure new searches, not previously run, are performed.",
+    ConversationCommand.Webpage: "To extract information from a webpage. Useful for more detailed research from the internet. Usually used when you know the webpage links to refer to. Share the webpage link and information to extract in your query.",
+    ConversationCommand.Code: "To run Python code in a Pyodide sandbox with no network access. Helpful when need to parse information, run complex calculations, create documents and charts for user. Matplotlib, bs4, pandas, numpy, etc. are available.",
+}
 mode_descriptions_for_llm = {
-    ConversationCommand.Image: "Use this if you are confident the user is requesting you to create a new picture based on their description.",
+    ConversationCommand.Image: "Use this if you are confident the user is requesting you to create a new picture based on their description. This does not support generating charts or graphs.",
     ConversationCommand.Automation: "Use this if you are confident the user is requesting a response at a scheduled date, time and frequency",
     ConversationCommand.Text: "Use this if a normal text response would be sufficient for accurately responding to the query.",
     ConversationCommand.Diagram: "Use this if the user is requesting a diagram or visual representation that requires primitives like lines, rectangles, and text.",
 }
 mode_descriptions_for_agent = {
-    ConversationCommand.Image: "Agent can generate image in response.",
+    ConversationCommand.Image: "Agent can generate images in response. It cannot not use this to generate charts and graphs.",
     ConversationCommand.Automation: "Agent can schedule a task to run at a scheduled date, time and frequency in response.",
     ConversationCommand.Text: "Agent can generate text in response.",
     ConversationCommand.Diagram: "Agent can generate a visual representation that requires primitives like lines, rectangles, and text.",

khoj/utils/yaml.py CHANGED Viewed

@@ -41,3 +41,7 @@ def parse_config_from_string(yaml_config: dict) -> FullConfig:
 def parse_config_from_file(yaml_config_file):
     "Parse and validate config in YML file"
     return parse_config_from_string(load_config_from_file(yaml_config_file))
+def yaml_dump(data):
+    return yaml.dump(data, allow_unicode=True, sort_keys=False, default_flow_style=False)

{khoj-1.27.2.dev18.dist-info → khoj-1.27.2.dev130.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: khoj
-Version: 1.27.2.dev18
+Version: 1.27.2.dev130
 Summary: Your Second Brain
 Project-URL: Homepage, https://khoj.dev
 Project-URL: Documentation, https://docs.khoj.dev
@@ -78,6 +78,7 @@ Requires-Dist: black>=23.1.0; extra == 'dev'
 Requires-Dist: boto3>=1.34.57; extra == 'dev'
 Requires-Dist: factory-boy>=3.2.1; extra == 'dev'
 Requires-Dist: freezegun>=1.2.0; extra == 'dev'
+Requires-Dist: gitpython~=3.1.43; extra == 'dev'
 Requires-Dist: google-auth==2.23.3; extra == 'dev'
 Requires-Dist: gunicorn==22.0.0; extra == 'dev'
 Requires-Dist: mypy>=1.0.1; extra == 'dev'

khoj 1.27.2.dev18__py3-none-any.whl → 1.27.2.dev130__py3-none-any.whl

khoj 1.27.2.dev18py3-none-any.whl → 1.27.2.dev130py3-none-any.whl