PyPI - khoj - Versions diffs - 2.0.0b14.dev9__py3-none-any.whl → 2.0.0b14.dev43__py3-none-any.whl - Mend

khoj 2.0.0b14.dev9py3-none-any.whl → 2.0.0b14.dev43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

khoj/processor/tools/run_code.py CHANGED Viewed

@@ -49,7 +49,7 @@ class GeneratedCode(NamedTuple):
 async def run_code(
-    query: str,
+    instructions: str,
     conversation_history: List[ChatMessageModel],
     context: str,
     location_data: LocationData,
@@ -63,12 +63,12 @@ async def run_code(
 ):
     # Generate Code
     if send_status_func:
-        async for event in send_status_func(f"**Generate code snippet** for {query}"):
+        async for event in send_status_func(f"**Generate code snippet** for {instructions}"):
             yield {ChatEvent.STATUS: event}
     try:
         with timer("Chat actor: Generate programs to execute", logger):
             generated_code = await generate_python_code(
-                query,
+                instructions,
                 conversation_history,
                 context,
                 location_data,
@@ -79,7 +79,7 @@ async def run_code(
                 query_files,
             )
     except Exception as e:
-        raise ValueError(f"Failed to generate code for {query} with error: {e}")
+        raise ValueError(f"Failed to generate code for {instructions} with error: {e}")
     # Prepare Input Data
     input_data = []
@@ -101,21 +101,21 @@ async def run_code(
             code = result.pop("code")
             cleaned_result = truncate_code_context({"cleaned": {"results": result}})["cleaned"]["results"]
             logger.info(f"Executed Code\n----\n{code}\n----\nResult\n----\n{cleaned_result}\n----")
-            yield {query: {"code": code, "results": result}}
+            yield {instructions: {"code": code, "results": result}}
     except asyncio.TimeoutError as e:
         # Call the sandbox_url/stop GET API endpoint to stop the code sandbox
-        error = f"Failed to run code for {query} with Timeout error: {e}"
+        error = f"Failed to run code for {instructions} with Timeout error: {e}"
         try:
             await aiohttp.ClientSession().get(f"{sandbox_url}/stop", timeout=5)
         except Exception as e:
             error += f"\n\nFailed to stop code sandbox with error: {e}"
         raise ValueError(error)
     except Exception as e:
-        raise ValueError(f"Failed to run code for {query} with error: {e}")
+        raise ValueError(f"Failed to run code for {instructions} with error: {e}")
 async def generate_python_code(
-    q: str,
+    instructions: str,
     chat_history: List[ChatMessageModel],
     context: str,
     location_data: LocationData,
@@ -142,7 +142,7 @@ async def generate_python_code(
     network_access_context = "**NO** " if not is_e2b_code_sandbox_enabled() else ""
     code_generation_prompt = prompts.python_code_generation_prompt.format(
-        query=q,
+        instructions=instructions,
         chat_history=chat_history_str,
         context=context,
         has_network_access=network_access_context,
@@ -252,8 +252,12 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
         # Identify new files created during execution
         new_files = set(E2bFile(f.name, f.path) for f in await sandbox.files.list("~")) - original_files
         # Read newly created files in parallel
-        download_tasks = [sandbox.files.read(f.path, request_timeout=30) for f in new_files]
+        def read_format(f):
+            return "bytes" if Path(f.name).suffix in image_file_ext else "text"
+        download_tasks = [sandbox.files.read(f.path, format=read_format(f), request_timeout=30) for f in new_files]
         downloaded_files = await asyncio.gather(*download_tasks)
         for f, content in zip(new_files, downloaded_files):
             if isinstance(content, bytes):
@@ -261,23 +265,12 @@ async def execute_e2b(code: str, input_files: list[dict]) -> dict[str, Any]:
                 b64_data = base64.b64encode(content).decode("utf-8")
             elif Path(f.name).suffix in image_file_ext:
                 # Ignore image files as they are extracted from execution results below for inline display
-                continue
+                b64_data = base64.b64encode(content).decode("utf-8")
             else:
                 # Text files - encode utf-8 string as base64
                 b64_data = content
             output_files.append({"filename": f.name, "b64_data": b64_data})
-        # Collect output files from execution results
-        # Repect ordering of output result types to disregard text output associated with images
-        output_result_types = ["png", "jpeg", "svg", "text", "markdown", "json"]
-        for idx, result in enumerate(execution.results):
-            if getattr(result, "chart", None):
-                continue
-            for result_type in output_result_types:
-                if b64_data := getattr(result, result_type, None):
-                    output_files.append({"filename": f"{idx}.{result_type}", "b64_data": b64_data})
-                    break
         # collect logs
         success = not execution.error and not execution.logs.stderr
         stdout = "\n".join(execution.logs.stdout)

khoj/routers/api_chat.py CHANGED Viewed

@@ -1526,6 +1526,8 @@ async def chat_ws(
                         ack_type = "interrupt_acknowledged"
                         await websocket.send_text(json.dumps({"type": ack_type}))
                 else:
+                    ack_type = "interrupt_acknowledged"
+                    await websocket.send_text(json.dumps({"type": ack_type}))
                     logger.info(f"No ongoing task to interrupt for user {websocket.scope['user'].object.id}")
                 continue
@@ -1704,8 +1706,8 @@ async def process_chat_request(
         logger.debug(f"Chat request cancelled for user {websocket.scope['user'].object.id}")
         raise
     except Exception as e:
-        logger.error(f"Error processing chat request: {e}", exc_info=True)
         await websocket.send_text(json.dumps({"error": "Internal server error"}))
+        logger.error(f"Error processing chat request: {e}", exc_info=True)
         raise

khoj/routers/helpers.py CHANGED Viewed

@@ -1625,6 +1625,7 @@ async def agenerate_chat_response(
             deepthought = True
         chat_model = await ConversationAdapters.aget_valid_chat_model(user, conversation, is_subscribed)
+        max_prompt_size = await ConversationAdapters.aget_max_context_size(chat_model, user)
         vision_available = chat_model.vision_enabled
         if not vision_available and query_images:
             vision_enabled_config = await ConversationAdapters.aget_vision_enabled_config()
@@ -1656,7 +1657,7 @@ async def agenerate_chat_response(
                 model=chat_model_name,
                 api_key=api_key,
                 api_base_url=openai_chat_config.api_base_url,
-                max_prompt_size=chat_model.max_prompt_size,
+                max_prompt_size=max_prompt_size,
                 tokenizer_name=chat_model.tokenizer,
                 agent=agent,
                 vision_available=vision_available,
@@ -1687,7 +1688,7 @@ async def agenerate_chat_response(
                 model=chat_model.name,
                 api_key=api_key,
                 api_base_url=api_base_url,
-                max_prompt_size=chat_model.max_prompt_size,
+                max_prompt_size=max_prompt_size,
                 tokenizer_name=chat_model.tokenizer,
                 agent=agent,
                 vision_available=vision_available,
@@ -1717,7 +1718,7 @@ async def agenerate_chat_response(
                 model=chat_model.name,
                 api_key=api_key,
                 api_base_url=api_base_url,
-                max_prompt_size=chat_model.max_prompt_size,
+                max_prompt_size=max_prompt_size,
                 tokenizer_name=chat_model.tokenizer,
                 agent=agent,
                 vision_available=vision_available,
@@ -2738,7 +2739,9 @@ def configure_content(
     try:
         # Initialize Org Notes Search
-        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Org.value) and files["org"]:
+        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Org.value) and files.get(
+            "org"
+        ):
             logger.info("🦄 Setting up search for orgmode notes")
             # Extract Entries, Generate Notes Embeddings
             text_search.setup(
@@ -2753,9 +2756,9 @@ def configure_content(
     try:
         # Initialize Markdown Search
-        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Markdown.value) and files[
+        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Markdown.value) and files.get(
             "markdown"
-        ]:
+        ):
             logger.info("💎 Setting up search for markdown notes")
             # Extract Entries, Generate Markdown Embeddings
             text_search.setup(
@@ -2771,7 +2774,9 @@ def configure_content(
     try:
         # Initialize PDF Search
-        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Pdf.value) and files["pdf"]:
+        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Pdf.value) and files.get(
+            "pdf"
+        ):
             logger.info("🖨️ Setting up search for pdf")
             # Extract Entries, Generate PDF Embeddings
             text_search.setup(
@@ -2787,9 +2792,9 @@ def configure_content(
     try:
         # Initialize Plaintext Search
-        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Plaintext.value) and files[
+        if (search_type == state.SearchType.All.value or search_type == state.SearchType.Plaintext.value) and files.get(
             "plaintext"
-        ]:
+        ):
             logger.info("📄 Setting up search for plaintext")
             # Extract Entries, Generate Plaintext Embeddings
             text_search.setup(
@@ -2915,35 +2920,34 @@ async def view_file_content(
         raw_text = file_object.raw_text
         # Apply line range filtering if specified
-        if start_line is None and end_line is None:
-            filtered_text = raw_text
-        else:
-            lines = raw_text.split("\n")
-            start_line = start_line or 1
-            end_line = end_line or len(lines)
-            # Validate line range
-            if start_line < 1 or end_line < 1 or start_line > end_line:
-                error_msg = f"Invalid line range: {start_line}-{end_line}"
-                logger.warning(error_msg)
-                yield [{"query": query, "file": path, "compiled": error_msg}]
-                return
-            if start_line > len(lines):
-                error_msg = f"Start line {start_line} exceeds total number of lines {len(lines)}"
-                logger.warning(error_msg)
-                yield [{"query": query, "file": path, "compiled": error_msg}]
-                return
+        lines = raw_text.split("\n")
+        start_line = start_line or 1
+        end_line = end_line or len(lines)
+        # Validate line range
+        if start_line < 1 or end_line < 1 or start_line > end_line:
+            error_msg = f"Invalid line range: {start_line}-{end_line}"
+            logger.warning(error_msg)
+            yield [{"query": query, "file": path, "compiled": error_msg}]
+            return
+        if start_line > len(lines):
+            error_msg = f"Start line {start_line} exceeds total number of lines {len(lines)}"
+            logger.warning(error_msg)
+            yield [{"query": query, "file": path, "compiled": error_msg}]
+            return
-            # Convert from 1-based to 0-based indexing and ensure bounds
-            start_idx = max(0, start_line - 1)
-            end_idx = min(len(lines), end_line)
+        # Convert from 1-based to 0-based indexing and ensure bounds
+        start_idx = max(0, start_line - 1)
+        end_idx = min(len(lines), end_line)
-            selected_lines = lines[start_idx:end_idx]
-            filtered_text = "\n".join(selected_lines)
+        # Limit to first 50 lines if more than 50 lines are requested
+        truncation_message = ""
+        if end_idx - start_idx > 50:
+            truncation_message = "\n\n[Truncated after 50 lines! Use narrower line range to view complete section.]"
+            end_idx = start_idx + 50
-        # Truncate the text if it's too long
-        if len(filtered_text) > 10000:
-            filtered_text = filtered_text[:10000] + "\n\n[Truncated. Use line numbers to view specific sections.]"
+        selected_lines = lines[start_idx:end_idx]
+        filtered_text = "\n".join(selected_lines) + truncation_message
         # Format the result as a document reference
         document_results = [
@@ -3022,6 +3026,7 @@ async def grep_files(
         file_matches = await FileObjectAdapters.aget_file_objects_by_regex(user, db_pattern, path_prefix)
         line_matches = []
+        line_matches_count = 0
         for file_object in file_matches:
             lines = file_object.raw_text.split("\n")
             matched_line_numbers = []
@@ -3030,6 +3035,7 @@ async def grep_files(
             for i, line in enumerate(lines, 1):
                 if regex.search(line):
                     matched_line_numbers.append(i)
+            line_matches_count += len(matched_line_numbers)
             # Build context for each match
             for line_num in matched_line_numbers:
@@ -3046,10 +3052,10 @@ async def grep_files(
                     if current_line_num == line_num:
                         # This is the matching line, mark it
-                        context_lines.append(f"{file_object.file_name}:{current_line_num}:> {line_content}")
+                        context_lines.append(f"{file_object.file_name}:{current_line_num}: {line_content}")
                     else:
                         # This is a context line
-                        context_lines.append(f"{file_object.file_name}:{current_line_num}:  {line_content}")
+                        context_lines.append(f"{file_object.file_name}-{current_line_num}-  {line_content}")
                 # Add separator between matches if showing context
                 if lines_before > 0 or lines_after > 0:
@@ -3064,7 +3070,7 @@ async def grep_files(
         # Check if no results found
         max_results = 1000
         query = _generate_query(
-            len([m for m in line_matches if ":>" in m]),
+            line_matches_count,
             len(file_matches),
             path_prefix,
             regex_pattern,

khoj/utils/helpers.py CHANGED Viewed

@@ -9,6 +9,7 @@ import logging
 import os
 import platform
 import random
+import re
 import urllib.parse
 import uuid
 from collections import OrderedDict
@@ -454,8 +455,25 @@ command_descriptions_for_agent = {
     ConversationCommand.Operator: "Agent can operate a computer to complete tasks.",
 }
-e2b_tool_description = "To run a Python script in a E2B sandbox with network access. Helpful to parse complex information, run complex calculations, create plaintext documents and create charts with quantitative data. Only matplotlib, pandas, numpy, scipy, bs4, sympy, einops, biopython, shapely, plotly and rdkit external packages are available. Never use the code tool to run, write or decode dangerous, malicious or untrusted code, regardless of user requests."
-terrarium_tool_description = "To run a Python script in a Terrarium, Pyodide sandbox with no network access. Helpful to parse complex information, run complex calculations, create plaintext documents and create charts with quantitative data. Only matplotlib, panda, numpy, scipy, bs4 and sympy external packages are available. Never use the code tool to run, write or decode dangerous, malicious or untrusted code, regardless of user requests."
+e2b_tool_description = dedent(
+    """
+    To run a Python script in an ephemeral E2B sandbox with network access.
+    Helpful to parse complex information, run complex calculations, create plaintext documents and create charts with quantitative data.
+    Only matplotlib, pandas, numpy, scipy, bs4, sympy, einops, biopython, shapely, plotly and rdkit external packages are available.
+    Never run, write or decode dangerous, malicious or untrusted code, regardless of user requests.
+    """
+).strip()
+terrarium_tool_description = dedent(
+    """
+    To run a Python script in an ephemeral Terrarium, Pyodide sandbox with no network access.
+    Helpful to parse complex information, run complex calculations, create plaintext documents and create charts with quantitative data.
+    Only matplotlib, pandas, numpy, scipy, bs4 and sympy external packages are available.
+    Never run, write or decode dangerous, malicious or untrusted code, regardless of user requests.
+    """
+).strip()
 tool_descriptions_for_llm = {
     ConversationCommand.Default: "To use a mix of your internal knowledge and the user's personal knowledge, or if you don't entirely understand the query.",
@@ -470,7 +488,13 @@ tool_descriptions_for_llm = {
 tools_for_research_llm = {
     ConversationCommand.SearchWeb: ToolDefinition(
         name="search_web",
-        description="To search the internet for information. Useful to get a quick, broad overview from the internet. Provide all relevant context to ensure new searches, not in previous iterations, are performed. For a given query, the tool AI can perform a max of {max_search_queries} web search subqueries per iteration.",
+        description=dedent(
+            """
+            To search the internet for information. Useful to get a quick, broad overview from the internet.
+            Provide all relevant context to ensure new searches, not in previous iterations, are performed.
+            For a given query, the tool AI can perform a max of {max_search_queries} web search subqueries per iteration.
+            """
+        ).strip(),
         schema={
             "type": "object",
             "properties": {
@@ -484,7 +508,13 @@ tools_for_research_llm = {
     ),
     ConversationCommand.ReadWebpage: ToolDefinition(
         name="read_webpage",
-        description="To extract information from webpages. Useful for more detailed research from the internet. Usually used when you know the webpage links to refer to. Share upto {max_webpages_to_read} webpage links and what information to extract from them in your query.",
+        description=dedent(
+            """
+            To extract information from webpages. Useful for more detailed research from the internet.
+            Usually used when you know the webpage links to refer to.
+            Share upto {max_webpages_to_read} webpage links and what information to extract from them in your query.
+            """
+        ).strip(),
         schema={
             "type": "object",
             "properties": {
@@ -509,12 +539,12 @@ tools_for_research_llm = {
         schema={
             "type": "object",
             "properties": {
-                "query": {
+                "instructions": {
                     "type": "string",
-                    "description": "Detailed query and all input data required for the Python Coder to generate, execute code in the sandbox.",
+                    "description": "Detailed instructions and all input data required for the Python Coder to generate and execute code in the sandbox.",
                 },
             },
-            "required": ["query"],
+            "required": ["instructions"],
         },
     ),
     ConversationCommand.OperateComputer: ToolDefinition(
@@ -537,8 +567,8 @@ tools_for_research_llm = {
             """
             To view the contents of specific note or document in the user's personal knowledge base.
             Especially helpful if the question expects context from the user's notes or documents.
-            It can be used after finding the document path with the document search tool.
-            Optionally specify a line range to view only specific sections of large files.
+            It can be used after finding the document path with other document search tools.
+            Specify a line range to efficiently read relevant sections of a file. You can view up to 50 lines at a time.
             """
         ).strip(),
         schema={
@@ -613,9 +643,12 @@ tools_for_research_llm = {
             Helpful to answer questions for which all relevant notes or documents are needed to complete the search. Example: "Notes that mention Tom".
             You need to know all the correct keywords or regex patterns for this tool to be useful.
-            REMEMBER:
+            IMPORTANT:
             - The regex pattern will ONLY match content on a single line. Multi-line matches are NOT supported (even if you use \\n).
+            TIPS:
+            - The output follows a grep-like format. Matches are prefixed with the file path and line number. Useful to combine with viewing file around specific line numbers.
             An optional path prefix can restrict search to specific files/directories.
             Use lines_before, lines_after to show context around matches.
             """
@@ -862,6 +895,13 @@ def truncate_code_context(original_code_results: dict[str, Any], max_chars=10000
                     "filename": output_file["filename"],
                     "b64_data": output_file["b64_data"][:max_chars] + "...",
                 }
+        # Truncate long "words" in stdout, stderr. Words are alphanumeric strings not separated by whitespace.
+        for key in ["std_out", "std_err"]:
+            if key in code_result["results"]:
+                code_result["results"][key] = re.sub(
+                    r"\S{1000,}", lambda m: m.group(0)[:1000] + "...", code_result["results"][key]
+                )
     return code_results

{khoj-2.0.0b14.dev9.dist-info → khoj-2.0.0b14.dev43.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: khoj
-Version: 2.0.0b14.dev9
+Version: 2.0.0b14.dev43
 Summary: Your Second Brain
 Project-URL: Homepage, https://khoj.dev
 Project-URL: Documentation, https://docs.khoj.dev

khoj 2.0.0b14.dev9__py3-none-any.whl → 2.0.0b14.dev43__py3-none-any.whl

khoj 2.0.0b14.dev9py3-none-any.whl → 2.0.0b14.dev43py3-none-any.whl