PyPI - khoj - Versions diffs - 1.33.3.dev32__py3-none-any.whl - Mend

khoj 1.33.3.dev32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (393) hide show

khoj/processor/content/github/github_to_entries.py ADDED Viewed

@@ -0,0 +1,226 @@
+import logging
+import time
+from typing import Dict, List, Tuple
+import requests
+from magika import Magika
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import GithubConfig, KhojUser
+from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
+from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
+from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
+from khoj.processor.content.text_to_entries import TextToEntries
+from khoj.utils.helpers import is_none_or_empty, timer
+from khoj.utils.rawconfig import GithubContentConfig, GithubRepoConfig
+logger = logging.getLogger(__name__)
+magika = Magika()
+class GithubToEntries(TextToEntries):
+    def __init__(self, config: GithubConfig):
+        super().__init__(config)
+        raw_repos = config.githubrepoconfig.all()
+        repos = []
+        for repo in raw_repos:
+            repos.append(
+                GithubRepoConfig(
+                    name=repo.name,
+                    owner=repo.owner,
+                    branch=repo.branch,
+                )
+            )
+        self.config = GithubContentConfig(
+            pat_token=config.pat_token,
+            repos=repos,
+        )
+        self.session = requests.Session()
+        if not is_none_or_empty(self.config.pat_token):
+            self.session.headers.update({"Authorization": f"token {self.config.pat_token}"})
+    @staticmethod
+    def wait_for_rate_limit_reset(response, func, *args, **kwargs):
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+            time.sleep(wait_time)
+            return func(*args, **kwargs)
+        else:
+            return
+    def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = False) -> Tuple[int, int]:
+        if is_none_or_empty(self.config.pat_token):
+            logger.warning(
+                f"Github PAT token is not set. Private repositories cannot be indexed and lower rate limits apply."
+            )
+        current_entries = []
+        for repo in self.config.repos:
+            current_entries += self.process_repo(repo)
+        return self.update_entries_with_ids(current_entries, user=user)
+    def process_repo(self, repo: GithubRepoConfig):
+        repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}"
+        repo_shorthand = f"{repo.owner}/{repo.name}"
+        logger.info(f"Processing github repo {repo_shorthand}")
+        with timer("Download files from github repo", logger):
+            try:
+                markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
+            except ConnectionAbortedError as e:
+                logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}")
+                raise e
+            except Exception as e:
+                logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
+                raise e
+        logger.info(
+            f"Found {len(markdown_files)} md, {len(org_files)} org and {len(plaintext_files)} text files in github repo {repo_shorthand}"
+        )
+        current_entries = []
+        with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
+            current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
+                *GithubToEntries.extract_markdown_entries(markdown_files)
+            )
+        with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
+            current_entries += OrgToEntries.convert_org_nodes_to_entries(
+                *GithubToEntries.extract_org_entries(org_files)
+            )
+        with timer(f"Extract plaintext entries from github repo {repo_shorthand}", logger):
+            current_entries += PlaintextToEntries.convert_text_files_to_entries(
+                *GithubToEntries.extract_plaintext_entries(plaintext_files)
+            )
+        with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
+            current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
+        return current_entries
+    def update_entries_with_ids(self, current_entries, user: KhojUser = None):
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
+                user,
+                current_entries,
+                DbEntry.EntryType.GITHUB,
+                DbEntry.EntrySource.GITHUB,
+                key="compiled",
+                logger=logger,
+            )
+        return num_new_embeddings, num_deleted_embeddings
+    def get_files(self, repo_url: str, repo: GithubRepoConfig):
+        # Get the contents of the repository
+        repo_content_url = f"{repo_url}/git/trees/{repo.branch}"
+        headers = {}
+        if not is_none_or_empty(self.config.pat_token):
+            headers = {"Authorization": f"token {self.config.pat_token}"}
+        params = {"recursive": "true"}
+        response = requests.get(repo_content_url, headers=headers, params=params)
+        contents = response.json()
+        # Raise exception if hit rate limit
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            raise ConnectionAbortedError("Github rate limit reached")
+        # Extract markdown files from the repository
+        markdown_files: List[Dict[str, str]] = []
+        org_files: List[Dict[str, str]] = []
+        plaintext_files: List[Dict[str, str]] = []
+        if "tree" not in contents:
+            return markdown_files, org_files, plaintext_files
+        for item in contents["tree"]:
+            # Find all markdown files in the repository
+            if item["type"] == "blob" and item["path"].endswith(".md"):
+                # Create URL for each markdown file on Github
+                url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
+                # Add markdown file contents and URL to list
+                markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
+            # Find all org files in the repository
+            elif item["type"] == "blob" and item["path"].endswith(".org"):
+                # Create URL for each org file on Github
+                url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
+                # Add org file contents and URL to list
+                org_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
+            # Find, index remaining non-binary files in the repository
+            elif item["type"] == "blob":
+                url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
+                content_bytes = self.get_file_contents(item["url"], decode=False)
+                content_type, content_str = None, None
+                try:
+                    content_type = magika.identify_bytes(content_bytes).output.group
+                except:
+                    logger.error(f"Unable to identify content type of file at {url_path}. Skip indexing it")
+                    continue
+                # Add non-binary file contents and URL to list
+                if content_type in ["text", "code"]:
+                    try:
+                        content_str = content_bytes.decode("utf-8")
+                    except:
+                        logger.error(f"Unable to decode content of file at {url_path}. Skip indexing it")
+                        continue
+                    plaintext_files += [{"content": content_str, "path": url_path}]
+        return markdown_files, org_files, plaintext_files
+    def get_file_contents(self, file_url, decode=True):
+        # Get text from each markdown file
+        headers = {"Accept": "application/vnd.github.v3.raw"}
+        response = self.session.get(file_url, headers=headers, stream=True)
+        # Stop indexing on hitting rate limit
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            raise ConnectionAbortedError("Github rate limit reached")
+        content = "" if decode else b""
+        for chunk in response.iter_content(chunk_size=2048):
+            if chunk:
+                try:
+                    content += chunk.decode("utf-8") if decode else chunk
+                except Exception as e:
+                    logger.error(f"Unable to decode chunk from {file_url}")
+                    logger.error(e)
+        return content
+    @staticmethod
+    def extract_markdown_entries(markdown_files):
+        entries = []
+        entry_to_file_map = []
+        for doc in markdown_files:
+            entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
+                doc["content"], doc["path"], entries, entry_to_file_map
+            )
+        return entries, dict(entry_to_file_map)
+    @staticmethod
+    def extract_org_entries(org_files):
+        entries = []
+        entry_to_file_map = []
+        for doc in org_files:
+            entries, entry_to_file_map = OrgToEntries.process_single_org_file(
+                doc["content"], doc["path"], entries, entry_to_file_map
+            )
+        return entries, dict(entry_to_file_map)
+    @staticmethod
+    def extract_plaintext_entries(plaintext_files):
+        entries = []
+        entry_to_file_map = []
+        for doc in plaintext_files:
+            entries, entry_to_file_map = PlaintextToEntries.process_single_plaintext_file(
+                doc["content"], doc["path"], entries, entry_to_file_map
+            )
+        return entries, dict(entry_to_file_map)

khoj/processor/content/images/__init__.py ADDED Viewed

File without changes

khoj/processor/content/images/image_to_entries.py ADDED Viewed

@@ -0,0 +1,117 @@
+import base64
+import logging
+import os
+from datetime import datetime
+from typing import Dict, List, Tuple
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import KhojUser
+from khoj.processor.content.text_to_entries import TextToEntries
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import Entry
+logger = logging.getLogger(__name__)
+class ImageToEntries(TextToEntries):
+    def __init__(self):
+        super().__init__()
+    # Define Functions
+    def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = False) -> Tuple[int, int]:
+        # Extract required fields from config
+        deletion_file_names = set([file for file in files if files[file] == b""])
+        files_to_process = set(files) - deletion_file_names
+        files = {file: files[file] for file in files_to_process}
+        # Extract Entries from specified image files
+        with timer("Extract entries from specified Image files", logger):
+            file_to_text_map, current_entries = ImageToEntries.extract_image_entries(files)
+        # Split entries by max tokens supported by model
+        with timer("Split entries by max token size supported by model", logger):
+            current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
+                user,
+                current_entries,
+                DbEntry.EntryType.IMAGE,
+                DbEntry.EntrySource.COMPUTER,
+                "compiled",
+                logger,
+                deletion_file_names,
+                regenerate=regenerate,
+                file_to_text_map=file_to_text_map,
+            )
+        return num_new_embeddings, num_deleted_embeddings
+    @staticmethod
+    def extract_image_entries(image_files) -> Tuple[Dict, List[Entry]]:  # important function
+        """Extract entries by page from specified image files"""
+        file_to_text_map = dict()
+        entries: List[str] = []
+        entry_to_location_map: List[Tuple[str, str]] = []
+        for image_file in image_files:
+            try:
+                bytes = image_files[image_file]
+                # write the image to a temporary file
+                timestamp_now = datetime.utcnow().timestamp()
+                # use either png or jpg
+                if image_file.endswith(".png"):
+                    tmp_file = f"tmp_image_file_{timestamp_now}.png"
+                elif image_file.endswith(".jpg") or image_file.endswith(".jpeg"):
+                    tmp_file = f"tmp_image_file_{timestamp_now}.jpg"
+                elif image_file.endswith(".webp"):
+                    tmp_file = f"tmp_image_file_{timestamp_now}.webp"
+                with open(tmp_file, "wb") as f:
+                    bytes = image_files[image_file]
+                    f.write(bytes)
+                try:
+                    from rapidocr_onnxruntime import RapidOCR
+                    loader = RapidOCR()
+                    image_entries_per_file = ""
+                    result, _ = loader(tmp_file)
+                    if result:
+                        expanded_entries = [text[1] for text in result]
+                        image_entries_per_file = " ".join(expanded_entries)
+                except ImportError:
+                    logger.warning(
+                        f"Unable to process image or scanned file for text: {image_file}. This file will not be indexed."
+                    )
+                    continue
+                entry_to_location_map.append((image_entries_per_file, image_file))
+                entries.extend([image_entries_per_file])
+                file_to_text_map[image_file] = image_entries_per_file
+            except Exception as e:
+                logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.")
+                logger.warning(e, exc_info=True)
+            finally:
+                if os.path.exists(tmp_file):
+                    os.remove(tmp_file)
+        return file_to_text_map, ImageToEntries.convert_image_entries_to_maps(entries, dict(entry_to_location_map))
+    @staticmethod
+    def convert_image_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
+        "Convert each image entries into a dictionary"
+        entries = []
+        for parsed_entry in parsed_entries:
+            entry_filename = entry_to_file_map[parsed_entry]
+            # Append base filename to compiled entry for context to model
+            heading = f"{entry_filename}\n"
+            compiled_entry = f"{heading}{parsed_entry}"
+            entries.append(
+                Entry(
+                    compiled=compiled_entry,
+                    raw=parsed_entry,
+                    heading=heading,
+                    file=f"{entry_filename}",
+                )
+            )
+        logger.debug(f"Converted {len(parsed_entries)} image entries to dictionaries")
+        return entries

khoj/processor/content/markdown/__init__.py ADDED Viewed

File without changes

khoj/processor/content/markdown/markdown_to_entries.py ADDED Viewed

@@ -0,0 +1,160 @@
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+import urllib3.util
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import KhojUser
+from khoj.processor.content.text_to_entries import TextToEntries
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import Entry
+logger = logging.getLogger(__name__)
+class MarkdownToEntries(TextToEntries):
+    def __init__(self):
+        super().__init__()
+    # Define Functions
+    def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = False) -> Tuple[int, int]:
+        # Extract required fields from config
+        deletion_file_names = set([file for file in files if files[file] == ""])
+        files_to_process = set(files) - deletion_file_names
+        files = {file: files[file] for file in files_to_process}
+        max_tokens = 256
+        # Extract Entries from specified Markdown files
+        with timer("Extract entries from specified Markdown files", logger):
+            file_to_text_map, current_entries = MarkdownToEntries.extract_markdown_entries(files, max_tokens)
+        # Split entries by max tokens supported by model
+        with timer("Split entries by max token size supported by model", logger):
+            current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens)
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
+                user,
+                current_entries,
+                DbEntry.EntryType.MARKDOWN,
+                DbEntry.EntrySource.COMPUTER,
+                "compiled",
+                logger,
+                deletion_file_names,
+                regenerate=regenerate,
+                file_to_text_map=file_to_text_map,
+            )
+        return num_new_embeddings, num_deleted_embeddings
+    @staticmethod
+    def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
+        "Extract entries by heading from specified Markdown files"
+        entries: List[str] = []
+        entry_to_file_map: List[Tuple[str, str]] = []
+        file_to_text_map: Dict[str, str] = dict()
+        for markdown_file in markdown_files:
+            try:
+                markdown_content = markdown_files[markdown_file]
+                entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
+                    markdown_content, markdown_file, entries, entry_to_file_map, max_tokens
+                )
+                file_to_text_map[markdown_file] = markdown_content
+            except Exception as e:
+                logger.error(
+                    f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True
+                )
+        return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map))
+    @staticmethod
+    def process_single_markdown_file(
+        markdown_content: str,
+        markdown_file: str,
+        entries: List[str],
+        entry_to_file_map: List[Tuple[str, str]],
+        max_tokens=256,
+        ancestry: Dict[int, str] = {},
+    ) -> Tuple[List[str], List[Tuple[str, str]]]:
+        # Prepend the markdown section's heading ancestry
+        ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())])
+        markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}"
+        # If content is small or content has no children headings, save it as a single entry
+        if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
+            rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
+        ):
+            entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
+            entries.extend([markdown_content_with_ancestry])
+            return entries, entry_to_file_map
+        # Split by next heading level present in the entry
+        next_heading_level = len(ancestry)
+        sections: List[str] = []
+        while len(sections) < 2:
+            next_heading_level += 1
+            sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
+        for section in sections:
+            # Skip empty sections
+            if section.strip() == "":
+                continue
+            # Extract the section body and (when present) the heading
+            current_ancestry = ancestry.copy()
+            first_line = [line for line in section.split("\n") if line.strip() != ""][0]
+            if re.search(rf"^#{{{next_heading_level}}} ", first_line):
+                # Extract the section body without the heading
+                current_section_body = "\n".join(section.split(first_line)[1:])
+                # Parse the section heading into current section ancestry
+                current_section_title = first_line[next_heading_level:].strip()
+                current_ancestry[next_heading_level] = current_section_title
+            else:
+                current_section_body = section
+            # Recurse down children of the current entry
+            MarkdownToEntries.process_single_markdown_file(
+                current_section_body,
+                markdown_file,
+                entries,
+                entry_to_file_map,
+                max_tokens,
+                current_ancestry,
+            )
+        return entries, entry_to_file_map
+    @staticmethod
+    def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
+        "Convert each Markdown entries into a dictionary"
+        entries: List[Entry] = []
+        for parsed_entry in parsed_entries:
+            raw_filename = entry_to_file_map[parsed_entry]
+            # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path.
+            if type(raw_filename) == str and re.search(r"^https?://", raw_filename):
+                # Escape the URL to avoid issues with special characters
+                entry_filename = urllib3.util.parse_url(raw_filename).url
+            else:
+                entry_filename = raw_filename
+            heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
+            # Append base filename to compiled entry for context to model
+            # Increment heading level for heading entries and make filename as its top level heading
+            prefix = f"# {entry_filename}\n#" if heading else f"# {entry_filename}\n"
+            compiled_entry = f"{prefix}{parsed_entry}"
+            entries.append(
+                Entry(
+                    compiled=compiled_entry,
+                    raw=parsed_entry,
+                    heading=f"{prefix}{heading}",
+                    file=entry_filename,
+                )
+            )
+        logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
+        return entries