PyPI - khoj - Versions diffs - 1.16.1.dev15__py3-none-any.whl - Mend

khoj 1.16.1.dev15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

khoj/__init__.py +0 -0
khoj/app/README.md +94 -0
khoj/app/__init__.py +0 -0
khoj/app/asgi.py +16 -0
khoj/app/settings.py +192 -0
khoj/app/urls.py +25 -0
khoj/configure.py +424 -0
khoj/database/__init__.py +0 -0
khoj/database/adapters/__init__.py +1234 -0
khoj/database/admin.py +290 -0
khoj/database/apps.py +6 -0
khoj/database/management/__init__.py +0 -0
khoj/database/management/commands/__init__.py +0 -0
khoj/database/management/commands/change_generated_images_url.py +61 -0
khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
khoj/database/migrations/0001_khojuser.py +98 -0
khoj/database/migrations/0002_googleuser.py +32 -0
khoj/database/migrations/0003_vector_extension.py +10 -0
khoj/database/migrations/0004_content_types_and_more.py +181 -0
khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
khoj/database/migrations/0006_embeddingsdates.py +33 -0
khoj/database/migrations/0007_add_conversation.py +27 -0
khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
khoj/database/migrations/0009_khojapiuser.py +24 -0
khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
khoj/database/migrations/0012_entry_file_source.py +21 -0
khoj/database/migrations/0013_subscription.py +37 -0
khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
khoj/database/migrations/0015_alter_subscription_user.py +21 -0
khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
khoj/database/migrations/0017_searchmodel.py +32 -0
khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
khoj/database/migrations/0020_reflectivequestion.py +36 -0
khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
khoj/database/migrations/0029_userrequests.py +27 -0
khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
khoj/database/migrations/0035_processlock.py +26 -0
khoj/database/migrations/0036_alter_processlock_name.py +19 -0
khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
khoj/database/migrations/0036_publicconversation.py +42 -0
khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
khoj/database/migrations/0040_alter_processlock_name.py +26 -0
khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
khoj/database/migrations/0042_serverchatsettings.py +46 -0
khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
khoj/database/migrations/0044_conversation_file_filters.py +17 -0
khoj/database/migrations/0045_fileobject.py +37 -0
khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
khoj/database/migrations/0049_datastore.py +38 -0
khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
khoj/database/migrations/0050_alter_processlock_name.py +25 -0
khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
khoj/database/migrations/__init__.py +0 -0
khoj/database/models/__init__.py +402 -0
khoj/database/tests.py +3 -0
khoj/interface/email/feedback.html +34 -0
khoj/interface/email/magic_link.html +17 -0
khoj/interface/email/task.html +40 -0
khoj/interface/email/welcome.html +61 -0
khoj/interface/web/404.html +56 -0
khoj/interface/web/agent.html +312 -0
khoj/interface/web/agents.html +276 -0
khoj/interface/web/assets/icons/agents.svg +6 -0
khoj/interface/web/assets/icons/automation.svg +37 -0
khoj/interface/web/assets/icons/cancel.svg +3 -0
khoj/interface/web/assets/icons/chat.svg +24 -0
khoj/interface/web/assets/icons/collapse.svg +17 -0
khoj/interface/web/assets/icons/computer.png +0 -0
khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
khoj/interface/web/assets/icons/copy-button.svg +5 -0
khoj/interface/web/assets/icons/credit-card.png +0 -0
khoj/interface/web/assets/icons/delete.svg +26 -0
khoj/interface/web/assets/icons/docx.svg +7 -0
khoj/interface/web/assets/icons/edit.svg +4 -0
khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
khoj/interface/web/assets/icons/favicon.icns +0 -0
khoj/interface/web/assets/icons/github.svg +1 -0
khoj/interface/web/assets/icons/key.svg +4 -0
khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
khoj/interface/web/assets/icons/logotype.svg +1 -0
khoj/interface/web/assets/icons/markdown.svg +1 -0
khoj/interface/web/assets/icons/new.svg +23 -0
khoj/interface/web/assets/icons/notion.svg +4 -0
khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
khoj/interface/web/assets/icons/org.svg +1 -0
khoj/interface/web/assets/icons/pdf.svg +23 -0
khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
khoj/interface/web/assets/icons/plaintext.svg +1 -0
khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
khoj/interface/web/assets/icons/search.svg +25 -0
khoj/interface/web/assets/icons/send.svg +1 -0
khoj/interface/web/assets/icons/share.svg +8 -0
khoj/interface/web/assets/icons/speaker.svg +4 -0
khoj/interface/web/assets/icons/stop-solid.svg +37 -0
khoj/interface/web/assets/icons/sync.svg +4 -0
khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
khoj/interface/web/assets/icons/voice.svg +8 -0
khoj/interface/web/assets/icons/web.svg +2 -0
khoj/interface/web/assets/icons/whatsapp.svg +17 -0
khoj/interface/web/assets/khoj.css +237 -0
khoj/interface/web/assets/markdown-it.min.js +8476 -0
khoj/interface/web/assets/natural-cron.min.js +1 -0
khoj/interface/web/assets/org.min.js +1823 -0
khoj/interface/web/assets/pico.min.css +5 -0
khoj/interface/web/assets/purify.min.js +3 -0
khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
khoj/interface/web/assets/utils.js +33 -0
khoj/interface/web/base_config.html +445 -0
khoj/interface/web/chat.html +3546 -0
khoj/interface/web/config.html +1011 -0
khoj/interface/web/config_automation.html +1103 -0
khoj/interface/web/content_source_computer_input.html +139 -0
khoj/interface/web/content_source_github_input.html +216 -0
khoj/interface/web/content_source_notion_input.html +94 -0
khoj/interface/web/khoj.webmanifest +51 -0
khoj/interface/web/login.html +219 -0
khoj/interface/web/public_conversation.html +2006 -0
khoj/interface/web/search.html +470 -0
khoj/interface/web/utils.html +48 -0
khoj/main.py +241 -0
khoj/manage.py +22 -0
khoj/migrations/__init__.py +0 -0
khoj/migrations/migrate_offline_chat_default_model.py +69 -0
khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
khoj/migrations/migrate_offline_chat_schema.py +83 -0
khoj/migrations/migrate_offline_model.py +29 -0
khoj/migrations/migrate_processor_config_openai.py +67 -0
khoj/migrations/migrate_server_pg.py +138 -0
khoj/migrations/migrate_version.py +17 -0
khoj/processor/__init__.py +0 -0
khoj/processor/content/__init__.py +0 -0
khoj/processor/content/docx/__init__.py +0 -0
khoj/processor/content/docx/docx_to_entries.py +110 -0
khoj/processor/content/github/__init__.py +0 -0
khoj/processor/content/github/github_to_entries.py +224 -0
khoj/processor/content/images/__init__.py +0 -0
khoj/processor/content/images/image_to_entries.py +118 -0
khoj/processor/content/markdown/__init__.py +0 -0
khoj/processor/content/markdown/markdown_to_entries.py +165 -0
khoj/processor/content/notion/notion_to_entries.py +260 -0
khoj/processor/content/org_mode/__init__.py +0 -0
khoj/processor/content/org_mode/org_to_entries.py +231 -0
khoj/processor/content/org_mode/orgnode.py +532 -0
khoj/processor/content/pdf/__init__.py +0 -0
khoj/processor/content/pdf/pdf_to_entries.py +116 -0
khoj/processor/content/plaintext/__init__.py +0 -0
khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
khoj/processor/content/text_to_entries.py +297 -0
khoj/processor/conversation/__init__.py +0 -0
khoj/processor/conversation/anthropic/__init__.py +0 -0
khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
khoj/processor/conversation/anthropic/utils.py +114 -0
khoj/processor/conversation/offline/__init__.py +0 -0
khoj/processor/conversation/offline/chat_model.py +231 -0
khoj/processor/conversation/offline/utils.py +78 -0
khoj/processor/conversation/offline/whisper.py +15 -0
khoj/processor/conversation/openai/__init__.py +0 -0
khoj/processor/conversation/openai/gpt.py +187 -0
khoj/processor/conversation/openai/utils.py +129 -0
khoj/processor/conversation/openai/whisper.py +13 -0
khoj/processor/conversation/prompts.py +758 -0
khoj/processor/conversation/utils.py +262 -0
khoj/processor/embeddings.py +117 -0
khoj/processor/speech/__init__.py +0 -0
khoj/processor/speech/text_to_speech.py +51 -0
khoj/processor/tools/__init__.py +0 -0
khoj/processor/tools/online_search.py +225 -0
khoj/routers/__init__.py +0 -0
khoj/routers/api.py +626 -0
khoj/routers/api_agents.py +43 -0
khoj/routers/api_chat.py +1180 -0
khoj/routers/api_config.py +434 -0
khoj/routers/api_phone.py +86 -0
khoj/routers/auth.py +181 -0
khoj/routers/email.py +133 -0
khoj/routers/helpers.py +1188 -0
khoj/routers/indexer.py +349 -0
khoj/routers/notion.py +91 -0
khoj/routers/storage.py +35 -0
khoj/routers/subscription.py +104 -0
khoj/routers/twilio.py +36 -0
khoj/routers/web_client.py +471 -0
khoj/search_filter/__init__.py +0 -0
khoj/search_filter/base_filter.py +15 -0
khoj/search_filter/date_filter.py +217 -0
khoj/search_filter/file_filter.py +30 -0
khoj/search_filter/word_filter.py +29 -0
khoj/search_type/__init__.py +0 -0
khoj/search_type/text_search.py +241 -0
khoj/utils/__init__.py +0 -0
khoj/utils/cli.py +93 -0
khoj/utils/config.py +81 -0
khoj/utils/constants.py +24 -0
khoj/utils/fs_syncer.py +249 -0
khoj/utils/helpers.py +418 -0
khoj/utils/initialization.py +146 -0
khoj/utils/jsonl.py +43 -0
khoj/utils/models.py +47 -0
khoj/utils/rawconfig.py +160 -0
khoj/utils/state.py +46 -0
khoj/utils/yaml.py +43 -0
khoj-1.16.1.dev15.dist-info/METADATA +178 -0
khoj-1.16.1.dev15.dist-info/RECORD +242 -0
khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0

khoj/processor/content/markdown/markdown_to_entries.py ADDED Viewed

@@ -0,0 +1,165 @@
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+import urllib3
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import KhojUser
+from khoj.processor.content.text_to_entries import TextToEntries
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import Entry
+logger = logging.getLogger(__name__)
+class MarkdownToEntries(TextToEntries):
+    def __init__(self):
+        super().__init__()
+    # Define Functions
+    def process(
+        self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
+    ) -> Tuple[int, int]:
+        # Extract required fields from config
+        if not full_corpus:
+            deletion_file_names = set([file for file in files if files[file] == ""])
+            files_to_process = set(files) - deletion_file_names
+            files = {file: files[file] for file in files_to_process}
+        else:
+            deletion_file_names = None
+        max_tokens = 256
+        # Extract Entries from specified Markdown files
+        with timer("Extract entries from specified Markdown files", logger):
+            file_to_text_map, current_entries = MarkdownToEntries.extract_markdown_entries(files, max_tokens)
+        # Split entries by max tokens supported by model
+        with timer("Split entries by max token size supported by model", logger):
+            current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens)
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
+                current_entries,
+                DbEntry.EntryType.MARKDOWN,
+                DbEntry.EntrySource.COMPUTER,
+                "compiled",
+                logger,
+                deletion_file_names,
+                user,
+                regenerate=regenerate,
+                file_to_text_map=file_to_text_map,
+            )
+        return num_new_embeddings, num_deleted_embeddings
+    @staticmethod
+    def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]:
+        "Extract entries by heading from specified Markdown files"
+        entries: List[str] = []
+        entry_to_file_map: List[Tuple[str, str]] = []
+        file_to_text_map = dict()
+        for markdown_file in markdown_files:
+            try:
+                markdown_content = markdown_files[markdown_file]
+                entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
+                    markdown_content, markdown_file, entries, entry_to_file_map, max_tokens
+                )
+                file_to_text_map[markdown_file] = markdown_content
+            except Exception as e:
+                logger.error(
+                    f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True
+                )
+        return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map))
+    @staticmethod
+    def process_single_markdown_file(
+        markdown_content: str,
+        markdown_file: str,
+        entries: List[str],
+        entry_to_file_map: List[Tuple[str, str]],
+        max_tokens=256,
+        ancestry: Dict[int, str] = {},
+    ) -> Tuple[List[str], List[Tuple[str, str]]]:
+        # Prepend the markdown section's heading ancestry
+        ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())])
+        markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}"
+        # If content is small or content has no children headings, save it as a single entry
+        if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
+            rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
+        ):
+            entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
+            entries.extend([markdown_content_with_ancestry])
+            return entries, entry_to_file_map
+        # Split by next heading level present in the entry
+        next_heading_level = len(ancestry)
+        sections: List[str] = []
+        while len(sections) < 2:
+            next_heading_level += 1
+            sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
+        for section in sections:
+            # Skip empty sections
+            if section.strip() == "":
+                continue
+            # Extract the section body and (when present) the heading
+            current_ancestry = ancestry.copy()
+            first_line = [line for line in section.split("\n") if line.strip() != ""][0]
+            if re.search(rf"^#{{{next_heading_level}}} ", first_line):
+                # Extract the section body without the heading
+                current_section_body = "\n".join(section.split(first_line)[1:])
+                # Parse the section heading into current section ancestry
+                current_section_title = first_line[next_heading_level:].strip()
+                current_ancestry[next_heading_level] = current_section_title
+            else:
+                current_section_body = section
+            # Recurse down children of the current entry
+            MarkdownToEntries.process_single_markdown_file(
+                current_section_body,
+                markdown_file,
+                entries,
+                entry_to_file_map,
+                max_tokens,
+                current_ancestry,
+            )
+        return entries, entry_to_file_map
+    @staticmethod
+    def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
+        "Convert each Markdown entries into a dictionary"
+        entries: List[Entry] = []
+        for parsed_entry in parsed_entries:
+            raw_filename = entry_to_file_map[parsed_entry]
+            # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path.
+            if type(raw_filename) == str and re.search(r"^https?://", raw_filename):
+                # Escape the URL to avoid issues with special characters
+                entry_filename = urllib3.util.parse_url(raw_filename).url
+            else:
+                entry_filename = str(Path(raw_filename))
+            heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
+            # Append base filename to compiled entry for context to model
+            # Increment heading level for heading entries and make filename as its top level heading
+            prefix = f"# {entry_filename}\n#" if heading else f"# {entry_filename}\n"
+            compiled_entry = f"{prefix}{parsed_entry}"
+            entries.append(
+                Entry(
+                    compiled=compiled_entry,
+                    raw=parsed_entry,
+                    heading=f"{prefix}{heading}",
+                    file=f"{entry_filename}",
+                )
+            )
+        logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
+        return entries

khoj/processor/content/notion/notion_to_entries.py ADDED Viewed

@@ -0,0 +1,260 @@
+import logging
+from enum import Enum
+from typing import Tuple
+import requests
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import KhojUser, NotionConfig
+from khoj.processor.content.text_to_entries import TextToEntries
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import Entry, NotionContentConfig
+logger = logging.getLogger(__name__)
+class NotionBlockType(Enum):
+    PARAGRAPH = "paragraph"
+    HEADING_1 = "heading_1"
+    HEADING_2 = "heading_2"
+    HEADING_3 = "heading_3"
+    BULLETED_LIST_ITEM = "bulleted_list_item"
+    NUMBERED_LIST_ITEM = "numbered_list_item"
+    TO_DO = "to_do"
+    TOGGLE = "toggle"
+    CHILD_PAGE = "child_page"
+    UNSUPPORTED = "unsupported"
+    BOOKMARK = "bookmark"
+    DIVIDER = "divider"
+    PDF = "pdf"
+    IMAGE = "image"
+    EMBED = "embed"
+    VIDEO = "video"
+    FILE = "file"
+    SYNCED_BLOCK = "synced_block"
+    TABLE_OF_CONTENTS = "table_of_contents"
+    COLUMN = "column"
+    EQUATION = "equation"
+    LINK_PREVIEW = "link_preview"
+    COLUMN_LIST = "column_list"
+    QUOTE = "quote"
+    BREADCRUMB = "breadcrumb"
+    LINK_TO_PAGE = "link_to_page"
+    CHILD_DATABASE = "child_database"
+    TEMPLATE = "template"
+    CALLOUT = "callout"
+class NotionToEntries(TextToEntries):
+    def __init__(self, config: NotionConfig):
+        super().__init__(config)
+        self.config = NotionContentConfig(
+            token=config.token,
+        )
+        self.session = requests.Session()
+        self.session.headers.update({"Authorization": f"Bearer {config.token}", "Notion-Version": "2022-02-22"})
+        self.unsupported_block_types = [
+            NotionBlockType.BOOKMARK.value,
+            NotionBlockType.DIVIDER.value,
+            NotionBlockType.CHILD_DATABASE.value,
+            NotionBlockType.TEMPLATE.value,
+            NotionBlockType.CALLOUT.value,
+            NotionBlockType.UNSUPPORTED.value,
+        ]
+        self.display_block_block_types = [
+            NotionBlockType.PARAGRAPH.value,
+            NotionBlockType.HEADING_1.value,
+            NotionBlockType.HEADING_2.value,
+            NotionBlockType.HEADING_3.value,
+            NotionBlockType.BULLETED_LIST_ITEM.value,
+            NotionBlockType.NUMBERED_LIST_ITEM.value,
+            NotionBlockType.TO_DO.value,
+            NotionBlockType.TOGGLE.value,
+            NotionBlockType.CHILD_PAGE.value,
+            NotionBlockType.BOOKMARK.value,
+            NotionBlockType.DIVIDER.value,
+        ]
+        self.body_params = {"page_size": 100}
+    def process(
+        self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
+    ) -> Tuple[int, int]:
+        current_entries = []
+        # Get all pages
+        with timer("Getting all pages via search endpoint", logger=logger):
+            responses = []
+            while True:
+                result = self.session.post(
+                    "https://api.notion.com/v1/search",
+                    json=self.body_params,
+                ).json()
+                responses.append(result)
+                if result.get("has_more", False) == False:
+                    break
+                else:
+                    self.body_params.update({"start_cursor": result["next_cursor"]})
+        for response in responses:
+            with timer("Processing response", logger=logger):
+                pages_or_databases = response.get("results", [])
+                # Get all pages content
+                for p_or_d in pages_or_databases:
+                    with timer(f"Processing {p_or_d['object']} {p_or_d['id']}", logger=logger):
+                        if p_or_d["object"] == "database":
+                            # TODO: Handle databases
+                            continue
+                        elif p_or_d["object"] == "page":
+                            page_entries = self.process_page(p_or_d)
+                            current_entries.extend(page_entries)
+        current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
+        return self.update_entries_with_ids(current_entries, user=user)
+    def process_page(self, page):
+        page_id = page["id"]
+        title, content = self.get_page_content(page_id)
+        if title == None or content == None:
+            return []
+        current_entries = []
+        curr_heading = ""
+        for block in content.get("results", []):
+            block_type = block.get("type")
+            if block_type == None:
+                continue
+            block_data = block[block_type]
+            if block_data.get("rich_text") == None or len(block_data["rich_text"]) == 0:
+                # There's no text to handle here.
+                continue
+            raw_content = ""
+            if block_type in ["heading_1", "heading_2", "heading_3"]:
+                # If the current block is a heading, we can consider the previous block processing completed.
+                # Add it as an entry and move on to processing the next chunk of the page.
+                if raw_content != "":
+                    current_entries.append(
+                        Entry(
+                            compiled=raw_content,
+                            raw=raw_content,
+                            heading=title,
+                            file=page["url"],
+                        )
+                    )
+                curr_heading = block_data["rich_text"][0]["plain_text"]
+            else:
+                if curr_heading != "":
+                    # Add the last known heading to the content for additional context
+                    raw_content = self.process_heading(curr_heading)
+            for text in block_data["rich_text"]:
+                raw_content += self.process_text(text)
+            if block.get("has_children", True):
+                raw_content += "\n"
+                raw_content = self.process_nested_children(
+                    self.get_block_children(block["id"]), raw_content, block_type
+                )
+            if raw_content != "":
+                current_entries.append(
+                    Entry(
+                        compiled=raw_content,
+                        raw=raw_content,
+                        heading=title,
+                        file=page["url"],
+                    )
+                )
+        return current_entries
+    def process_heading(self, heading):
+        return f"\n<b>{heading}</b>\n"
+    def process_nested_children(self, children, raw_content, block_type=None):
+        results = children.get("results", [])
+        for child in results:
+            child_type = child.get("type")
+            if child_type == None:
+                continue
+            child_data = child[child_type]
+            if child_data.get("rich_text") and len(child_data["rich_text"]) > 0:
+                for text in child_data["rich_text"]:
+                    raw_content += self.process_text(text, block_type)
+            if child_data.get("has_children", True):
+                return self.process_nested_children(self.get_block_children(child["id"]), raw_content, block_type)
+        return raw_content
+    def process_text(self, text, block_type=None):
+        text_type = text.get("type", None)
+        if text_type in self.unsupported_block_types:
+            return ""
+        if text.get("href", None):
+            return f"<a href='{text['href']}'>{text['plain_text']}</a>"
+        raw_text = text["plain_text"]
+        if text_type in self.display_block_block_types or block_type in self.display_block_block_types:
+            return f"\n{raw_text}\n"
+        return raw_text
+    def get_block_children(self, block_id):
+        try:
+            return self.session.get(f"https://api.notion.com/v1/blocks/{block_id}/children").json()
+        except Exception as e:
+            logger.error(f"Error getting children for block {block_id}: {e}")
+            return {}
+    def get_page(self, page_id):
+        return self.session.get(f"https://api.notion.com/v1/pages/{page_id}").json()
+    def get_page_children(self, page_id):
+        return self.session.get(f"https://api.notion.com/v1/blocks/{page_id}/children").json()
+    def get_page_content(self, page_id):
+        try:
+            page = self.get_page(page_id)
+            content = self.get_page_children(page_id)
+        except Exception as e:
+            logger.error(f"Error getting page {page_id}: {e}", exc_info=True)
+            return None, None
+        properties = page.get("properties", {})
+        title_field = "title"
+        if "Title" in properties:
+            title_field = "Title"
+        elif "Name" in properties:
+            title_field = "Name"
+        elif "Page" in properties:
+            title_field = "Page"
+        elif "Event" in properties:
+            title_field = "Event"
+        elif title_field not in properties:
+            logger.debug(f"Title field not found for page {page_id}. Setting title as None...")
+            title = None
+            return title, content
+        try:
+            title = page["properties"][title_field]["title"][0]["text"]["content"]
+        except Exception as e:
+            logger.warning(f"Error getting title for page {page_id}: {e}. Setting title as None...")
+            title = None
+        return title, content
+    def update_entries_with_ids(self, current_entries, user: KhojUser = None):
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
+                current_entries,
+                DbEntry.EntryType.NOTION,
+                DbEntry.EntrySource.NOTION,
+                key="compiled",
+                logger=logger,
+                user=user,
+            )
+        return num_new_embeddings, num_deleted_embeddings

khoj/processor/content/org_mode/__init__.py ADDED Viewed

File without changes

khoj/processor/content/org_mode/org_to_entries.py ADDED Viewed

@@ -0,0 +1,231 @@
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import KhojUser
+from khoj.processor.content.org_mode import orgnode
+from khoj.processor.content.org_mode.orgnode import Orgnode
+from khoj.processor.content.text_to_entries import TextToEntries
+from khoj.utils import state
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import Entry
+logger = logging.getLogger(__name__)
+class OrgToEntries(TextToEntries):
+    def __init__(self):
+        super().__init__()
+    # Define Functions
+    def process(
+        self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
+    ) -> Tuple[int, int]:
+        if not full_corpus:
+            deletion_file_names = set([file for file in files if files[file] == ""])
+            files_to_process = set(files) - deletion_file_names
+            files = {file: files[file] for file in files_to_process}
+        else:
+            deletion_file_names = None
+        # Extract Entries from specified Org files
+        max_tokens = 256
+        with timer("Extract entries from specified Org files", logger):
+            file_to_text_map, current_entries = self.extract_org_entries(files, max_tokens=max_tokens)
+        with timer("Split entries by max token size supported by model", logger):
+            current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=max_tokens)
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
+                current_entries,
+                DbEntry.EntryType.ORG,
+                DbEntry.EntrySource.COMPUTER,
+                "compiled",
+                logger,
+                deletion_file_names,
+                user,
+                regenerate=regenerate,
+                file_to_text_map=file_to_text_map,
+            )
+        return num_new_embeddings, num_deleted_embeddings
+    @staticmethod
+    def extract_org_entries(
+        org_files: dict[str, str], index_heading_entries: bool = False, max_tokens=256
+    ) -> Tuple[Dict, List[Entry]]:
+        "Extract entries from specified Org files"
+        file_to_text_map, entries, entry_to_file_map = OrgToEntries.extract_org_nodes(org_files, max_tokens)
+        return file_to_text_map, OrgToEntries.convert_org_nodes_to_entries(
+            entries, entry_to_file_map, index_heading_entries
+        )
+    @staticmethod
+    def extract_org_nodes(
+        org_files: dict[str, str], max_tokens
+    ) -> Tuple[Dict, List[List[Orgnode]], Dict[Orgnode, str]]:
+        "Extract org nodes from specified org files"
+        entries: List[List[Orgnode]] = []
+        entry_to_file_map: List[Tuple[Orgnode, str]] = []
+        file_to_text_map = {}
+        for org_file in org_files:
+            try:
+                org_content = org_files[org_file]
+                entries, entry_to_file_map = OrgToEntries.process_single_org_file(
+                    org_content, org_file, entries, entry_to_file_map, max_tokens
+                )
+                file_to_text_map[org_file] = org_content
+            except Exception as e:
+                logger.error(f"Unable to process file: {org_file}. Skipped indexing it.\nError; {e}", exc_info=True)
+        return file_to_text_map, entries, dict(entry_to_file_map)
+    @staticmethod
+    def process_single_org_file(
+        org_content: str,
+        org_file: str,
+        entries: List[List[Orgnode]],
+        entry_to_file_map: List[Tuple[Orgnode, str]],
+        max_tokens=256,
+        ancestry: Dict[int, str] = {},
+    ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]:
+        """Parse org_content from org_file into OrgNode entries
+        Recurse down org file entries, one heading level at a time,
+        until reach a leaf entry or the current entry tree fits max_tokens.
+        Parse recursion terminating entry (trees) into (a list of) OrgNode objects.
+        """
+        # Prepend the org section's heading ancestry
+        ancestry_string = "\n".join([f"{'*' * key} {ancestry[key]}" for key in sorted(ancestry.keys())])
+        org_content_with_ancestry = f"{ancestry_string}{org_content}"
+        # If content is small or content has no children headings, save it as a single entry
+        # Note: This is the terminating condition for this recursive function
+        if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search(
+            rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE
+        ):
+            orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file)
+            entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry))
+            entries.extend([orgnode_content_with_ancestry])
+            return entries, entry_to_file_map
+        # Split this entry tree into sections by the next heading level in it
+        # Increment heading level until able to split entry into sections or reach max heading level
+        # A successful split will result in at least 2 sections
+        max_heading_level = 100
+        next_heading_level = len(ancestry)
+        sections: List[str] = []
+        while len(sections) < 2 and next_heading_level < max_heading_level:
+            next_heading_level += 1
+            sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
+        # If unable to split entry into sections, log error and skip indexing it
+        if next_heading_level == max_heading_level:
+            logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.")
+            return entries, entry_to_file_map
+        # Recurse down each non-empty section after parsing its body, heading and ancestry
+        for section in sections:
+            # Skip empty sections
+            if section.strip() == "":
+                continue
+            # Extract the section body and (when present) the heading
+            current_ancestry = ancestry.copy()
+            first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0]
+            # If first non-empty line is a heading with expected heading level
+            if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
+                # Extract the section body without the heading
+                current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
+                # Parse the section heading into current section ancestry
+                current_section_title = first_non_empty_line[next_heading_level:].strip()
+                current_ancestry[next_heading_level] = current_section_title
+            # Else process the section as just body text
+            else:
+                current_section_body = section
+            # Recurse down children of the current entry
+            OrgToEntries.process_single_org_file(
+                current_section_body,
+                org_file,
+                entries,
+                entry_to_file_map,
+                max_tokens,
+                current_ancestry,
+            )
+        return entries, entry_to_file_map
+    @staticmethod
+    def convert_org_nodes_to_entries(
+        parsed_entries: List[List[Orgnode]],
+        entry_to_file_map: Dict[Orgnode, str],
+        index_heading_entries: bool = False,
+    ) -> List[Entry]:
+        """
+        Convert OrgNode lists into list of Entry objects
+        Each list of OrgNodes is a parsed parent org tree or leaf node.
+        Convert each list of these OrgNodes into a single Entry.
+        """
+        entries: List[Entry] = []
+        for entry_group in parsed_entries:
+            entry_heading, entry_compiled, entry_raw = "", "", ""
+            for parsed_entry in entry_group:
+                if not parsed_entry.hasBody and not index_heading_entries:
+                    # Ignore title notes i.e notes with just headings and empty body
+                    continue
+                todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else ""
+                # Set base level to current org-node tree's root heading level
+                if not entry_heading and parsed_entry.level > 0:
+                    base_level = parsed_entry.level
+                # Indent entry by 1 heading level as ancestry is prepended as top level heading
+                heading = f"{'*' * (parsed_entry.level-base_level+2)} {todo_str}" if parsed_entry.level > 0 else ""
+                if parsed_entry.heading:
+                    heading += f"{parsed_entry.heading}."
+                # Prepend ancestor headings, filename as top heading to root parent entry for context
+                # Children nodes do not need ancestors trail as root parent node will have it
+                if not entry_heading:
+                    ancestors_trail = " / ".join(parsed_entry.ancestors) or Path(entry_to_file_map[parsed_entry])
+                    heading = f"* {ancestors_trail}\n{heading}" if heading else f"* {ancestors_trail}."
+                compiled = heading
+                if parsed_entry.tags:
+                    tags_str = " ".join(parsed_entry.tags)
+                    compiled += f"\t {tags_str}."
+                if parsed_entry.closed:
+                    compiled += f'\n Closed on {parsed_entry.closed.strftime("%Y-%m-%d")}.'
+                if parsed_entry.scheduled:
+                    compiled += f'\n Scheduled for {parsed_entry.scheduled.strftime("%Y-%m-%d")}.'
+                if parsed_entry.hasBody:
+                    compiled += f"\n {parsed_entry.body}"
+                # Add the sub-entry contents to the entry
+                entry_compiled += f"{compiled}"
+                entry_raw += f"{parsed_entry}"
+                if not entry_heading:
+                    entry_heading = heading
+            if entry_compiled:
+                entries.append(
+                    Entry(
+                        compiled=entry_compiled,
+                        raw=entry_raw,
+                        heading=f"{entry_heading}",
+                        file=f"{entry_to_file_map[parsed_entry]}",
+                    )
+                )
+        return entries