PyPI - khoj - Versions diffs - 1.16.1.dev15__py3-none-any.whl - Mend

khoj 1.16.1.dev15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

khoj/__init__.py +0 -0
khoj/app/README.md +94 -0
khoj/app/__init__.py +0 -0
khoj/app/asgi.py +16 -0
khoj/app/settings.py +192 -0
khoj/app/urls.py +25 -0
khoj/configure.py +424 -0
khoj/database/__init__.py +0 -0
khoj/database/adapters/__init__.py +1234 -0
khoj/database/admin.py +290 -0
khoj/database/apps.py +6 -0
khoj/database/management/__init__.py +0 -0
khoj/database/management/commands/__init__.py +0 -0
khoj/database/management/commands/change_generated_images_url.py +61 -0
khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
khoj/database/migrations/0001_khojuser.py +98 -0
khoj/database/migrations/0002_googleuser.py +32 -0
khoj/database/migrations/0003_vector_extension.py +10 -0
khoj/database/migrations/0004_content_types_and_more.py +181 -0
khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
khoj/database/migrations/0006_embeddingsdates.py +33 -0
khoj/database/migrations/0007_add_conversation.py +27 -0
khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
khoj/database/migrations/0009_khojapiuser.py +24 -0
khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
khoj/database/migrations/0012_entry_file_source.py +21 -0
khoj/database/migrations/0013_subscription.py +37 -0
khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
khoj/database/migrations/0015_alter_subscription_user.py +21 -0
khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
khoj/database/migrations/0017_searchmodel.py +32 -0
khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
khoj/database/migrations/0020_reflectivequestion.py +36 -0
khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
khoj/database/migrations/0029_userrequests.py +27 -0
khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
khoj/database/migrations/0035_processlock.py +26 -0
khoj/database/migrations/0036_alter_processlock_name.py +19 -0
khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
khoj/database/migrations/0036_publicconversation.py +42 -0
khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
khoj/database/migrations/0040_alter_processlock_name.py +26 -0
khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
khoj/database/migrations/0042_serverchatsettings.py +46 -0
khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
khoj/database/migrations/0044_conversation_file_filters.py +17 -0
khoj/database/migrations/0045_fileobject.py +37 -0
khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
khoj/database/migrations/0049_datastore.py +38 -0
khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
khoj/database/migrations/0050_alter_processlock_name.py +25 -0
khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
khoj/database/migrations/__init__.py +0 -0
khoj/database/models/__init__.py +402 -0
khoj/database/tests.py +3 -0
khoj/interface/email/feedback.html +34 -0
khoj/interface/email/magic_link.html +17 -0
khoj/interface/email/task.html +40 -0
khoj/interface/email/welcome.html +61 -0
khoj/interface/web/404.html +56 -0
khoj/interface/web/agent.html +312 -0
khoj/interface/web/agents.html +276 -0
khoj/interface/web/assets/icons/agents.svg +6 -0
khoj/interface/web/assets/icons/automation.svg +37 -0
khoj/interface/web/assets/icons/cancel.svg +3 -0
khoj/interface/web/assets/icons/chat.svg +24 -0
khoj/interface/web/assets/icons/collapse.svg +17 -0
khoj/interface/web/assets/icons/computer.png +0 -0
khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
khoj/interface/web/assets/icons/copy-button.svg +5 -0
khoj/interface/web/assets/icons/credit-card.png +0 -0
khoj/interface/web/assets/icons/delete.svg +26 -0
khoj/interface/web/assets/icons/docx.svg +7 -0
khoj/interface/web/assets/icons/edit.svg +4 -0
khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
khoj/interface/web/assets/icons/favicon.icns +0 -0
khoj/interface/web/assets/icons/github.svg +1 -0
khoj/interface/web/assets/icons/key.svg +4 -0
khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
khoj/interface/web/assets/icons/logotype.svg +1 -0
khoj/interface/web/assets/icons/markdown.svg +1 -0
khoj/interface/web/assets/icons/new.svg +23 -0
khoj/interface/web/assets/icons/notion.svg +4 -0
khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
khoj/interface/web/assets/icons/org.svg +1 -0
khoj/interface/web/assets/icons/pdf.svg +23 -0
khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
khoj/interface/web/assets/icons/plaintext.svg +1 -0
khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
khoj/interface/web/assets/icons/search.svg +25 -0
khoj/interface/web/assets/icons/send.svg +1 -0
khoj/interface/web/assets/icons/share.svg +8 -0
khoj/interface/web/assets/icons/speaker.svg +4 -0
khoj/interface/web/assets/icons/stop-solid.svg +37 -0
khoj/interface/web/assets/icons/sync.svg +4 -0
khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
khoj/interface/web/assets/icons/voice.svg +8 -0
khoj/interface/web/assets/icons/web.svg +2 -0
khoj/interface/web/assets/icons/whatsapp.svg +17 -0
khoj/interface/web/assets/khoj.css +237 -0
khoj/interface/web/assets/markdown-it.min.js +8476 -0
khoj/interface/web/assets/natural-cron.min.js +1 -0
khoj/interface/web/assets/org.min.js +1823 -0
khoj/interface/web/assets/pico.min.css +5 -0
khoj/interface/web/assets/purify.min.js +3 -0
khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
khoj/interface/web/assets/utils.js +33 -0
khoj/interface/web/base_config.html +445 -0
khoj/interface/web/chat.html +3546 -0
khoj/interface/web/config.html +1011 -0
khoj/interface/web/config_automation.html +1103 -0
khoj/interface/web/content_source_computer_input.html +139 -0
khoj/interface/web/content_source_github_input.html +216 -0
khoj/interface/web/content_source_notion_input.html +94 -0
khoj/interface/web/khoj.webmanifest +51 -0
khoj/interface/web/login.html +219 -0
khoj/interface/web/public_conversation.html +2006 -0
khoj/interface/web/search.html +470 -0
khoj/interface/web/utils.html +48 -0
khoj/main.py +241 -0
khoj/manage.py +22 -0
khoj/migrations/__init__.py +0 -0
khoj/migrations/migrate_offline_chat_default_model.py +69 -0
khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
khoj/migrations/migrate_offline_chat_schema.py +83 -0
khoj/migrations/migrate_offline_model.py +29 -0
khoj/migrations/migrate_processor_config_openai.py +67 -0
khoj/migrations/migrate_server_pg.py +138 -0
khoj/migrations/migrate_version.py +17 -0
khoj/processor/__init__.py +0 -0
khoj/processor/content/__init__.py +0 -0
khoj/processor/content/docx/__init__.py +0 -0
khoj/processor/content/docx/docx_to_entries.py +110 -0
khoj/processor/content/github/__init__.py +0 -0
khoj/processor/content/github/github_to_entries.py +224 -0
khoj/processor/content/images/__init__.py +0 -0
khoj/processor/content/images/image_to_entries.py +118 -0
khoj/processor/content/markdown/__init__.py +0 -0
khoj/processor/content/markdown/markdown_to_entries.py +165 -0
khoj/processor/content/notion/notion_to_entries.py +260 -0
khoj/processor/content/org_mode/__init__.py +0 -0
khoj/processor/content/org_mode/org_to_entries.py +231 -0
khoj/processor/content/org_mode/orgnode.py +532 -0
khoj/processor/content/pdf/__init__.py +0 -0
khoj/processor/content/pdf/pdf_to_entries.py +116 -0
khoj/processor/content/plaintext/__init__.py +0 -0
khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
khoj/processor/content/text_to_entries.py +297 -0
khoj/processor/conversation/__init__.py +0 -0
khoj/processor/conversation/anthropic/__init__.py +0 -0
khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
khoj/processor/conversation/anthropic/utils.py +114 -0
khoj/processor/conversation/offline/__init__.py +0 -0
khoj/processor/conversation/offline/chat_model.py +231 -0
khoj/processor/conversation/offline/utils.py +78 -0
khoj/processor/conversation/offline/whisper.py +15 -0
khoj/processor/conversation/openai/__init__.py +0 -0
khoj/processor/conversation/openai/gpt.py +187 -0
khoj/processor/conversation/openai/utils.py +129 -0
khoj/processor/conversation/openai/whisper.py +13 -0
khoj/processor/conversation/prompts.py +758 -0
khoj/processor/conversation/utils.py +262 -0
khoj/processor/embeddings.py +117 -0
khoj/processor/speech/__init__.py +0 -0
khoj/processor/speech/text_to_speech.py +51 -0
khoj/processor/tools/__init__.py +0 -0
khoj/processor/tools/online_search.py +225 -0
khoj/routers/__init__.py +0 -0
khoj/routers/api.py +626 -0
khoj/routers/api_agents.py +43 -0
khoj/routers/api_chat.py +1180 -0
khoj/routers/api_config.py +434 -0
khoj/routers/api_phone.py +86 -0
khoj/routers/auth.py +181 -0
khoj/routers/email.py +133 -0
khoj/routers/helpers.py +1188 -0
khoj/routers/indexer.py +349 -0
khoj/routers/notion.py +91 -0
khoj/routers/storage.py +35 -0
khoj/routers/subscription.py +104 -0
khoj/routers/twilio.py +36 -0
khoj/routers/web_client.py +471 -0
khoj/search_filter/__init__.py +0 -0
khoj/search_filter/base_filter.py +15 -0
khoj/search_filter/date_filter.py +217 -0
khoj/search_filter/file_filter.py +30 -0
khoj/search_filter/word_filter.py +29 -0
khoj/search_type/__init__.py +0 -0
khoj/search_type/text_search.py +241 -0
khoj/utils/__init__.py +0 -0
khoj/utils/cli.py +93 -0
khoj/utils/config.py +81 -0
khoj/utils/constants.py +24 -0
khoj/utils/fs_syncer.py +249 -0
khoj/utils/helpers.py +418 -0
khoj/utils/initialization.py +146 -0
khoj/utils/jsonl.py +43 -0
khoj/utils/models.py +47 -0
khoj/utils/rawconfig.py +160 -0
khoj/utils/state.py +46 -0
khoj/utils/yaml.py +43 -0
khoj-1.16.1.dev15.dist-info/METADATA +178 -0
khoj-1.16.1.dev15.dist-info/RECORD +242 -0
khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0

khoj/processor/content/plaintext/plaintext_to_entries.py ADDED Viewed

@@ -0,0 +1,122 @@
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+import urllib3
+from bs4 import BeautifulSoup
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import KhojUser
+from khoj.processor.content.text_to_entries import TextToEntries
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import Entry
+logger = logging.getLogger(__name__)
+class PlaintextToEntries(TextToEntries):
+    def __init__(self):
+        super().__init__()
+    # Define Functions
+    def process(
+        self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
+    ) -> Tuple[int, int]:
+        if not full_corpus:
+            deletion_file_names = set([file for file in files if files[file] == ""])
+            files_to_process = set(files) - deletion_file_names
+            files = {file: files[file] for file in files_to_process}
+        else:
+            deletion_file_names = None
+        # Extract Entries from specified plaintext files
+        with timer("Extract entries from specified Plaintext files", logger):
+            file_to_text_map, current_entries = PlaintextToEntries.extract_plaintext_entries(files)
+        # Split entries by max tokens supported by model
+        with timer("Split entries by max token size supported by model", logger):
+            current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256, raw_is_compiled=True)
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
+                current_entries,
+                DbEntry.EntryType.PLAINTEXT,
+                DbEntry.EntrySource.COMPUTER,
+                key="compiled",
+                logger=logger,
+                deletion_filenames=deletion_file_names,
+                user=user,
+                regenerate=regenerate,
+                file_to_text_map=file_to_text_map,
+            )
+        return num_new_embeddings, num_deleted_embeddings
+    @staticmethod
+    def extract_html_content(markup_content: str, markup_type: str):
+        "Extract content from HTML"
+        if markup_type == "xml":
+            soup = BeautifulSoup(markup_content, "xml")
+        else:
+            soup = BeautifulSoup(markup_content, "html.parser")
+        return soup.get_text(strip=True, separator="\n")
+    @staticmethod
+    def extract_plaintext_entries(text_files: Dict[str, str]) -> Tuple[Dict, List[Entry]]:
+        entries: List[str] = []
+        entry_to_file_map: List[Tuple[str, str]] = []
+        file_to_text_map = dict()
+        for text_file in text_files:
+            try:
+                text_content = text_files[text_file]
+                entries, entry_to_file_map = PlaintextToEntries.process_single_plaintext_file(
+                    text_content, text_file, entries, entry_to_file_map
+                )
+                file_to_text_map[text_file] = text_content
+            except Exception as e:
+                logger.warning(f"Unable to read file: {text_file} as plaintext. Skipping file.")
+                logger.warning(e, exc_info=True)
+        # Extract Entries from specified plaintext files
+        return file_to_text_map, PlaintextToEntries.convert_text_files_to_entries(entries, dict(entry_to_file_map))
+    @staticmethod
+    def process_single_plaintext_file(
+        text_content: str,
+        text_file: str,
+        entries: List[str],
+        entry_to_file_map: List[Tuple[str, str]],
+    ) -> Tuple[List[str], List[Tuple[str, str]]]:
+        if text_file.endswith(("html", "htm", "xml")):
+            text_content = PlaintextToEntries.extract_html_content(text_content, text_file.split(".")[-1])
+        entry_to_file_map += [(text_content, text_file)]
+        entries.extend([text_content])
+        return entries, entry_to_file_map
+    @staticmethod
+    def convert_text_files_to_entries(parsed_entries: List[str], entry_to_file_map: dict[str, str]) -> List[Entry]:
+        "Convert each plaintext file into an entry"
+        entries: List[Entry] = []
+        for parsed_entry in parsed_entries:
+            raw_filename = entry_to_file_map[parsed_entry]
+            # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path.
+            if type(raw_filename) == str and re.search(r"^https?://", raw_filename):
+                # Escape the URL to avoid issues with special characters
+                entry_filename = urllib3.util.parse_url(raw_filename).url
+            else:
+                entry_filename = raw_filename
+            # Append base filename to compiled entry for context to model
+            entries.append(
+                Entry(
+                    raw=parsed_entry,
+                    file=f"{entry_filename}",
+                    compiled=f"{entry_filename}\n{parsed_entry}",
+                    heading=entry_filename,
+                )
+            )
+        logger.debug(f"Converted {len(parsed_entries)} plaintext files to entries")
+        return entries

khoj/processor/content/text_to_entries.py ADDED Viewed

@@ -0,0 +1,297 @@
+import hashlib
+import logging
+import re
+import uuid
+from abc import ABC, abstractmethod
+from itertools import repeat
+from typing import Any, Callable, List, Set, Tuple
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from tqdm import tqdm
+from khoj.database.adapters import (
+    EntryAdapters,
+    FileObjectAdapters,
+    get_user_search_model_or_default,
+)
+from khoj.database.models import Entry as DbEntry
+from khoj.database.models import EntryDates, KhojUser
+from khoj.search_filter.date_filter import DateFilter
+from khoj.utils import state
+from khoj.utils.helpers import batcher, is_none_or_empty, timer
+from khoj.utils.rawconfig import Entry
+logger = logging.getLogger(__name__)
+class TextToEntries(ABC):
+    def __init__(self, config: Any = None):
+        self.embeddings_model = state.embeddings_model
+        self.config = config
+        self.date_filter = DateFilter()
+    @abstractmethod
+    def process(
+        self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
+    ) -> Tuple[int, int]:
+        ...
+    @staticmethod
+    def hash_func(key: str) -> Callable:
+        return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding="utf-8")).hexdigest()
+    @staticmethod
+    def remove_long_words(text: str, max_word_length: int = 500) -> str:
+        "Remove words longer than max_word_length from text."
+        # Split the string by words, keeping the delimiters
+        splits = re.split(r"(\s+)", text) + [""]
+        words_with_delimiters = list(zip(splits[::2], splits[1::2]))
+        # Filter out long words while preserving delimiters in text
+        filtered_text = [
+            f"{word}{delimiter}"
+            for word, delimiter in words_with_delimiters
+            if not word.strip() or len(word.strip()) <= max_word_length
+        ]
+        return "".join(filtered_text)
+    @staticmethod
+    def tokenizer(text: str) -> List[str]:
+        "Tokenize text into words."
+        return text.split()
+    @staticmethod
+    def split_entries_by_max_tokens(
+        entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500, raw_is_compiled: bool = False
+    ) -> List[Entry]:
+        "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
+        chunked_entries: List[Entry] = []
+        for entry in entries:
+            if is_none_or_empty(entry.compiled):
+                continue
+            # Split entry into chunks of max_tokens
+            # Use chunking preference order: paragraphs > sentences > words > characters
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=max_tokens,
+                separators=["\n\n", "\n", "!", "?", ".", " ", "\t", ""],
+                keep_separator=True,
+                length_function=lambda chunk: len(TextToEntries.tokenizer(chunk)),
+                chunk_overlap=0,
+            )
+            chunked_entry_chunks = text_splitter.split_text(entry.compiled)
+            corpus_id = uuid.uuid4()
+            # Create heading prefixed entry from each chunk
+            for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks):
+                # Prepend heading to all other chunks, the first chunk already has heading from original entry
+                if chunk_index > 0 and entry.heading:
+                    # Snip heading to avoid crossing max_tokens limit
+                    # Keep last 100 characters of heading as entry heading more important than filename
+                    snipped_heading = entry.heading[-100:]
+                    # Prepend snipped heading
+                    compiled_entry_chunk = f"{snipped_heading}\n{compiled_entry_chunk}"
+                # Drop long words instead of having entry truncated to maintain quality of entry processed by models
+                compiled_entry_chunk = TextToEntries.remove_long_words(compiled_entry_chunk, max_word_length)
+                # Clean entry of unwanted characters like \0 character
+                compiled_entry_chunk = TextToEntries.clean_field(compiled_entry_chunk)
+                entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw)
+                entry.heading = TextToEntries.clean_field(entry.heading)
+                entry.file = TextToEntries.clean_field(entry.file)
+                chunked_entries.append(
+                    Entry(
+                        compiled=compiled_entry_chunk,
+                        raw=entry.raw,
+                        heading=entry.heading,
+                        file=entry.file,
+                        corpus_id=corpus_id,
+                    )
+                )
+        return chunked_entries
+    def update_embeddings(
+        self,
+        current_entries: List[Entry],
+        file_type: str,
+        file_source: str,
+        key="compiled",
+        logger: logging.Logger = None,
+        deletion_filenames: Set[str] = None,
+        user: KhojUser = None,
+        regenerate: bool = False,
+        file_to_text_map: dict[str, str] = None,
+    ):
+        with timer("Constructed current entry hashes in", logger):
+            hashes_by_file = dict[str, set[str]]()
+            current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
+            hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
+            for entry in tqdm(current_entries, desc="Hashing Entries"):
+                hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
+        num_deleted_entries = 0
+        if regenerate:
+            with timer("Cleared existing dataset for regeneration in", logger):
+                logger.debug(f"Deleting all entries for file type {file_type}")
+                num_deleted_entries = EntryAdapters.delete_all_entries(user, file_type=file_type)
+        hashes_to_process = set()
+        with timer("Identified entries to add to database in", logger):
+            for file in tqdm(hashes_by_file, desc="Identify new entries"):
+                hashes_for_file = hashes_by_file[file]
+                existing_entries = DbEntry.objects.filter(
+                    user=user, hashed_value__in=hashes_for_file, file_type=file_type
+                )
+                existing_entry_hashes = set([entry.hashed_value for entry in existing_entries])
+                hashes_to_process |= hashes_for_file - existing_entry_hashes
+        embeddings = []
+        with timer("Generated embeddings for entries to add to database in", logger):
+            entries_to_process = [hash_to_current_entries[hashed_val] for hashed_val in hashes_to_process]
+            data_to_embed = [getattr(entry, key) for entry in entries_to_process]
+            model = get_user_search_model_or_default(user)
+            embeddings += self.embeddings_model[model.name].embed_documents(data_to_embed)
+        added_entries: list[DbEntry] = []
+        with timer("Added entries to database in", logger):
+            num_items = len(hashes_to_process)
+            assert num_items == len(embeddings)
+            batch_size = min(200, num_items)
+            entry_batches = zip(hashes_to_process, embeddings)
+            for entry_batch in tqdm(batcher(entry_batches, batch_size), desc="Add entries to database"):
+                batch_embeddings_to_create: List[DbEntry] = []
+                for entry_hash, new_entry in entry_batch:
+                    entry = hash_to_current_entries[entry_hash]
+                    batch_embeddings_to_create.append(
+                        DbEntry(
+                            user=user,
+                            embeddings=new_entry,
+                            raw=entry.raw,
+                            compiled=entry.compiled,
+                            heading=entry.heading[:1000],  # Truncate to max chars of field allowed
+                            file_path=entry.file,
+                            file_source=file_source,
+                            file_type=file_type,
+                            hashed_value=entry_hash,
+                            corpus_id=entry.corpus_id,
+                        )
+                    )
+                try:
+                    added_entries += DbEntry.objects.bulk_create(batch_embeddings_to_create)
+                except Exception as e:
+                    batch_indexing_error = "\n\n".join(
+                        f"file: {entry.file_path}\nheading: {entry.heading}\ncompiled: {entry.compiled[:100]}\nraw: {entry.raw[:100]}"
+                        for entry in batch_embeddings_to_create
+                    )
+                    logger.error(f"Error adding entries to database:\n{batch_indexing_error}\n---\n{e}", exc_info=True)
+            logger.debug(f"Added {len(added_entries)} {file_type} entries to database")
+        if file_to_text_map:
+            with timer("Indexed text of modified file in", logger):
+                # get the set of modified files from added_entries
+                modified_files = {entry.file_path for entry in added_entries}
+                # create or update text of each updated file indexed on DB
+                for modified_file in modified_files:
+                    raw_text = file_to_text_map[modified_file]
+                    file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file)
+                    if file_object:
+                        FileObjectAdapters.update_raw_text(file_object, raw_text)
+                    else:
+                        FileObjectAdapters.create_file_object(user, modified_file, raw_text)
+        new_dates = []
+        with timer("Indexed dates from added entries in", logger):
+            for added_entry in added_entries:
+                dates_in_entries = zip(self.date_filter.extract_dates(added_entry.compiled), repeat(added_entry))
+                dates_to_create = [
+                    EntryDates(date=date, entry=added_entry)
+                    for date, added_entry in dates_in_entries
+                    if not is_none_or_empty(date)
+                ]
+                new_dates += EntryDates.objects.bulk_create(dates_to_create)
+            logger.debug(f"Indexed {len(new_dates)} dates from added {file_type} entries")
+        with timer("Deleted entries identified by server from database in", logger):
+            for file in hashes_by_file:
+                existing_entry_hashes = EntryAdapters.get_existing_entry_hashes_by_file(user, file)
+                to_delete_entry_hashes = set(existing_entry_hashes) - hashes_by_file[file]
+                num_deleted_entries += len(to_delete_entry_hashes)
+                EntryAdapters.delete_entry_by_hash(user, hashed_values=list(to_delete_entry_hashes))
+        with timer("Deleted entries requested by clients from database in", logger):
+            if deletion_filenames is not None:
+                for file_path in deletion_filenames:
+                    deleted_count = EntryAdapters.delete_entry_by_file(user, file_path)
+                    num_deleted_entries += deleted_count
+                    FileObjectAdapters.delete_file_object_by_name(user, file_path)
+        return len(added_entries), num_deleted_entries
+    @staticmethod
+    def mark_entries_for_update(
+        current_entries: List[Entry],
+        previous_entries: List[Entry],
+        key="compiled",
+        logger: logging.Logger = None,
+        deletion_filenames: Set[str] = None,
+    ):
+        # Hash all current and previous entries to identify new entries
+        with timer("Hash previous, current entries", logger):
+            current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
+            previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
+            if deletion_filenames is not None:
+                deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
+                deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
+            else:
+                deletion_entry_hashes = []
+        with timer("Identify, Mark, Combine new, existing entries", logger):
+            hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
+            hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))
+            # All entries that did not exist in the previous set are to be added
+            new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes)
+            # All entries that exist in both current and previous sets are kept
+            existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
+            # All entries that exist in the previous set but not in the current set should be preserved
+            remaining_entry_hashes = set(previous_entry_hashes) - set(current_entry_hashes)
+            # All entries that exist in the previous set and also in the deletions set should be removed
+            to_delete_entry_hashes = set(previous_entry_hashes) & set(deletion_entry_hashes)
+            preserving_entry_hashes = existing_entry_hashes
+            if deletion_filenames is not None:
+                preserving_entry_hashes = (
+                    (existing_entry_hashes | remaining_entry_hashes)
+                    if len(deletion_entry_hashes) == 0
+                    else (set(previous_entry_hashes) - to_delete_entry_hashes)
+                )
+            # load new entries in the order in which they are processed for a stable sort
+            new_entries = [
+                (current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash])
+                for entry_hash in new_entry_hashes
+            ]
+            new_entries_sorted = sorted(new_entries, key=lambda e: e[0])
+            # Mark new entries with -1 id to flag for later embeddings generation
+            new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted]
+            # Set id of existing entries to their previous ids to reuse their existing encoded embeddings
+            existing_entries = [
+                (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
+                for entry_hash in preserving_entry_hashes
+            ]
+            existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
+            entries_with_ids = existing_entries_sorted + new_entries_sorted
+        return entries_with_ids
+    @staticmethod
+    def clean_field(field: str) -> str:
+        return field.replace("\0", "") if not is_none_or_empty(field) else ""

khoj/processor/conversation/__init__.py ADDED Viewed

File without changes

khoj/processor/conversation/anthropic/__init__.py ADDED Viewed

File without changes

khoj/processor/conversation/anthropic/anthropic_chat.py ADDED Viewed

@@ -0,0 +1,206 @@
+import json
+import logging
+import re
+from datetime import datetime, timedelta
+from typing import Dict, Optional
+from langchain.schema import ChatMessage
+from khoj.database.models import Agent
+from khoj.processor.conversation import prompts
+from khoj.processor.conversation.anthropic.utils import (
+    anthropic_chat_completion_with_backoff,
+    anthropic_completion_with_backoff,
+)
+from khoj.processor.conversation.utils import generate_chatml_messages_with_context
+from khoj.utils.helpers import ConversationCommand, is_none_or_empty
+from khoj.utils.rawconfig import LocationData
+logger = logging.getLogger(__name__)
+def extract_questions_anthropic(
+    text,
+    model: Optional[str] = "claude-instant-1.2",
+    conversation_log={},
+    api_key=None,
+    temperature=0,
+    location_data: LocationData = None,
+):
+    """
+    Infer search queries to retrieve relevant notes to answer user query
+    """
+    # Extract Past User Message and Inferred Questions from Conversation Log
+    location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
+    # Extract Past User Message and Inferred Questions from Conversation Log
+    chat_history = "".join(
+        [
+            f'Q: {chat["intent"]["query"]}\nKhoj: {{"queries": {chat["intent"].get("inferred-queries") or list([chat["intent"]["query"]])}}}\nA: {chat["message"]}\n\n'
+            for chat in conversation_log.get("chat", [])[-4:]
+            if chat["by"] == "khoj" and "text-to-image" not in chat["intent"].get("type")
+        ]
+    )
+    # Get dates relative to today for prompt creation
+    today = datetime.today()
+    current_new_year = today.replace(month=1, day=1)
+    last_new_year = current_new_year.replace(year=today.year - 1)
+    system_prompt = prompts.extract_questions_anthropic_system_prompt.format(
+        current_date=today.strftime("%Y-%m-%d"),
+        day_of_week=today.strftime("%A"),
+        last_new_year=last_new_year.strftime("%Y"),
+        last_new_year_date=last_new_year.strftime("%Y-%m-%d"),
+        current_new_year_date=current_new_year.strftime("%Y-%m-%d"),
+        yesterday_date=(today - timedelta(days=1)).strftime("%Y-%m-%d"),
+        location=location,
+    )
+    prompt = prompts.extract_questions_anthropic_user_message.format(
+        chat_history=chat_history,
+        text=text,
+    )
+    messages = [ChatMessage(content=prompt, role="user")]
+    response = anthropic_completion_with_backoff(
+        messages=messages,
+        system_prompt=system_prompt,
+        model_name=model,
+        temperature=temperature,
+        api_key=api_key,
+    )
+    # Extract, Clean Message from Claude's Response
+    try:
+        response = response.strip()
+        match = re.search(r"\{.*?\}", response)
+        if match:
+            response = match.group()
+        response = json.loads(response)
+        response = [q.strip() for q in response["queries"] if q.strip()]
+        if not isinstance(response, list) or not response:
+            logger.error(f"Invalid response for constructing subqueries: {response}")
+            return [text]
+        return response
+    except:
+        logger.warning(f"Claude returned invalid JSON. Falling back to using user message as search query.\n{response}")
+        questions = [text]
+    logger.debug(f"Extracted Questions by Claude: {questions}")
+    return questions
+def anthropic_send_message_to_model(messages, api_key, model):
+    """
+    Send message to model
+    """
+    # Anthropic requires the first message to be a 'user' message, and the system prompt is not to be sent in the messages parameter
+    system_prompt = None
+    if len(messages) == 1:
+        messages[0].role = "user"
+    else:
+        system_prompt = ""
+        for message in messages.copy():
+            if message.role == "system":
+                system_prompt += message.content
+                messages.remove(message)
+    # Get Response from GPT. Don't use response_type because Anthropic doesn't support it.
+    return anthropic_completion_with_backoff(
+        messages=messages,
+        system_prompt=system_prompt,
+        model_name=model,
+        api_key=api_key,
+    )
+def converse_anthropic(
+    references,
+    user_query,
+    online_results: Optional[Dict[str, Dict]] = None,
+    conversation_log={},
+    model: Optional[str] = "claude-instant-1.2",
+    api_key: Optional[str] = None,
+    completion_func=None,
+    conversation_commands=[ConversationCommand.Default],
+    max_prompt_size=None,
+    tokenizer_name=None,
+    location_data: LocationData = None,
+    user_name: str = None,
+    agent: Agent = None,
+):
+    """
+    Converse with user using Anthropic's Claude
+    """
+    # Initialize Variables
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    compiled_references = "\n\n".join({f"# {item}" for item in references})
+    conversation_primer = prompts.query_prompt.format(query=user_query)
+    if agent and agent.personality:
+        system_prompt = prompts.custom_personality.format(
+            name=agent.name, bio=agent.personality, current_date=current_date
+        )
+    else:
+        system_prompt = prompts.personality.format(current_date=current_date)
+    if location_data:
+        location = f"{location_data.city}, {location_data.region}, {location_data.country}"
+        location_prompt = prompts.user_location.format(location=location)
+        system_prompt = f"{system_prompt}\n{location_prompt}"
+    if user_name:
+        user_name_prompt = prompts.user_name.format(name=user_name)
+        system_prompt = f"{system_prompt}\n{user_name_prompt}"
+    # Get Conversation Primer appropriate to Conversation Type
+    if conversation_commands == [ConversationCommand.Notes] and is_none_or_empty(compiled_references):
+        completion_func(chat_response=prompts.no_notes_found.format())
+        return iter([prompts.no_notes_found.format()])
+    elif conversation_commands == [ConversationCommand.Online] and is_none_or_empty(online_results):
+        completion_func(chat_response=prompts.no_online_results_found.format())
+        return iter([prompts.no_online_results_found.format()])
+    if ConversationCommand.Online in conversation_commands or ConversationCommand.Webpage in conversation_commands:
+        conversation_primer = (
+            f"{prompts.online_search_conversation.format(online_results=str(online_results))}\n{conversation_primer}"
+        )
+    if not is_none_or_empty(compiled_references):
+        conversation_primer = f"{prompts.notes_conversation.format(query=user_query, references=compiled_references)}\n\n{conversation_primer}"
+    # Setup Prompt with Primer or Conversation History
+    messages = generate_chatml_messages_with_context(
+        conversation_primer,
+        conversation_log=conversation_log,
+        model_name=model,
+        max_prompt_size=max_prompt_size,
+        tokenizer_name=tokenizer_name,
+    )
+    if len(messages) > 1:
+        if messages[0].role == "assistant":
+            messages = messages[1:]
+    for message in messages.copy():
+        if message.role == "system":
+            system_prompt += message.content
+            messages.remove(message)
+    truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
+    logger.debug(f"Conversation Context for Claude: {truncated_messages}")
+    # Get Response from Claude
+    return anthropic_chat_completion_with_backoff(
+        messages=messages,
+        compiled_references=references,
+        online_results=online_results,
+        model_name=model,
+        temperature=0,
+        api_key=api_key,
+        system_prompt=system_prompt,
+        completion_func=completion_func,
+        max_prompt_size=max_prompt_size,
+    )