khoj 1.16.1.dev15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. khoj/__init__.py +0 -0
  2. khoj/app/README.md +94 -0
  3. khoj/app/__init__.py +0 -0
  4. khoj/app/asgi.py +16 -0
  5. khoj/app/settings.py +192 -0
  6. khoj/app/urls.py +25 -0
  7. khoj/configure.py +424 -0
  8. khoj/database/__init__.py +0 -0
  9. khoj/database/adapters/__init__.py +1234 -0
  10. khoj/database/admin.py +290 -0
  11. khoj/database/apps.py +6 -0
  12. khoj/database/management/__init__.py +0 -0
  13. khoj/database/management/commands/__init__.py +0 -0
  14. khoj/database/management/commands/change_generated_images_url.py +61 -0
  15. khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
  16. khoj/database/migrations/0001_khojuser.py +98 -0
  17. khoj/database/migrations/0002_googleuser.py +32 -0
  18. khoj/database/migrations/0003_vector_extension.py +10 -0
  19. khoj/database/migrations/0004_content_types_and_more.py +181 -0
  20. khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
  21. khoj/database/migrations/0006_embeddingsdates.py +33 -0
  22. khoj/database/migrations/0007_add_conversation.py +27 -0
  23. khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
  24. khoj/database/migrations/0009_khojapiuser.py +24 -0
  25. khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
  26. khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
  27. khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
  28. khoj/database/migrations/0012_entry_file_source.py +21 -0
  29. khoj/database/migrations/0013_subscription.py +37 -0
  30. khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
  31. khoj/database/migrations/0015_alter_subscription_user.py +21 -0
  32. khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
  33. khoj/database/migrations/0017_searchmodel.py +32 -0
  34. khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
  35. khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
  36. khoj/database/migrations/0020_reflectivequestion.py +36 -0
  37. khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
  38. khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
  39. khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
  40. khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
  41. khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
  42. khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
  43. khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
  44. khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
  45. khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
  46. khoj/database/migrations/0029_userrequests.py +27 -0
  47. khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
  48. khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
  49. khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
  50. khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
  51. khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
  52. khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
  53. khoj/database/migrations/0035_processlock.py +26 -0
  54. khoj/database/migrations/0036_alter_processlock_name.py +19 -0
  55. khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
  56. khoj/database/migrations/0036_publicconversation.py +42 -0
  57. khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
  58. khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
  59. khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
  60. khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
  61. khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
  62. khoj/database/migrations/0040_alter_processlock_name.py +26 -0
  63. khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
  64. khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
  65. khoj/database/migrations/0042_serverchatsettings.py +46 -0
  66. khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
  67. khoj/database/migrations/0044_conversation_file_filters.py +17 -0
  68. khoj/database/migrations/0045_fileobject.py +37 -0
  69. khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
  70. khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
  71. khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
  72. khoj/database/migrations/0049_datastore.py +38 -0
  73. khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
  74. khoj/database/migrations/0050_alter_processlock_name.py +25 -0
  75. khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
  76. khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
  77. khoj/database/migrations/__init__.py +0 -0
  78. khoj/database/models/__init__.py +402 -0
  79. khoj/database/tests.py +3 -0
  80. khoj/interface/email/feedback.html +34 -0
  81. khoj/interface/email/magic_link.html +17 -0
  82. khoj/interface/email/task.html +40 -0
  83. khoj/interface/email/welcome.html +61 -0
  84. khoj/interface/web/404.html +56 -0
  85. khoj/interface/web/agent.html +312 -0
  86. khoj/interface/web/agents.html +276 -0
  87. khoj/interface/web/assets/icons/agents.svg +6 -0
  88. khoj/interface/web/assets/icons/automation.svg +37 -0
  89. khoj/interface/web/assets/icons/cancel.svg +3 -0
  90. khoj/interface/web/assets/icons/chat.svg +24 -0
  91. khoj/interface/web/assets/icons/collapse.svg +17 -0
  92. khoj/interface/web/assets/icons/computer.png +0 -0
  93. khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
  94. khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
  95. khoj/interface/web/assets/icons/copy-button.svg +5 -0
  96. khoj/interface/web/assets/icons/credit-card.png +0 -0
  97. khoj/interface/web/assets/icons/delete.svg +26 -0
  98. khoj/interface/web/assets/icons/docx.svg +7 -0
  99. khoj/interface/web/assets/icons/edit.svg +4 -0
  100. khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
  101. khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
  102. khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
  103. khoj/interface/web/assets/icons/favicon.icns +0 -0
  104. khoj/interface/web/assets/icons/github.svg +1 -0
  105. khoj/interface/web/assets/icons/key.svg +4 -0
  106. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  107. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  108. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
  109. khoj/interface/web/assets/icons/logotype.svg +1 -0
  110. khoj/interface/web/assets/icons/markdown.svg +1 -0
  111. khoj/interface/web/assets/icons/new.svg +23 -0
  112. khoj/interface/web/assets/icons/notion.svg +4 -0
  113. khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
  114. khoj/interface/web/assets/icons/org.svg +1 -0
  115. khoj/interface/web/assets/icons/pdf.svg +23 -0
  116. khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
  117. khoj/interface/web/assets/icons/plaintext.svg +1 -0
  118. khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
  119. khoj/interface/web/assets/icons/search.svg +25 -0
  120. khoj/interface/web/assets/icons/send.svg +1 -0
  121. khoj/interface/web/assets/icons/share.svg +8 -0
  122. khoj/interface/web/assets/icons/speaker.svg +4 -0
  123. khoj/interface/web/assets/icons/stop-solid.svg +37 -0
  124. khoj/interface/web/assets/icons/sync.svg +4 -0
  125. khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
  126. khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
  127. khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
  128. khoj/interface/web/assets/icons/voice.svg +8 -0
  129. khoj/interface/web/assets/icons/web.svg +2 -0
  130. khoj/interface/web/assets/icons/whatsapp.svg +17 -0
  131. khoj/interface/web/assets/khoj.css +237 -0
  132. khoj/interface/web/assets/markdown-it.min.js +8476 -0
  133. khoj/interface/web/assets/natural-cron.min.js +1 -0
  134. khoj/interface/web/assets/org.min.js +1823 -0
  135. khoj/interface/web/assets/pico.min.css +5 -0
  136. khoj/interface/web/assets/purify.min.js +3 -0
  137. khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
  138. khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
  139. khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
  140. khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
  141. khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
  142. khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
  143. khoj/interface/web/assets/utils.js +33 -0
  144. khoj/interface/web/base_config.html +445 -0
  145. khoj/interface/web/chat.html +3546 -0
  146. khoj/interface/web/config.html +1011 -0
  147. khoj/interface/web/config_automation.html +1103 -0
  148. khoj/interface/web/content_source_computer_input.html +139 -0
  149. khoj/interface/web/content_source_github_input.html +216 -0
  150. khoj/interface/web/content_source_notion_input.html +94 -0
  151. khoj/interface/web/khoj.webmanifest +51 -0
  152. khoj/interface/web/login.html +219 -0
  153. khoj/interface/web/public_conversation.html +2006 -0
  154. khoj/interface/web/search.html +470 -0
  155. khoj/interface/web/utils.html +48 -0
  156. khoj/main.py +241 -0
  157. khoj/manage.py +22 -0
  158. khoj/migrations/__init__.py +0 -0
  159. khoj/migrations/migrate_offline_chat_default_model.py +69 -0
  160. khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
  161. khoj/migrations/migrate_offline_chat_schema.py +83 -0
  162. khoj/migrations/migrate_offline_model.py +29 -0
  163. khoj/migrations/migrate_processor_config_openai.py +67 -0
  164. khoj/migrations/migrate_server_pg.py +138 -0
  165. khoj/migrations/migrate_version.py +17 -0
  166. khoj/processor/__init__.py +0 -0
  167. khoj/processor/content/__init__.py +0 -0
  168. khoj/processor/content/docx/__init__.py +0 -0
  169. khoj/processor/content/docx/docx_to_entries.py +110 -0
  170. khoj/processor/content/github/__init__.py +0 -0
  171. khoj/processor/content/github/github_to_entries.py +224 -0
  172. khoj/processor/content/images/__init__.py +0 -0
  173. khoj/processor/content/images/image_to_entries.py +118 -0
  174. khoj/processor/content/markdown/__init__.py +0 -0
  175. khoj/processor/content/markdown/markdown_to_entries.py +165 -0
  176. khoj/processor/content/notion/notion_to_entries.py +260 -0
  177. khoj/processor/content/org_mode/__init__.py +0 -0
  178. khoj/processor/content/org_mode/org_to_entries.py +231 -0
  179. khoj/processor/content/org_mode/orgnode.py +532 -0
  180. khoj/processor/content/pdf/__init__.py +0 -0
  181. khoj/processor/content/pdf/pdf_to_entries.py +116 -0
  182. khoj/processor/content/plaintext/__init__.py +0 -0
  183. khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
  184. khoj/processor/content/text_to_entries.py +297 -0
  185. khoj/processor/conversation/__init__.py +0 -0
  186. khoj/processor/conversation/anthropic/__init__.py +0 -0
  187. khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
  188. khoj/processor/conversation/anthropic/utils.py +114 -0
  189. khoj/processor/conversation/offline/__init__.py +0 -0
  190. khoj/processor/conversation/offline/chat_model.py +231 -0
  191. khoj/processor/conversation/offline/utils.py +78 -0
  192. khoj/processor/conversation/offline/whisper.py +15 -0
  193. khoj/processor/conversation/openai/__init__.py +0 -0
  194. khoj/processor/conversation/openai/gpt.py +187 -0
  195. khoj/processor/conversation/openai/utils.py +129 -0
  196. khoj/processor/conversation/openai/whisper.py +13 -0
  197. khoj/processor/conversation/prompts.py +758 -0
  198. khoj/processor/conversation/utils.py +262 -0
  199. khoj/processor/embeddings.py +117 -0
  200. khoj/processor/speech/__init__.py +0 -0
  201. khoj/processor/speech/text_to_speech.py +51 -0
  202. khoj/processor/tools/__init__.py +0 -0
  203. khoj/processor/tools/online_search.py +225 -0
  204. khoj/routers/__init__.py +0 -0
  205. khoj/routers/api.py +626 -0
  206. khoj/routers/api_agents.py +43 -0
  207. khoj/routers/api_chat.py +1180 -0
  208. khoj/routers/api_config.py +434 -0
  209. khoj/routers/api_phone.py +86 -0
  210. khoj/routers/auth.py +181 -0
  211. khoj/routers/email.py +133 -0
  212. khoj/routers/helpers.py +1188 -0
  213. khoj/routers/indexer.py +349 -0
  214. khoj/routers/notion.py +91 -0
  215. khoj/routers/storage.py +35 -0
  216. khoj/routers/subscription.py +104 -0
  217. khoj/routers/twilio.py +36 -0
  218. khoj/routers/web_client.py +471 -0
  219. khoj/search_filter/__init__.py +0 -0
  220. khoj/search_filter/base_filter.py +15 -0
  221. khoj/search_filter/date_filter.py +217 -0
  222. khoj/search_filter/file_filter.py +30 -0
  223. khoj/search_filter/word_filter.py +29 -0
  224. khoj/search_type/__init__.py +0 -0
  225. khoj/search_type/text_search.py +241 -0
  226. khoj/utils/__init__.py +0 -0
  227. khoj/utils/cli.py +93 -0
  228. khoj/utils/config.py +81 -0
  229. khoj/utils/constants.py +24 -0
  230. khoj/utils/fs_syncer.py +249 -0
  231. khoj/utils/helpers.py +418 -0
  232. khoj/utils/initialization.py +146 -0
  233. khoj/utils/jsonl.py +43 -0
  234. khoj/utils/models.py +47 -0
  235. khoj/utils/rawconfig.py +160 -0
  236. khoj/utils/state.py +46 -0
  237. khoj/utils/yaml.py +43 -0
  238. khoj-1.16.1.dev15.dist-info/METADATA +178 -0
  239. khoj-1.16.1.dev15.dist-info/RECORD +242 -0
  240. khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
  241. khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
  242. khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,122 @@
1
+ import logging
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ import urllib3
7
+ from bs4 import BeautifulSoup
8
+
9
+ from khoj.database.models import Entry as DbEntry
10
+ from khoj.database.models import KhojUser
11
+ from khoj.processor.content.text_to_entries import TextToEntries
12
+ from khoj.utils.helpers import timer
13
+ from khoj.utils.rawconfig import Entry
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class PlaintextToEntries(TextToEntries):
19
+ def __init__(self):
20
+ super().__init__()
21
+
22
+ # Define Functions
23
+ def process(
24
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
25
+ ) -> Tuple[int, int]:
26
+ if not full_corpus:
27
+ deletion_file_names = set([file for file in files if files[file] == ""])
28
+ files_to_process = set(files) - deletion_file_names
29
+ files = {file: files[file] for file in files_to_process}
30
+ else:
31
+ deletion_file_names = None
32
+
33
+ # Extract Entries from specified plaintext files
34
+ with timer("Extract entries from specified Plaintext files", logger):
35
+ file_to_text_map, current_entries = PlaintextToEntries.extract_plaintext_entries(files)
36
+
37
+ # Split entries by max tokens supported by model
38
+ with timer("Split entries by max token size supported by model", logger):
39
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256, raw_is_compiled=True)
40
+
41
+ # Identify, mark and merge any new entries with previous entries
42
+ with timer("Identify new or updated entries", logger):
43
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
44
+ current_entries,
45
+ DbEntry.EntryType.PLAINTEXT,
46
+ DbEntry.EntrySource.COMPUTER,
47
+ key="compiled",
48
+ logger=logger,
49
+ deletion_filenames=deletion_file_names,
50
+ user=user,
51
+ regenerate=regenerate,
52
+ file_to_text_map=file_to_text_map,
53
+ )
54
+
55
+ return num_new_embeddings, num_deleted_embeddings
56
+
57
+ @staticmethod
58
+ def extract_html_content(markup_content: str, markup_type: str):
59
+ "Extract content from HTML"
60
+ if markup_type == "xml":
61
+ soup = BeautifulSoup(markup_content, "xml")
62
+ else:
63
+ soup = BeautifulSoup(markup_content, "html.parser")
64
+ return soup.get_text(strip=True, separator="\n")
65
+
66
+ @staticmethod
67
+ def extract_plaintext_entries(text_files: Dict[str, str]) -> Tuple[Dict, List[Entry]]:
68
+ entries: List[str] = []
69
+ entry_to_file_map: List[Tuple[str, str]] = []
70
+ file_to_text_map = dict()
71
+ for text_file in text_files:
72
+ try:
73
+ text_content = text_files[text_file]
74
+ entries, entry_to_file_map = PlaintextToEntries.process_single_plaintext_file(
75
+ text_content, text_file, entries, entry_to_file_map
76
+ )
77
+ file_to_text_map[text_file] = text_content
78
+ except Exception as e:
79
+ logger.warning(f"Unable to read file: {text_file} as plaintext. Skipping file.")
80
+ logger.warning(e, exc_info=True)
81
+
82
+ # Extract Entries from specified plaintext files
83
+ return file_to_text_map, PlaintextToEntries.convert_text_files_to_entries(entries, dict(entry_to_file_map))
84
+
85
+ @staticmethod
86
+ def process_single_plaintext_file(
87
+ text_content: str,
88
+ text_file: str,
89
+ entries: List[str],
90
+ entry_to_file_map: List[Tuple[str, str]],
91
+ ) -> Tuple[List[str], List[Tuple[str, str]]]:
92
+ if text_file.endswith(("html", "htm", "xml")):
93
+ text_content = PlaintextToEntries.extract_html_content(text_content, text_file.split(".")[-1])
94
+ entry_to_file_map += [(text_content, text_file)]
95
+ entries.extend([text_content])
96
+ return entries, entry_to_file_map
97
+
98
+ @staticmethod
99
+ def convert_text_files_to_entries(parsed_entries: List[str], entry_to_file_map: dict[str, str]) -> List[Entry]:
100
+ "Convert each plaintext file into an entry"
101
+ entries: List[Entry] = []
102
+ for parsed_entry in parsed_entries:
103
+ raw_filename = entry_to_file_map[parsed_entry]
104
+ # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path.
105
+ if type(raw_filename) == str and re.search(r"^https?://", raw_filename):
106
+ # Escape the URL to avoid issues with special characters
107
+ entry_filename = urllib3.util.parse_url(raw_filename).url
108
+ else:
109
+ entry_filename = raw_filename
110
+
111
+ # Append base filename to compiled entry for context to model
112
+ entries.append(
113
+ Entry(
114
+ raw=parsed_entry,
115
+ file=f"{entry_filename}",
116
+ compiled=f"{entry_filename}\n{parsed_entry}",
117
+ heading=entry_filename,
118
+ )
119
+ )
120
+
121
+ logger.debug(f"Converted {len(parsed_entries)} plaintext files to entries")
122
+ return entries
@@ -0,0 +1,297 @@
1
+ import hashlib
2
+ import logging
3
+ import re
4
+ import uuid
5
+ from abc import ABC, abstractmethod
6
+ from itertools import repeat
7
+ from typing import Any, Callable, List, Set, Tuple
8
+
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from tqdm import tqdm
11
+
12
+ from khoj.database.adapters import (
13
+ EntryAdapters,
14
+ FileObjectAdapters,
15
+ get_user_search_model_or_default,
16
+ )
17
+ from khoj.database.models import Entry as DbEntry
18
+ from khoj.database.models import EntryDates, KhojUser
19
+ from khoj.search_filter.date_filter import DateFilter
20
+ from khoj.utils import state
21
+ from khoj.utils.helpers import batcher, is_none_or_empty, timer
22
+ from khoj.utils.rawconfig import Entry
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class TextToEntries(ABC):
28
+ def __init__(self, config: Any = None):
29
+ self.embeddings_model = state.embeddings_model
30
+ self.config = config
31
+ self.date_filter = DateFilter()
32
+
33
+ @abstractmethod
34
+ def process(
35
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
36
+ ) -> Tuple[int, int]:
37
+ ...
38
+
39
+ @staticmethod
40
+ def hash_func(key: str) -> Callable:
41
+ return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding="utf-8")).hexdigest()
42
+
43
+ @staticmethod
44
+ def remove_long_words(text: str, max_word_length: int = 500) -> str:
45
+ "Remove words longer than max_word_length from text."
46
+ # Split the string by words, keeping the delimiters
47
+ splits = re.split(r"(\s+)", text) + [""]
48
+ words_with_delimiters = list(zip(splits[::2], splits[1::2]))
49
+
50
+ # Filter out long words while preserving delimiters in text
51
+ filtered_text = [
52
+ f"{word}{delimiter}"
53
+ for word, delimiter in words_with_delimiters
54
+ if not word.strip() or len(word.strip()) <= max_word_length
55
+ ]
56
+
57
+ return "".join(filtered_text)
58
+
59
+ @staticmethod
60
+ def tokenizer(text: str) -> List[str]:
61
+ "Tokenize text into words."
62
+ return text.split()
63
+
64
+ @staticmethod
65
+ def split_entries_by_max_tokens(
66
+ entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500, raw_is_compiled: bool = False
67
+ ) -> List[Entry]:
68
+ "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
69
+ chunked_entries: List[Entry] = []
70
+ for entry in entries:
71
+ if is_none_or_empty(entry.compiled):
72
+ continue
73
+
74
+ # Split entry into chunks of max_tokens
75
+ # Use chunking preference order: paragraphs > sentences > words > characters
76
+ text_splitter = RecursiveCharacterTextSplitter(
77
+ chunk_size=max_tokens,
78
+ separators=["\n\n", "\n", "!", "?", ".", " ", "\t", ""],
79
+ keep_separator=True,
80
+ length_function=lambda chunk: len(TextToEntries.tokenizer(chunk)),
81
+ chunk_overlap=0,
82
+ )
83
+ chunked_entry_chunks = text_splitter.split_text(entry.compiled)
84
+ corpus_id = uuid.uuid4()
85
+
86
+ # Create heading prefixed entry from each chunk
87
+ for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks):
88
+ # Prepend heading to all other chunks, the first chunk already has heading from original entry
89
+ if chunk_index > 0 and entry.heading:
90
+ # Snip heading to avoid crossing max_tokens limit
91
+ # Keep last 100 characters of heading as entry heading more important than filename
92
+ snipped_heading = entry.heading[-100:]
93
+ # Prepend snipped heading
94
+ compiled_entry_chunk = f"{snipped_heading}\n{compiled_entry_chunk}"
95
+
96
+ # Drop long words instead of having entry truncated to maintain quality of entry processed by models
97
+ compiled_entry_chunk = TextToEntries.remove_long_words(compiled_entry_chunk, max_word_length)
98
+
99
+ # Clean entry of unwanted characters like \0 character
100
+ compiled_entry_chunk = TextToEntries.clean_field(compiled_entry_chunk)
101
+ entry.raw = compiled_entry_chunk if raw_is_compiled else TextToEntries.clean_field(entry.raw)
102
+ entry.heading = TextToEntries.clean_field(entry.heading)
103
+ entry.file = TextToEntries.clean_field(entry.file)
104
+
105
+ chunked_entries.append(
106
+ Entry(
107
+ compiled=compiled_entry_chunk,
108
+ raw=entry.raw,
109
+ heading=entry.heading,
110
+ file=entry.file,
111
+ corpus_id=corpus_id,
112
+ )
113
+ )
114
+
115
+ return chunked_entries
116
+
117
+ def update_embeddings(
118
+ self,
119
+ current_entries: List[Entry],
120
+ file_type: str,
121
+ file_source: str,
122
+ key="compiled",
123
+ logger: logging.Logger = None,
124
+ deletion_filenames: Set[str] = None,
125
+ user: KhojUser = None,
126
+ regenerate: bool = False,
127
+ file_to_text_map: dict[str, str] = None,
128
+ ):
129
+ with timer("Constructed current entry hashes in", logger):
130
+ hashes_by_file = dict[str, set[str]]()
131
+ current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
132
+ hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
133
+ for entry in tqdm(current_entries, desc="Hashing Entries"):
134
+ hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
135
+
136
+ num_deleted_entries = 0
137
+ if regenerate:
138
+ with timer("Cleared existing dataset for regeneration in", logger):
139
+ logger.debug(f"Deleting all entries for file type {file_type}")
140
+ num_deleted_entries = EntryAdapters.delete_all_entries(user, file_type=file_type)
141
+
142
+ hashes_to_process = set()
143
+ with timer("Identified entries to add to database in", logger):
144
+ for file in tqdm(hashes_by_file, desc="Identify new entries"):
145
+ hashes_for_file = hashes_by_file[file]
146
+ existing_entries = DbEntry.objects.filter(
147
+ user=user, hashed_value__in=hashes_for_file, file_type=file_type
148
+ )
149
+ existing_entry_hashes = set([entry.hashed_value for entry in existing_entries])
150
+ hashes_to_process |= hashes_for_file - existing_entry_hashes
151
+
152
+ embeddings = []
153
+ with timer("Generated embeddings for entries to add to database in", logger):
154
+ entries_to_process = [hash_to_current_entries[hashed_val] for hashed_val in hashes_to_process]
155
+ data_to_embed = [getattr(entry, key) for entry in entries_to_process]
156
+ model = get_user_search_model_or_default(user)
157
+ embeddings += self.embeddings_model[model.name].embed_documents(data_to_embed)
158
+
159
+ added_entries: list[DbEntry] = []
160
+ with timer("Added entries to database in", logger):
161
+ num_items = len(hashes_to_process)
162
+ assert num_items == len(embeddings)
163
+ batch_size = min(200, num_items)
164
+ entry_batches = zip(hashes_to_process, embeddings)
165
+
166
+ for entry_batch in tqdm(batcher(entry_batches, batch_size), desc="Add entries to database"):
167
+ batch_embeddings_to_create: List[DbEntry] = []
168
+ for entry_hash, new_entry in entry_batch:
169
+ entry = hash_to_current_entries[entry_hash]
170
+ batch_embeddings_to_create.append(
171
+ DbEntry(
172
+ user=user,
173
+ embeddings=new_entry,
174
+ raw=entry.raw,
175
+ compiled=entry.compiled,
176
+ heading=entry.heading[:1000], # Truncate to max chars of field allowed
177
+ file_path=entry.file,
178
+ file_source=file_source,
179
+ file_type=file_type,
180
+ hashed_value=entry_hash,
181
+ corpus_id=entry.corpus_id,
182
+ )
183
+ )
184
+ try:
185
+ added_entries += DbEntry.objects.bulk_create(batch_embeddings_to_create)
186
+ except Exception as e:
187
+ batch_indexing_error = "\n\n".join(
188
+ f"file: {entry.file_path}\nheading: {entry.heading}\ncompiled: {entry.compiled[:100]}\nraw: {entry.raw[:100]}"
189
+ for entry in batch_embeddings_to_create
190
+ )
191
+ logger.error(f"Error adding entries to database:\n{batch_indexing_error}\n---\n{e}", exc_info=True)
192
+ logger.debug(f"Added {len(added_entries)} {file_type} entries to database")
193
+
194
+ if file_to_text_map:
195
+ with timer("Indexed text of modified file in", logger):
196
+ # get the set of modified files from added_entries
197
+ modified_files = {entry.file_path for entry in added_entries}
198
+ # create or update text of each updated file indexed on DB
199
+ for modified_file in modified_files:
200
+ raw_text = file_to_text_map[modified_file]
201
+ file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file)
202
+ if file_object:
203
+ FileObjectAdapters.update_raw_text(file_object, raw_text)
204
+ else:
205
+ FileObjectAdapters.create_file_object(user, modified_file, raw_text)
206
+
207
+ new_dates = []
208
+ with timer("Indexed dates from added entries in", logger):
209
+ for added_entry in added_entries:
210
+ dates_in_entries = zip(self.date_filter.extract_dates(added_entry.compiled), repeat(added_entry))
211
+ dates_to_create = [
212
+ EntryDates(date=date, entry=added_entry)
213
+ for date, added_entry in dates_in_entries
214
+ if not is_none_or_empty(date)
215
+ ]
216
+ new_dates += EntryDates.objects.bulk_create(dates_to_create)
217
+ logger.debug(f"Indexed {len(new_dates)} dates from added {file_type} entries")
218
+
219
+ with timer("Deleted entries identified by server from database in", logger):
220
+ for file in hashes_by_file:
221
+ existing_entry_hashes = EntryAdapters.get_existing_entry_hashes_by_file(user, file)
222
+ to_delete_entry_hashes = set(existing_entry_hashes) - hashes_by_file[file]
223
+ num_deleted_entries += len(to_delete_entry_hashes)
224
+ EntryAdapters.delete_entry_by_hash(user, hashed_values=list(to_delete_entry_hashes))
225
+
226
+ with timer("Deleted entries requested by clients from database in", logger):
227
+ if deletion_filenames is not None:
228
+ for file_path in deletion_filenames:
229
+ deleted_count = EntryAdapters.delete_entry_by_file(user, file_path)
230
+ num_deleted_entries += deleted_count
231
+ FileObjectAdapters.delete_file_object_by_name(user, file_path)
232
+
233
+ return len(added_entries), num_deleted_entries
234
+
235
+ @staticmethod
236
+ def mark_entries_for_update(
237
+ current_entries: List[Entry],
238
+ previous_entries: List[Entry],
239
+ key="compiled",
240
+ logger: logging.Logger = None,
241
+ deletion_filenames: Set[str] = None,
242
+ ):
243
+ # Hash all current and previous entries to identify new entries
244
+ with timer("Hash previous, current entries", logger):
245
+ current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
246
+ previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
247
+ if deletion_filenames is not None:
248
+ deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
249
+ deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
250
+ else:
251
+ deletion_entry_hashes = []
252
+
253
+ with timer("Identify, Mark, Combine new, existing entries", logger):
254
+ hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
255
+ hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))
256
+
257
+ # All entries that did not exist in the previous set are to be added
258
+ new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes)
259
+ # All entries that exist in both current and previous sets are kept
260
+ existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
261
+ # All entries that exist in the previous set but not in the current set should be preserved
262
+ remaining_entry_hashes = set(previous_entry_hashes) - set(current_entry_hashes)
263
+ # All entries that exist in the previous set and also in the deletions set should be removed
264
+ to_delete_entry_hashes = set(previous_entry_hashes) & set(deletion_entry_hashes)
265
+
266
+ preserving_entry_hashes = existing_entry_hashes
267
+
268
+ if deletion_filenames is not None:
269
+ preserving_entry_hashes = (
270
+ (existing_entry_hashes | remaining_entry_hashes)
271
+ if len(deletion_entry_hashes) == 0
272
+ else (set(previous_entry_hashes) - to_delete_entry_hashes)
273
+ )
274
+
275
+ # load new entries in the order in which they are processed for a stable sort
276
+ new_entries = [
277
+ (current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash])
278
+ for entry_hash in new_entry_hashes
279
+ ]
280
+ new_entries_sorted = sorted(new_entries, key=lambda e: e[0])
281
+ # Mark new entries with -1 id to flag for later embeddings generation
282
+ new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted]
283
+
284
+ # Set id of existing entries to their previous ids to reuse their existing encoded embeddings
285
+ existing_entries = [
286
+ (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
287
+ for entry_hash in preserving_entry_hashes
288
+ ]
289
+ existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
290
+
291
+ entries_with_ids = existing_entries_sorted + new_entries_sorted
292
+
293
+ return entries_with_ids
294
+
295
+ @staticmethod
296
+ def clean_field(field: str) -> str:
297
+ return field.replace("\0", "") if not is_none_or_empty(field) else ""
File without changes
File without changes
@@ -0,0 +1,206 @@
1
+ import json
2
+ import logging
3
+ import re
4
+ from datetime import datetime, timedelta
5
+ from typing import Dict, Optional
6
+
7
+ from langchain.schema import ChatMessage
8
+
9
+ from khoj.database.models import Agent
10
+ from khoj.processor.conversation import prompts
11
+ from khoj.processor.conversation.anthropic.utils import (
12
+ anthropic_chat_completion_with_backoff,
13
+ anthropic_completion_with_backoff,
14
+ )
15
+ from khoj.processor.conversation.utils import generate_chatml_messages_with_context
16
+ from khoj.utils.helpers import ConversationCommand, is_none_or_empty
17
+ from khoj.utils.rawconfig import LocationData
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def extract_questions_anthropic(
23
+ text,
24
+ model: Optional[str] = "claude-instant-1.2",
25
+ conversation_log={},
26
+ api_key=None,
27
+ temperature=0,
28
+ location_data: LocationData = None,
29
+ ):
30
+ """
31
+ Infer search queries to retrieve relevant notes to answer user query
32
+ """
33
+ # Extract Past User Message and Inferred Questions from Conversation Log
34
+ location = f"{location_data.city}, {location_data.region}, {location_data.country}" if location_data else "Unknown"
35
+
36
+ # Extract Past User Message and Inferred Questions from Conversation Log
37
+ chat_history = "".join(
38
+ [
39
+ f'Q: {chat["intent"]["query"]}\nKhoj: {{"queries": {chat["intent"].get("inferred-queries") or list([chat["intent"]["query"]])}}}\nA: {chat["message"]}\n\n'
40
+ for chat in conversation_log.get("chat", [])[-4:]
41
+ if chat["by"] == "khoj" and "text-to-image" not in chat["intent"].get("type")
42
+ ]
43
+ )
44
+
45
+ # Get dates relative to today for prompt creation
46
+ today = datetime.today()
47
+ current_new_year = today.replace(month=1, day=1)
48
+ last_new_year = current_new_year.replace(year=today.year - 1)
49
+
50
+ system_prompt = prompts.extract_questions_anthropic_system_prompt.format(
51
+ current_date=today.strftime("%Y-%m-%d"),
52
+ day_of_week=today.strftime("%A"),
53
+ last_new_year=last_new_year.strftime("%Y"),
54
+ last_new_year_date=last_new_year.strftime("%Y-%m-%d"),
55
+ current_new_year_date=current_new_year.strftime("%Y-%m-%d"),
56
+ yesterday_date=(today - timedelta(days=1)).strftime("%Y-%m-%d"),
57
+ location=location,
58
+ )
59
+
60
+ prompt = prompts.extract_questions_anthropic_user_message.format(
61
+ chat_history=chat_history,
62
+ text=text,
63
+ )
64
+
65
+ messages = [ChatMessage(content=prompt, role="user")]
66
+
67
+ response = anthropic_completion_with_backoff(
68
+ messages=messages,
69
+ system_prompt=system_prompt,
70
+ model_name=model,
71
+ temperature=temperature,
72
+ api_key=api_key,
73
+ )
74
+
75
+ # Extract, Clean Message from Claude's Response
76
+ try:
77
+ response = response.strip()
78
+ match = re.search(r"\{.*?\}", response)
79
+ if match:
80
+ response = match.group()
81
+ response = json.loads(response)
82
+ response = [q.strip() for q in response["queries"] if q.strip()]
83
+ if not isinstance(response, list) or not response:
84
+ logger.error(f"Invalid response for constructing subqueries: {response}")
85
+ return [text]
86
+ return response
87
+ except:
88
+ logger.warning(f"Claude returned invalid JSON. Falling back to using user message as search query.\n{response}")
89
+ questions = [text]
90
+ logger.debug(f"Extracted Questions by Claude: {questions}")
91
+ return questions
92
+
93
+
94
+ def anthropic_send_message_to_model(messages, api_key, model):
95
+ """
96
+ Send message to model
97
+ """
98
+ # Anthropic requires the first message to be a 'user' message, and the system prompt is not to be sent in the messages parameter
99
+ system_prompt = None
100
+
101
+ if len(messages) == 1:
102
+ messages[0].role = "user"
103
+ else:
104
+ system_prompt = ""
105
+ for message in messages.copy():
106
+ if message.role == "system":
107
+ system_prompt += message.content
108
+ messages.remove(message)
109
+
110
+ # Get Response from GPT. Don't use response_type because Anthropic doesn't support it.
111
+ return anthropic_completion_with_backoff(
112
+ messages=messages,
113
+ system_prompt=system_prompt,
114
+ model_name=model,
115
+ api_key=api_key,
116
+ )
117
+
118
+
119
+ def converse_anthropic(
120
+ references,
121
+ user_query,
122
+ online_results: Optional[Dict[str, Dict]] = None,
123
+ conversation_log={},
124
+ model: Optional[str] = "claude-instant-1.2",
125
+ api_key: Optional[str] = None,
126
+ completion_func=None,
127
+ conversation_commands=[ConversationCommand.Default],
128
+ max_prompt_size=None,
129
+ tokenizer_name=None,
130
+ location_data: LocationData = None,
131
+ user_name: str = None,
132
+ agent: Agent = None,
133
+ ):
134
+ """
135
+ Converse with user using Anthropic's Claude
136
+ """
137
+ # Initialize Variables
138
+ current_date = datetime.now().strftime("%Y-%m-%d")
139
+ compiled_references = "\n\n".join({f"# {item}" for item in references})
140
+
141
+ conversation_primer = prompts.query_prompt.format(query=user_query)
142
+
143
+ if agent and agent.personality:
144
+ system_prompt = prompts.custom_personality.format(
145
+ name=agent.name, bio=agent.personality, current_date=current_date
146
+ )
147
+ else:
148
+ system_prompt = prompts.personality.format(current_date=current_date)
149
+
150
+ if location_data:
151
+ location = f"{location_data.city}, {location_data.region}, {location_data.country}"
152
+ location_prompt = prompts.user_location.format(location=location)
153
+ system_prompt = f"{system_prompt}\n{location_prompt}"
154
+
155
+ if user_name:
156
+ user_name_prompt = prompts.user_name.format(name=user_name)
157
+ system_prompt = f"{system_prompt}\n{user_name_prompt}"
158
+
159
+ # Get Conversation Primer appropriate to Conversation Type
160
+ if conversation_commands == [ConversationCommand.Notes] and is_none_or_empty(compiled_references):
161
+ completion_func(chat_response=prompts.no_notes_found.format())
162
+ return iter([prompts.no_notes_found.format()])
163
+ elif conversation_commands == [ConversationCommand.Online] and is_none_or_empty(online_results):
164
+ completion_func(chat_response=prompts.no_online_results_found.format())
165
+ return iter([prompts.no_online_results_found.format()])
166
+
167
+ if ConversationCommand.Online in conversation_commands or ConversationCommand.Webpage in conversation_commands:
168
+ conversation_primer = (
169
+ f"{prompts.online_search_conversation.format(online_results=str(online_results))}\n{conversation_primer}"
170
+ )
171
+ if not is_none_or_empty(compiled_references):
172
+ conversation_primer = f"{prompts.notes_conversation.format(query=user_query, references=compiled_references)}\n\n{conversation_primer}"
173
+
174
+ # Setup Prompt with Primer or Conversation History
175
+ messages = generate_chatml_messages_with_context(
176
+ conversation_primer,
177
+ conversation_log=conversation_log,
178
+ model_name=model,
179
+ max_prompt_size=max_prompt_size,
180
+ tokenizer_name=tokenizer_name,
181
+ )
182
+
183
+ if len(messages) > 1:
184
+ if messages[0].role == "assistant":
185
+ messages = messages[1:]
186
+
187
+ for message in messages.copy():
188
+ if message.role == "system":
189
+ system_prompt += message.content
190
+ messages.remove(message)
191
+
192
+ truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
193
+ logger.debug(f"Conversation Context for Claude: {truncated_messages}")
194
+
195
+ # Get Response from Claude
196
+ return anthropic_chat_completion_with_backoff(
197
+ messages=messages,
198
+ compiled_references=references,
199
+ online_results=online_results,
200
+ model_name=model,
201
+ temperature=0,
202
+ api_key=api_key,
203
+ system_prompt=system_prompt,
204
+ completion_func=completion_func,
205
+ max_prompt_size=max_prompt_size,
206
+ )