khoj 1.16.1.dev15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. khoj/__init__.py +0 -0
  2. khoj/app/README.md +94 -0
  3. khoj/app/__init__.py +0 -0
  4. khoj/app/asgi.py +16 -0
  5. khoj/app/settings.py +192 -0
  6. khoj/app/urls.py +25 -0
  7. khoj/configure.py +424 -0
  8. khoj/database/__init__.py +0 -0
  9. khoj/database/adapters/__init__.py +1234 -0
  10. khoj/database/admin.py +290 -0
  11. khoj/database/apps.py +6 -0
  12. khoj/database/management/__init__.py +0 -0
  13. khoj/database/management/commands/__init__.py +0 -0
  14. khoj/database/management/commands/change_generated_images_url.py +61 -0
  15. khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
  16. khoj/database/migrations/0001_khojuser.py +98 -0
  17. khoj/database/migrations/0002_googleuser.py +32 -0
  18. khoj/database/migrations/0003_vector_extension.py +10 -0
  19. khoj/database/migrations/0004_content_types_and_more.py +181 -0
  20. khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
  21. khoj/database/migrations/0006_embeddingsdates.py +33 -0
  22. khoj/database/migrations/0007_add_conversation.py +27 -0
  23. khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
  24. khoj/database/migrations/0009_khojapiuser.py +24 -0
  25. khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
  26. khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
  27. khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
  28. khoj/database/migrations/0012_entry_file_source.py +21 -0
  29. khoj/database/migrations/0013_subscription.py +37 -0
  30. khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
  31. khoj/database/migrations/0015_alter_subscription_user.py +21 -0
  32. khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
  33. khoj/database/migrations/0017_searchmodel.py +32 -0
  34. khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
  35. khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
  36. khoj/database/migrations/0020_reflectivequestion.py +36 -0
  37. khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
  38. khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
  39. khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
  40. khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
  41. khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
  42. khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
  43. khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
  44. khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
  45. khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
  46. khoj/database/migrations/0029_userrequests.py +27 -0
  47. khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
  48. khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
  49. khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
  50. khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
  51. khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
  52. khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
  53. khoj/database/migrations/0035_processlock.py +26 -0
  54. khoj/database/migrations/0036_alter_processlock_name.py +19 -0
  55. khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
  56. khoj/database/migrations/0036_publicconversation.py +42 -0
  57. khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
  58. khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
  59. khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
  60. khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
  61. khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
  62. khoj/database/migrations/0040_alter_processlock_name.py +26 -0
  63. khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
  64. khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
  65. khoj/database/migrations/0042_serverchatsettings.py +46 -0
  66. khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
  67. khoj/database/migrations/0044_conversation_file_filters.py +17 -0
  68. khoj/database/migrations/0045_fileobject.py +37 -0
  69. khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
  70. khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
  71. khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
  72. khoj/database/migrations/0049_datastore.py +38 -0
  73. khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
  74. khoj/database/migrations/0050_alter_processlock_name.py +25 -0
  75. khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
  76. khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
  77. khoj/database/migrations/__init__.py +0 -0
  78. khoj/database/models/__init__.py +402 -0
  79. khoj/database/tests.py +3 -0
  80. khoj/interface/email/feedback.html +34 -0
  81. khoj/interface/email/magic_link.html +17 -0
  82. khoj/interface/email/task.html +40 -0
  83. khoj/interface/email/welcome.html +61 -0
  84. khoj/interface/web/404.html +56 -0
  85. khoj/interface/web/agent.html +312 -0
  86. khoj/interface/web/agents.html +276 -0
  87. khoj/interface/web/assets/icons/agents.svg +6 -0
  88. khoj/interface/web/assets/icons/automation.svg +37 -0
  89. khoj/interface/web/assets/icons/cancel.svg +3 -0
  90. khoj/interface/web/assets/icons/chat.svg +24 -0
  91. khoj/interface/web/assets/icons/collapse.svg +17 -0
  92. khoj/interface/web/assets/icons/computer.png +0 -0
  93. khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
  94. khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
  95. khoj/interface/web/assets/icons/copy-button.svg +5 -0
  96. khoj/interface/web/assets/icons/credit-card.png +0 -0
  97. khoj/interface/web/assets/icons/delete.svg +26 -0
  98. khoj/interface/web/assets/icons/docx.svg +7 -0
  99. khoj/interface/web/assets/icons/edit.svg +4 -0
  100. khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
  101. khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
  102. khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
  103. khoj/interface/web/assets/icons/favicon.icns +0 -0
  104. khoj/interface/web/assets/icons/github.svg +1 -0
  105. khoj/interface/web/assets/icons/key.svg +4 -0
  106. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  107. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  108. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
  109. khoj/interface/web/assets/icons/logotype.svg +1 -0
  110. khoj/interface/web/assets/icons/markdown.svg +1 -0
  111. khoj/interface/web/assets/icons/new.svg +23 -0
  112. khoj/interface/web/assets/icons/notion.svg +4 -0
  113. khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
  114. khoj/interface/web/assets/icons/org.svg +1 -0
  115. khoj/interface/web/assets/icons/pdf.svg +23 -0
  116. khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
  117. khoj/interface/web/assets/icons/plaintext.svg +1 -0
  118. khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
  119. khoj/interface/web/assets/icons/search.svg +25 -0
  120. khoj/interface/web/assets/icons/send.svg +1 -0
  121. khoj/interface/web/assets/icons/share.svg +8 -0
  122. khoj/interface/web/assets/icons/speaker.svg +4 -0
  123. khoj/interface/web/assets/icons/stop-solid.svg +37 -0
  124. khoj/interface/web/assets/icons/sync.svg +4 -0
  125. khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
  126. khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
  127. khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
  128. khoj/interface/web/assets/icons/voice.svg +8 -0
  129. khoj/interface/web/assets/icons/web.svg +2 -0
  130. khoj/interface/web/assets/icons/whatsapp.svg +17 -0
  131. khoj/interface/web/assets/khoj.css +237 -0
  132. khoj/interface/web/assets/markdown-it.min.js +8476 -0
  133. khoj/interface/web/assets/natural-cron.min.js +1 -0
  134. khoj/interface/web/assets/org.min.js +1823 -0
  135. khoj/interface/web/assets/pico.min.css +5 -0
  136. khoj/interface/web/assets/purify.min.js +3 -0
  137. khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
  138. khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
  139. khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
  140. khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
  141. khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
  142. khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
  143. khoj/interface/web/assets/utils.js +33 -0
  144. khoj/interface/web/base_config.html +445 -0
  145. khoj/interface/web/chat.html +3546 -0
  146. khoj/interface/web/config.html +1011 -0
  147. khoj/interface/web/config_automation.html +1103 -0
  148. khoj/interface/web/content_source_computer_input.html +139 -0
  149. khoj/interface/web/content_source_github_input.html +216 -0
  150. khoj/interface/web/content_source_notion_input.html +94 -0
  151. khoj/interface/web/khoj.webmanifest +51 -0
  152. khoj/interface/web/login.html +219 -0
  153. khoj/interface/web/public_conversation.html +2006 -0
  154. khoj/interface/web/search.html +470 -0
  155. khoj/interface/web/utils.html +48 -0
  156. khoj/main.py +241 -0
  157. khoj/manage.py +22 -0
  158. khoj/migrations/__init__.py +0 -0
  159. khoj/migrations/migrate_offline_chat_default_model.py +69 -0
  160. khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
  161. khoj/migrations/migrate_offline_chat_schema.py +83 -0
  162. khoj/migrations/migrate_offline_model.py +29 -0
  163. khoj/migrations/migrate_processor_config_openai.py +67 -0
  164. khoj/migrations/migrate_server_pg.py +138 -0
  165. khoj/migrations/migrate_version.py +17 -0
  166. khoj/processor/__init__.py +0 -0
  167. khoj/processor/content/__init__.py +0 -0
  168. khoj/processor/content/docx/__init__.py +0 -0
  169. khoj/processor/content/docx/docx_to_entries.py +110 -0
  170. khoj/processor/content/github/__init__.py +0 -0
  171. khoj/processor/content/github/github_to_entries.py +224 -0
  172. khoj/processor/content/images/__init__.py +0 -0
  173. khoj/processor/content/images/image_to_entries.py +118 -0
  174. khoj/processor/content/markdown/__init__.py +0 -0
  175. khoj/processor/content/markdown/markdown_to_entries.py +165 -0
  176. khoj/processor/content/notion/notion_to_entries.py +260 -0
  177. khoj/processor/content/org_mode/__init__.py +0 -0
  178. khoj/processor/content/org_mode/org_to_entries.py +231 -0
  179. khoj/processor/content/org_mode/orgnode.py +532 -0
  180. khoj/processor/content/pdf/__init__.py +0 -0
  181. khoj/processor/content/pdf/pdf_to_entries.py +116 -0
  182. khoj/processor/content/plaintext/__init__.py +0 -0
  183. khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
  184. khoj/processor/content/text_to_entries.py +297 -0
  185. khoj/processor/conversation/__init__.py +0 -0
  186. khoj/processor/conversation/anthropic/__init__.py +0 -0
  187. khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
  188. khoj/processor/conversation/anthropic/utils.py +114 -0
  189. khoj/processor/conversation/offline/__init__.py +0 -0
  190. khoj/processor/conversation/offline/chat_model.py +231 -0
  191. khoj/processor/conversation/offline/utils.py +78 -0
  192. khoj/processor/conversation/offline/whisper.py +15 -0
  193. khoj/processor/conversation/openai/__init__.py +0 -0
  194. khoj/processor/conversation/openai/gpt.py +187 -0
  195. khoj/processor/conversation/openai/utils.py +129 -0
  196. khoj/processor/conversation/openai/whisper.py +13 -0
  197. khoj/processor/conversation/prompts.py +758 -0
  198. khoj/processor/conversation/utils.py +262 -0
  199. khoj/processor/embeddings.py +117 -0
  200. khoj/processor/speech/__init__.py +0 -0
  201. khoj/processor/speech/text_to_speech.py +51 -0
  202. khoj/processor/tools/__init__.py +0 -0
  203. khoj/processor/tools/online_search.py +225 -0
  204. khoj/routers/__init__.py +0 -0
  205. khoj/routers/api.py +626 -0
  206. khoj/routers/api_agents.py +43 -0
  207. khoj/routers/api_chat.py +1180 -0
  208. khoj/routers/api_config.py +434 -0
  209. khoj/routers/api_phone.py +86 -0
  210. khoj/routers/auth.py +181 -0
  211. khoj/routers/email.py +133 -0
  212. khoj/routers/helpers.py +1188 -0
  213. khoj/routers/indexer.py +349 -0
  214. khoj/routers/notion.py +91 -0
  215. khoj/routers/storage.py +35 -0
  216. khoj/routers/subscription.py +104 -0
  217. khoj/routers/twilio.py +36 -0
  218. khoj/routers/web_client.py +471 -0
  219. khoj/search_filter/__init__.py +0 -0
  220. khoj/search_filter/base_filter.py +15 -0
  221. khoj/search_filter/date_filter.py +217 -0
  222. khoj/search_filter/file_filter.py +30 -0
  223. khoj/search_filter/word_filter.py +29 -0
  224. khoj/search_type/__init__.py +0 -0
  225. khoj/search_type/text_search.py +241 -0
  226. khoj/utils/__init__.py +0 -0
  227. khoj/utils/cli.py +93 -0
  228. khoj/utils/config.py +81 -0
  229. khoj/utils/constants.py +24 -0
  230. khoj/utils/fs_syncer.py +249 -0
  231. khoj/utils/helpers.py +418 -0
  232. khoj/utils/initialization.py +146 -0
  233. khoj/utils/jsonl.py +43 -0
  234. khoj/utils/models.py +47 -0
  235. khoj/utils/rawconfig.py +160 -0
  236. khoj/utils/state.py +46 -0
  237. khoj/utils/yaml.py +43 -0
  238. khoj-1.16.1.dev15.dist-info/METADATA +178 -0
  239. khoj-1.16.1.dev15.dist-info/RECORD +242 -0
  240. khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
  241. khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
  242. khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,165 @@
1
+ import logging
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ import urllib3
7
+
8
+ from khoj.database.models import Entry as DbEntry
9
+ from khoj.database.models import KhojUser
10
+ from khoj.processor.content.text_to_entries import TextToEntries
11
+ from khoj.utils.helpers import timer
12
+ from khoj.utils.rawconfig import Entry
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class MarkdownToEntries(TextToEntries):
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ # Define Functions
22
+ def process(
23
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
24
+ ) -> Tuple[int, int]:
25
+ # Extract required fields from config
26
+ if not full_corpus:
27
+ deletion_file_names = set([file for file in files if files[file] == ""])
28
+ files_to_process = set(files) - deletion_file_names
29
+ files = {file: files[file] for file in files_to_process}
30
+ else:
31
+ deletion_file_names = None
32
+
33
+ max_tokens = 256
34
+ # Extract Entries from specified Markdown files
35
+ with timer("Extract entries from specified Markdown files", logger):
36
+ file_to_text_map, current_entries = MarkdownToEntries.extract_markdown_entries(files, max_tokens)
37
+
38
+ # Split entries by max tokens supported by model
39
+ with timer("Split entries by max token size supported by model", logger):
40
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens)
41
+
42
+ # Identify, mark and merge any new entries with previous entries
43
+ with timer("Identify new or updated entries", logger):
44
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
45
+ current_entries,
46
+ DbEntry.EntryType.MARKDOWN,
47
+ DbEntry.EntrySource.COMPUTER,
48
+ "compiled",
49
+ logger,
50
+ deletion_file_names,
51
+ user,
52
+ regenerate=regenerate,
53
+ file_to_text_map=file_to_text_map,
54
+ )
55
+
56
+ return num_new_embeddings, num_deleted_embeddings
57
+
58
+ @staticmethod
59
+ def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]:
60
+ "Extract entries by heading from specified Markdown files"
61
+ entries: List[str] = []
62
+ entry_to_file_map: List[Tuple[str, str]] = []
63
+ file_to_text_map = dict()
64
+ for markdown_file in markdown_files:
65
+ try:
66
+ markdown_content = markdown_files[markdown_file]
67
+ entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
68
+ markdown_content, markdown_file, entries, entry_to_file_map, max_tokens
69
+ )
70
+ file_to_text_map[markdown_file] = markdown_content
71
+ except Exception as e:
72
+ logger.error(
73
+ f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True
74
+ )
75
+
76
+ return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map))
77
+
78
+ @staticmethod
79
+ def process_single_markdown_file(
80
+ markdown_content: str,
81
+ markdown_file: str,
82
+ entries: List[str],
83
+ entry_to_file_map: List[Tuple[str, str]],
84
+ max_tokens=256,
85
+ ancestry: Dict[int, str] = {},
86
+ ) -> Tuple[List[str], List[Tuple[str, str]]]:
87
+ # Prepend the markdown section's heading ancestry
88
+ ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())])
89
+ markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}"
90
+
91
+ # If content is small or content has no children headings, save it as a single entry
92
+ if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
93
+ rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
94
+ ):
95
+ entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
96
+ entries.extend([markdown_content_with_ancestry])
97
+ return entries, entry_to_file_map
98
+
99
+ # Split by next heading level present in the entry
100
+ next_heading_level = len(ancestry)
101
+ sections: List[str] = []
102
+ while len(sections) < 2:
103
+ next_heading_level += 1
104
+ sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
105
+
106
+ for section in sections:
107
+ # Skip empty sections
108
+ if section.strip() == "":
109
+ continue
110
+
111
+ # Extract the section body and (when present) the heading
112
+ current_ancestry = ancestry.copy()
113
+ first_line = [line for line in section.split("\n") if line.strip() != ""][0]
114
+ if re.search(rf"^#{{{next_heading_level}}} ", first_line):
115
+ # Extract the section body without the heading
116
+ current_section_body = "\n".join(section.split(first_line)[1:])
117
+ # Parse the section heading into current section ancestry
118
+ current_section_title = first_line[next_heading_level:].strip()
119
+ current_ancestry[next_heading_level] = current_section_title
120
+ else:
121
+ current_section_body = section
122
+
123
+ # Recurse down children of the current entry
124
+ MarkdownToEntries.process_single_markdown_file(
125
+ current_section_body,
126
+ markdown_file,
127
+ entries,
128
+ entry_to_file_map,
129
+ max_tokens,
130
+ current_ancestry,
131
+ )
132
+
133
+ return entries, entry_to_file_map
134
+
135
+ @staticmethod
136
+ def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
137
+ "Convert each Markdown entries into a dictionary"
138
+ entries: List[Entry] = []
139
+ for parsed_entry in parsed_entries:
140
+ raw_filename = entry_to_file_map[parsed_entry]
141
+
142
+ # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path.
143
+ if type(raw_filename) == str and re.search(r"^https?://", raw_filename):
144
+ # Escape the URL to avoid issues with special characters
145
+ entry_filename = urllib3.util.parse_url(raw_filename).url
146
+ else:
147
+ entry_filename = str(Path(raw_filename))
148
+
149
+ heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
150
+ # Append base filename to compiled entry for context to model
151
+ # Increment heading level for heading entries and make filename as its top level heading
152
+ prefix = f"# {entry_filename}\n#" if heading else f"# {entry_filename}\n"
153
+ compiled_entry = f"{prefix}{parsed_entry}"
154
+ entries.append(
155
+ Entry(
156
+ compiled=compiled_entry,
157
+ raw=parsed_entry,
158
+ heading=f"{prefix}{heading}",
159
+ file=f"{entry_filename}",
160
+ )
161
+ )
162
+
163
+ logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
164
+
165
+ return entries
@@ -0,0 +1,260 @@
1
+ import logging
2
+ from enum import Enum
3
+ from typing import Tuple
4
+
5
+ import requests
6
+
7
+ from khoj.database.models import Entry as DbEntry
8
+ from khoj.database.models import KhojUser, NotionConfig
9
+ from khoj.processor.content.text_to_entries import TextToEntries
10
+ from khoj.utils.helpers import timer
11
+ from khoj.utils.rawconfig import Entry, NotionContentConfig
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class NotionBlockType(Enum):
17
+ PARAGRAPH = "paragraph"
18
+ HEADING_1 = "heading_1"
19
+ HEADING_2 = "heading_2"
20
+ HEADING_3 = "heading_3"
21
+ BULLETED_LIST_ITEM = "bulleted_list_item"
22
+ NUMBERED_LIST_ITEM = "numbered_list_item"
23
+ TO_DO = "to_do"
24
+ TOGGLE = "toggle"
25
+ CHILD_PAGE = "child_page"
26
+ UNSUPPORTED = "unsupported"
27
+ BOOKMARK = "bookmark"
28
+ DIVIDER = "divider"
29
+ PDF = "pdf"
30
+ IMAGE = "image"
31
+ EMBED = "embed"
32
+ VIDEO = "video"
33
+ FILE = "file"
34
+ SYNCED_BLOCK = "synced_block"
35
+ TABLE_OF_CONTENTS = "table_of_contents"
36
+ COLUMN = "column"
37
+ EQUATION = "equation"
38
+ LINK_PREVIEW = "link_preview"
39
+ COLUMN_LIST = "column_list"
40
+ QUOTE = "quote"
41
+ BREADCRUMB = "breadcrumb"
42
+ LINK_TO_PAGE = "link_to_page"
43
+ CHILD_DATABASE = "child_database"
44
+ TEMPLATE = "template"
45
+ CALLOUT = "callout"
46
+
47
+
48
+ class NotionToEntries(TextToEntries):
49
+ def __init__(self, config: NotionConfig):
50
+ super().__init__(config)
51
+ self.config = NotionContentConfig(
52
+ token=config.token,
53
+ )
54
+ self.session = requests.Session()
55
+ self.session.headers.update({"Authorization": f"Bearer {config.token}", "Notion-Version": "2022-02-22"})
56
+ self.unsupported_block_types = [
57
+ NotionBlockType.BOOKMARK.value,
58
+ NotionBlockType.DIVIDER.value,
59
+ NotionBlockType.CHILD_DATABASE.value,
60
+ NotionBlockType.TEMPLATE.value,
61
+ NotionBlockType.CALLOUT.value,
62
+ NotionBlockType.UNSUPPORTED.value,
63
+ ]
64
+
65
+ self.display_block_block_types = [
66
+ NotionBlockType.PARAGRAPH.value,
67
+ NotionBlockType.HEADING_1.value,
68
+ NotionBlockType.HEADING_2.value,
69
+ NotionBlockType.HEADING_3.value,
70
+ NotionBlockType.BULLETED_LIST_ITEM.value,
71
+ NotionBlockType.NUMBERED_LIST_ITEM.value,
72
+ NotionBlockType.TO_DO.value,
73
+ NotionBlockType.TOGGLE.value,
74
+ NotionBlockType.CHILD_PAGE.value,
75
+ NotionBlockType.BOOKMARK.value,
76
+ NotionBlockType.DIVIDER.value,
77
+ ]
78
+
79
+ self.body_params = {"page_size": 100}
80
+
81
+ def process(
82
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
83
+ ) -> Tuple[int, int]:
84
+ current_entries = []
85
+
86
+ # Get all pages
87
+ with timer("Getting all pages via search endpoint", logger=logger):
88
+ responses = []
89
+
90
+ while True:
91
+ result = self.session.post(
92
+ "https://api.notion.com/v1/search",
93
+ json=self.body_params,
94
+ ).json()
95
+ responses.append(result)
96
+ if result.get("has_more", False) == False:
97
+ break
98
+ else:
99
+ self.body_params.update({"start_cursor": result["next_cursor"]})
100
+
101
+ for response in responses:
102
+ with timer("Processing response", logger=logger):
103
+ pages_or_databases = response.get("results", [])
104
+
105
+ # Get all pages content
106
+ for p_or_d in pages_or_databases:
107
+ with timer(f"Processing {p_or_d['object']} {p_or_d['id']}", logger=logger):
108
+ if p_or_d["object"] == "database":
109
+ # TODO: Handle databases
110
+ continue
111
+ elif p_or_d["object"] == "page":
112
+ page_entries = self.process_page(p_or_d)
113
+ current_entries.extend(page_entries)
114
+
115
+ current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
116
+
117
+ return self.update_entries_with_ids(current_entries, user=user)
118
+
119
+ def process_page(self, page):
120
+ page_id = page["id"]
121
+ title, content = self.get_page_content(page_id)
122
+
123
+ if title == None or content == None:
124
+ return []
125
+
126
+ current_entries = []
127
+ curr_heading = ""
128
+ for block in content.get("results", []):
129
+ block_type = block.get("type")
130
+
131
+ if block_type == None:
132
+ continue
133
+ block_data = block[block_type]
134
+
135
+ if block_data.get("rich_text") == None or len(block_data["rich_text"]) == 0:
136
+ # There's no text to handle here.
137
+ continue
138
+
139
+ raw_content = ""
140
+ if block_type in ["heading_1", "heading_2", "heading_3"]:
141
+ # If the current block is a heading, we can consider the previous block processing completed.
142
+ # Add it as an entry and move on to processing the next chunk of the page.
143
+ if raw_content != "":
144
+ current_entries.append(
145
+ Entry(
146
+ compiled=raw_content,
147
+ raw=raw_content,
148
+ heading=title,
149
+ file=page["url"],
150
+ )
151
+ )
152
+ curr_heading = block_data["rich_text"][0]["plain_text"]
153
+ else:
154
+ if curr_heading != "":
155
+ # Add the last known heading to the content for additional context
156
+ raw_content = self.process_heading(curr_heading)
157
+ for text in block_data["rich_text"]:
158
+ raw_content += self.process_text(text)
159
+
160
+ if block.get("has_children", True):
161
+ raw_content += "\n"
162
+ raw_content = self.process_nested_children(
163
+ self.get_block_children(block["id"]), raw_content, block_type
164
+ )
165
+
166
+ if raw_content != "":
167
+ current_entries.append(
168
+ Entry(
169
+ compiled=raw_content,
170
+ raw=raw_content,
171
+ heading=title,
172
+ file=page["url"],
173
+ )
174
+ )
175
+ return current_entries
176
+
177
+ def process_heading(self, heading):
178
+ return f"\n<b>{heading}</b>\n"
179
+
180
+ def process_nested_children(self, children, raw_content, block_type=None):
181
+ results = children.get("results", [])
182
+ for child in results:
183
+ child_type = child.get("type")
184
+ if child_type == None:
185
+ continue
186
+ child_data = child[child_type]
187
+ if child_data.get("rich_text") and len(child_data["rich_text"]) > 0:
188
+ for text in child_data["rich_text"]:
189
+ raw_content += self.process_text(text, block_type)
190
+ if child_data.get("has_children", True):
191
+ return self.process_nested_children(self.get_block_children(child["id"]), raw_content, block_type)
192
+
193
+ return raw_content
194
+
195
+ def process_text(self, text, block_type=None):
196
+ text_type = text.get("type", None)
197
+ if text_type in self.unsupported_block_types:
198
+ return ""
199
+ if text.get("href", None):
200
+ return f"<a href='{text['href']}'>{text['plain_text']}</a>"
201
+ raw_text = text["plain_text"]
202
+ if text_type in self.display_block_block_types or block_type in self.display_block_block_types:
203
+ return f"\n{raw_text}\n"
204
+ return raw_text
205
+
206
+ def get_block_children(self, block_id):
207
+ try:
208
+ return self.session.get(f"https://api.notion.com/v1/blocks/{block_id}/children").json()
209
+ except Exception as e:
210
+ logger.error(f"Error getting children for block {block_id}: {e}")
211
+ return {}
212
+
213
+ def get_page(self, page_id):
214
+ return self.session.get(f"https://api.notion.com/v1/pages/{page_id}").json()
215
+
216
+ def get_page_children(self, page_id):
217
+ return self.session.get(f"https://api.notion.com/v1/blocks/{page_id}/children").json()
218
+
219
+ def get_page_content(self, page_id):
220
+ try:
221
+ page = self.get_page(page_id)
222
+ content = self.get_page_children(page_id)
223
+ except Exception as e:
224
+ logger.error(f"Error getting page {page_id}: {e}", exc_info=True)
225
+ return None, None
226
+ properties = page.get("properties", {})
227
+
228
+ title_field = "title"
229
+ if "Title" in properties:
230
+ title_field = "Title"
231
+ elif "Name" in properties:
232
+ title_field = "Name"
233
+ elif "Page" in properties:
234
+ title_field = "Page"
235
+ elif "Event" in properties:
236
+ title_field = "Event"
237
+ elif title_field not in properties:
238
+ logger.debug(f"Title field not found for page {page_id}. Setting title as None...")
239
+ title = None
240
+ return title, content
241
+ try:
242
+ title = page["properties"][title_field]["title"][0]["text"]["content"]
243
+ except Exception as e:
244
+ logger.warning(f"Error getting title for page {page_id}: {e}. Setting title as None...")
245
+ title = None
246
+ return title, content
247
+
248
+ def update_entries_with_ids(self, current_entries, user: KhojUser = None):
249
+ # Identify, mark and merge any new entries with previous entries
250
+ with timer("Identify new or updated entries", logger):
251
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
252
+ current_entries,
253
+ DbEntry.EntryType.NOTION,
254
+ DbEntry.EntrySource.NOTION,
255
+ key="compiled",
256
+ logger=logger,
257
+ user=user,
258
+ )
259
+
260
+ return num_new_embeddings, num_deleted_embeddings
File without changes
@@ -0,0 +1,231 @@
1
+ import logging
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ from khoj.database.models import Entry as DbEntry
7
+ from khoj.database.models import KhojUser
8
+ from khoj.processor.content.org_mode import orgnode
9
+ from khoj.processor.content.org_mode.orgnode import Orgnode
10
+ from khoj.processor.content.text_to_entries import TextToEntries
11
+ from khoj.utils import state
12
+ from khoj.utils.helpers import timer
13
+ from khoj.utils.rawconfig import Entry
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class OrgToEntries(TextToEntries):
19
+ def __init__(self):
20
+ super().__init__()
21
+
22
+ # Define Functions
23
+ def process(
24
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
25
+ ) -> Tuple[int, int]:
26
+ if not full_corpus:
27
+ deletion_file_names = set([file for file in files if files[file] == ""])
28
+ files_to_process = set(files) - deletion_file_names
29
+ files = {file: files[file] for file in files_to_process}
30
+ else:
31
+ deletion_file_names = None
32
+
33
+ # Extract Entries from specified Org files
34
+ max_tokens = 256
35
+ with timer("Extract entries from specified Org files", logger):
36
+ file_to_text_map, current_entries = self.extract_org_entries(files, max_tokens=max_tokens)
37
+
38
+ with timer("Split entries by max token size supported by model", logger):
39
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=max_tokens)
40
+
41
+ # Identify, mark and merge any new entries with previous entries
42
+ with timer("Identify new or updated entries", logger):
43
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
44
+ current_entries,
45
+ DbEntry.EntryType.ORG,
46
+ DbEntry.EntrySource.COMPUTER,
47
+ "compiled",
48
+ logger,
49
+ deletion_file_names,
50
+ user,
51
+ regenerate=regenerate,
52
+ file_to_text_map=file_to_text_map,
53
+ )
54
+
55
+ return num_new_embeddings, num_deleted_embeddings
56
+
57
+ @staticmethod
58
+ def extract_org_entries(
59
+ org_files: dict[str, str], index_heading_entries: bool = False, max_tokens=256
60
+ ) -> Tuple[Dict, List[Entry]]:
61
+ "Extract entries from specified Org files"
62
+ file_to_text_map, entries, entry_to_file_map = OrgToEntries.extract_org_nodes(org_files, max_tokens)
63
+ return file_to_text_map, OrgToEntries.convert_org_nodes_to_entries(
64
+ entries, entry_to_file_map, index_heading_entries
65
+ )
66
+
67
+ @staticmethod
68
+ def extract_org_nodes(
69
+ org_files: dict[str, str], max_tokens
70
+ ) -> Tuple[Dict, List[List[Orgnode]], Dict[Orgnode, str]]:
71
+ "Extract org nodes from specified org files"
72
+ entries: List[List[Orgnode]] = []
73
+ entry_to_file_map: List[Tuple[Orgnode, str]] = []
74
+ file_to_text_map = {}
75
+ for org_file in org_files:
76
+ try:
77
+ org_content = org_files[org_file]
78
+ entries, entry_to_file_map = OrgToEntries.process_single_org_file(
79
+ org_content, org_file, entries, entry_to_file_map, max_tokens
80
+ )
81
+ file_to_text_map[org_file] = org_content
82
+ except Exception as e:
83
+ logger.error(f"Unable to process file: {org_file}. Skipped indexing it.\nError; {e}", exc_info=True)
84
+
85
+ return file_to_text_map, entries, dict(entry_to_file_map)
86
+
87
+ @staticmethod
88
+ def process_single_org_file(
89
+ org_content: str,
90
+ org_file: str,
91
+ entries: List[List[Orgnode]],
92
+ entry_to_file_map: List[Tuple[Orgnode, str]],
93
+ max_tokens=256,
94
+ ancestry: Dict[int, str] = {},
95
+ ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]:
96
+ """Parse org_content from org_file into OrgNode entries
97
+
98
+ Recurse down org file entries, one heading level at a time,
99
+ until reach a leaf entry or the current entry tree fits max_tokens.
100
+
101
+ Parse recursion terminating entry (trees) into (a list of) OrgNode objects.
102
+ """
103
+ # Prepend the org section's heading ancestry
104
+ ancestry_string = "\n".join([f"{'*' * key} {ancestry[key]}" for key in sorted(ancestry.keys())])
105
+ org_content_with_ancestry = f"{ancestry_string}{org_content}"
106
+
107
+ # If content is small or content has no children headings, save it as a single entry
108
+ # Note: This is the terminating condition for this recursive function
109
+ if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search(
110
+ rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE
111
+ ):
112
+ orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file)
113
+ entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry))
114
+ entries.extend([orgnode_content_with_ancestry])
115
+ return entries, entry_to_file_map
116
+
117
+ # Split this entry tree into sections by the next heading level in it
118
+ # Increment heading level until able to split entry into sections or reach max heading level
119
+ # A successful split will result in at least 2 sections
120
+ max_heading_level = 100
121
+ next_heading_level = len(ancestry)
122
+ sections: List[str] = []
123
+ while len(sections) < 2 and next_heading_level < max_heading_level:
124
+ next_heading_level += 1
125
+ sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
126
+
127
+ # If unable to split entry into sections, log error and skip indexing it
128
+ if next_heading_level == max_heading_level:
129
+ logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.")
130
+ return entries, entry_to_file_map
131
+
132
+ # Recurse down each non-empty section after parsing its body, heading and ancestry
133
+ for section in sections:
134
+ # Skip empty sections
135
+ if section.strip() == "":
136
+ continue
137
+
138
+ # Extract the section body and (when present) the heading
139
+ current_ancestry = ancestry.copy()
140
+ first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0]
141
+ # If first non-empty line is a heading with expected heading level
142
+ if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
143
+ # Extract the section body without the heading
144
+ current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
145
+ # Parse the section heading into current section ancestry
146
+ current_section_title = first_non_empty_line[next_heading_level:].strip()
147
+ current_ancestry[next_heading_level] = current_section_title
148
+ # Else process the section as just body text
149
+ else:
150
+ current_section_body = section
151
+
152
+ # Recurse down children of the current entry
153
+ OrgToEntries.process_single_org_file(
154
+ current_section_body,
155
+ org_file,
156
+ entries,
157
+ entry_to_file_map,
158
+ max_tokens,
159
+ current_ancestry,
160
+ )
161
+
162
+ return entries, entry_to_file_map
163
+
164
+ @staticmethod
165
+ def convert_org_nodes_to_entries(
166
+ parsed_entries: List[List[Orgnode]],
167
+ entry_to_file_map: Dict[Orgnode, str],
168
+ index_heading_entries: bool = False,
169
+ ) -> List[Entry]:
170
+ """
171
+ Convert OrgNode lists into list of Entry objects
172
+
173
+ Each list of OrgNodes is a parsed parent org tree or leaf node.
174
+ Convert each list of these OrgNodes into a single Entry.
175
+ """
176
+ entries: List[Entry] = []
177
+ for entry_group in parsed_entries:
178
+ entry_heading, entry_compiled, entry_raw = "", "", ""
179
+ for parsed_entry in entry_group:
180
+ if not parsed_entry.hasBody and not index_heading_entries:
181
+ # Ignore title notes i.e notes with just headings and empty body
182
+ continue
183
+
184
+ todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else ""
185
+
186
+ # Set base level to current org-node tree's root heading level
187
+ if not entry_heading and parsed_entry.level > 0:
188
+ base_level = parsed_entry.level
189
+ # Indent entry by 1 heading level as ancestry is prepended as top level heading
190
+ heading = f"{'*' * (parsed_entry.level-base_level+2)} {todo_str}" if parsed_entry.level > 0 else ""
191
+ if parsed_entry.heading:
192
+ heading += f"{parsed_entry.heading}."
193
+
194
+ # Prepend ancestor headings, filename as top heading to root parent entry for context
195
+ # Children nodes do not need ancestors trail as root parent node will have it
196
+ if not entry_heading:
197
+ ancestors_trail = " / ".join(parsed_entry.ancestors) or Path(entry_to_file_map[parsed_entry])
198
+ heading = f"* {ancestors_trail}\n{heading}" if heading else f"* {ancestors_trail}."
199
+
200
+ compiled = heading
201
+
202
+ if parsed_entry.tags:
203
+ tags_str = " ".join(parsed_entry.tags)
204
+ compiled += f"\t {tags_str}."
205
+
206
+ if parsed_entry.closed:
207
+ compiled += f'\n Closed on {parsed_entry.closed.strftime("%Y-%m-%d")}.'
208
+
209
+ if parsed_entry.scheduled:
210
+ compiled += f'\n Scheduled for {parsed_entry.scheduled.strftime("%Y-%m-%d")}.'
211
+
212
+ if parsed_entry.hasBody:
213
+ compiled += f"\n {parsed_entry.body}"
214
+
215
+ # Add the sub-entry contents to the entry
216
+ entry_compiled += f"{compiled}"
217
+ entry_raw += f"{parsed_entry}"
218
+ if not entry_heading:
219
+ entry_heading = heading
220
+
221
+ if entry_compiled:
222
+ entries.append(
223
+ Entry(
224
+ compiled=entry_compiled,
225
+ raw=entry_raw,
226
+ heading=f"{entry_heading}",
227
+ file=f"{entry_to_file_map[parsed_entry]}",
228
+ )
229
+ )
230
+
231
+ return entries