khoj 1.16.1.dev15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. khoj/__init__.py +0 -0
  2. khoj/app/README.md +94 -0
  3. khoj/app/__init__.py +0 -0
  4. khoj/app/asgi.py +16 -0
  5. khoj/app/settings.py +192 -0
  6. khoj/app/urls.py +25 -0
  7. khoj/configure.py +424 -0
  8. khoj/database/__init__.py +0 -0
  9. khoj/database/adapters/__init__.py +1234 -0
  10. khoj/database/admin.py +290 -0
  11. khoj/database/apps.py +6 -0
  12. khoj/database/management/__init__.py +0 -0
  13. khoj/database/management/commands/__init__.py +0 -0
  14. khoj/database/management/commands/change_generated_images_url.py +61 -0
  15. khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
  16. khoj/database/migrations/0001_khojuser.py +98 -0
  17. khoj/database/migrations/0002_googleuser.py +32 -0
  18. khoj/database/migrations/0003_vector_extension.py +10 -0
  19. khoj/database/migrations/0004_content_types_and_more.py +181 -0
  20. khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
  21. khoj/database/migrations/0006_embeddingsdates.py +33 -0
  22. khoj/database/migrations/0007_add_conversation.py +27 -0
  23. khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
  24. khoj/database/migrations/0009_khojapiuser.py +24 -0
  25. khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
  26. khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
  27. khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
  28. khoj/database/migrations/0012_entry_file_source.py +21 -0
  29. khoj/database/migrations/0013_subscription.py +37 -0
  30. khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
  31. khoj/database/migrations/0015_alter_subscription_user.py +21 -0
  32. khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
  33. khoj/database/migrations/0017_searchmodel.py +32 -0
  34. khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
  35. khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
  36. khoj/database/migrations/0020_reflectivequestion.py +36 -0
  37. khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
  38. khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
  39. khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
  40. khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
  41. khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
  42. khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
  43. khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
  44. khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
  45. khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
  46. khoj/database/migrations/0029_userrequests.py +27 -0
  47. khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
  48. khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
  49. khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
  50. khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
  51. khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
  52. khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
  53. khoj/database/migrations/0035_processlock.py +26 -0
  54. khoj/database/migrations/0036_alter_processlock_name.py +19 -0
  55. khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
  56. khoj/database/migrations/0036_publicconversation.py +42 -0
  57. khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
  58. khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
  59. khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
  60. khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
  61. khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
  62. khoj/database/migrations/0040_alter_processlock_name.py +26 -0
  63. khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
  64. khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
  65. khoj/database/migrations/0042_serverchatsettings.py +46 -0
  66. khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
  67. khoj/database/migrations/0044_conversation_file_filters.py +17 -0
  68. khoj/database/migrations/0045_fileobject.py +37 -0
  69. khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
  70. khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
  71. khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
  72. khoj/database/migrations/0049_datastore.py +38 -0
  73. khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
  74. khoj/database/migrations/0050_alter_processlock_name.py +25 -0
  75. khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
  76. khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
  77. khoj/database/migrations/__init__.py +0 -0
  78. khoj/database/models/__init__.py +402 -0
  79. khoj/database/tests.py +3 -0
  80. khoj/interface/email/feedback.html +34 -0
  81. khoj/interface/email/magic_link.html +17 -0
  82. khoj/interface/email/task.html +40 -0
  83. khoj/interface/email/welcome.html +61 -0
  84. khoj/interface/web/404.html +56 -0
  85. khoj/interface/web/agent.html +312 -0
  86. khoj/interface/web/agents.html +276 -0
  87. khoj/interface/web/assets/icons/agents.svg +6 -0
  88. khoj/interface/web/assets/icons/automation.svg +37 -0
  89. khoj/interface/web/assets/icons/cancel.svg +3 -0
  90. khoj/interface/web/assets/icons/chat.svg +24 -0
  91. khoj/interface/web/assets/icons/collapse.svg +17 -0
  92. khoj/interface/web/assets/icons/computer.png +0 -0
  93. khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
  94. khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
  95. khoj/interface/web/assets/icons/copy-button.svg +5 -0
  96. khoj/interface/web/assets/icons/credit-card.png +0 -0
  97. khoj/interface/web/assets/icons/delete.svg +26 -0
  98. khoj/interface/web/assets/icons/docx.svg +7 -0
  99. khoj/interface/web/assets/icons/edit.svg +4 -0
  100. khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
  101. khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
  102. khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
  103. khoj/interface/web/assets/icons/favicon.icns +0 -0
  104. khoj/interface/web/assets/icons/github.svg +1 -0
  105. khoj/interface/web/assets/icons/key.svg +4 -0
  106. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  107. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  108. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
  109. khoj/interface/web/assets/icons/logotype.svg +1 -0
  110. khoj/interface/web/assets/icons/markdown.svg +1 -0
  111. khoj/interface/web/assets/icons/new.svg +23 -0
  112. khoj/interface/web/assets/icons/notion.svg +4 -0
  113. khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
  114. khoj/interface/web/assets/icons/org.svg +1 -0
  115. khoj/interface/web/assets/icons/pdf.svg +23 -0
  116. khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
  117. khoj/interface/web/assets/icons/plaintext.svg +1 -0
  118. khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
  119. khoj/interface/web/assets/icons/search.svg +25 -0
  120. khoj/interface/web/assets/icons/send.svg +1 -0
  121. khoj/interface/web/assets/icons/share.svg +8 -0
  122. khoj/interface/web/assets/icons/speaker.svg +4 -0
  123. khoj/interface/web/assets/icons/stop-solid.svg +37 -0
  124. khoj/interface/web/assets/icons/sync.svg +4 -0
  125. khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
  126. khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
  127. khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
  128. khoj/interface/web/assets/icons/voice.svg +8 -0
  129. khoj/interface/web/assets/icons/web.svg +2 -0
  130. khoj/interface/web/assets/icons/whatsapp.svg +17 -0
  131. khoj/interface/web/assets/khoj.css +237 -0
  132. khoj/interface/web/assets/markdown-it.min.js +8476 -0
  133. khoj/interface/web/assets/natural-cron.min.js +1 -0
  134. khoj/interface/web/assets/org.min.js +1823 -0
  135. khoj/interface/web/assets/pico.min.css +5 -0
  136. khoj/interface/web/assets/purify.min.js +3 -0
  137. khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
  138. khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
  139. khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
  140. khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
  141. khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
  142. khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
  143. khoj/interface/web/assets/utils.js +33 -0
  144. khoj/interface/web/base_config.html +445 -0
  145. khoj/interface/web/chat.html +3546 -0
  146. khoj/interface/web/config.html +1011 -0
  147. khoj/interface/web/config_automation.html +1103 -0
  148. khoj/interface/web/content_source_computer_input.html +139 -0
  149. khoj/interface/web/content_source_github_input.html +216 -0
  150. khoj/interface/web/content_source_notion_input.html +94 -0
  151. khoj/interface/web/khoj.webmanifest +51 -0
  152. khoj/interface/web/login.html +219 -0
  153. khoj/interface/web/public_conversation.html +2006 -0
  154. khoj/interface/web/search.html +470 -0
  155. khoj/interface/web/utils.html +48 -0
  156. khoj/main.py +241 -0
  157. khoj/manage.py +22 -0
  158. khoj/migrations/__init__.py +0 -0
  159. khoj/migrations/migrate_offline_chat_default_model.py +69 -0
  160. khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
  161. khoj/migrations/migrate_offline_chat_schema.py +83 -0
  162. khoj/migrations/migrate_offline_model.py +29 -0
  163. khoj/migrations/migrate_processor_config_openai.py +67 -0
  164. khoj/migrations/migrate_server_pg.py +138 -0
  165. khoj/migrations/migrate_version.py +17 -0
  166. khoj/processor/__init__.py +0 -0
  167. khoj/processor/content/__init__.py +0 -0
  168. khoj/processor/content/docx/__init__.py +0 -0
  169. khoj/processor/content/docx/docx_to_entries.py +110 -0
  170. khoj/processor/content/github/__init__.py +0 -0
  171. khoj/processor/content/github/github_to_entries.py +224 -0
  172. khoj/processor/content/images/__init__.py +0 -0
  173. khoj/processor/content/images/image_to_entries.py +118 -0
  174. khoj/processor/content/markdown/__init__.py +0 -0
  175. khoj/processor/content/markdown/markdown_to_entries.py +165 -0
  176. khoj/processor/content/notion/notion_to_entries.py +260 -0
  177. khoj/processor/content/org_mode/__init__.py +0 -0
  178. khoj/processor/content/org_mode/org_to_entries.py +231 -0
  179. khoj/processor/content/org_mode/orgnode.py +532 -0
  180. khoj/processor/content/pdf/__init__.py +0 -0
  181. khoj/processor/content/pdf/pdf_to_entries.py +116 -0
  182. khoj/processor/content/plaintext/__init__.py +0 -0
  183. khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
  184. khoj/processor/content/text_to_entries.py +297 -0
  185. khoj/processor/conversation/__init__.py +0 -0
  186. khoj/processor/conversation/anthropic/__init__.py +0 -0
  187. khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
  188. khoj/processor/conversation/anthropic/utils.py +114 -0
  189. khoj/processor/conversation/offline/__init__.py +0 -0
  190. khoj/processor/conversation/offline/chat_model.py +231 -0
  191. khoj/processor/conversation/offline/utils.py +78 -0
  192. khoj/processor/conversation/offline/whisper.py +15 -0
  193. khoj/processor/conversation/openai/__init__.py +0 -0
  194. khoj/processor/conversation/openai/gpt.py +187 -0
  195. khoj/processor/conversation/openai/utils.py +129 -0
  196. khoj/processor/conversation/openai/whisper.py +13 -0
  197. khoj/processor/conversation/prompts.py +758 -0
  198. khoj/processor/conversation/utils.py +262 -0
  199. khoj/processor/embeddings.py +117 -0
  200. khoj/processor/speech/__init__.py +0 -0
  201. khoj/processor/speech/text_to_speech.py +51 -0
  202. khoj/processor/tools/__init__.py +0 -0
  203. khoj/processor/tools/online_search.py +225 -0
  204. khoj/routers/__init__.py +0 -0
  205. khoj/routers/api.py +626 -0
  206. khoj/routers/api_agents.py +43 -0
  207. khoj/routers/api_chat.py +1180 -0
  208. khoj/routers/api_config.py +434 -0
  209. khoj/routers/api_phone.py +86 -0
  210. khoj/routers/auth.py +181 -0
  211. khoj/routers/email.py +133 -0
  212. khoj/routers/helpers.py +1188 -0
  213. khoj/routers/indexer.py +349 -0
  214. khoj/routers/notion.py +91 -0
  215. khoj/routers/storage.py +35 -0
  216. khoj/routers/subscription.py +104 -0
  217. khoj/routers/twilio.py +36 -0
  218. khoj/routers/web_client.py +471 -0
  219. khoj/search_filter/__init__.py +0 -0
  220. khoj/search_filter/base_filter.py +15 -0
  221. khoj/search_filter/date_filter.py +217 -0
  222. khoj/search_filter/file_filter.py +30 -0
  223. khoj/search_filter/word_filter.py +29 -0
  224. khoj/search_type/__init__.py +0 -0
  225. khoj/search_type/text_search.py +241 -0
  226. khoj/utils/__init__.py +0 -0
  227. khoj/utils/cli.py +93 -0
  228. khoj/utils/config.py +81 -0
  229. khoj/utils/constants.py +24 -0
  230. khoj/utils/fs_syncer.py +249 -0
  231. khoj/utils/helpers.py +418 -0
  232. khoj/utils/initialization.py +146 -0
  233. khoj/utils/jsonl.py +43 -0
  234. khoj/utils/models.py +47 -0
  235. khoj/utils/rawconfig.py +160 -0
  236. khoj/utils/state.py +46 -0
  237. khoj/utils/yaml.py +43 -0
  238. khoj-1.16.1.dev15.dist-info/METADATA +178 -0
  239. khoj-1.16.1.dev15.dist-info/RECORD +242 -0
  240. khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
  241. khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
  242. khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,138 @@
1
+ """
2
+ The application config currently looks like this:
3
+ app:
4
+ should-log-telemetry: true
5
+ content-type:
6
+ ...
7
+ processor:
8
+ conversation:
9
+ conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
10
+ max-prompt-size: null
11
+ offline-chat:
12
+ chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
13
+ enable-offline-chat: false
14
+ openai:
15
+ api-key: sk-blah
16
+ chat-model: gpt-3.5-turbo
17
+ tokenizer: null
18
+ search-type:
19
+ asymmetric:
20
+ cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
21
+ encoder: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
22
+ encoder-type: null
23
+ model-directory: /Users/si/.khoj/search/asymmetric
24
+ image:
25
+ encoder: sentence-transformers/clip-ViT-B-32
26
+ encoder-type: null
27
+ model-directory: /Users/si/.khoj/search/image
28
+ symmetric:
29
+ cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
30
+ encoder: sentence-transformers/all-MiniLM-L6-v2
31
+ encoder-type: null
32
+ model-directory: ~/.khoj/search/symmetric
33
+ version: 0.14.0
34
+
35
+
36
+ The new version will looks like this:
37
+ app:
38
+ should-log-telemetry: true
39
+ processor:
40
+ conversation:
41
+ offline-chat:
42
+ enabled: false
43
+ openai:
44
+ api-key: sk-blah
45
+ chat-model-options:
46
+ - chat-model: gpt-3.5-turbo
47
+ tokenizer: null
48
+ type: openai
49
+ - chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
50
+ tokenizer: null
51
+ type: offline
52
+ search-type:
53
+ asymmetric:
54
+ cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
55
+ encoder: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
56
+ version: 0.15.0
57
+ """
58
+
59
+ import logging
60
+
61
+ from packaging import version
62
+
63
+ from khoj.database.models import (
64
+ ChatModelOptions,
65
+ OpenAIProcessorConversationConfig,
66
+ SearchModelConfig,
67
+ )
68
+ from khoj.utils.yaml import load_config_from_file, save_config_to_file
69
+
70
+ logger = logging.getLogger(__name__)
71
+
72
+
73
+ def migrate_server_pg(args):
74
+ schema_version = "0.15.0"
75
+ raw_config = load_config_from_file(args.config_file)
76
+ previous_version = raw_config.get("version")
77
+
78
+ if previous_version is None or version.parse(previous_version) < version.parse(schema_version):
79
+ logger.info(
80
+ f"Migrating configuration used for version {previous_version} to latest version for server with postgres in {args.version_no}"
81
+ )
82
+ raw_config["version"] = schema_version
83
+
84
+ if raw_config is None:
85
+ return args
86
+
87
+ if "search-type" in raw_config and raw_config["search-type"]:
88
+ if "asymmetric" in raw_config["search-type"]:
89
+ # Delete all existing search models
90
+ SearchModelConfig.objects.filter(model_type=SearchModelConfig.ModelType.TEXT).delete()
91
+ # Create new search model from existing Khoj YAML config
92
+ asymmetric_search = raw_config["search-type"]["asymmetric"]
93
+ SearchModelConfig.objects.create(
94
+ name="default",
95
+ model_type=SearchModelConfig.ModelType.TEXT,
96
+ bi_encoder=asymmetric_search.get("encoder"),
97
+ cross_encoder=asymmetric_search.get("cross-encoder"),
98
+ )
99
+
100
+ if "processor" in raw_config and raw_config["processor"] and "conversation" in raw_config["processor"]:
101
+ processor_conversation = raw_config["processor"]["conversation"]
102
+
103
+ if "offline-chat" in raw_config["processor"]["conversation"]:
104
+ offline_chat = raw_config["processor"]["conversation"]["offline-chat"]
105
+ ChatModelOptions.objects.create(
106
+ chat_model=offline_chat.get("chat-model"),
107
+ tokenizer=processor_conversation.get("tokenizer"),
108
+ max_prompt_size=processor_conversation.get("max-prompt-size"),
109
+ model_type=ChatModelOptions.ModelType.OFFLINE,
110
+ )
111
+
112
+ if (
113
+ "openai" in raw_config["processor"]["conversation"]
114
+ and raw_config["processor"]["conversation"]["openai"]
115
+ ):
116
+ openai = raw_config["processor"]["conversation"]["openai"]
117
+
118
+ if openai.get("api-key") is None:
119
+ logger.error("OpenAI API Key is not set. Will not be migrating OpenAI config.")
120
+ else:
121
+ if openai.get("chat-model") is None:
122
+ openai["chat-model"] = "gpt-3.5-turbo"
123
+
124
+ openai_config = OpenAIProcessorConversationConfig.objects.create(
125
+ api_key=openai.get("api-key"), name="default"
126
+ )
127
+
128
+ ChatModelOptions.objects.create(
129
+ chat_model=openai.get("chat-model"),
130
+ tokenizer=processor_conversation.get("tokenizer"),
131
+ max_prompt_size=processor_conversation.get("max-prompt-size"),
132
+ model_type=ChatModelOptions.ModelType.OPENAI,
133
+ openai_config=openai_config,
134
+ )
135
+
136
+ save_config_to_file(raw_config, args.config_file)
137
+
138
+ return args
@@ -0,0 +1,17 @@
1
+ from khoj.utils.yaml import load_config_from_file, save_config_to_file
2
+
3
+
4
+ def migrate_config_to_version(args):
5
+ schema_version = "0.9.0"
6
+ raw_config = load_config_from_file(args.config_file)
7
+
8
+ # Add version to khoj config schema
9
+ if "version" not in raw_config:
10
+ raw_config["version"] = schema_version
11
+ save_config_to_file(raw_config, args.config_file)
12
+
13
+ # regenerate khoj index on first start of this version
14
+ # this should refresh index and apply index corruption fixes from #325
15
+ args.regenerate = True
16
+
17
+ return args
File without changes
File without changes
File without changes
@@ -0,0 +1,110 @@
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+ from typing import Dict, List, Tuple
5
+
6
+ from langchain_community.document_loaders import Docx2txtLoader
7
+
8
+ from khoj.database.models import Entry as DbEntry
9
+ from khoj.database.models import KhojUser
10
+ from khoj.processor.content.text_to_entries import TextToEntries
11
+ from khoj.utils.helpers import timer
12
+ from khoj.utils.rawconfig import Entry
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DocxToEntries(TextToEntries):
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ # Define Functions
22
+ def process(
23
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
24
+ ) -> Tuple[int, int]:
25
+ # Extract required fields from config
26
+ if not full_corpus:
27
+ deletion_file_names = set([file for file in files if files[file] == b""])
28
+ files_to_process = set(files) - deletion_file_names
29
+ files = {file: files[file] for file in files_to_process}
30
+ else:
31
+ deletion_file_names = None
32
+
33
+ # Extract Entries from specified Docx files
34
+ with timer("Extract entries from specified DOCX files", logger):
35
+ file_to_text_map, current_entries = DocxToEntries.extract_docx_entries(files)
36
+
37
+ # Split entries by max tokens supported by model
38
+ with timer("Split entries by max token size supported by model", logger):
39
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
40
+
41
+ # Identify, mark and merge any new entries with previous entries
42
+ with timer("Identify new or updated entries", logger):
43
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
44
+ current_entries,
45
+ DbEntry.EntryType.DOCX,
46
+ DbEntry.EntrySource.COMPUTER,
47
+ "compiled",
48
+ logger,
49
+ deletion_file_names,
50
+ user,
51
+ regenerate=regenerate,
52
+ file_to_text_map=file_to_text_map,
53
+ )
54
+
55
+ return num_new_embeddings, num_deleted_embeddings
56
+
57
+ @staticmethod
58
+ def extract_docx_entries(docx_files) -> Tuple[Dict, List[Entry]]:
59
+ """Extract entries from specified DOCX files"""
60
+
61
+ entries: List[str] = []
62
+ entry_to_location_map: List[Tuple[str, str]] = []
63
+ file_to_text_map = dict()
64
+ for docx_file in docx_files:
65
+ try:
66
+ timestamp_now = datetime.utcnow().timestamp()
67
+ tmp_file = f"tmp_docx_file_{timestamp_now}.docx"
68
+ with open(tmp_file, "wb") as f:
69
+ bytes_content = docx_files[docx_file]
70
+ f.write(bytes_content)
71
+
72
+ # Load the content using Docx2txtLoader
73
+ loader = Docx2txtLoader(tmp_file)
74
+ docx_entries_per_file = loader.load()
75
+
76
+ # Convert the loaded entries into the desired format
77
+ docx_texts = [page.page_content for page in docx_entries_per_file]
78
+
79
+ entry_to_location_map += zip(docx_texts, [docx_file] * len(docx_texts))
80
+ entries.extend(docx_texts)
81
+ file_to_text_map[docx_file] = docx_texts
82
+ except Exception as e:
83
+ logger.warning(f"Unable to process file: {docx_file}. This file will not be indexed.")
84
+ logger.warning(e, exc_info=True)
85
+ finally:
86
+ if os.path.exists(f"{tmp_file}"):
87
+ os.remove(f"{tmp_file}")
88
+ return file_to_text_map, DocxToEntries.convert_docx_entries_to_maps(entries, dict(entry_to_location_map))
89
+
90
+ @staticmethod
91
+ def convert_docx_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
92
+ """Convert each DOCX entry into a dictionary"""
93
+ entries = []
94
+ for parsed_entry in parsed_entries:
95
+ entry_filename = entry_to_file_map[parsed_entry]
96
+ # Append base filename to compiled entry for context to model
97
+ heading = f"{entry_filename}\n"
98
+ compiled_entry = f"{heading}{parsed_entry}"
99
+ entries.append(
100
+ Entry(
101
+ compiled=compiled_entry,
102
+ raw=parsed_entry,
103
+ heading=heading,
104
+ file=f"{entry_filename}",
105
+ )
106
+ )
107
+
108
+ logger.debug(f"Converted {len(parsed_entries)} DOCX entries to dictionaries")
109
+
110
+ return entries
File without changes
@@ -0,0 +1,224 @@
1
+ import logging
2
+ import time
3
+ from typing import Any, Dict, List, Tuple
4
+
5
+ import requests
6
+ from magika import Magika
7
+
8
+ from khoj.database.models import Entry as DbEntry
9
+ from khoj.database.models import GithubConfig, KhojUser
10
+ from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
11
+ from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
12
+ from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
13
+ from khoj.processor.content.text_to_entries import TextToEntries
14
+ from khoj.utils.helpers import timer
15
+ from khoj.utils.rawconfig import GithubContentConfig, GithubRepoConfig
16
+
17
+ logger = logging.getLogger(__name__)
18
+ magika = Magika()
19
+
20
+
21
+ class GithubToEntries(TextToEntries):
22
+ def __init__(self, config: GithubConfig):
23
+ super().__init__(config)
24
+ raw_repos = config.githubrepoconfig.all()
25
+ repos = []
26
+ for repo in raw_repos:
27
+ repos.append(
28
+ GithubRepoConfig(
29
+ name=repo.name,
30
+ owner=repo.owner,
31
+ branch=repo.branch,
32
+ )
33
+ )
34
+ self.config = GithubContentConfig(
35
+ pat_token=config.pat_token,
36
+ repos=repos,
37
+ )
38
+ self.session = requests.Session()
39
+ self.session.headers.update({"Authorization": f"token {self.config.pat_token}"})
40
+
41
+ @staticmethod
42
+ def wait_for_rate_limit_reset(response, func, *args, **kwargs):
43
+ if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
44
+ wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
45
+ logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
46
+ time.sleep(wait_time)
47
+ return func(*args, **kwargs)
48
+ else:
49
+ return
50
+
51
+ def process(
52
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
53
+ ) -> Tuple[int, int]:
54
+ if self.config.pat_token is None or self.config.pat_token == "":
55
+ logger.error(f"Github PAT token is not set. Skipping github content")
56
+ raise ValueError("Github PAT token is not set. Skipping github content")
57
+ current_entries = []
58
+ for repo in self.config.repos:
59
+ current_entries += self.process_repo(repo)
60
+
61
+ return self.update_entries_with_ids(current_entries, user=user)
62
+
63
+ def process_repo(self, repo: GithubRepoConfig):
64
+ repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}"
65
+ repo_shorthand = f"{repo.owner}/{repo.name}"
66
+ logger.info(f"Processing github repo {repo_shorthand}")
67
+ with timer("Download files from github repo", logger):
68
+ try:
69
+ markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
70
+ except ConnectionAbortedError as e:
71
+ logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}")
72
+ raise e
73
+ except Exception as e:
74
+ logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
75
+ raise e
76
+
77
+ logger.info(
78
+ f"Found {len(markdown_files)} md, {len(org_files)} org and {len(plaintext_files)} text files in github repo {repo_shorthand}"
79
+ )
80
+ current_entries = []
81
+
82
+ with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
83
+ current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
84
+ *GithubToEntries.extract_markdown_entries(markdown_files)
85
+ )
86
+
87
+ with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
88
+ current_entries += OrgToEntries.convert_org_nodes_to_entries(
89
+ *GithubToEntries.extract_org_entries(org_files)
90
+ )
91
+
92
+ with timer(f"Extract plaintext entries from github repo {repo_shorthand}", logger):
93
+ current_entries += PlaintextToEntries.convert_text_files_to_entries(
94
+ *GithubToEntries.extract_plaintext_entries(plaintext_files)
95
+ )
96
+
97
+ with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
98
+ current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
99
+
100
+ return current_entries
101
+
102
+ def update_entries_with_ids(self, current_entries, user: KhojUser = None):
103
+ # Identify, mark and merge any new entries with previous entries
104
+ with timer("Identify new or updated entries", logger):
105
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
106
+ current_entries,
107
+ DbEntry.EntryType.GITHUB,
108
+ DbEntry.EntrySource.GITHUB,
109
+ key="compiled",
110
+ logger=logger,
111
+ user=user,
112
+ )
113
+
114
+ return num_new_embeddings, num_deleted_embeddings
115
+
116
+ def get_files(self, repo_url: str, repo: GithubRepoConfig):
117
+ # Get the contents of the repository
118
+ repo_content_url = f"{repo_url}/git/trees/{repo.branch}"
119
+ headers = {"Authorization": f"token {self.config.pat_token}"}
120
+ params = {"recursive": "true"}
121
+ response = requests.get(repo_content_url, headers=headers, params=params)
122
+ contents = response.json()
123
+
124
+ # Raise exception if hit rate limit
125
+ if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
126
+ raise ConnectionAbortedError("Github rate limit reached")
127
+
128
+ # Extract markdown files from the repository
129
+ markdown_files: List[Dict[str, str]] = []
130
+ org_files: List[Dict[str, str]] = []
131
+ plaintext_files: List[Dict[str, str]] = []
132
+ if "tree" not in contents:
133
+ return markdown_files, org_files, plaintext_files
134
+
135
+ for item in contents["tree"]:
136
+ # Find all markdown files in the repository
137
+ if item["type"] == "blob" and item["path"].endswith(".md"):
138
+ # Create URL for each markdown file on Github
139
+ url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
140
+
141
+ # Add markdown file contents and URL to list
142
+ markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
143
+
144
+ # Find all org files in the repository
145
+ elif item["type"] == "blob" and item["path"].endswith(".org"):
146
+ # Create URL for each org file on Github
147
+ url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
148
+
149
+ # Add org file contents and URL to list
150
+ org_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
151
+
152
+ # Find, index remaining non-binary files in the repository
153
+ elif item["type"] == "blob":
154
+ url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
155
+ content_bytes = self.get_file_contents(item["url"], decode=False)
156
+ content_type, content_str = None, None
157
+ try:
158
+ content_type = magika.identify_bytes(content_bytes).output.group
159
+ except:
160
+ logger.error(f"Unable to identify content type of file at {url_path}. Skip indexing it")
161
+ continue
162
+
163
+ # Add non-binary file contents and URL to list
164
+ if content_type in ["text", "code"]:
165
+ try:
166
+ content_str = content_bytes.decode("utf-8")
167
+ except:
168
+ logger.error(f"Unable to decode content of file at {url_path}. Skip indexing it")
169
+ continue
170
+ plaintext_files += [{"content": content_str, "path": url_path}]
171
+
172
+ return markdown_files, org_files, plaintext_files
173
+
174
+ def get_file_contents(self, file_url, decode=True):
175
+ # Get text from each markdown file
176
+ headers = {"Accept": "application/vnd.github.v3.raw"}
177
+ response = self.session.get(file_url, headers=headers, stream=True)
178
+
179
+ # Stop indexing on hitting rate limit
180
+ if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
181
+ raise ConnectionAbortedError("Github rate limit reached")
182
+
183
+ content = "" if decode else b""
184
+ for chunk in response.iter_content(chunk_size=2048):
185
+ if chunk:
186
+ try:
187
+ content += chunk.decode("utf-8") if decode else chunk
188
+ except Exception as e:
189
+ logger.error(f"Unable to decode chunk from {file_url}")
190
+ logger.error(e)
191
+
192
+ return content
193
+
194
+ @staticmethod
195
+ def extract_markdown_entries(markdown_files):
196
+ entries = []
197
+ entry_to_file_map = []
198
+ for doc in markdown_files:
199
+ entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
200
+ doc["content"], doc["path"], entries, entry_to_file_map
201
+ )
202
+ return entries, dict(entry_to_file_map)
203
+
204
+ @staticmethod
205
+ def extract_org_entries(org_files):
206
+ entries = []
207
+ entry_to_file_map = []
208
+
209
+ for doc in org_files:
210
+ entries, entry_to_file_map = OrgToEntries.process_single_org_file(
211
+ doc["content"], doc["path"], entries, entry_to_file_map
212
+ )
213
+ return entries, dict(entry_to_file_map)
214
+
215
+ @staticmethod
216
+ def extract_plaintext_entries(plaintext_files):
217
+ entries = []
218
+ entry_to_file_map = []
219
+
220
+ for doc in plaintext_files:
221
+ entries, entry_to_file_map = PlaintextToEntries.process_single_plaintext_file(
222
+ doc["content"], doc["path"], entries, entry_to_file_map
223
+ )
224
+ return entries, dict(entry_to_file_map)
File without changes
@@ -0,0 +1,118 @@
1
+ import base64
2
+ import logging
3
+ import os
4
+ from datetime import datetime
5
+ from typing import Dict, List, Tuple
6
+
7
+ from rapidocr_onnxruntime import RapidOCR
8
+
9
+ from khoj.database.models import Entry as DbEntry
10
+ from khoj.database.models import KhojUser
11
+ from khoj.processor.content.text_to_entries import TextToEntries
12
+ from khoj.utils.helpers import timer
13
+ from khoj.utils.rawconfig import Entry
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class ImageToEntries(TextToEntries):
19
+ def __init__(self):
20
+ super().__init__()
21
+
22
+ # Define Functions
23
+ def process(
24
+ self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
25
+ ) -> Tuple[int, int]:
26
+ # Extract required fields from config
27
+ if not full_corpus:
28
+ deletion_file_names = set([file for file in files if files[file] == b""])
29
+ files_to_process = set(files) - deletion_file_names
30
+ files = {file: files[file] for file in files_to_process}
31
+ else:
32
+ deletion_file_names = None
33
+
34
+ # Extract Entries from specified image files
35
+ with timer("Extract entries from specified Image files", logger):
36
+ file_to_text_map, current_entries = ImageToEntries.extract_image_entries(files)
37
+
38
+ # Split entries by max tokens supported by model
39
+ with timer("Split entries by max token size supported by model", logger):
40
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
41
+
42
+ # Identify, mark and merge any new entries with previous entries
43
+ with timer("Identify new or updated entries", logger):
44
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
45
+ current_entries,
46
+ DbEntry.EntryType.IMAGE,
47
+ DbEntry.EntrySource.COMPUTER,
48
+ "compiled",
49
+ logger,
50
+ deletion_file_names,
51
+ user,
52
+ regenerate=regenerate,
53
+ file_to_text_map=file_to_text_map,
54
+ )
55
+
56
+ return num_new_embeddings, num_deleted_embeddings
57
+
58
+ @staticmethod
59
+ def extract_image_entries(image_files) -> Tuple[Dict, List[Entry]]: # important function
60
+ """Extract entries by page from specified image files"""
61
+ file_to_text_map = dict()
62
+ entries: List[str] = []
63
+ entry_to_location_map: List[Tuple[str, str]] = []
64
+ for image_file in image_files:
65
+ try:
66
+ loader = RapidOCR()
67
+ bytes = image_files[image_file]
68
+ # write the image to a temporary file
69
+ timestamp_now = datetime.utcnow().timestamp()
70
+ # use either png or jpg
71
+ if image_file.endswith(".png"):
72
+ tmp_file = f"tmp_image_file_{timestamp_now}.png"
73
+ elif image_file.endswith(".jpg") or image_file.endswith(".jpeg"):
74
+ tmp_file = f"tmp_image_file_{timestamp_now}.jpg"
75
+ with open(tmp_file, "wb") as f:
76
+ bytes = image_files[image_file]
77
+ f.write(bytes)
78
+ try:
79
+ image_entries_per_file = ""
80
+ result, _ = loader(tmp_file)
81
+ if result:
82
+ expanded_entries = [text[1] for text in result]
83
+ image_entries_per_file = " ".join(expanded_entries)
84
+ except ImportError:
85
+ logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.")
86
+ continue
87
+ entry_to_location_map.append((image_entries_per_file, image_file))
88
+ entries.extend([image_entries_per_file])
89
+ file_to_text_map[image_file] = image_entries_per_file
90
+ except Exception as e:
91
+ logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.")
92
+ logger.warning(e, exc_info=True)
93
+ finally:
94
+ if os.path.exists(tmp_file):
95
+ os.remove(tmp_file)
96
+ return file_to_text_map, ImageToEntries.convert_image_entries_to_maps(entries, dict(entry_to_location_map))
97
+
98
+ @staticmethod
99
+ def convert_image_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
100
+ "Convert each image entries into a dictionary"
101
+ entries = []
102
+ for parsed_entry in parsed_entries:
103
+ entry_filename = entry_to_file_map[parsed_entry]
104
+ # Append base filename to compiled entry for context to model
105
+ heading = f"{entry_filename}\n"
106
+ compiled_entry = f"{heading}{parsed_entry}"
107
+ entries.append(
108
+ Entry(
109
+ compiled=compiled_entry,
110
+ raw=parsed_entry,
111
+ heading=heading,
112
+ file=f"{entry_filename}",
113
+ )
114
+ )
115
+
116
+ logger.debug(f"Converted {len(parsed_entries)} image entries to dictionaries")
117
+
118
+ return entries
File without changes