khoj 1.33.3.dev32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. khoj/__init__.py +0 -0
  2. khoj/app/README.md +94 -0
  3. khoj/app/__init__.py +0 -0
  4. khoj/app/asgi.py +16 -0
  5. khoj/app/settings.py +218 -0
  6. khoj/app/urls.py +25 -0
  7. khoj/configure.py +452 -0
  8. khoj/database/__init__.py +0 -0
  9. khoj/database/adapters/__init__.py +1821 -0
  10. khoj/database/admin.py +417 -0
  11. khoj/database/apps.py +6 -0
  12. khoj/database/management/__init__.py +0 -0
  13. khoj/database/management/commands/__init__.py +0 -0
  14. khoj/database/management/commands/change_default_model.py +116 -0
  15. khoj/database/management/commands/change_generated_images_url.py +61 -0
  16. khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
  17. khoj/database/migrations/0001_khojuser.py +98 -0
  18. khoj/database/migrations/0002_googleuser.py +32 -0
  19. khoj/database/migrations/0003_vector_extension.py +10 -0
  20. khoj/database/migrations/0004_content_types_and_more.py +181 -0
  21. khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
  22. khoj/database/migrations/0006_embeddingsdates.py +33 -0
  23. khoj/database/migrations/0007_add_conversation.py +27 -0
  24. khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
  25. khoj/database/migrations/0009_khojapiuser.py +24 -0
  26. khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
  27. khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
  28. khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
  29. khoj/database/migrations/0012_entry_file_source.py +21 -0
  30. khoj/database/migrations/0013_subscription.py +37 -0
  31. khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
  32. khoj/database/migrations/0015_alter_subscription_user.py +21 -0
  33. khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
  34. khoj/database/migrations/0017_searchmodel.py +32 -0
  35. khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
  36. khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
  37. khoj/database/migrations/0020_reflectivequestion.py +36 -0
  38. khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
  39. khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
  40. khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
  41. khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
  42. khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
  43. khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
  44. khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
  45. khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
  46. khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
  47. khoj/database/migrations/0029_userrequests.py +27 -0
  48. khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
  49. khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
  50. khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
  51. khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
  52. khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
  53. khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
  54. khoj/database/migrations/0035_processlock.py +26 -0
  55. khoj/database/migrations/0036_alter_processlock_name.py +19 -0
  56. khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
  57. khoj/database/migrations/0036_publicconversation.py +42 -0
  58. khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
  59. khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
  60. khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
  61. khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
  62. khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
  63. khoj/database/migrations/0040_alter_processlock_name.py +26 -0
  64. khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
  65. khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
  66. khoj/database/migrations/0042_serverchatsettings.py +46 -0
  67. khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
  68. khoj/database/migrations/0044_conversation_file_filters.py +17 -0
  69. khoj/database/migrations/0045_fileobject.py +37 -0
  70. khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
  71. khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
  72. khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
  73. khoj/database/migrations/0049_datastore.py +38 -0
  74. khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
  75. khoj/database/migrations/0050_alter_processlock_name.py +25 -0
  76. khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
  77. khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
  78. khoj/database/migrations/0053_agent_style_color_agent_style_icon.py +61 -0
  79. khoj/database/migrations/0054_alter_agent_style_color.py +38 -0
  80. khoj/database/migrations/0055_alter_agent_style_icon.py +37 -0
  81. khoj/database/migrations/0056_chatmodeloptions_vision_enabled.py +17 -0
  82. khoj/database/migrations/0056_searchmodelconfig_cross_encoder_model_config.py +17 -0
  83. khoj/database/migrations/0057_merge_20240816_1409.py +13 -0
  84. khoj/database/migrations/0057_remove_serverchatsettings_default_model_and_more.py +51 -0
  85. khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py +17 -0
  86. khoj/database/migrations/0059_searchmodelconfig_bi_encoder_confidence_threshold.py +17 -0
  87. khoj/database/migrations/0060_merge_20240905_1828.py +14 -0
  88. khoj/database/migrations/0061_alter_chatmodeloptions_model_type.py +26 -0
  89. khoj/database/migrations/0061_alter_texttoimagemodelconfig_model_type.py +21 -0
  90. khoj/database/migrations/0062_merge_20240913_0222.py +14 -0
  91. khoj/database/migrations/0063_conversation_temp_id.py +36 -0
  92. khoj/database/migrations/0064_remove_conversation_temp_id_alter_conversation_id.py +86 -0
  93. khoj/database/migrations/0065_remove_agent_avatar_remove_agent_public_and_more.py +49 -0
  94. khoj/database/migrations/0066_remove_agent_tools_agent_input_tools_and_more.py +69 -0
  95. khoj/database/migrations/0067_alter_agent_style_icon.py +50 -0
  96. khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
  97. khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
  98. khoj/database/migrations/0070_alter_agent_input_tools_alter_agent_output_modes.py +46 -0
  99. khoj/database/migrations/0071_subscription_enabled_trial_at_and_more.py +32 -0
  100. khoj/database/migrations/0072_entry_search_model.py +24 -0
  101. khoj/database/migrations/0073_delete_usersearchmodelconfig.py +15 -0
  102. khoj/database/migrations/0074_alter_conversation_title.py +17 -0
  103. khoj/database/migrations/0075_migrate_generated_assets_and_validate.py +85 -0
  104. khoj/database/migrations/0076_rename_openaiprocessorconversationconfig_aimodelapi_and_more.py +26 -0
  105. khoj/database/migrations/0077_chatmodel_alter_agent_chat_model_and_more.py +62 -0
  106. khoj/database/migrations/0078_khojuser_email_verification_code_expiry.py +17 -0
  107. khoj/database/migrations/__init__.py +0 -0
  108. khoj/database/models/__init__.py +725 -0
  109. khoj/database/tests.py +3 -0
  110. khoj/interface/compiled/404/index.html +1 -0
  111. khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_buildManifest.js +1 -0
  112. khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_ssgManifest.js +1 -0
  113. khoj/interface/compiled/_next/static/chunks/1010-8f39bb4648b5ba10.js +1 -0
  114. khoj/interface/compiled/_next/static/chunks/182-f1c48a203dc91e0e.js +20 -0
  115. khoj/interface/compiled/_next/static/chunks/1915-d3c36ad6ce697ce7.js +1 -0
  116. khoj/interface/compiled/_next/static/chunks/2117-165ef4747a5b836b.js +2 -0
  117. khoj/interface/compiled/_next/static/chunks/2581-455000f8aeb08fc3.js +1 -0
  118. khoj/interface/compiled/_next/static/chunks/3727.dcea8f2193111552.js +1 -0
  119. khoj/interface/compiled/_next/static/chunks/3789-a09e37a819171a9d.js +1 -0
  120. khoj/interface/compiled/_next/static/chunks/4124-6c28322ce218d2d5.js +1 -0
  121. khoj/interface/compiled/_next/static/chunks/5427-b52d95253e692bfa.js +1 -0
  122. khoj/interface/compiled/_next/static/chunks/5473-b1cf56dedac6577a.js +1 -0
  123. khoj/interface/compiled/_next/static/chunks/5477-0bbddb79c25a54a7.js +1 -0
  124. khoj/interface/compiled/_next/static/chunks/6065-64db9ad305ba0bcd.js +1 -0
  125. khoj/interface/compiled/_next/static/chunks/6293-469dd16402ea8a6f.js +3 -0
  126. khoj/interface/compiled/_next/static/chunks/688-b5b4391bbc0376f1.js +1 -0
  127. khoj/interface/compiled/_next/static/chunks/8667-b6bf63c72b2d76eb.js +1 -0
  128. khoj/interface/compiled/_next/static/chunks/9259-1172dbaca0515237.js +1 -0
  129. khoj/interface/compiled/_next/static/chunks/94ca1967.1d9b42d929a1ee8c.js +1 -0
  130. khoj/interface/compiled/_next/static/chunks/9597.83583248dfbf6e73.js +1 -0
  131. khoj/interface/compiled/_next/static/chunks/964ecbae.51d6faf8801d15e6.js +1 -0
  132. khoj/interface/compiled/_next/static/chunks/9665-391df1e5c51c960a.js +1 -0
  133. khoj/interface/compiled/_next/static/chunks/app/_not-found/page-a834eddae3e235df.js +1 -0
  134. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  135. khoj/interface/compiled/_next/static/chunks/app/agents/page-28ce086a1129bca2.js +1 -0
  136. khoj/interface/compiled/_next/static/chunks/app/automations/layout-1fe1537449f43496.js +1 -0
  137. khoj/interface/compiled/_next/static/chunks/app/automations/page-bf365a60829d347f.js +1 -0
  138. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
  139. khoj/interface/compiled/_next/static/chunks/app/chat/page-0e476e57eb2015e3.js +1 -0
  140. khoj/interface/compiled/_next/static/chunks/app/layout-30e7fda7262713ce.js +1 -0
  141. khoj/interface/compiled/_next/static/chunks/app/page-a5515ea71aec5ef0.js +1 -0
  142. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +1 -0
  143. khoj/interface/compiled/_next/static/chunks/app/search/page-9140541e67ea307d.js +1 -0
  144. khoj/interface/compiled/_next/static/chunks/app/settings/layout-d09d6510a45cd4bd.js +1 -0
  145. khoj/interface/compiled/_next/static/chunks/app/settings/page-951ba40b5b94b23a.js +1 -0
  146. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +1 -0
  147. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-1beb80d8d741c932.js +1 -0
  148. khoj/interface/compiled/_next/static/chunks/d3ac728e-44ebd2a0c99b12a0.js +1 -0
  149. khoj/interface/compiled/_next/static/chunks/fd9d1056-4482b99a36fd1673.js +1 -0
  150. khoj/interface/compiled/_next/static/chunks/framework-8e0e0f4a6b83a956.js +1 -0
  151. khoj/interface/compiled/_next/static/chunks/main-app-de1f09df97a3cfc7.js +1 -0
  152. khoj/interface/compiled/_next/static/chunks/main-db4bfac6b0a8d00b.js +1 -0
  153. khoj/interface/compiled/_next/static/chunks/pages/_app-3c9ca398d360b709.js +1 -0
  154. khoj/interface/compiled/_next/static/chunks/pages/_error-cf5ca766ac8f493f.js +1 -0
  155. khoj/interface/compiled/_next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
  156. khoj/interface/compiled/_next/static/chunks/webpack-a03962458328b163.js +1 -0
  157. khoj/interface/compiled/_next/static/css/089de1d8526b96e9.css +1 -0
  158. khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
  159. khoj/interface/compiled/_next/static/css/4e4e6a4a1c920d06.css +1 -0
  160. khoj/interface/compiled/_next/static/css/8d02837c730f8d13.css +25 -0
  161. khoj/interface/compiled/_next/static/css/8e6a3ca11a60b189.css +1 -0
  162. khoj/interface/compiled/_next/static/css/9c164d9727dd8092.css +1 -0
  163. khoj/interface/compiled/_next/static/css/dac88c17aaee5fcf.css +1 -0
  164. khoj/interface/compiled/_next/static/css/df4b47a2d0d85eae.css +1 -0
  165. khoj/interface/compiled/_next/static/css/e4eb883b5265d372.css +1 -0
  166. khoj/interface/compiled/_next/static/media/1d8a05b60287ae6c-s.p.woff2 +0 -0
  167. khoj/interface/compiled/_next/static/media/6f22fce21a7c433c-s.woff2 +0 -0
  168. khoj/interface/compiled/_next/static/media/77c207b095007c34-s.p.woff2 +0 -0
  169. khoj/interface/compiled/_next/static/media/82ef96de0e8f4d8c-s.p.woff2 +0 -0
  170. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.1608a09b.woff +0 -0
  171. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.4aafdb68.ttf +0 -0
  172. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.a79f1c31.woff2 +0 -0
  173. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.b6770918.woff +0 -0
  174. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.cce5b8ec.ttf +0 -0
  175. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.ec17d132.woff2 +0 -0
  176. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.07ef19e7.ttf +0 -0
  177. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.55fac258.woff2 +0 -0
  178. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.dad44a7f.woff +0 -0
  179. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.9f256b85.woff +0 -0
  180. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.b18f59e1.ttf +0 -0
  181. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.d42a5579.woff2 +0 -0
  182. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.7c187121.woff +0 -0
  183. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.d3c882a6.woff2 +0 -0
  184. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.ed38e79f.ttf +0 -0
  185. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.b74a1a8b.ttf +0 -0
  186. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.c3fb5ac2.woff2 +0 -0
  187. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.d181c465.woff +0 -0
  188. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.6f2bb1df.woff2 +0 -0
  189. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.70d8b0a5.ttf +0 -0
  190. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.e3f82f9d.woff +0 -0
  191. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.47373d1e.ttf +0 -0
  192. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.8916142b.woff2 +0 -0
  193. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.9024d815.woff +0 -0
  194. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.0462f03b.woff2 +0 -0
  195. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.7f51fe03.woff +0 -0
  196. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.b7f8fe9b.ttf +0 -0
  197. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.572d331f.woff2 +0 -0
  198. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.a879cf83.ttf +0 -0
  199. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.f1035d8d.woff +0 -0
  200. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.5295ba48.woff +0 -0
  201. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.939bc644.ttf +0 -0
  202. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.f28c23ac.woff2 +0 -0
  203. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.8c5b5494.woff2 +0 -0
  204. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.94e1e8dc.ttf +0 -0
  205. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.bf59d231.woff +0 -0
  206. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.3b1e59b3.woff2 +0 -0
  207. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.7c9bc82b.woff +0 -0
  208. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.b4c20c84.ttf +0 -0
  209. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.74048478.woff +0 -0
  210. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.ba21ed5f.woff2 +0 -0
  211. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.d4d7ba48.ttf +0 -0
  212. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.03e9641d.woff2 +0 -0
  213. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.07505710.woff +0 -0
  214. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.fe9cbbe1.ttf +0 -0
  215. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.e1e279cb.woff +0 -0
  216. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.eae34984.woff2 +0 -0
  217. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.fabc004a.ttf +0 -0
  218. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.57727022.woff +0 -0
  219. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.5916a24f.woff2 +0 -0
  220. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.d6b476ec.ttf +0 -0
  221. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.9acaf01c.woff +0 -0
  222. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.a144ef58.ttf +0 -0
  223. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.b4230e7e.woff2 +0 -0
  224. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.10d95fd3.woff2 +0 -0
  225. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.7a996c9d.woff +0 -0
  226. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.fbccdabe.ttf +0 -0
  227. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.6258592b.woff +0 -0
  228. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.a8709e36.woff2 +0 -0
  229. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.d97aaf4a.ttf +0 -0
  230. khoj/interface/compiled/_next/static/media/a6ecd16fa044d500-s.p.woff2 +0 -0
  231. khoj/interface/compiled/_next/static/media/bd82c78e5b7b3fe9-s.p.woff2 +0 -0
  232. khoj/interface/compiled/_next/static/media/c32c8052c071fc42-s.woff2 +0 -0
  233. khoj/interface/compiled/_next/static/media/c4250770ab8708b6-s.p.woff2 +0 -0
  234. khoj/interface/compiled/_next/static/media/e098aaaecc9cfbb2-s.p.woff2 +0 -0
  235. khoj/interface/compiled/_next/static/media/flags.3afdda2f.webp +0 -0
  236. khoj/interface/compiled/_next/static/media/flags@2x.5fbe9fc1.webp +0 -0
  237. khoj/interface/compiled/_next/static/media/globe.98e105ca.webp +0 -0
  238. khoj/interface/compiled/_next/static/media/globe@2x.974df6f8.webp +0 -0
  239. khoj/interface/compiled/agents/index.html +1 -0
  240. khoj/interface/compiled/agents/index.txt +7 -0
  241. khoj/interface/compiled/agents.svg +6 -0
  242. khoj/interface/compiled/assets/icons/khoj_lantern.ico +0 -0
  243. khoj/interface/compiled/assets/icons/khoj_lantern.svg +100 -0
  244. khoj/interface/compiled/assets/icons/khoj_lantern_1200x1200.png +0 -0
  245. khoj/interface/compiled/assets/icons/khoj_lantern_128x128.png +0 -0
  246. khoj/interface/compiled/assets/icons/khoj_lantern_128x128_dark.png +0 -0
  247. khoj/interface/compiled/assets/icons/khoj_lantern_256x256.png +0 -0
  248. khoj/interface/compiled/assets/icons/khoj_lantern_512x512.png +0 -0
  249. khoj/interface/compiled/assets/icons/khoj_lantern_logomarktype_1200x630.png +0 -0
  250. khoj/interface/compiled/assets/samples/desktop-browse-draw-sample.png +0 -0
  251. khoj/interface/compiled/assets/samples/desktop-plain-chat-sample.png +0 -0
  252. khoj/interface/compiled/assets/samples/desktop-remember-plan-sample.png +0 -0
  253. khoj/interface/compiled/assets/samples/phone-browse-draw-sample.png +0 -0
  254. khoj/interface/compiled/assets/samples/phone-plain-chat-sample.png +0 -0
  255. khoj/interface/compiled/assets/samples/phone-remember-plan-sample.png +0 -0
  256. khoj/interface/compiled/automation.svg +37 -0
  257. khoj/interface/compiled/automations/index.html +1 -0
  258. khoj/interface/compiled/automations/index.txt +8 -0
  259. khoj/interface/compiled/chat/index.html +1 -0
  260. khoj/interface/compiled/chat/index.txt +7 -0
  261. khoj/interface/compiled/chat.svg +24 -0
  262. khoj/interface/compiled/close.svg +5 -0
  263. khoj/interface/compiled/copy-button-success.svg +6 -0
  264. khoj/interface/compiled/copy-button.svg +5 -0
  265. khoj/interface/compiled/index.html +1 -0
  266. khoj/interface/compiled/index.txt +7 -0
  267. khoj/interface/compiled/khoj.webmanifest +76 -0
  268. khoj/interface/compiled/logo.svg +24 -0
  269. khoj/interface/compiled/search/index.html +1 -0
  270. khoj/interface/compiled/search/index.txt +7 -0
  271. khoj/interface/compiled/send.svg +1 -0
  272. khoj/interface/compiled/settings/index.html +1 -0
  273. khoj/interface/compiled/settings/index.txt +9 -0
  274. khoj/interface/compiled/share/chat/index.html +1 -0
  275. khoj/interface/compiled/share/chat/index.txt +7 -0
  276. khoj/interface/compiled/share.svg +8 -0
  277. khoj/interface/compiled/thumbs-down.svg +6 -0
  278. khoj/interface/compiled/thumbs-up.svg +6 -0
  279. khoj/interface/email/feedback.html +34 -0
  280. khoj/interface/email/magic_link.html +40 -0
  281. khoj/interface/email/task.html +37 -0
  282. khoj/interface/email/welcome.html +90 -0
  283. khoj/interface/web/.well-known/assetlinks.json +11 -0
  284. khoj/interface/web/assets/icons/agents.svg +19 -0
  285. khoj/interface/web/assets/icons/automation.svg +43 -0
  286. khoj/interface/web/assets/icons/chat.svg +24 -0
  287. khoj/interface/web/assets/icons/github.svg +1 -0
  288. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  289. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  290. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +32 -0
  291. khoj/interface/web/assets/icons/khoj.svg +26 -0
  292. khoj/interface/web/assets/icons/logotype.svg +1 -0
  293. khoj/interface/web/assets/icons/search.svg +57 -0
  294. khoj/interface/web/assets/icons/sync.svg +4 -0
  295. khoj/interface/web/assets/khoj.css +237 -0
  296. khoj/interface/web/assets/utils.js +33 -0
  297. khoj/interface/web/base_config.html +445 -0
  298. khoj/interface/web/content_source_github_input.html +208 -0
  299. khoj/interface/web/login.html +310 -0
  300. khoj/interface/web/utils.html +48 -0
  301. khoj/main.py +249 -0
  302. khoj/manage.py +22 -0
  303. khoj/migrations/__init__.py +0 -0
  304. khoj/migrations/migrate_offline_chat_default_model.py +69 -0
  305. khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
  306. khoj/migrations/migrate_offline_chat_schema.py +83 -0
  307. khoj/migrations/migrate_offline_model.py +29 -0
  308. khoj/migrations/migrate_processor_config_openai.py +67 -0
  309. khoj/migrations/migrate_server_pg.py +132 -0
  310. khoj/migrations/migrate_version.py +17 -0
  311. khoj/processor/__init__.py +0 -0
  312. khoj/processor/content/__init__.py +0 -0
  313. khoj/processor/content/docx/__init__.py +0 -0
  314. khoj/processor/content/docx/docx_to_entries.py +111 -0
  315. khoj/processor/content/github/__init__.py +0 -0
  316. khoj/processor/content/github/github_to_entries.py +226 -0
  317. khoj/processor/content/images/__init__.py +0 -0
  318. khoj/processor/content/images/image_to_entries.py +117 -0
  319. khoj/processor/content/markdown/__init__.py +0 -0
  320. khoj/processor/content/markdown/markdown_to_entries.py +160 -0
  321. khoj/processor/content/notion/notion_to_entries.py +259 -0
  322. khoj/processor/content/org_mode/__init__.py +0 -0
  323. khoj/processor/content/org_mode/org_to_entries.py +226 -0
  324. khoj/processor/content/org_mode/orgnode.py +532 -0
  325. khoj/processor/content/pdf/__init__.py +0 -0
  326. khoj/processor/content/pdf/pdf_to_entries.py +119 -0
  327. khoj/processor/content/plaintext/__init__.py +0 -0
  328. khoj/processor/content/plaintext/plaintext_to_entries.py +117 -0
  329. khoj/processor/content/text_to_entries.py +296 -0
  330. khoj/processor/conversation/__init__.py +0 -0
  331. khoj/processor/conversation/anthropic/__init__.py +0 -0
  332. khoj/processor/conversation/anthropic/anthropic_chat.py +243 -0
  333. khoj/processor/conversation/anthropic/utils.py +217 -0
  334. khoj/processor/conversation/google/__init__.py +0 -0
  335. khoj/processor/conversation/google/gemini_chat.py +253 -0
  336. khoj/processor/conversation/google/utils.py +260 -0
  337. khoj/processor/conversation/offline/__init__.py +0 -0
  338. khoj/processor/conversation/offline/chat_model.py +308 -0
  339. khoj/processor/conversation/offline/utils.py +80 -0
  340. khoj/processor/conversation/offline/whisper.py +15 -0
  341. khoj/processor/conversation/openai/__init__.py +0 -0
  342. khoj/processor/conversation/openai/gpt.py +243 -0
  343. khoj/processor/conversation/openai/utils.py +232 -0
  344. khoj/processor/conversation/openai/whisper.py +13 -0
  345. khoj/processor/conversation/prompts.py +1188 -0
  346. khoj/processor/conversation/utils.py +867 -0
  347. khoj/processor/embeddings.py +122 -0
  348. khoj/processor/image/generate.py +215 -0
  349. khoj/processor/speech/__init__.py +0 -0
  350. khoj/processor/speech/text_to_speech.py +51 -0
  351. khoj/processor/tools/__init__.py +0 -0
  352. khoj/processor/tools/online_search.py +472 -0
  353. khoj/processor/tools/run_code.py +179 -0
  354. khoj/routers/__init__.py +0 -0
  355. khoj/routers/api.py +760 -0
  356. khoj/routers/api_agents.py +295 -0
  357. khoj/routers/api_chat.py +1273 -0
  358. khoj/routers/api_content.py +634 -0
  359. khoj/routers/api_model.py +123 -0
  360. khoj/routers/api_phone.py +86 -0
  361. khoj/routers/api_subscription.py +144 -0
  362. khoj/routers/auth.py +307 -0
  363. khoj/routers/email.py +135 -0
  364. khoj/routers/helpers.py +2333 -0
  365. khoj/routers/notion.py +85 -0
  366. khoj/routers/research.py +364 -0
  367. khoj/routers/storage.py +63 -0
  368. khoj/routers/twilio.py +36 -0
  369. khoj/routers/web_client.py +141 -0
  370. khoj/search_filter/__init__.py +0 -0
  371. khoj/search_filter/base_filter.py +15 -0
  372. khoj/search_filter/date_filter.py +215 -0
  373. khoj/search_filter/file_filter.py +32 -0
  374. khoj/search_filter/word_filter.py +29 -0
  375. khoj/search_type/__init__.py +0 -0
  376. khoj/search_type/text_search.py +255 -0
  377. khoj/utils/__init__.py +0 -0
  378. khoj/utils/cli.py +101 -0
  379. khoj/utils/config.py +81 -0
  380. khoj/utils/constants.py +51 -0
  381. khoj/utils/fs_syncer.py +252 -0
  382. khoj/utils/helpers.py +627 -0
  383. khoj/utils/initialization.py +301 -0
  384. khoj/utils/jsonl.py +43 -0
  385. khoj/utils/models.py +47 -0
  386. khoj/utils/rawconfig.py +208 -0
  387. khoj/utils/state.py +48 -0
  388. khoj/utils/yaml.py +47 -0
  389. khoj-1.33.3.dev32.dist-info/METADATA +190 -0
  390. khoj-1.33.3.dev32.dist-info/RECORD +393 -0
  391. khoj-1.33.3.dev32.dist-info/WHEEL +4 -0
  392. khoj-1.33.3.dev32.dist-info/entry_points.txt +2 -0
  393. khoj-1.33.3.dev32.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,226 @@
1
+ import logging
2
+ import time
3
+ from typing import Dict, List, Tuple
4
+
5
+ import requests
6
+ from magika import Magika
7
+
8
+ from khoj.database.models import Entry as DbEntry
9
+ from khoj.database.models import GithubConfig, KhojUser
10
+ from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
11
+ from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
12
+ from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
13
+ from khoj.processor.content.text_to_entries import TextToEntries
14
+ from khoj.utils.helpers import is_none_or_empty, timer
15
+ from khoj.utils.rawconfig import GithubContentConfig, GithubRepoConfig
16
+
17
+ logger = logging.getLogger(__name__)
18
+ magika = Magika()
19
+
20
+
21
+ class GithubToEntries(TextToEntries):
22
+ def __init__(self, config: GithubConfig):
23
+ super().__init__(config)
24
+ raw_repos = config.githubrepoconfig.all()
25
+ repos = []
26
+ for repo in raw_repos:
27
+ repos.append(
28
+ GithubRepoConfig(
29
+ name=repo.name,
30
+ owner=repo.owner,
31
+ branch=repo.branch,
32
+ )
33
+ )
34
+ self.config = GithubContentConfig(
35
+ pat_token=config.pat_token,
36
+ repos=repos,
37
+ )
38
+ self.session = requests.Session()
39
+ if not is_none_or_empty(self.config.pat_token):
40
+ self.session.headers.update({"Authorization": f"token {self.config.pat_token}"})
41
+
42
+ @staticmethod
43
+ def wait_for_rate_limit_reset(response, func, *args, **kwargs):
44
+ if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
45
+ wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
46
+ logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
47
+ time.sleep(wait_time)
48
+ return func(*args, **kwargs)
49
+ else:
50
+ return
51
+
52
+ def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = False) -> Tuple[int, int]:
53
+ if is_none_or_empty(self.config.pat_token):
54
+ logger.warning(
55
+ f"Github PAT token is not set. Private repositories cannot be indexed and lower rate limits apply."
56
+ )
57
+ current_entries = []
58
+ for repo in self.config.repos:
59
+ current_entries += self.process_repo(repo)
60
+
61
+ return self.update_entries_with_ids(current_entries, user=user)
62
+
63
+ def process_repo(self, repo: GithubRepoConfig):
64
+ repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}"
65
+ repo_shorthand = f"{repo.owner}/{repo.name}"
66
+ logger.info(f"Processing github repo {repo_shorthand}")
67
+ with timer("Download files from github repo", logger):
68
+ try:
69
+ markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
70
+ except ConnectionAbortedError as e:
71
+ logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}")
72
+ raise e
73
+ except Exception as e:
74
+ logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
75
+ raise e
76
+
77
+ logger.info(
78
+ f"Found {len(markdown_files)} md, {len(org_files)} org and {len(plaintext_files)} text files in github repo {repo_shorthand}"
79
+ )
80
+ current_entries = []
81
+
82
+ with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
83
+ current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
84
+ *GithubToEntries.extract_markdown_entries(markdown_files)
85
+ )
86
+
87
+ with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
88
+ current_entries += OrgToEntries.convert_org_nodes_to_entries(
89
+ *GithubToEntries.extract_org_entries(org_files)
90
+ )
91
+
92
+ with timer(f"Extract plaintext entries from github repo {repo_shorthand}", logger):
93
+ current_entries += PlaintextToEntries.convert_text_files_to_entries(
94
+ *GithubToEntries.extract_plaintext_entries(plaintext_files)
95
+ )
96
+
97
+ with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
98
+ current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
99
+
100
+ return current_entries
101
+
102
+ def update_entries_with_ids(self, current_entries, user: KhojUser = None):
103
+ # Identify, mark and merge any new entries with previous entries
104
+ with timer("Identify new or updated entries", logger):
105
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
106
+ user,
107
+ current_entries,
108
+ DbEntry.EntryType.GITHUB,
109
+ DbEntry.EntrySource.GITHUB,
110
+ key="compiled",
111
+ logger=logger,
112
+ )
113
+
114
+ return num_new_embeddings, num_deleted_embeddings
115
+
116
+ def get_files(self, repo_url: str, repo: GithubRepoConfig):
117
+ # Get the contents of the repository
118
+ repo_content_url = f"{repo_url}/git/trees/{repo.branch}"
119
+ headers = {}
120
+ if not is_none_or_empty(self.config.pat_token):
121
+ headers = {"Authorization": f"token {self.config.pat_token}"}
122
+ params = {"recursive": "true"}
123
+ response = requests.get(repo_content_url, headers=headers, params=params)
124
+ contents = response.json()
125
+
126
+ # Raise exception if hit rate limit
127
+ if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
128
+ raise ConnectionAbortedError("Github rate limit reached")
129
+
130
+ # Extract markdown files from the repository
131
+ markdown_files: List[Dict[str, str]] = []
132
+ org_files: List[Dict[str, str]] = []
133
+ plaintext_files: List[Dict[str, str]] = []
134
+ if "tree" not in contents:
135
+ return markdown_files, org_files, plaintext_files
136
+
137
+ for item in contents["tree"]:
138
+ # Find all markdown files in the repository
139
+ if item["type"] == "blob" and item["path"].endswith(".md"):
140
+ # Create URL for each markdown file on Github
141
+ url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
142
+
143
+ # Add markdown file contents and URL to list
144
+ markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
145
+
146
+ # Find all org files in the repository
147
+ elif item["type"] == "blob" and item["path"].endswith(".org"):
148
+ # Create URL for each org file on Github
149
+ url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
150
+
151
+ # Add org file contents and URL to list
152
+ org_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
153
+
154
+ # Find, index remaining non-binary files in the repository
155
+ elif item["type"] == "blob":
156
+ url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
157
+ content_bytes = self.get_file_contents(item["url"], decode=False)
158
+ content_type, content_str = None, None
159
+ try:
160
+ content_type = magika.identify_bytes(content_bytes).output.group
161
+ except:
162
+ logger.error(f"Unable to identify content type of file at {url_path}. Skip indexing it")
163
+ continue
164
+
165
+ # Add non-binary file contents and URL to list
166
+ if content_type in ["text", "code"]:
167
+ try:
168
+ content_str = content_bytes.decode("utf-8")
169
+ except:
170
+ logger.error(f"Unable to decode content of file at {url_path}. Skip indexing it")
171
+ continue
172
+ plaintext_files += [{"content": content_str, "path": url_path}]
173
+
174
+ return markdown_files, org_files, plaintext_files
175
+
176
+ def get_file_contents(self, file_url, decode=True):
177
+ # Get text from each markdown file
178
+ headers = {"Accept": "application/vnd.github.v3.raw"}
179
+ response = self.session.get(file_url, headers=headers, stream=True)
180
+
181
+ # Stop indexing on hitting rate limit
182
+ if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
183
+ raise ConnectionAbortedError("Github rate limit reached")
184
+
185
+ content = "" if decode else b""
186
+ for chunk in response.iter_content(chunk_size=2048):
187
+ if chunk:
188
+ try:
189
+ content += chunk.decode("utf-8") if decode else chunk
190
+ except Exception as e:
191
+ logger.error(f"Unable to decode chunk from {file_url}")
192
+ logger.error(e)
193
+
194
+ return content
195
+
196
+ @staticmethod
197
+ def extract_markdown_entries(markdown_files):
198
+ entries = []
199
+ entry_to_file_map = []
200
+ for doc in markdown_files:
201
+ entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
202
+ doc["content"], doc["path"], entries, entry_to_file_map
203
+ )
204
+ return entries, dict(entry_to_file_map)
205
+
206
+ @staticmethod
207
+ def extract_org_entries(org_files):
208
+ entries = []
209
+ entry_to_file_map = []
210
+
211
+ for doc in org_files:
212
+ entries, entry_to_file_map = OrgToEntries.process_single_org_file(
213
+ doc["content"], doc["path"], entries, entry_to_file_map
214
+ )
215
+ return entries, dict(entry_to_file_map)
216
+
217
+ @staticmethod
218
+ def extract_plaintext_entries(plaintext_files):
219
+ entries = []
220
+ entry_to_file_map = []
221
+
222
+ for doc in plaintext_files:
223
+ entries, entry_to_file_map = PlaintextToEntries.process_single_plaintext_file(
224
+ doc["content"], doc["path"], entries, entry_to_file_map
225
+ )
226
+ return entries, dict(entry_to_file_map)
File without changes
@@ -0,0 +1,117 @@
1
+ import base64
2
+ import logging
3
+ import os
4
+ from datetime import datetime
5
+ from typing import Dict, List, Tuple
6
+
7
+ from khoj.database.models import Entry as DbEntry
8
+ from khoj.database.models import KhojUser
9
+ from khoj.processor.content.text_to_entries import TextToEntries
10
+ from khoj.utils.helpers import timer
11
+ from khoj.utils.rawconfig import Entry
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ImageToEntries(TextToEntries):
17
+ def __init__(self):
18
+ super().__init__()
19
+
20
+ # Define Functions
21
+ def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = False) -> Tuple[int, int]:
22
+ # Extract required fields from config
23
+ deletion_file_names = set([file for file in files if files[file] == b""])
24
+ files_to_process = set(files) - deletion_file_names
25
+ files = {file: files[file] for file in files_to_process}
26
+
27
+ # Extract Entries from specified image files
28
+ with timer("Extract entries from specified Image files", logger):
29
+ file_to_text_map, current_entries = ImageToEntries.extract_image_entries(files)
30
+
31
+ # Split entries by max tokens supported by model
32
+ with timer("Split entries by max token size supported by model", logger):
33
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
34
+
35
+ # Identify, mark and merge any new entries with previous entries
36
+ with timer("Identify new or updated entries", logger):
37
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
38
+ user,
39
+ current_entries,
40
+ DbEntry.EntryType.IMAGE,
41
+ DbEntry.EntrySource.COMPUTER,
42
+ "compiled",
43
+ logger,
44
+ deletion_file_names,
45
+ regenerate=regenerate,
46
+ file_to_text_map=file_to_text_map,
47
+ )
48
+
49
+ return num_new_embeddings, num_deleted_embeddings
50
+
51
+ @staticmethod
52
+ def extract_image_entries(image_files) -> Tuple[Dict, List[Entry]]: # important function
53
+ """Extract entries by page from specified image files"""
54
+ file_to_text_map = dict()
55
+ entries: List[str] = []
56
+ entry_to_location_map: List[Tuple[str, str]] = []
57
+ for image_file in image_files:
58
+ try:
59
+ bytes = image_files[image_file]
60
+ # write the image to a temporary file
61
+ timestamp_now = datetime.utcnow().timestamp()
62
+ # use either png or jpg
63
+ if image_file.endswith(".png"):
64
+ tmp_file = f"tmp_image_file_{timestamp_now}.png"
65
+ elif image_file.endswith(".jpg") or image_file.endswith(".jpeg"):
66
+ tmp_file = f"tmp_image_file_{timestamp_now}.jpg"
67
+ elif image_file.endswith(".webp"):
68
+ tmp_file = f"tmp_image_file_{timestamp_now}.webp"
69
+ with open(tmp_file, "wb") as f:
70
+ bytes = image_files[image_file]
71
+ f.write(bytes)
72
+ try:
73
+ from rapidocr_onnxruntime import RapidOCR
74
+
75
+ loader = RapidOCR()
76
+ image_entries_per_file = ""
77
+ result, _ = loader(tmp_file)
78
+ if result:
79
+ expanded_entries = [text[1] for text in result]
80
+ image_entries_per_file = " ".join(expanded_entries)
81
+ except ImportError:
82
+ logger.warning(
83
+ f"Unable to process image or scanned file for text: {image_file}. This file will not be indexed."
84
+ )
85
+ continue
86
+ entry_to_location_map.append((image_entries_per_file, image_file))
87
+ entries.extend([image_entries_per_file])
88
+ file_to_text_map[image_file] = image_entries_per_file
89
+ except Exception as e:
90
+ logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.")
91
+ logger.warning(e, exc_info=True)
92
+ finally:
93
+ if os.path.exists(tmp_file):
94
+ os.remove(tmp_file)
95
+ return file_to_text_map, ImageToEntries.convert_image_entries_to_maps(entries, dict(entry_to_location_map))
96
+
97
+ @staticmethod
98
+ def convert_image_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
99
+ "Convert each image entries into a dictionary"
100
+ entries = []
101
+ for parsed_entry in parsed_entries:
102
+ entry_filename = entry_to_file_map[parsed_entry]
103
+ # Append base filename to compiled entry for context to model
104
+ heading = f"{entry_filename}\n"
105
+ compiled_entry = f"{heading}{parsed_entry}"
106
+ entries.append(
107
+ Entry(
108
+ compiled=compiled_entry,
109
+ raw=parsed_entry,
110
+ heading=heading,
111
+ file=f"{entry_filename}",
112
+ )
113
+ )
114
+
115
+ logger.debug(f"Converted {len(parsed_entries)} image entries to dictionaries")
116
+
117
+ return entries
File without changes
@@ -0,0 +1,160 @@
1
+ import logging
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ import urllib3.util
7
+
8
+ from khoj.database.models import Entry as DbEntry
9
+ from khoj.database.models import KhojUser
10
+ from khoj.processor.content.text_to_entries import TextToEntries
11
+ from khoj.utils.helpers import timer
12
+ from khoj.utils.rawconfig import Entry
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class MarkdownToEntries(TextToEntries):
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ # Define Functions
22
+ def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = False) -> Tuple[int, int]:
23
+ # Extract required fields from config
24
+ deletion_file_names = set([file for file in files if files[file] == ""])
25
+ files_to_process = set(files) - deletion_file_names
26
+ files = {file: files[file] for file in files_to_process}
27
+
28
+ max_tokens = 256
29
+ # Extract Entries from specified Markdown files
30
+ with timer("Extract entries from specified Markdown files", logger):
31
+ file_to_text_map, current_entries = MarkdownToEntries.extract_markdown_entries(files, max_tokens)
32
+
33
+ # Split entries by max tokens supported by model
34
+ with timer("Split entries by max token size supported by model", logger):
35
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens)
36
+
37
+ # Identify, mark and merge any new entries with previous entries
38
+ with timer("Identify new or updated entries", logger):
39
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
40
+ user,
41
+ current_entries,
42
+ DbEntry.EntryType.MARKDOWN,
43
+ DbEntry.EntrySource.COMPUTER,
44
+ "compiled",
45
+ logger,
46
+ deletion_file_names,
47
+ regenerate=regenerate,
48
+ file_to_text_map=file_to_text_map,
49
+ )
50
+
51
+ return num_new_embeddings, num_deleted_embeddings
52
+
53
+ @staticmethod
54
+ def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
55
+ "Extract entries by heading from specified Markdown files"
56
+ entries: List[str] = []
57
+ entry_to_file_map: List[Tuple[str, str]] = []
58
+ file_to_text_map: Dict[str, str] = dict()
59
+ for markdown_file in markdown_files:
60
+ try:
61
+ markdown_content = markdown_files[markdown_file]
62
+ entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
63
+ markdown_content, markdown_file, entries, entry_to_file_map, max_tokens
64
+ )
65
+ file_to_text_map[markdown_file] = markdown_content
66
+ except Exception as e:
67
+ logger.error(
68
+ f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True
69
+ )
70
+
71
+ return file_to_text_map, MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map))
72
+
73
+ @staticmethod
74
+ def process_single_markdown_file(
75
+ markdown_content: str,
76
+ markdown_file: str,
77
+ entries: List[str],
78
+ entry_to_file_map: List[Tuple[str, str]],
79
+ max_tokens=256,
80
+ ancestry: Dict[int, str] = {},
81
+ ) -> Tuple[List[str], List[Tuple[str, str]]]:
82
+ # Prepend the markdown section's heading ancestry
83
+ ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())])
84
+ markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}"
85
+
86
+ # If content is small or content has no children headings, save it as a single entry
87
+ if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
88
+ rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
89
+ ):
90
+ entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
91
+ entries.extend([markdown_content_with_ancestry])
92
+ return entries, entry_to_file_map
93
+
94
+ # Split by next heading level present in the entry
95
+ next_heading_level = len(ancestry)
96
+ sections: List[str] = []
97
+ while len(sections) < 2:
98
+ next_heading_level += 1
99
+ sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
100
+
101
+ for section in sections:
102
+ # Skip empty sections
103
+ if section.strip() == "":
104
+ continue
105
+
106
+ # Extract the section body and (when present) the heading
107
+ current_ancestry = ancestry.copy()
108
+ first_line = [line for line in section.split("\n") if line.strip() != ""][0]
109
+ if re.search(rf"^#{{{next_heading_level}}} ", first_line):
110
+ # Extract the section body without the heading
111
+ current_section_body = "\n".join(section.split(first_line)[1:])
112
+ # Parse the section heading into current section ancestry
113
+ current_section_title = first_line[next_heading_level:].strip()
114
+ current_ancestry[next_heading_level] = current_section_title
115
+ else:
116
+ current_section_body = section
117
+
118
+ # Recurse down children of the current entry
119
+ MarkdownToEntries.process_single_markdown_file(
120
+ current_section_body,
121
+ markdown_file,
122
+ entries,
123
+ entry_to_file_map,
124
+ max_tokens,
125
+ current_ancestry,
126
+ )
127
+
128
+ return entries, entry_to_file_map
129
+
130
+ @staticmethod
131
+ def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
132
+ "Convert each Markdown entries into a dictionary"
133
+ entries: List[Entry] = []
134
+ for parsed_entry in parsed_entries:
135
+ raw_filename = entry_to_file_map[parsed_entry]
136
+
137
+ # Check if raw_filename is a URL. If so, save it as is. If not, convert it to a Path.
138
+ if type(raw_filename) == str and re.search(r"^https?://", raw_filename):
139
+ # Escape the URL to avoid issues with special characters
140
+ entry_filename = urllib3.util.parse_url(raw_filename).url
141
+ else:
142
+ entry_filename = raw_filename
143
+
144
+ heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
145
+ # Append base filename to compiled entry for context to model
146
+ # Increment heading level for heading entries and make filename as its top level heading
147
+ prefix = f"# {entry_filename}\n#" if heading else f"# {entry_filename}\n"
148
+ compiled_entry = f"{prefix}{parsed_entry}"
149
+ entries.append(
150
+ Entry(
151
+ compiled=compiled_entry,
152
+ raw=parsed_entry,
153
+ heading=f"{prefix}{heading}",
154
+ file=entry_filename,
155
+ )
156
+ )
157
+
158
+ logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
159
+
160
+ return entries