khoj 1.33.3.dev32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. khoj/__init__.py +0 -0
  2. khoj/app/README.md +94 -0
  3. khoj/app/__init__.py +0 -0
  4. khoj/app/asgi.py +16 -0
  5. khoj/app/settings.py +218 -0
  6. khoj/app/urls.py +25 -0
  7. khoj/configure.py +452 -0
  8. khoj/database/__init__.py +0 -0
  9. khoj/database/adapters/__init__.py +1821 -0
  10. khoj/database/admin.py +417 -0
  11. khoj/database/apps.py +6 -0
  12. khoj/database/management/__init__.py +0 -0
  13. khoj/database/management/commands/__init__.py +0 -0
  14. khoj/database/management/commands/change_default_model.py +116 -0
  15. khoj/database/management/commands/change_generated_images_url.py +61 -0
  16. khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
  17. khoj/database/migrations/0001_khojuser.py +98 -0
  18. khoj/database/migrations/0002_googleuser.py +32 -0
  19. khoj/database/migrations/0003_vector_extension.py +10 -0
  20. khoj/database/migrations/0004_content_types_and_more.py +181 -0
  21. khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
  22. khoj/database/migrations/0006_embeddingsdates.py +33 -0
  23. khoj/database/migrations/0007_add_conversation.py +27 -0
  24. khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
  25. khoj/database/migrations/0009_khojapiuser.py +24 -0
  26. khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
  27. khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
  28. khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
  29. khoj/database/migrations/0012_entry_file_source.py +21 -0
  30. khoj/database/migrations/0013_subscription.py +37 -0
  31. khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
  32. khoj/database/migrations/0015_alter_subscription_user.py +21 -0
  33. khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
  34. khoj/database/migrations/0017_searchmodel.py +32 -0
  35. khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
  36. khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
  37. khoj/database/migrations/0020_reflectivequestion.py +36 -0
  38. khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
  39. khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
  40. khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
  41. khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
  42. khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
  43. khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
  44. khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
  45. khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
  46. khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
  47. khoj/database/migrations/0029_userrequests.py +27 -0
  48. khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
  49. khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
  50. khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
  51. khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
  52. khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
  53. khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
  54. khoj/database/migrations/0035_processlock.py +26 -0
  55. khoj/database/migrations/0036_alter_processlock_name.py +19 -0
  56. khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
  57. khoj/database/migrations/0036_publicconversation.py +42 -0
  58. khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
  59. khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
  60. khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
  61. khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
  62. khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
  63. khoj/database/migrations/0040_alter_processlock_name.py +26 -0
  64. khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
  65. khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
  66. khoj/database/migrations/0042_serverchatsettings.py +46 -0
  67. khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
  68. khoj/database/migrations/0044_conversation_file_filters.py +17 -0
  69. khoj/database/migrations/0045_fileobject.py +37 -0
  70. khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
  71. khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
  72. khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
  73. khoj/database/migrations/0049_datastore.py +38 -0
  74. khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
  75. khoj/database/migrations/0050_alter_processlock_name.py +25 -0
  76. khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
  77. khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
  78. khoj/database/migrations/0053_agent_style_color_agent_style_icon.py +61 -0
  79. khoj/database/migrations/0054_alter_agent_style_color.py +38 -0
  80. khoj/database/migrations/0055_alter_agent_style_icon.py +37 -0
  81. khoj/database/migrations/0056_chatmodeloptions_vision_enabled.py +17 -0
  82. khoj/database/migrations/0056_searchmodelconfig_cross_encoder_model_config.py +17 -0
  83. khoj/database/migrations/0057_merge_20240816_1409.py +13 -0
  84. khoj/database/migrations/0057_remove_serverchatsettings_default_model_and_more.py +51 -0
  85. khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py +17 -0
  86. khoj/database/migrations/0059_searchmodelconfig_bi_encoder_confidence_threshold.py +17 -0
  87. khoj/database/migrations/0060_merge_20240905_1828.py +14 -0
  88. khoj/database/migrations/0061_alter_chatmodeloptions_model_type.py +26 -0
  89. khoj/database/migrations/0061_alter_texttoimagemodelconfig_model_type.py +21 -0
  90. khoj/database/migrations/0062_merge_20240913_0222.py +14 -0
  91. khoj/database/migrations/0063_conversation_temp_id.py +36 -0
  92. khoj/database/migrations/0064_remove_conversation_temp_id_alter_conversation_id.py +86 -0
  93. khoj/database/migrations/0065_remove_agent_avatar_remove_agent_public_and_more.py +49 -0
  94. khoj/database/migrations/0066_remove_agent_tools_agent_input_tools_and_more.py +69 -0
  95. khoj/database/migrations/0067_alter_agent_style_icon.py +50 -0
  96. khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
  97. khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
  98. khoj/database/migrations/0070_alter_agent_input_tools_alter_agent_output_modes.py +46 -0
  99. khoj/database/migrations/0071_subscription_enabled_trial_at_and_more.py +32 -0
  100. khoj/database/migrations/0072_entry_search_model.py +24 -0
  101. khoj/database/migrations/0073_delete_usersearchmodelconfig.py +15 -0
  102. khoj/database/migrations/0074_alter_conversation_title.py +17 -0
  103. khoj/database/migrations/0075_migrate_generated_assets_and_validate.py +85 -0
  104. khoj/database/migrations/0076_rename_openaiprocessorconversationconfig_aimodelapi_and_more.py +26 -0
  105. khoj/database/migrations/0077_chatmodel_alter_agent_chat_model_and_more.py +62 -0
  106. khoj/database/migrations/0078_khojuser_email_verification_code_expiry.py +17 -0
  107. khoj/database/migrations/__init__.py +0 -0
  108. khoj/database/models/__init__.py +725 -0
  109. khoj/database/tests.py +3 -0
  110. khoj/interface/compiled/404/index.html +1 -0
  111. khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_buildManifest.js +1 -0
  112. khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_ssgManifest.js +1 -0
  113. khoj/interface/compiled/_next/static/chunks/1010-8f39bb4648b5ba10.js +1 -0
  114. khoj/interface/compiled/_next/static/chunks/182-f1c48a203dc91e0e.js +20 -0
  115. khoj/interface/compiled/_next/static/chunks/1915-d3c36ad6ce697ce7.js +1 -0
  116. khoj/interface/compiled/_next/static/chunks/2117-165ef4747a5b836b.js +2 -0
  117. khoj/interface/compiled/_next/static/chunks/2581-455000f8aeb08fc3.js +1 -0
  118. khoj/interface/compiled/_next/static/chunks/3727.dcea8f2193111552.js +1 -0
  119. khoj/interface/compiled/_next/static/chunks/3789-a09e37a819171a9d.js +1 -0
  120. khoj/interface/compiled/_next/static/chunks/4124-6c28322ce218d2d5.js +1 -0
  121. khoj/interface/compiled/_next/static/chunks/5427-b52d95253e692bfa.js +1 -0
  122. khoj/interface/compiled/_next/static/chunks/5473-b1cf56dedac6577a.js +1 -0
  123. khoj/interface/compiled/_next/static/chunks/5477-0bbddb79c25a54a7.js +1 -0
  124. khoj/interface/compiled/_next/static/chunks/6065-64db9ad305ba0bcd.js +1 -0
  125. khoj/interface/compiled/_next/static/chunks/6293-469dd16402ea8a6f.js +3 -0
  126. khoj/interface/compiled/_next/static/chunks/688-b5b4391bbc0376f1.js +1 -0
  127. khoj/interface/compiled/_next/static/chunks/8667-b6bf63c72b2d76eb.js +1 -0
  128. khoj/interface/compiled/_next/static/chunks/9259-1172dbaca0515237.js +1 -0
  129. khoj/interface/compiled/_next/static/chunks/94ca1967.1d9b42d929a1ee8c.js +1 -0
  130. khoj/interface/compiled/_next/static/chunks/9597.83583248dfbf6e73.js +1 -0
  131. khoj/interface/compiled/_next/static/chunks/964ecbae.51d6faf8801d15e6.js +1 -0
  132. khoj/interface/compiled/_next/static/chunks/9665-391df1e5c51c960a.js +1 -0
  133. khoj/interface/compiled/_next/static/chunks/app/_not-found/page-a834eddae3e235df.js +1 -0
  134. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  135. khoj/interface/compiled/_next/static/chunks/app/agents/page-28ce086a1129bca2.js +1 -0
  136. khoj/interface/compiled/_next/static/chunks/app/automations/layout-1fe1537449f43496.js +1 -0
  137. khoj/interface/compiled/_next/static/chunks/app/automations/page-bf365a60829d347f.js +1 -0
  138. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
  139. khoj/interface/compiled/_next/static/chunks/app/chat/page-0e476e57eb2015e3.js +1 -0
  140. khoj/interface/compiled/_next/static/chunks/app/layout-30e7fda7262713ce.js +1 -0
  141. khoj/interface/compiled/_next/static/chunks/app/page-a5515ea71aec5ef0.js +1 -0
  142. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +1 -0
  143. khoj/interface/compiled/_next/static/chunks/app/search/page-9140541e67ea307d.js +1 -0
  144. khoj/interface/compiled/_next/static/chunks/app/settings/layout-d09d6510a45cd4bd.js +1 -0
  145. khoj/interface/compiled/_next/static/chunks/app/settings/page-951ba40b5b94b23a.js +1 -0
  146. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +1 -0
  147. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-1beb80d8d741c932.js +1 -0
  148. khoj/interface/compiled/_next/static/chunks/d3ac728e-44ebd2a0c99b12a0.js +1 -0
  149. khoj/interface/compiled/_next/static/chunks/fd9d1056-4482b99a36fd1673.js +1 -0
  150. khoj/interface/compiled/_next/static/chunks/framework-8e0e0f4a6b83a956.js +1 -0
  151. khoj/interface/compiled/_next/static/chunks/main-app-de1f09df97a3cfc7.js +1 -0
  152. khoj/interface/compiled/_next/static/chunks/main-db4bfac6b0a8d00b.js +1 -0
  153. khoj/interface/compiled/_next/static/chunks/pages/_app-3c9ca398d360b709.js +1 -0
  154. khoj/interface/compiled/_next/static/chunks/pages/_error-cf5ca766ac8f493f.js +1 -0
  155. khoj/interface/compiled/_next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
  156. khoj/interface/compiled/_next/static/chunks/webpack-a03962458328b163.js +1 -0
  157. khoj/interface/compiled/_next/static/css/089de1d8526b96e9.css +1 -0
  158. khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
  159. khoj/interface/compiled/_next/static/css/4e4e6a4a1c920d06.css +1 -0
  160. khoj/interface/compiled/_next/static/css/8d02837c730f8d13.css +25 -0
  161. khoj/interface/compiled/_next/static/css/8e6a3ca11a60b189.css +1 -0
  162. khoj/interface/compiled/_next/static/css/9c164d9727dd8092.css +1 -0
  163. khoj/interface/compiled/_next/static/css/dac88c17aaee5fcf.css +1 -0
  164. khoj/interface/compiled/_next/static/css/df4b47a2d0d85eae.css +1 -0
  165. khoj/interface/compiled/_next/static/css/e4eb883b5265d372.css +1 -0
  166. khoj/interface/compiled/_next/static/media/1d8a05b60287ae6c-s.p.woff2 +0 -0
  167. khoj/interface/compiled/_next/static/media/6f22fce21a7c433c-s.woff2 +0 -0
  168. khoj/interface/compiled/_next/static/media/77c207b095007c34-s.p.woff2 +0 -0
  169. khoj/interface/compiled/_next/static/media/82ef96de0e8f4d8c-s.p.woff2 +0 -0
  170. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.1608a09b.woff +0 -0
  171. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.4aafdb68.ttf +0 -0
  172. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.a79f1c31.woff2 +0 -0
  173. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.b6770918.woff +0 -0
  174. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.cce5b8ec.ttf +0 -0
  175. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.ec17d132.woff2 +0 -0
  176. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.07ef19e7.ttf +0 -0
  177. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.55fac258.woff2 +0 -0
  178. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.dad44a7f.woff +0 -0
  179. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.9f256b85.woff +0 -0
  180. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.b18f59e1.ttf +0 -0
  181. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.d42a5579.woff2 +0 -0
  182. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.7c187121.woff +0 -0
  183. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.d3c882a6.woff2 +0 -0
  184. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.ed38e79f.ttf +0 -0
  185. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.b74a1a8b.ttf +0 -0
  186. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.c3fb5ac2.woff2 +0 -0
  187. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.d181c465.woff +0 -0
  188. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.6f2bb1df.woff2 +0 -0
  189. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.70d8b0a5.ttf +0 -0
  190. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.e3f82f9d.woff +0 -0
  191. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.47373d1e.ttf +0 -0
  192. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.8916142b.woff2 +0 -0
  193. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.9024d815.woff +0 -0
  194. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.0462f03b.woff2 +0 -0
  195. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.7f51fe03.woff +0 -0
  196. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.b7f8fe9b.ttf +0 -0
  197. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.572d331f.woff2 +0 -0
  198. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.a879cf83.ttf +0 -0
  199. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.f1035d8d.woff +0 -0
  200. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.5295ba48.woff +0 -0
  201. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.939bc644.ttf +0 -0
  202. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.f28c23ac.woff2 +0 -0
  203. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.8c5b5494.woff2 +0 -0
  204. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.94e1e8dc.ttf +0 -0
  205. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.bf59d231.woff +0 -0
  206. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.3b1e59b3.woff2 +0 -0
  207. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.7c9bc82b.woff +0 -0
  208. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.b4c20c84.ttf +0 -0
  209. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.74048478.woff +0 -0
  210. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.ba21ed5f.woff2 +0 -0
  211. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.d4d7ba48.ttf +0 -0
  212. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.03e9641d.woff2 +0 -0
  213. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.07505710.woff +0 -0
  214. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.fe9cbbe1.ttf +0 -0
  215. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.e1e279cb.woff +0 -0
  216. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.eae34984.woff2 +0 -0
  217. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.fabc004a.ttf +0 -0
  218. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.57727022.woff +0 -0
  219. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.5916a24f.woff2 +0 -0
  220. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.d6b476ec.ttf +0 -0
  221. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.9acaf01c.woff +0 -0
  222. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.a144ef58.ttf +0 -0
  223. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.b4230e7e.woff2 +0 -0
  224. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.10d95fd3.woff2 +0 -0
  225. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.7a996c9d.woff +0 -0
  226. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.fbccdabe.ttf +0 -0
  227. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.6258592b.woff +0 -0
  228. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.a8709e36.woff2 +0 -0
  229. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.d97aaf4a.ttf +0 -0
  230. khoj/interface/compiled/_next/static/media/a6ecd16fa044d500-s.p.woff2 +0 -0
  231. khoj/interface/compiled/_next/static/media/bd82c78e5b7b3fe9-s.p.woff2 +0 -0
  232. khoj/interface/compiled/_next/static/media/c32c8052c071fc42-s.woff2 +0 -0
  233. khoj/interface/compiled/_next/static/media/c4250770ab8708b6-s.p.woff2 +0 -0
  234. khoj/interface/compiled/_next/static/media/e098aaaecc9cfbb2-s.p.woff2 +0 -0
  235. khoj/interface/compiled/_next/static/media/flags.3afdda2f.webp +0 -0
  236. khoj/interface/compiled/_next/static/media/flags@2x.5fbe9fc1.webp +0 -0
  237. khoj/interface/compiled/_next/static/media/globe.98e105ca.webp +0 -0
  238. khoj/interface/compiled/_next/static/media/globe@2x.974df6f8.webp +0 -0
  239. khoj/interface/compiled/agents/index.html +1 -0
  240. khoj/interface/compiled/agents/index.txt +7 -0
  241. khoj/interface/compiled/agents.svg +6 -0
  242. khoj/interface/compiled/assets/icons/khoj_lantern.ico +0 -0
  243. khoj/interface/compiled/assets/icons/khoj_lantern.svg +100 -0
  244. khoj/interface/compiled/assets/icons/khoj_lantern_1200x1200.png +0 -0
  245. khoj/interface/compiled/assets/icons/khoj_lantern_128x128.png +0 -0
  246. khoj/interface/compiled/assets/icons/khoj_lantern_128x128_dark.png +0 -0
  247. khoj/interface/compiled/assets/icons/khoj_lantern_256x256.png +0 -0
  248. khoj/interface/compiled/assets/icons/khoj_lantern_512x512.png +0 -0
  249. khoj/interface/compiled/assets/icons/khoj_lantern_logomarktype_1200x630.png +0 -0
  250. khoj/interface/compiled/assets/samples/desktop-browse-draw-sample.png +0 -0
  251. khoj/interface/compiled/assets/samples/desktop-plain-chat-sample.png +0 -0
  252. khoj/interface/compiled/assets/samples/desktop-remember-plan-sample.png +0 -0
  253. khoj/interface/compiled/assets/samples/phone-browse-draw-sample.png +0 -0
  254. khoj/interface/compiled/assets/samples/phone-plain-chat-sample.png +0 -0
  255. khoj/interface/compiled/assets/samples/phone-remember-plan-sample.png +0 -0
  256. khoj/interface/compiled/automation.svg +37 -0
  257. khoj/interface/compiled/automations/index.html +1 -0
  258. khoj/interface/compiled/automations/index.txt +8 -0
  259. khoj/interface/compiled/chat/index.html +1 -0
  260. khoj/interface/compiled/chat/index.txt +7 -0
  261. khoj/interface/compiled/chat.svg +24 -0
  262. khoj/interface/compiled/close.svg +5 -0
  263. khoj/interface/compiled/copy-button-success.svg +6 -0
  264. khoj/interface/compiled/copy-button.svg +5 -0
  265. khoj/interface/compiled/index.html +1 -0
  266. khoj/interface/compiled/index.txt +7 -0
  267. khoj/interface/compiled/khoj.webmanifest +76 -0
  268. khoj/interface/compiled/logo.svg +24 -0
  269. khoj/interface/compiled/search/index.html +1 -0
  270. khoj/interface/compiled/search/index.txt +7 -0
  271. khoj/interface/compiled/send.svg +1 -0
  272. khoj/interface/compiled/settings/index.html +1 -0
  273. khoj/interface/compiled/settings/index.txt +9 -0
  274. khoj/interface/compiled/share/chat/index.html +1 -0
  275. khoj/interface/compiled/share/chat/index.txt +7 -0
  276. khoj/interface/compiled/share.svg +8 -0
  277. khoj/interface/compiled/thumbs-down.svg +6 -0
  278. khoj/interface/compiled/thumbs-up.svg +6 -0
  279. khoj/interface/email/feedback.html +34 -0
  280. khoj/interface/email/magic_link.html +40 -0
  281. khoj/interface/email/task.html +37 -0
  282. khoj/interface/email/welcome.html +90 -0
  283. khoj/interface/web/.well-known/assetlinks.json +11 -0
  284. khoj/interface/web/assets/icons/agents.svg +19 -0
  285. khoj/interface/web/assets/icons/automation.svg +43 -0
  286. khoj/interface/web/assets/icons/chat.svg +24 -0
  287. khoj/interface/web/assets/icons/github.svg +1 -0
  288. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  289. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  290. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +32 -0
  291. khoj/interface/web/assets/icons/khoj.svg +26 -0
  292. khoj/interface/web/assets/icons/logotype.svg +1 -0
  293. khoj/interface/web/assets/icons/search.svg +57 -0
  294. khoj/interface/web/assets/icons/sync.svg +4 -0
  295. khoj/interface/web/assets/khoj.css +237 -0
  296. khoj/interface/web/assets/utils.js +33 -0
  297. khoj/interface/web/base_config.html +445 -0
  298. khoj/interface/web/content_source_github_input.html +208 -0
  299. khoj/interface/web/login.html +310 -0
  300. khoj/interface/web/utils.html +48 -0
  301. khoj/main.py +249 -0
  302. khoj/manage.py +22 -0
  303. khoj/migrations/__init__.py +0 -0
  304. khoj/migrations/migrate_offline_chat_default_model.py +69 -0
  305. khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
  306. khoj/migrations/migrate_offline_chat_schema.py +83 -0
  307. khoj/migrations/migrate_offline_model.py +29 -0
  308. khoj/migrations/migrate_processor_config_openai.py +67 -0
  309. khoj/migrations/migrate_server_pg.py +132 -0
  310. khoj/migrations/migrate_version.py +17 -0
  311. khoj/processor/__init__.py +0 -0
  312. khoj/processor/content/__init__.py +0 -0
  313. khoj/processor/content/docx/__init__.py +0 -0
  314. khoj/processor/content/docx/docx_to_entries.py +111 -0
  315. khoj/processor/content/github/__init__.py +0 -0
  316. khoj/processor/content/github/github_to_entries.py +226 -0
  317. khoj/processor/content/images/__init__.py +0 -0
  318. khoj/processor/content/images/image_to_entries.py +117 -0
  319. khoj/processor/content/markdown/__init__.py +0 -0
  320. khoj/processor/content/markdown/markdown_to_entries.py +160 -0
  321. khoj/processor/content/notion/notion_to_entries.py +259 -0
  322. khoj/processor/content/org_mode/__init__.py +0 -0
  323. khoj/processor/content/org_mode/org_to_entries.py +226 -0
  324. khoj/processor/content/org_mode/orgnode.py +532 -0
  325. khoj/processor/content/pdf/__init__.py +0 -0
  326. khoj/processor/content/pdf/pdf_to_entries.py +119 -0
  327. khoj/processor/content/plaintext/__init__.py +0 -0
  328. khoj/processor/content/plaintext/plaintext_to_entries.py +117 -0
  329. khoj/processor/content/text_to_entries.py +296 -0
  330. khoj/processor/conversation/__init__.py +0 -0
  331. khoj/processor/conversation/anthropic/__init__.py +0 -0
  332. khoj/processor/conversation/anthropic/anthropic_chat.py +243 -0
  333. khoj/processor/conversation/anthropic/utils.py +217 -0
  334. khoj/processor/conversation/google/__init__.py +0 -0
  335. khoj/processor/conversation/google/gemini_chat.py +253 -0
  336. khoj/processor/conversation/google/utils.py +260 -0
  337. khoj/processor/conversation/offline/__init__.py +0 -0
  338. khoj/processor/conversation/offline/chat_model.py +308 -0
  339. khoj/processor/conversation/offline/utils.py +80 -0
  340. khoj/processor/conversation/offline/whisper.py +15 -0
  341. khoj/processor/conversation/openai/__init__.py +0 -0
  342. khoj/processor/conversation/openai/gpt.py +243 -0
  343. khoj/processor/conversation/openai/utils.py +232 -0
  344. khoj/processor/conversation/openai/whisper.py +13 -0
  345. khoj/processor/conversation/prompts.py +1188 -0
  346. khoj/processor/conversation/utils.py +867 -0
  347. khoj/processor/embeddings.py +122 -0
  348. khoj/processor/image/generate.py +215 -0
  349. khoj/processor/speech/__init__.py +0 -0
  350. khoj/processor/speech/text_to_speech.py +51 -0
  351. khoj/processor/tools/__init__.py +0 -0
  352. khoj/processor/tools/online_search.py +472 -0
  353. khoj/processor/tools/run_code.py +179 -0
  354. khoj/routers/__init__.py +0 -0
  355. khoj/routers/api.py +760 -0
  356. khoj/routers/api_agents.py +295 -0
  357. khoj/routers/api_chat.py +1273 -0
  358. khoj/routers/api_content.py +634 -0
  359. khoj/routers/api_model.py +123 -0
  360. khoj/routers/api_phone.py +86 -0
  361. khoj/routers/api_subscription.py +144 -0
  362. khoj/routers/auth.py +307 -0
  363. khoj/routers/email.py +135 -0
  364. khoj/routers/helpers.py +2333 -0
  365. khoj/routers/notion.py +85 -0
  366. khoj/routers/research.py +364 -0
  367. khoj/routers/storage.py +63 -0
  368. khoj/routers/twilio.py +36 -0
  369. khoj/routers/web_client.py +141 -0
  370. khoj/search_filter/__init__.py +0 -0
  371. khoj/search_filter/base_filter.py +15 -0
  372. khoj/search_filter/date_filter.py +215 -0
  373. khoj/search_filter/file_filter.py +32 -0
  374. khoj/search_filter/word_filter.py +29 -0
  375. khoj/search_type/__init__.py +0 -0
  376. khoj/search_type/text_search.py +255 -0
  377. khoj/utils/__init__.py +0 -0
  378. khoj/utils/cli.py +101 -0
  379. khoj/utils/config.py +81 -0
  380. khoj/utils/constants.py +51 -0
  381. khoj/utils/fs_syncer.py +252 -0
  382. khoj/utils/helpers.py +627 -0
  383. khoj/utils/initialization.py +301 -0
  384. khoj/utils/jsonl.py +43 -0
  385. khoj/utils/models.py +47 -0
  386. khoj/utils/rawconfig.py +208 -0
  387. khoj/utils/state.py +48 -0
  388. khoj/utils/yaml.py +47 -0
  389. khoj-1.33.3.dev32.dist-info/METADATA +190 -0
  390. khoj-1.33.3.dev32.dist-info/RECORD +393 -0
  391. khoj-1.33.3.dev32.dist-info/WHEEL +4 -0
  392. khoj-1.33.3.dev32.dist-info/entry_points.txt +2 -0
  393. khoj-1.33.3.dev32.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,532 @@
1
+ # Copyright (c) 2010 Charles Cave
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person
4
+ # obtaining a copy of this software and associated documentation
5
+ # files (the "Software"), to deal in the Software without
6
+ # restriction, including without limitation the rights to use, copy,
7
+ # modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ # of the Software, and to permit persons to whom the Software is
9
+ # furnished to do so, subject to the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18
+ # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19
+ # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Program written by Charles Cave (charlesweb@optusnet.com.au)
24
+ # February - March 2009
25
+ # Version 2 - June 2009
26
+ # Added support for all tags, TODO priority and checking existence of a tag
27
+ # More information at
28
+ # http://members.optusnet.com.au/~charles57/GTD
29
+
30
+ """
31
+ The Orgnode module consists of the Orgnode class for representing a
32
+ headline and associated text from an org-mode file, and routines for
33
+ constructing data structures of these classes.
34
+ """
35
+
36
+ import datetime
37
+ import re
38
+ from os.path import relpath
39
+ from pathlib import Path
40
+ from typing import Dict, List, Tuple
41
+
42
+ indent_regex = re.compile(r"^ *")
43
+
44
+
45
+ def normalize_filename(filename):
46
+ "Normalize and escape filename for rendering"
47
+ if not Path(filename).is_absolute():
48
+ # Normalize relative filename to be relative to current directory
49
+ normalized_filename = f"~/{relpath(filename, start=Path.home())}"
50
+ else:
51
+ normalized_filename = filename
52
+ escaped_filename = f"{normalized_filename}".replace("[", r"\[").replace("]", r"\]")
53
+ return escaped_filename
54
+
55
+
56
+ def makelist_with_filepath(filename):
57
+ f = open(filename, "r")
58
+ return makelist(f, filename)
59
+
60
+
61
+ def makelist(file, filename) -> List["Orgnode"]:
62
+ """
63
+ Read an org-mode file and return a list of Orgnode objects
64
+ created from this file.
65
+ """
66
+ ctr = 0
67
+
68
+ if type(file) == str:
69
+ f = file.split("\n")
70
+ else:
71
+ f = file
72
+
73
+ todos = {
74
+ "TODO": "",
75
+ "WAITING": "",
76
+ "ACTIVE": "",
77
+ "DONE": "",
78
+ "CANCELLED": "",
79
+ "FAILED": "",
80
+ } # populated from #+SEQ_TODO line
81
+ level = ""
82
+ heading = ""
83
+ ancestor_headings: List[str] = []
84
+ bodytext = ""
85
+ introtext = ""
86
+ tags: List[str] = list() # set of all tags in headline
87
+ closed_date: datetime.date = None
88
+ sched_date: datetime.date = None
89
+ deadline_date: datetime.date = None
90
+ logbook: List[Tuple[datetime.datetime, datetime.datetime]] = list()
91
+ nodelist: List[Orgnode] = list()
92
+ property_map: Dict[str, str] = dict()
93
+ in_properties_drawer = False
94
+ in_logbook_drawer = False
95
+ file_title = f"{filename}"
96
+
97
+ for line in f:
98
+ ctr += 1
99
+ heading_search = re.search(r"^(\*+)\s(.*?)\s*$", line)
100
+ if heading_search: # we are processing a heading line
101
+ if heading: # if we have are on second heading, append first heading to headings list
102
+ thisNode = Orgnode(level, heading, bodytext, tags, ancestor_headings)
103
+ if closed_date:
104
+ thisNode.closed = closed_date
105
+ closed_date = None
106
+ if sched_date:
107
+ thisNode.scheduled = sched_date
108
+ sched_date = None
109
+ if deadline_date:
110
+ thisNode.deadline = deadline_date
111
+ deadline_date = None
112
+ if logbook:
113
+ thisNode.logbook = logbook
114
+ logbook = list()
115
+ thisNode.properties = property_map
116
+ nodelist.append(thisNode)
117
+ property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"}
118
+ previous_level = level
119
+ previous_heading: str = heading
120
+ level = heading_search.group(1)
121
+ heading = heading_search.group(2)
122
+ bodytext = ""
123
+ tags = list() # set of all tags in headline
124
+ tag_search = re.search(r"(.*?)\s*:([a-zA-Z0-9].*?):$", heading)
125
+ if tag_search:
126
+ heading = tag_search.group(1)
127
+ parsedtags = tag_search.group(2)
128
+ if parsedtags:
129
+ for parsedtag in parsedtags.split(":"):
130
+ if parsedtag != "":
131
+ tags.append(parsedtag)
132
+
133
+ # Add previous heading to ancestors if current heading is deeper than previous level
134
+ if len(level) > len(previous_level) and previous_heading:
135
+ ancestor_headings.append(previous_heading)
136
+ # Remove last ancestor(s) if current heading is shallower than previous level
137
+ elif len(level) < len(previous_level):
138
+ for _ in range(len(level), len(previous_level)):
139
+ if not ancestor_headings or len(ancestor_headings) == 0:
140
+ break
141
+ ancestor_headings.pop()
142
+
143
+ else: # we are processing a non-heading line
144
+ if line[:10] == "#+SEQ_TODO":
145
+ kwlist = re.findall(r"([A-Z]+)\(", line)
146
+ for kw in kwlist:
147
+ todos[kw] = ""
148
+
149
+ # Set file title to TITLE property, if it exists
150
+ title_search = re.search(r"^#\+TITLE:\s*(.*)$", line)
151
+ if title_search and title_search.group(1).strip() != "":
152
+ title_text = title_search.group(1).strip()
153
+ if file_title == f"{filename}":
154
+ file_title = title_text
155
+ else:
156
+ file_title += f" {title_text}"
157
+ continue
158
+
159
+ # Ignore Properties Drawer Start, End Lines
160
+ if re.search(":PROPERTIES:", line):
161
+ in_properties_drawer = True
162
+ continue
163
+ if in_properties_drawer and re.search(":END:", line):
164
+ in_properties_drawer = False
165
+ continue
166
+
167
+ # Ignore Logbook Drawer Start, End Lines
168
+ if re.search(":LOGBOOK:", line):
169
+ in_logbook_drawer = True
170
+ continue
171
+ if in_logbook_drawer and re.search(":END:", line):
172
+ in_logbook_drawer = False
173
+ continue
174
+
175
+ # Extract Clocking Lines
176
+ clocked_re = re.search(
177
+ r"CLOCK:\s*\[([0-9]{4}-[0-9]{2}-[0-9]{2} [a-zA-Z]{3} [0-9]{2}:[0-9]{2})\]--\[([0-9]{4}-[0-9]{2}-[0-9]{2} [a-zA-Z]{3} [0-9]{2}:[0-9]{2})\]",
178
+ line,
179
+ )
180
+ if clocked_re:
181
+ # convert clock in, clock out strings to datetime objects
182
+ clocked_in = datetime.datetime.strptime(clocked_re.group(1), "%Y-%m-%d %a %H:%M")
183
+ clocked_out = datetime.datetime.strptime(clocked_re.group(2), "%Y-%m-%d %a %H:%M")
184
+ # add clocked time to the entries logbook list
185
+ logbook += [(clocked_in, clocked_out)]
186
+ line = ""
187
+
188
+ property_search = re.search(r"^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$", line)
189
+ if property_search:
190
+ # Set ID property to an id based org-mode link to the entry
191
+ if property_search.group(1) == "ID":
192
+ property_map["ID"] = f"id:{property_search.group(2)}"
193
+ else:
194
+ property_map[property_search.group(1)] = property_search.group(2)
195
+ continue
196
+
197
+ cd_re = re.search(r"CLOSED:\s*\[([0-9]{4})-([0-9]{2})-([0-9]{2})", line)
198
+ if cd_re:
199
+ closed_date = datetime.date(int(cd_re.group(1)), int(cd_re.group(2)), int(cd_re.group(3)))
200
+ sd_re = re.search(r"SCHEDULED:\s*<([0-9]+)\-([0-9]+)\-([0-9]+)", line)
201
+ if sd_re:
202
+ sched_date = datetime.date(int(sd_re.group(1)), int(sd_re.group(2)), int(sd_re.group(3)))
203
+ dd_re = re.search(r"DEADLINE:\s*<(\d+)\-(\d+)\-(\d+)", line)
204
+ if dd_re:
205
+ deadline_date = datetime.date(int(dd_re.group(1)), int(dd_re.group(2)), int(dd_re.group(3)))
206
+
207
+ # Ignore property drawer, scheduled, closed, deadline, logbook entries and # lines from body
208
+ if (
209
+ not in_properties_drawer
210
+ and not cd_re
211
+ and not sd_re
212
+ and not dd_re
213
+ and not clocked_re
214
+ and line[:1] != "#"
215
+ ):
216
+ # if we are in a heading
217
+ if heading:
218
+ # add the line to the bodytext
219
+ bodytext += line.rstrip() + "\n\n" if line.strip() else ""
220
+ # bodytext += line + "\n" if line.strip() else "\n"
221
+ # else we are in the pre heading portion of the file
222
+ elif line.strip():
223
+ # so add the line to the introtext
224
+ introtext += line
225
+
226
+ # write out intro node before headings
227
+ # this is done at the end to allow collating all title lines
228
+ if introtext:
229
+ thisNode = Orgnode(level, file_title, introtext, tags)
230
+ nodelist = [thisNode] + nodelist
231
+ # write out last heading node
232
+ if heading:
233
+ thisNode = Orgnode(level, heading, bodytext, tags, ancestor_headings)
234
+ thisNode.properties = property_map
235
+ if sched_date:
236
+ thisNode.scheduled = sched_date
237
+ if deadline_date:
238
+ thisNode.deadline = deadline_date
239
+ if closed_date:
240
+ thisNode.closed = closed_date
241
+ if logbook:
242
+ thisNode.logbook = logbook
243
+ nodelist.append(thisNode)
244
+
245
+ # using the list of TODO keywords found in the file
246
+ # process the headings searching for TODO keywords
247
+ for n in nodelist:
248
+ todo_search = re.search(r"([A-Z]+)\s(.*?)$", n.heading)
249
+ if todo_search:
250
+ if todo_search.group(1) in todos:
251
+ n.heading = todo_search.group(2)
252
+ n.todo = todo_search.group(1)
253
+
254
+ # extract, set priority from heading, update heading if necessary
255
+ priority_search = re.search(r"^\[\#(A|B|C)\] (.*?)$", n.heading)
256
+ if priority_search:
257
+ n.priority = priority_search.group(1)
258
+ n.heading = priority_search.group(2)
259
+
260
+ # Prefix filepath/title to ancestors
261
+ n.ancestors = [file_title] + n.ancestors
262
+
263
+ # Set SOURCE property to a file+heading based org-mode link to the entry
264
+ if n.level == 0:
265
+ n.properties["LINE"] = f"file:{normalize_filename(filename)}::0"
266
+ n.properties["SOURCE"] = f"[[file:{normalize_filename(filename)}]]"
267
+ else:
268
+ escaped_heading = n.heading.replace("[", "\\[").replace("]", "\\]")
269
+ n.properties["SOURCE"] = f"[[file:{normalize_filename(filename)}::*{escaped_heading}]]"
270
+
271
+ return nodelist
272
+
273
+
274
+ ######################
275
+ class Orgnode(object):
276
+ """
277
+ Orgnode class represents a headline, tags and text associated
278
+ with the headline.
279
+ """
280
+
281
+ def __init__(self, level, headline, body, tags, ancestor_headings=[]):
282
+ """
283
+ Create an Orgnode object given the parameters of level (as the
284
+ raw asterisks), headline text (including the TODO tag), and
285
+ first tag. The makelist routine postprocesses the list to
286
+ identify TODO tags and updates headline and todo fields.
287
+ """
288
+ self._level = len(level)
289
+ self._heading = headline
290
+ self._body = body
291
+ self._tags = tags # All tags in the headline
292
+ self._todo = ""
293
+ self._priority = "" # empty of A, B or C
294
+ self._scheduled = "" # Scheduled date
295
+ self._deadline = "" # Deadline date
296
+ self._closed = "" # Closed date
297
+ self._properties = dict()
298
+ self._logbook = list() # List of clock-in, clock-out tuples representing logbook entries
299
+ self._ancestor_headings = ancestor_headings.copy()
300
+
301
+ @property
302
+ def ancestors(self) -> List[str]:
303
+ """
304
+ Return the ancestor headings of the node
305
+ """
306
+ return self._ancestor_headings
307
+
308
+ @ancestors.setter
309
+ def ancestors(self, new_ancestors):
310
+ """
311
+ Update the ancestor headings of the node
312
+ """
313
+ self._ancestor_headings = new_ancestors
314
+
315
+ @property
316
+ def heading(self):
317
+ """
318
+ Return the Heading text of the node without the TODO tag
319
+ """
320
+ return self._heading
321
+
322
+ @heading.setter
323
+ def heading(self, newhdng):
324
+ """
325
+ Change the heading to the supplied string
326
+ """
327
+ self._heading = newhdng
328
+
329
+ @property
330
+ def body(self):
331
+ """
332
+ Returns all lines of text of the body of this node except the
333
+ Property Drawer
334
+ """
335
+ return self._body
336
+
337
+ @property
338
+ def hasBody(self):
339
+ """
340
+ Returns True if node has non empty body, else False
341
+ """
342
+ return self._body and re.sub(r"\n|\t|\r| ", "", self._body) != ""
343
+
344
+ @property
345
+ def level(self):
346
+ """
347
+ Returns an integer corresponding to the level of the node.
348
+ Top level (one asterisk) has a level of 1.
349
+ """
350
+ return self._level
351
+
352
+ @property
353
+ def priority(self):
354
+ """
355
+ Returns the priority of this headline: 'A', 'B', 'C' or empty
356
+ string if priority has not been set.
357
+ """
358
+ return self._priority
359
+
360
+ @priority.setter
361
+ def priority(self, new_priority):
362
+ """
363
+ Change the value of the priority of this headline.
364
+ Values values are '', 'A', 'B', 'C'
365
+ """
366
+ self._priority = new_priority
367
+
368
+ @property
369
+ def tags(self):
370
+ """
371
+ Returns the list of all tags
372
+ For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER']
373
+ """
374
+ return self._tags
375
+
376
+ @tags.setter
377
+ def tags(self, newtags):
378
+ """
379
+ Store all the tags found in the headline.
380
+ """
381
+ self._tags = newtags
382
+
383
+ def hasTag(self, tag):
384
+ """
385
+ Returns True if the supplied tag is present in this headline
386
+ For example, hasTag('COMPUTER') on headling containing
387
+ :HOME:COMPUTER: would return True.
388
+ """
389
+ return tag in self._tags
390
+
391
+ @property
392
+ def todo(self):
393
+ """
394
+ Return the value of the TODO tag
395
+ """
396
+ return self._todo
397
+
398
+ @todo.setter
399
+ def todo(self, new_todo):
400
+ """
401
+ Set the value of the TODO tag to the supplied string
402
+ """
403
+ self._todo = new_todo
404
+
405
+ @property
406
+ def properties(self):
407
+ """
408
+ Return the dictionary of properties
409
+ """
410
+ return self._properties
411
+
412
+ @properties.setter
413
+ def properties(self, new_properties):
414
+ """
415
+ Sets all properties using the supplied dictionary of
416
+ name/value pairs
417
+ """
418
+ self._properties = new_properties
419
+
420
+ def Property(self, property_key):
421
+ """
422
+ Returns the value of the requested property or null if the
423
+ property does not exist.
424
+ """
425
+ return self._properties.get(property_key, "")
426
+
427
+ @property
428
+ def scheduled(self):
429
+ """
430
+ Return the scheduled date
431
+ """
432
+ return self._scheduled
433
+
434
+ @scheduled.setter
435
+ def scheduled(self, new_scheduled):
436
+ """
437
+ Set the scheduled date to the scheduled date
438
+ """
439
+ self._scheduled = new_scheduled
440
+
441
+ @property
442
+ def deadline(self):
443
+ """
444
+ Return the deadline date
445
+ """
446
+ return self._deadline
447
+
448
+ @deadline.setter
449
+ def deadline(self, new_deadline):
450
+ """
451
+ Set the deadline (due) date to the new deadline date
452
+ """
453
+ self._deadline = new_deadline
454
+
455
+ @property
456
+ def closed(self):
457
+ """
458
+ Return the closed date
459
+ """
460
+ return self._closed
461
+
462
+ @closed.setter
463
+ def closed(self, new_closed):
464
+ """
465
+ Set the closed date to the new closed date
466
+ """
467
+ self._closed = new_closed
468
+
469
+ @property
470
+ def logbook(self):
471
+ """
472
+ Return the logbook with all clocked-in, clocked-out date object pairs or empty list if nonexistent
473
+ """
474
+ return self._logbook
475
+
476
+ @logbook.setter
477
+ def logbook(self, new_logbook):
478
+ """
479
+ Set the logbook with list of clocked-in, clocked-out tuples for the entry
480
+ """
481
+ self._logbook = new_logbook
482
+
483
+ def __repr__(self):
484
+ """
485
+ Print the level, heading text and tag of a node and the body
486
+ text as used to construct the node.
487
+ """
488
+ # Output heading line
489
+ n = ""
490
+ for _ in range(0, self._level):
491
+ n = n + "*"
492
+ n = n + " "
493
+ if self._todo:
494
+ n = n + self._todo + " "
495
+ if self._priority:
496
+ n = n + "[#" + self._priority + "] "
497
+ n = n + self._heading
498
+ if self._tags:
499
+ n = "%-60s " % n # hack - tags will start in column 62
500
+ closecolon = ""
501
+ for t in self._tags:
502
+ n = n + ":" + t
503
+ closecolon = ":"
504
+ n = n + closecolon
505
+ n = n + "\n"
506
+
507
+ # Get body indentation from first line of body
508
+ indent = indent_regex.match(self._body).group()
509
+
510
+ # Output Closed Date, Scheduled Date, Deadline Date
511
+ if self._closed or self._scheduled or self._deadline:
512
+ n = n + indent
513
+ if self._closed:
514
+ n = n + f'CLOSED: [{self._closed.strftime("%Y-%m-%d %a")}] '
515
+ if self._scheduled:
516
+ n = n + f'SCHEDULED: <{self._scheduled.strftime("%Y-%m-%d %a")}> '
517
+ if self._deadline:
518
+ n = n + f'DEADLINE: <{self._deadline.strftime("%Y-%m-%d %a")}> '
519
+ if self._closed or self._scheduled or self._deadline:
520
+ n = n + "\n"
521
+
522
+ # Output Property Drawer
523
+ n = n + indent + ":PROPERTIES:\n"
524
+ for key, value in self._properties.items():
525
+ n = n + indent + f":{key}: {value}\n"
526
+ n = n + indent + ":END:\n"
527
+
528
+ # Output Body
529
+ if self.hasBody:
530
+ n = n + self._body
531
+
532
+ return n
File without changes
@@ -0,0 +1,119 @@
1
+ import logging
2
+ import tempfile
3
+ from typing import Dict, Final, List, Tuple
4
+
5
+ from langchain_community.document_loaders import PyMuPDFLoader
6
+
7
+ from khoj.database.models import Entry as DbEntry
8
+ from khoj.database.models import KhojUser
9
+ from khoj.processor.content.text_to_entries import TextToEntries
10
+ from khoj.utils.helpers import timer
11
+ from khoj.utils.rawconfig import Entry
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class PdfToEntries(TextToEntries):
17
+ # Class-level constant translation table
18
+ NULL_TRANSLATOR: Final = str.maketrans("", "", "\x00")
19
+
20
+ def __init__(self):
21
+ super().__init__()
22
+
23
+ # Define Functions
24
+ def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = False) -> Tuple[int, int]:
25
+ # Extract required fields from config
26
+ deletion_file_names = set([file for file in files if files[file] == b""])
27
+ files_to_process = set(files) - deletion_file_names
28
+ files = {file: files[file] for file in files_to_process}
29
+
30
+ # Extract Entries from specified Pdf files
31
+ with timer("Extract entries from specified PDF files", logger):
32
+ file_to_text_map, current_entries = PdfToEntries.extract_pdf_entries(files)
33
+
34
+ # Split entries by max tokens supported by model
35
+ with timer("Split entries by max token size supported by model", logger):
36
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
37
+
38
+ # Identify, mark and merge any new entries with previous entries
39
+ with timer("Identify new or updated entries", logger):
40
+ num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
41
+ user,
42
+ current_entries,
43
+ DbEntry.EntryType.PDF,
44
+ DbEntry.EntrySource.COMPUTER,
45
+ "compiled",
46
+ logger,
47
+ deletion_file_names,
48
+ regenerate=regenerate,
49
+ file_to_text_map=file_to_text_map,
50
+ )
51
+
52
+ return num_new_embeddings, num_deleted_embeddings
53
+
54
+ @staticmethod
55
+ def extract_pdf_entries(pdf_files) -> Tuple[Dict, List[Entry]]: # important function
56
+ """Extract entries by page from specified PDF files"""
57
+ file_to_text_map = dict()
58
+ entries: List[str] = []
59
+ entry_to_location_map: List[Tuple[str, str]] = []
60
+ for pdf_file in pdf_files:
61
+ try:
62
+ pdf_entries_per_file = PdfToEntries.extract_text(pdf_files[pdf_file])
63
+ entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
64
+ entries.extend(pdf_entries_per_file)
65
+ file_to_text_map[pdf_file] = pdf_entries_per_file
66
+ except Exception as e:
67
+ logger.warning(f"Unable to extract entries from file: {pdf_file}")
68
+ logger.warning(e, exc_info=True)
69
+
70
+ return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))
71
+
72
+ @staticmethod
73
+ def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
74
+ "Convert each PDF entries into a dictionary"
75
+ entries = []
76
+ for parsed_entry in parsed_entries:
77
+ entry_filename = entry_to_file_map[parsed_entry]
78
+ # Append base filename to compiled entry for context to model
79
+ heading = f"{entry_filename}\n"
80
+ compiled_entry = f"{heading}{parsed_entry}"
81
+ entries.append(
82
+ Entry(
83
+ compiled=compiled_entry,
84
+ raw=parsed_entry,
85
+ heading=heading,
86
+ file=f"{entry_filename}",
87
+ )
88
+ )
89
+
90
+ logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
91
+
92
+ return entries
93
+
94
+ @staticmethod
95
+ def extract_text(pdf_file):
96
+ """Extract text from specified PDF files"""
97
+ try:
98
+ # Create temp file with .pdf extension that gets auto-deleted
99
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf:
100
+ tmpf.write(pdf_file)
101
+ tmpf.flush() # Ensure all data is written
102
+
103
+ # Load the content using PyMuPDFLoader
104
+ loader = PyMuPDFLoader(tmpf.name)
105
+ pdf_entries_per_file = loader.load()
106
+
107
+ # Convert the loaded entries into the desired format
108
+ pdf_entry_by_pages = [PdfToEntries.clean_text(page.page_content) for page in pdf_entries_per_file]
109
+ except Exception as e:
110
+ logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
111
+ logger.warning(e, exc_info=True)
112
+
113
+ return pdf_entry_by_pages
114
+
115
+ @staticmethod
116
+ def clean_text(text: str) -> str:
117
+ """Clean PDF text by removing null bytes and invalid Unicode characters."""
118
+ # Use faster translation table instead of replace
119
+ return text.translate(PdfToEntries.NULL_TRANSLATOR)
File without changes