khoj 1.33.3.dev32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. khoj/__init__.py +0 -0
  2. khoj/app/README.md +94 -0
  3. khoj/app/__init__.py +0 -0
  4. khoj/app/asgi.py +16 -0
  5. khoj/app/settings.py +218 -0
  6. khoj/app/urls.py +25 -0
  7. khoj/configure.py +452 -0
  8. khoj/database/__init__.py +0 -0
  9. khoj/database/adapters/__init__.py +1821 -0
  10. khoj/database/admin.py +417 -0
  11. khoj/database/apps.py +6 -0
  12. khoj/database/management/__init__.py +0 -0
  13. khoj/database/management/commands/__init__.py +0 -0
  14. khoj/database/management/commands/change_default_model.py +116 -0
  15. khoj/database/management/commands/change_generated_images_url.py +61 -0
  16. khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
  17. khoj/database/migrations/0001_khojuser.py +98 -0
  18. khoj/database/migrations/0002_googleuser.py +32 -0
  19. khoj/database/migrations/0003_vector_extension.py +10 -0
  20. khoj/database/migrations/0004_content_types_and_more.py +181 -0
  21. khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
  22. khoj/database/migrations/0006_embeddingsdates.py +33 -0
  23. khoj/database/migrations/0007_add_conversation.py +27 -0
  24. khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
  25. khoj/database/migrations/0009_khojapiuser.py +24 -0
  26. khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
  27. khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
  28. khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
  29. khoj/database/migrations/0012_entry_file_source.py +21 -0
  30. khoj/database/migrations/0013_subscription.py +37 -0
  31. khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
  32. khoj/database/migrations/0015_alter_subscription_user.py +21 -0
  33. khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
  34. khoj/database/migrations/0017_searchmodel.py +32 -0
  35. khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
  36. khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
  37. khoj/database/migrations/0020_reflectivequestion.py +36 -0
  38. khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
  39. khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
  40. khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
  41. khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
  42. khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
  43. khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
  44. khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
  45. khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
  46. khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
  47. khoj/database/migrations/0029_userrequests.py +27 -0
  48. khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
  49. khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
  50. khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
  51. khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
  52. khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
  53. khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
  54. khoj/database/migrations/0035_processlock.py +26 -0
  55. khoj/database/migrations/0036_alter_processlock_name.py +19 -0
  56. khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
  57. khoj/database/migrations/0036_publicconversation.py +42 -0
  58. khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
  59. khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
  60. khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
  61. khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
  62. khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
  63. khoj/database/migrations/0040_alter_processlock_name.py +26 -0
  64. khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
  65. khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
  66. khoj/database/migrations/0042_serverchatsettings.py +46 -0
  67. khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
  68. khoj/database/migrations/0044_conversation_file_filters.py +17 -0
  69. khoj/database/migrations/0045_fileobject.py +37 -0
  70. khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
  71. khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
  72. khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
  73. khoj/database/migrations/0049_datastore.py +38 -0
  74. khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
  75. khoj/database/migrations/0050_alter_processlock_name.py +25 -0
  76. khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
  77. khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
  78. khoj/database/migrations/0053_agent_style_color_agent_style_icon.py +61 -0
  79. khoj/database/migrations/0054_alter_agent_style_color.py +38 -0
  80. khoj/database/migrations/0055_alter_agent_style_icon.py +37 -0
  81. khoj/database/migrations/0056_chatmodeloptions_vision_enabled.py +17 -0
  82. khoj/database/migrations/0056_searchmodelconfig_cross_encoder_model_config.py +17 -0
  83. khoj/database/migrations/0057_merge_20240816_1409.py +13 -0
  84. khoj/database/migrations/0057_remove_serverchatsettings_default_model_and_more.py +51 -0
  85. khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py +17 -0
  86. khoj/database/migrations/0059_searchmodelconfig_bi_encoder_confidence_threshold.py +17 -0
  87. khoj/database/migrations/0060_merge_20240905_1828.py +14 -0
  88. khoj/database/migrations/0061_alter_chatmodeloptions_model_type.py +26 -0
  89. khoj/database/migrations/0061_alter_texttoimagemodelconfig_model_type.py +21 -0
  90. khoj/database/migrations/0062_merge_20240913_0222.py +14 -0
  91. khoj/database/migrations/0063_conversation_temp_id.py +36 -0
  92. khoj/database/migrations/0064_remove_conversation_temp_id_alter_conversation_id.py +86 -0
  93. khoj/database/migrations/0065_remove_agent_avatar_remove_agent_public_and_more.py +49 -0
  94. khoj/database/migrations/0066_remove_agent_tools_agent_input_tools_and_more.py +69 -0
  95. khoj/database/migrations/0067_alter_agent_style_icon.py +50 -0
  96. khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
  97. khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
  98. khoj/database/migrations/0070_alter_agent_input_tools_alter_agent_output_modes.py +46 -0
  99. khoj/database/migrations/0071_subscription_enabled_trial_at_and_more.py +32 -0
  100. khoj/database/migrations/0072_entry_search_model.py +24 -0
  101. khoj/database/migrations/0073_delete_usersearchmodelconfig.py +15 -0
  102. khoj/database/migrations/0074_alter_conversation_title.py +17 -0
  103. khoj/database/migrations/0075_migrate_generated_assets_and_validate.py +85 -0
  104. khoj/database/migrations/0076_rename_openaiprocessorconversationconfig_aimodelapi_and_more.py +26 -0
  105. khoj/database/migrations/0077_chatmodel_alter_agent_chat_model_and_more.py +62 -0
  106. khoj/database/migrations/0078_khojuser_email_verification_code_expiry.py +17 -0
  107. khoj/database/migrations/__init__.py +0 -0
  108. khoj/database/models/__init__.py +725 -0
  109. khoj/database/tests.py +3 -0
  110. khoj/interface/compiled/404/index.html +1 -0
  111. khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_buildManifest.js +1 -0
  112. khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_ssgManifest.js +1 -0
  113. khoj/interface/compiled/_next/static/chunks/1010-8f39bb4648b5ba10.js +1 -0
  114. khoj/interface/compiled/_next/static/chunks/182-f1c48a203dc91e0e.js +20 -0
  115. khoj/interface/compiled/_next/static/chunks/1915-d3c36ad6ce697ce7.js +1 -0
  116. khoj/interface/compiled/_next/static/chunks/2117-165ef4747a5b836b.js +2 -0
  117. khoj/interface/compiled/_next/static/chunks/2581-455000f8aeb08fc3.js +1 -0
  118. khoj/interface/compiled/_next/static/chunks/3727.dcea8f2193111552.js +1 -0
  119. khoj/interface/compiled/_next/static/chunks/3789-a09e37a819171a9d.js +1 -0
  120. khoj/interface/compiled/_next/static/chunks/4124-6c28322ce218d2d5.js +1 -0
  121. khoj/interface/compiled/_next/static/chunks/5427-b52d95253e692bfa.js +1 -0
  122. khoj/interface/compiled/_next/static/chunks/5473-b1cf56dedac6577a.js +1 -0
  123. khoj/interface/compiled/_next/static/chunks/5477-0bbddb79c25a54a7.js +1 -0
  124. khoj/interface/compiled/_next/static/chunks/6065-64db9ad305ba0bcd.js +1 -0
  125. khoj/interface/compiled/_next/static/chunks/6293-469dd16402ea8a6f.js +3 -0
  126. khoj/interface/compiled/_next/static/chunks/688-b5b4391bbc0376f1.js +1 -0
  127. khoj/interface/compiled/_next/static/chunks/8667-b6bf63c72b2d76eb.js +1 -0
  128. khoj/interface/compiled/_next/static/chunks/9259-1172dbaca0515237.js +1 -0
  129. khoj/interface/compiled/_next/static/chunks/94ca1967.1d9b42d929a1ee8c.js +1 -0
  130. khoj/interface/compiled/_next/static/chunks/9597.83583248dfbf6e73.js +1 -0
  131. khoj/interface/compiled/_next/static/chunks/964ecbae.51d6faf8801d15e6.js +1 -0
  132. khoj/interface/compiled/_next/static/chunks/9665-391df1e5c51c960a.js +1 -0
  133. khoj/interface/compiled/_next/static/chunks/app/_not-found/page-a834eddae3e235df.js +1 -0
  134. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  135. khoj/interface/compiled/_next/static/chunks/app/agents/page-28ce086a1129bca2.js +1 -0
  136. khoj/interface/compiled/_next/static/chunks/app/automations/layout-1fe1537449f43496.js +1 -0
  137. khoj/interface/compiled/_next/static/chunks/app/automations/page-bf365a60829d347f.js +1 -0
  138. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
  139. khoj/interface/compiled/_next/static/chunks/app/chat/page-0e476e57eb2015e3.js +1 -0
  140. khoj/interface/compiled/_next/static/chunks/app/layout-30e7fda7262713ce.js +1 -0
  141. khoj/interface/compiled/_next/static/chunks/app/page-a5515ea71aec5ef0.js +1 -0
  142. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +1 -0
  143. khoj/interface/compiled/_next/static/chunks/app/search/page-9140541e67ea307d.js +1 -0
  144. khoj/interface/compiled/_next/static/chunks/app/settings/layout-d09d6510a45cd4bd.js +1 -0
  145. khoj/interface/compiled/_next/static/chunks/app/settings/page-951ba40b5b94b23a.js +1 -0
  146. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +1 -0
  147. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-1beb80d8d741c932.js +1 -0
  148. khoj/interface/compiled/_next/static/chunks/d3ac728e-44ebd2a0c99b12a0.js +1 -0
  149. khoj/interface/compiled/_next/static/chunks/fd9d1056-4482b99a36fd1673.js +1 -0
  150. khoj/interface/compiled/_next/static/chunks/framework-8e0e0f4a6b83a956.js +1 -0
  151. khoj/interface/compiled/_next/static/chunks/main-app-de1f09df97a3cfc7.js +1 -0
  152. khoj/interface/compiled/_next/static/chunks/main-db4bfac6b0a8d00b.js +1 -0
  153. khoj/interface/compiled/_next/static/chunks/pages/_app-3c9ca398d360b709.js +1 -0
  154. khoj/interface/compiled/_next/static/chunks/pages/_error-cf5ca766ac8f493f.js +1 -0
  155. khoj/interface/compiled/_next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
  156. khoj/interface/compiled/_next/static/chunks/webpack-a03962458328b163.js +1 -0
  157. khoj/interface/compiled/_next/static/css/089de1d8526b96e9.css +1 -0
  158. khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
  159. khoj/interface/compiled/_next/static/css/4e4e6a4a1c920d06.css +1 -0
  160. khoj/interface/compiled/_next/static/css/8d02837c730f8d13.css +25 -0
  161. khoj/interface/compiled/_next/static/css/8e6a3ca11a60b189.css +1 -0
  162. khoj/interface/compiled/_next/static/css/9c164d9727dd8092.css +1 -0
  163. khoj/interface/compiled/_next/static/css/dac88c17aaee5fcf.css +1 -0
  164. khoj/interface/compiled/_next/static/css/df4b47a2d0d85eae.css +1 -0
  165. khoj/interface/compiled/_next/static/css/e4eb883b5265d372.css +1 -0
  166. khoj/interface/compiled/_next/static/media/1d8a05b60287ae6c-s.p.woff2 +0 -0
  167. khoj/interface/compiled/_next/static/media/6f22fce21a7c433c-s.woff2 +0 -0
  168. khoj/interface/compiled/_next/static/media/77c207b095007c34-s.p.woff2 +0 -0
  169. khoj/interface/compiled/_next/static/media/82ef96de0e8f4d8c-s.p.woff2 +0 -0
  170. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.1608a09b.woff +0 -0
  171. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.4aafdb68.ttf +0 -0
  172. khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.a79f1c31.woff2 +0 -0
  173. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.b6770918.woff +0 -0
  174. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.cce5b8ec.ttf +0 -0
  175. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.ec17d132.woff2 +0 -0
  176. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.07ef19e7.ttf +0 -0
  177. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.55fac258.woff2 +0 -0
  178. khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.dad44a7f.woff +0 -0
  179. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.9f256b85.woff +0 -0
  180. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.b18f59e1.ttf +0 -0
  181. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.d42a5579.woff2 +0 -0
  182. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.7c187121.woff +0 -0
  183. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.d3c882a6.woff2 +0 -0
  184. khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.ed38e79f.ttf +0 -0
  185. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.b74a1a8b.ttf +0 -0
  186. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.c3fb5ac2.woff2 +0 -0
  187. khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.d181c465.woff +0 -0
  188. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.6f2bb1df.woff2 +0 -0
  189. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.70d8b0a5.ttf +0 -0
  190. khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.e3f82f9d.woff +0 -0
  191. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.47373d1e.ttf +0 -0
  192. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.8916142b.woff2 +0 -0
  193. khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.9024d815.woff +0 -0
  194. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.0462f03b.woff2 +0 -0
  195. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.7f51fe03.woff +0 -0
  196. khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.b7f8fe9b.ttf +0 -0
  197. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.572d331f.woff2 +0 -0
  198. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.a879cf83.ttf +0 -0
  199. khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.f1035d8d.woff +0 -0
  200. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.5295ba48.woff +0 -0
  201. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.939bc644.ttf +0 -0
  202. khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.f28c23ac.woff2 +0 -0
  203. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.8c5b5494.woff2 +0 -0
  204. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.94e1e8dc.ttf +0 -0
  205. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.bf59d231.woff +0 -0
  206. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.3b1e59b3.woff2 +0 -0
  207. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.7c9bc82b.woff +0 -0
  208. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.b4c20c84.ttf +0 -0
  209. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.74048478.woff +0 -0
  210. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.ba21ed5f.woff2 +0 -0
  211. khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.d4d7ba48.ttf +0 -0
  212. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.03e9641d.woff2 +0 -0
  213. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.07505710.woff +0 -0
  214. khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.fe9cbbe1.ttf +0 -0
  215. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.e1e279cb.woff +0 -0
  216. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.eae34984.woff2 +0 -0
  217. khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.fabc004a.ttf +0 -0
  218. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.57727022.woff +0 -0
  219. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.5916a24f.woff2 +0 -0
  220. khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.d6b476ec.ttf +0 -0
  221. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.9acaf01c.woff +0 -0
  222. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.a144ef58.ttf +0 -0
  223. khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.b4230e7e.woff2 +0 -0
  224. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.10d95fd3.woff2 +0 -0
  225. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.7a996c9d.woff +0 -0
  226. khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.fbccdabe.ttf +0 -0
  227. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.6258592b.woff +0 -0
  228. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.a8709e36.woff2 +0 -0
  229. khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.d97aaf4a.ttf +0 -0
  230. khoj/interface/compiled/_next/static/media/a6ecd16fa044d500-s.p.woff2 +0 -0
  231. khoj/interface/compiled/_next/static/media/bd82c78e5b7b3fe9-s.p.woff2 +0 -0
  232. khoj/interface/compiled/_next/static/media/c32c8052c071fc42-s.woff2 +0 -0
  233. khoj/interface/compiled/_next/static/media/c4250770ab8708b6-s.p.woff2 +0 -0
  234. khoj/interface/compiled/_next/static/media/e098aaaecc9cfbb2-s.p.woff2 +0 -0
  235. khoj/interface/compiled/_next/static/media/flags.3afdda2f.webp +0 -0
  236. khoj/interface/compiled/_next/static/media/flags@2x.5fbe9fc1.webp +0 -0
  237. khoj/interface/compiled/_next/static/media/globe.98e105ca.webp +0 -0
  238. khoj/interface/compiled/_next/static/media/globe@2x.974df6f8.webp +0 -0
  239. khoj/interface/compiled/agents/index.html +1 -0
  240. khoj/interface/compiled/agents/index.txt +7 -0
  241. khoj/interface/compiled/agents.svg +6 -0
  242. khoj/interface/compiled/assets/icons/khoj_lantern.ico +0 -0
  243. khoj/interface/compiled/assets/icons/khoj_lantern.svg +100 -0
  244. khoj/interface/compiled/assets/icons/khoj_lantern_1200x1200.png +0 -0
  245. khoj/interface/compiled/assets/icons/khoj_lantern_128x128.png +0 -0
  246. khoj/interface/compiled/assets/icons/khoj_lantern_128x128_dark.png +0 -0
  247. khoj/interface/compiled/assets/icons/khoj_lantern_256x256.png +0 -0
  248. khoj/interface/compiled/assets/icons/khoj_lantern_512x512.png +0 -0
  249. khoj/interface/compiled/assets/icons/khoj_lantern_logomarktype_1200x630.png +0 -0
  250. khoj/interface/compiled/assets/samples/desktop-browse-draw-sample.png +0 -0
  251. khoj/interface/compiled/assets/samples/desktop-plain-chat-sample.png +0 -0
  252. khoj/interface/compiled/assets/samples/desktop-remember-plan-sample.png +0 -0
  253. khoj/interface/compiled/assets/samples/phone-browse-draw-sample.png +0 -0
  254. khoj/interface/compiled/assets/samples/phone-plain-chat-sample.png +0 -0
  255. khoj/interface/compiled/assets/samples/phone-remember-plan-sample.png +0 -0
  256. khoj/interface/compiled/automation.svg +37 -0
  257. khoj/interface/compiled/automations/index.html +1 -0
  258. khoj/interface/compiled/automations/index.txt +8 -0
  259. khoj/interface/compiled/chat/index.html +1 -0
  260. khoj/interface/compiled/chat/index.txt +7 -0
  261. khoj/interface/compiled/chat.svg +24 -0
  262. khoj/interface/compiled/close.svg +5 -0
  263. khoj/interface/compiled/copy-button-success.svg +6 -0
  264. khoj/interface/compiled/copy-button.svg +5 -0
  265. khoj/interface/compiled/index.html +1 -0
  266. khoj/interface/compiled/index.txt +7 -0
  267. khoj/interface/compiled/khoj.webmanifest +76 -0
  268. khoj/interface/compiled/logo.svg +24 -0
  269. khoj/interface/compiled/search/index.html +1 -0
  270. khoj/interface/compiled/search/index.txt +7 -0
  271. khoj/interface/compiled/send.svg +1 -0
  272. khoj/interface/compiled/settings/index.html +1 -0
  273. khoj/interface/compiled/settings/index.txt +9 -0
  274. khoj/interface/compiled/share/chat/index.html +1 -0
  275. khoj/interface/compiled/share/chat/index.txt +7 -0
  276. khoj/interface/compiled/share.svg +8 -0
  277. khoj/interface/compiled/thumbs-down.svg +6 -0
  278. khoj/interface/compiled/thumbs-up.svg +6 -0
  279. khoj/interface/email/feedback.html +34 -0
  280. khoj/interface/email/magic_link.html +40 -0
  281. khoj/interface/email/task.html +37 -0
  282. khoj/interface/email/welcome.html +90 -0
  283. khoj/interface/web/.well-known/assetlinks.json +11 -0
  284. khoj/interface/web/assets/icons/agents.svg +19 -0
  285. khoj/interface/web/assets/icons/automation.svg +43 -0
  286. khoj/interface/web/assets/icons/chat.svg +24 -0
  287. khoj/interface/web/assets/icons/github.svg +1 -0
  288. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  289. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  290. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +32 -0
  291. khoj/interface/web/assets/icons/khoj.svg +26 -0
  292. khoj/interface/web/assets/icons/logotype.svg +1 -0
  293. khoj/interface/web/assets/icons/search.svg +57 -0
  294. khoj/interface/web/assets/icons/sync.svg +4 -0
  295. khoj/interface/web/assets/khoj.css +237 -0
  296. khoj/interface/web/assets/utils.js +33 -0
  297. khoj/interface/web/base_config.html +445 -0
  298. khoj/interface/web/content_source_github_input.html +208 -0
  299. khoj/interface/web/login.html +310 -0
  300. khoj/interface/web/utils.html +48 -0
  301. khoj/main.py +249 -0
  302. khoj/manage.py +22 -0
  303. khoj/migrations/__init__.py +0 -0
  304. khoj/migrations/migrate_offline_chat_default_model.py +69 -0
  305. khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
  306. khoj/migrations/migrate_offline_chat_schema.py +83 -0
  307. khoj/migrations/migrate_offline_model.py +29 -0
  308. khoj/migrations/migrate_processor_config_openai.py +67 -0
  309. khoj/migrations/migrate_server_pg.py +132 -0
  310. khoj/migrations/migrate_version.py +17 -0
  311. khoj/processor/__init__.py +0 -0
  312. khoj/processor/content/__init__.py +0 -0
  313. khoj/processor/content/docx/__init__.py +0 -0
  314. khoj/processor/content/docx/docx_to_entries.py +111 -0
  315. khoj/processor/content/github/__init__.py +0 -0
  316. khoj/processor/content/github/github_to_entries.py +226 -0
  317. khoj/processor/content/images/__init__.py +0 -0
  318. khoj/processor/content/images/image_to_entries.py +117 -0
  319. khoj/processor/content/markdown/__init__.py +0 -0
  320. khoj/processor/content/markdown/markdown_to_entries.py +160 -0
  321. khoj/processor/content/notion/notion_to_entries.py +259 -0
  322. khoj/processor/content/org_mode/__init__.py +0 -0
  323. khoj/processor/content/org_mode/org_to_entries.py +226 -0
  324. khoj/processor/content/org_mode/orgnode.py +532 -0
  325. khoj/processor/content/pdf/__init__.py +0 -0
  326. khoj/processor/content/pdf/pdf_to_entries.py +119 -0
  327. khoj/processor/content/plaintext/__init__.py +0 -0
  328. khoj/processor/content/plaintext/plaintext_to_entries.py +117 -0
  329. khoj/processor/content/text_to_entries.py +296 -0
  330. khoj/processor/conversation/__init__.py +0 -0
  331. khoj/processor/conversation/anthropic/__init__.py +0 -0
  332. khoj/processor/conversation/anthropic/anthropic_chat.py +243 -0
  333. khoj/processor/conversation/anthropic/utils.py +217 -0
  334. khoj/processor/conversation/google/__init__.py +0 -0
  335. khoj/processor/conversation/google/gemini_chat.py +253 -0
  336. khoj/processor/conversation/google/utils.py +260 -0
  337. khoj/processor/conversation/offline/__init__.py +0 -0
  338. khoj/processor/conversation/offline/chat_model.py +308 -0
  339. khoj/processor/conversation/offline/utils.py +80 -0
  340. khoj/processor/conversation/offline/whisper.py +15 -0
  341. khoj/processor/conversation/openai/__init__.py +0 -0
  342. khoj/processor/conversation/openai/gpt.py +243 -0
  343. khoj/processor/conversation/openai/utils.py +232 -0
  344. khoj/processor/conversation/openai/whisper.py +13 -0
  345. khoj/processor/conversation/prompts.py +1188 -0
  346. khoj/processor/conversation/utils.py +867 -0
  347. khoj/processor/embeddings.py +122 -0
  348. khoj/processor/image/generate.py +215 -0
  349. khoj/processor/speech/__init__.py +0 -0
  350. khoj/processor/speech/text_to_speech.py +51 -0
  351. khoj/processor/tools/__init__.py +0 -0
  352. khoj/processor/tools/online_search.py +472 -0
  353. khoj/processor/tools/run_code.py +179 -0
  354. khoj/routers/__init__.py +0 -0
  355. khoj/routers/api.py +760 -0
  356. khoj/routers/api_agents.py +295 -0
  357. khoj/routers/api_chat.py +1273 -0
  358. khoj/routers/api_content.py +634 -0
  359. khoj/routers/api_model.py +123 -0
  360. khoj/routers/api_phone.py +86 -0
  361. khoj/routers/api_subscription.py +144 -0
  362. khoj/routers/auth.py +307 -0
  363. khoj/routers/email.py +135 -0
  364. khoj/routers/helpers.py +2333 -0
  365. khoj/routers/notion.py +85 -0
  366. khoj/routers/research.py +364 -0
  367. khoj/routers/storage.py +63 -0
  368. khoj/routers/twilio.py +36 -0
  369. khoj/routers/web_client.py +141 -0
  370. khoj/search_filter/__init__.py +0 -0
  371. khoj/search_filter/base_filter.py +15 -0
  372. khoj/search_filter/date_filter.py +215 -0
  373. khoj/search_filter/file_filter.py +32 -0
  374. khoj/search_filter/word_filter.py +29 -0
  375. khoj/search_type/__init__.py +0 -0
  376. khoj/search_type/text_search.py +255 -0
  377. khoj/utils/__init__.py +0 -0
  378. khoj/utils/cli.py +101 -0
  379. khoj/utils/config.py +81 -0
  380. khoj/utils/constants.py +51 -0
  381. khoj/utils/fs_syncer.py +252 -0
  382. khoj/utils/helpers.py +627 -0
  383. khoj/utils/initialization.py +301 -0
  384. khoj/utils/jsonl.py +43 -0
  385. khoj/utils/models.py +47 -0
  386. khoj/utils/rawconfig.py +208 -0
  387. khoj/utils/state.py +48 -0
  388. khoj/utils/yaml.py +47 -0
  389. khoj-1.33.3.dev32.dist-info/METADATA +190 -0
  390. khoj-1.33.3.dev32.dist-info/RECORD +393 -0
  391. khoj-1.33.3.dev32.dist-info/WHEEL +4 -0
  392. khoj-1.33.3.dev32.dist-info/entry_points.txt +2 -0
  393. khoj-1.33.3.dev32.dist-info/licenses/LICENSE +661 -0
khoj/utils/config.py ADDED
@@ -0,0 +1,81 @@
1
+ # System Packages
2
+ from __future__ import annotations # to avoid quoting type hints
3
+
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
8
+
9
+ import torch
10
+
11
+ from khoj.processor.conversation.offline.utils import download_model
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ if TYPE_CHECKING:
17
+ from sentence_transformers import CrossEncoder
18
+
19
+ from khoj.utils.models import BaseEncoder
20
+
21
+
22
+ class SearchType(str, Enum):
23
+ All = "all"
24
+ Org = "org"
25
+ Markdown = "markdown"
26
+ Image = "image"
27
+ Pdf = "pdf"
28
+ Github = "github"
29
+ Notion = "notion"
30
+ Plaintext = "plaintext"
31
+ Docx = "docx"
32
+
33
+
34
+ class ProcessorType(str, Enum):
35
+ Conversation = "conversation"
36
+
37
+
38
+ @dataclass
39
+ class TextContent:
40
+ enabled: bool
41
+
42
+
43
+ @dataclass
44
+ class ImageContent:
45
+ image_names: List[str]
46
+ image_embeddings: torch.Tensor
47
+ image_metadata_embeddings: torch.Tensor
48
+
49
+
50
+ @dataclass
51
+ class TextSearchModel:
52
+ bi_encoder: BaseEncoder
53
+ cross_encoder: Optional[CrossEncoder] = None
54
+ top_k: Optional[int] = 15
55
+
56
+
57
+ @dataclass
58
+ class ImageSearchModel:
59
+ image_encoder: BaseEncoder
60
+
61
+
62
+ @dataclass
63
+ class SearchModels:
64
+ text_search: Optional[TextSearchModel] = None
65
+
66
+
67
+ @dataclass
68
+ class OfflineChatProcessorConfig:
69
+ loaded_model: Union[Any, None] = None
70
+
71
+
72
+ class OfflineChatProcessorModel:
73
+ def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
74
+ self.chat_model = chat_model
75
+ self.loaded_model = None
76
+ try:
77
+ self.loaded_model = download_model(self.chat_model, max_tokens=max_tokens)
78
+ except ValueError as e:
79
+ self.loaded_model = None
80
+ logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
81
+ raise e
@@ -0,0 +1,51 @@
1
+ from pathlib import Path
2
+ from typing import Dict
3
+
4
+ app_root_directory = Path(__file__).parent.parent.parent
5
+ web_directory = app_root_directory / "khoj/interface/web/"
6
+ next_js_directory = app_root_directory / "khoj/interface/built/"
7
+ pypi_static_directory = app_root_directory / "khoj/interface/compiled/"
8
+ assetlinks_file_path = web_directory / ".well-known/assetlinks.json"
9
+ empty_escape_sequences = "\n|\r|\t| "
10
+ app_env_filepath = "~/.khoj/env"
11
+ telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
12
+ content_directory = "~/.khoj/content/"
13
+ default_offline_chat_models = [
14
+ "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
15
+ "bartowski/Llama-3.2-3B-Instruct-GGUF",
16
+ "bartowski/gemma-2-9b-it-GGUF",
17
+ "bartowski/gemma-2-2b-it-GGUF",
18
+ "Qwen/Qwen2.5-14B-Instruct-GGUF",
19
+ ]
20
+ default_openai_chat_models = ["gpt-4o-mini", "gpt-4o"]
21
+ default_gemini_chat_models = ["gemini-1.5-flash", "gemini-1.5-pro"]
22
+ default_anthropic_chat_models = ["claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022"]
23
+
24
+ empty_config = {
25
+ "search-type": {
26
+ "image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
27
+ },
28
+ }
29
+
30
+ # default app config to use
31
+ default_config = {
32
+ "search-type": {
33
+ "image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
34
+ },
35
+ }
36
+
37
+ model_to_cost: Dict[str, Dict[str, float]] = {
38
+ # OpenAI Pricing: https://openai.com/api/pricing/
39
+ "gpt-4o": {"input": 2.50, "output": 10.00},
40
+ "gpt-4o-mini": {"input": 0.15, "output": 0.60},
41
+ "o1": {"input": 15.0, "output": 60.00},
42
+ "o1-mini": {"input": 3.0, "output": 12.0},
43
+ # Gemini Pricing: https://ai.google.dev/pricing
44
+ "gemini-1.5-flash": {"input": 0.075, "output": 0.30},
45
+ "gemini-1.5-flash-002": {"input": 0.075, "output": 0.30},
46
+ "gemini-1.5-pro": {"input": 1.25, "output": 5.00},
47
+ "gemini-1.5-pro-002": {"input": 1.25, "output": 5.00},
48
+ # Anthropic Pricing: https://www.anthropic.com/pricing#anthropic-api_
49
+ "claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0},
50
+ "claude-3-5-haiku-20241022": {"input": 1.0, "output": 5.0},
51
+ }
@@ -0,0 +1,252 @@
1
+ import glob
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from bs4 import BeautifulSoup
8
+ from magika import Magika
9
+
10
+ from khoj.database.models import (
11
+ KhojUser,
12
+ LocalMarkdownConfig,
13
+ LocalOrgConfig,
14
+ LocalPdfConfig,
15
+ LocalPlaintextConfig,
16
+ )
17
+ from khoj.utils.config import SearchType
18
+ from khoj.utils.helpers import get_absolute_path, is_none_or_empty
19
+ from khoj.utils.rawconfig import TextContentConfig
20
+
21
+ logger = logging.getLogger(__name__)
22
+ magika = Magika()
23
+
24
+
25
+ def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict:
26
+ files: dict[str, dict] = {"docx": {}, "image": {}}
27
+
28
+ if search_type == SearchType.All or search_type == SearchType.Org:
29
+ org_config = LocalOrgConfig.objects.filter(user=user).first()
30
+ files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {}
31
+ if search_type == SearchType.All or search_type == SearchType.Markdown:
32
+ markdown_config = LocalMarkdownConfig.objects.filter(user=user).first()
33
+ files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {}
34
+ if search_type == SearchType.All or search_type == SearchType.Plaintext:
35
+ plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first()
36
+ files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {}
37
+ if search_type == SearchType.All or search_type == SearchType.Pdf:
38
+ pdf_config = LocalPdfConfig.objects.filter(user=user).first()
39
+ files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {}
40
+ files["image"] = {}
41
+ files["docx"] = {}
42
+ return files
43
+
44
+
45
+ def construct_config_from_db(db_config) -> TextContentConfig:
46
+ return TextContentConfig(
47
+ input_files=db_config.input_files,
48
+ input_filter=db_config.input_filter,
49
+ index_heading_entries=db_config.index_heading_entries,
50
+ )
51
+
52
+
53
+ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
54
+ def is_plaintextfile(file: str):
55
+ "Check if file is plaintext file"
56
+ # Check if file path exists
57
+ content_group = magika.identify_path(Path(file)).output.group
58
+ # Use file extension to decide plaintext if file content is not identifiable
59
+ valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
60
+ return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
61
+
62
+ def extract_html_content(html_content: str):
63
+ "Extract content from HTML"
64
+ soup = BeautifulSoup(html_content, "html.parser")
65
+ return soup.get_text(strip=True, separator="\n")
66
+
67
+ # Extract required fields from config
68
+ input_files, input_filters = (
69
+ config.input_files,
70
+ config.input_filter,
71
+ )
72
+
73
+ # Input Validation
74
+ if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
75
+ logger.debug("At least one of input-files or input-file-filter is required to be specified")
76
+ return {}
77
+
78
+ # Get all plain text files to process
79
+ absolute_plaintext_files, filtered_plaintext_files = set(), set()
80
+ if input_files:
81
+ absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
82
+ if input_filters:
83
+ filtered_plaintext_files = {
84
+ filtered_file
85
+ for plaintext_file_filter in input_filters
86
+ for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
87
+ if os.path.isfile(filtered_file)
88
+ }
89
+
90
+ all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
91
+
92
+ files_with_no_plaintext_extensions = {
93
+ target_files for target_files in all_target_files if not is_plaintextfile(target_files)
94
+ }
95
+ if any(files_with_no_plaintext_extensions):
96
+ logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
97
+ all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
98
+
99
+ logger.debug(f"Processing files: {all_target_files}")
100
+
101
+ filename_to_content_map = {}
102
+ for file in all_target_files:
103
+ with open(file, "r", encoding="utf8") as f:
104
+ try:
105
+ plaintext_content = f.read()
106
+ if file.endswith(("html", "htm", "xml")):
107
+ plaintext_content = extract_html_content(plaintext_content)
108
+ filename_to_content_map[file] = plaintext_content
109
+ except Exception as e:
110
+ logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
111
+ logger.warning(e, exc_info=True)
112
+
113
+ return filename_to_content_map
114
+
115
+
116
+ def get_org_files(config: TextContentConfig):
117
+ # Extract required fields from config
118
+ org_files, org_file_filters = (
119
+ config.input_files,
120
+ config.input_filter,
121
+ )
122
+
123
+ # Input Validation
124
+ if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
125
+ logger.debug("At least one of org-files or org-file-filter is required to be specified")
126
+ return {}
127
+
128
+ # Get Org files to process
129
+ absolute_org_files, filtered_org_files = set(), set()
130
+ if org_files:
131
+ absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
132
+ if org_file_filters:
133
+ filtered_org_files = {
134
+ filtered_file
135
+ for org_file_filter in org_file_filters
136
+ for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
137
+ if os.path.isfile(filtered_file)
138
+ }
139
+
140
+ all_org_files = sorted(absolute_org_files | filtered_org_files)
141
+
142
+ files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
143
+ if any(files_with_non_org_extensions):
144
+ logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
145
+
146
+ logger.debug(f"Processing files: {all_org_files}")
147
+
148
+ filename_to_content_map = {}
149
+ for file in all_org_files:
150
+ with open(file, "r", encoding="utf8") as f:
151
+ try:
152
+ filename_to_content_map[file] = f.read()
153
+ except Exception as e:
154
+ logger.warning(f"Unable to read file: {file} as org. Skipping file.")
155
+ logger.warning(e, exc_info=True)
156
+
157
+ return filename_to_content_map
158
+
159
+
160
+ def get_markdown_files(config: TextContentConfig):
161
+ # Extract required fields from config
162
+ markdown_files, markdown_file_filters = (
163
+ config.input_files,
164
+ config.input_filter,
165
+ )
166
+
167
+ # Input Validation
168
+ if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
169
+ logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
170
+ return {}
171
+
172
+ # Get markdown files to process
173
+ absolute_markdown_files, filtered_markdown_files = set(), set()
174
+ if markdown_files:
175
+ absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
176
+
177
+ if markdown_file_filters:
178
+ filtered_markdown_files = {
179
+ filtered_file
180
+ for markdown_file_filter in markdown_file_filters
181
+ for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
182
+ if os.path.isfile(filtered_file)
183
+ }
184
+
185
+ all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
186
+
187
+ files_with_non_markdown_extensions = {
188
+ md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown")
189
+ }
190
+
191
+ if any(files_with_non_markdown_extensions):
192
+ logger.warning(
193
+ f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
194
+ )
195
+
196
+ logger.debug(f"Processing files: {all_markdown_files}")
197
+
198
+ filename_to_content_map = {}
199
+ for file in all_markdown_files:
200
+ with open(file, "r", encoding="utf8") as f:
201
+ try:
202
+ filename_to_content_map[file] = f.read()
203
+ except Exception as e:
204
+ logger.warning(f"Unable to read file: {file} as markdown. Skipping file.")
205
+ logger.warning(e, exc_info=True)
206
+
207
+ return filename_to_content_map
208
+
209
+
210
+ def get_pdf_files(config: TextContentConfig):
211
+ # Extract required fields from config
212
+ pdf_files, pdf_file_filters = (
213
+ config.input_files,
214
+ config.input_filter,
215
+ )
216
+
217
+ # Input Validation
218
+ if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
219
+ logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
220
+ return {}
221
+
222
+ # Get PDF files to process
223
+ absolute_pdf_files, filtered_pdf_files = set(), set()
224
+ if pdf_files:
225
+ absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
226
+ if pdf_file_filters:
227
+ filtered_pdf_files = {
228
+ filtered_file
229
+ for pdf_file_filter in pdf_file_filters
230
+ for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
231
+ if os.path.isfile(filtered_file)
232
+ }
233
+
234
+ all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
235
+
236
+ files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
237
+
238
+ if any(files_with_non_pdf_extensions):
239
+ logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
240
+
241
+ logger.debug(f"Processing files: {all_pdf_files}")
242
+
243
+ filename_to_content_map = {}
244
+ for file in all_pdf_files:
245
+ with open(file, "rb") as f:
246
+ try:
247
+ filename_to_content_map[file] = f.read()
248
+ except Exception as e:
249
+ logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
250
+ logger.warning(e, exc_info=True)
251
+
252
+ return filename_to_content_map