khoj 1.33.3.dev32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/__init__.py +0 -0
- khoj/app/README.md +94 -0
- khoj/app/__init__.py +0 -0
- khoj/app/asgi.py +16 -0
- khoj/app/settings.py +218 -0
- khoj/app/urls.py +25 -0
- khoj/configure.py +452 -0
- khoj/database/__init__.py +0 -0
- khoj/database/adapters/__init__.py +1821 -0
- khoj/database/admin.py +417 -0
- khoj/database/apps.py +6 -0
- khoj/database/management/__init__.py +0 -0
- khoj/database/management/commands/__init__.py +0 -0
- khoj/database/management/commands/change_default_model.py +116 -0
- khoj/database/management/commands/change_generated_images_url.py +61 -0
- khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
- khoj/database/migrations/0001_khojuser.py +98 -0
- khoj/database/migrations/0002_googleuser.py +32 -0
- khoj/database/migrations/0003_vector_extension.py +10 -0
- khoj/database/migrations/0004_content_types_and_more.py +181 -0
- khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
- khoj/database/migrations/0006_embeddingsdates.py +33 -0
- khoj/database/migrations/0007_add_conversation.py +27 -0
- khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
- khoj/database/migrations/0009_khojapiuser.py +24 -0
- khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
- khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
- khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
- khoj/database/migrations/0012_entry_file_source.py +21 -0
- khoj/database/migrations/0013_subscription.py +37 -0
- khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
- khoj/database/migrations/0015_alter_subscription_user.py +21 -0
- khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
- khoj/database/migrations/0017_searchmodel.py +32 -0
- khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
- khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
- khoj/database/migrations/0020_reflectivequestion.py +36 -0
- khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
- khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
- khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
- khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
- khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
- khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
- khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
- khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
- khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
- khoj/database/migrations/0029_userrequests.py +27 -0
- khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
- khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
- khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
- khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
- khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
- khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
- khoj/database/migrations/0035_processlock.py +26 -0
- khoj/database/migrations/0036_alter_processlock_name.py +19 -0
- khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
- khoj/database/migrations/0036_publicconversation.py +42 -0
- khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
- khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
- khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
- khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
- khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
- khoj/database/migrations/0040_alter_processlock_name.py +26 -0
- khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
- khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
- khoj/database/migrations/0042_serverchatsettings.py +46 -0
- khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
- khoj/database/migrations/0044_conversation_file_filters.py +17 -0
- khoj/database/migrations/0045_fileobject.py +37 -0
- khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
- khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
- khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
- khoj/database/migrations/0049_datastore.py +38 -0
- khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
- khoj/database/migrations/0050_alter_processlock_name.py +25 -0
- khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
- khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
- khoj/database/migrations/0053_agent_style_color_agent_style_icon.py +61 -0
- khoj/database/migrations/0054_alter_agent_style_color.py +38 -0
- khoj/database/migrations/0055_alter_agent_style_icon.py +37 -0
- khoj/database/migrations/0056_chatmodeloptions_vision_enabled.py +17 -0
- khoj/database/migrations/0056_searchmodelconfig_cross_encoder_model_config.py +17 -0
- khoj/database/migrations/0057_merge_20240816_1409.py +13 -0
- khoj/database/migrations/0057_remove_serverchatsettings_default_model_and_more.py +51 -0
- khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py +17 -0
- khoj/database/migrations/0059_searchmodelconfig_bi_encoder_confidence_threshold.py +17 -0
- khoj/database/migrations/0060_merge_20240905_1828.py +14 -0
- khoj/database/migrations/0061_alter_chatmodeloptions_model_type.py +26 -0
- khoj/database/migrations/0061_alter_texttoimagemodelconfig_model_type.py +21 -0
- khoj/database/migrations/0062_merge_20240913_0222.py +14 -0
- khoj/database/migrations/0063_conversation_temp_id.py +36 -0
- khoj/database/migrations/0064_remove_conversation_temp_id_alter_conversation_id.py +86 -0
- khoj/database/migrations/0065_remove_agent_avatar_remove_agent_public_and_more.py +49 -0
- khoj/database/migrations/0066_remove_agent_tools_agent_input_tools_and_more.py +69 -0
- khoj/database/migrations/0067_alter_agent_style_icon.py +50 -0
- khoj/database/migrations/0068_alter_agent_output_modes.py +24 -0
- khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py +89 -0
- khoj/database/migrations/0070_alter_agent_input_tools_alter_agent_output_modes.py +46 -0
- khoj/database/migrations/0071_subscription_enabled_trial_at_and_more.py +32 -0
- khoj/database/migrations/0072_entry_search_model.py +24 -0
- khoj/database/migrations/0073_delete_usersearchmodelconfig.py +15 -0
- khoj/database/migrations/0074_alter_conversation_title.py +17 -0
- khoj/database/migrations/0075_migrate_generated_assets_and_validate.py +85 -0
- khoj/database/migrations/0076_rename_openaiprocessorconversationconfig_aimodelapi_and_more.py +26 -0
- khoj/database/migrations/0077_chatmodel_alter_agent_chat_model_and_more.py +62 -0
- khoj/database/migrations/0078_khojuser_email_verification_code_expiry.py +17 -0
- khoj/database/migrations/__init__.py +0 -0
- khoj/database/models/__init__.py +725 -0
- khoj/database/tests.py +3 -0
- khoj/interface/compiled/404/index.html +1 -0
- khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_buildManifest.js +1 -0
- khoj/interface/compiled/_next/static/Tg-vU1p1B-YKT5Qv8KSHt/_ssgManifest.js +1 -0
- khoj/interface/compiled/_next/static/chunks/1010-8f39bb4648b5ba10.js +1 -0
- khoj/interface/compiled/_next/static/chunks/182-f1c48a203dc91e0e.js +20 -0
- khoj/interface/compiled/_next/static/chunks/1915-d3c36ad6ce697ce7.js +1 -0
- khoj/interface/compiled/_next/static/chunks/2117-165ef4747a5b836b.js +2 -0
- khoj/interface/compiled/_next/static/chunks/2581-455000f8aeb08fc3.js +1 -0
- khoj/interface/compiled/_next/static/chunks/3727.dcea8f2193111552.js +1 -0
- khoj/interface/compiled/_next/static/chunks/3789-a09e37a819171a9d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/4124-6c28322ce218d2d5.js +1 -0
- khoj/interface/compiled/_next/static/chunks/5427-b52d95253e692bfa.js +1 -0
- khoj/interface/compiled/_next/static/chunks/5473-b1cf56dedac6577a.js +1 -0
- khoj/interface/compiled/_next/static/chunks/5477-0bbddb79c25a54a7.js +1 -0
- khoj/interface/compiled/_next/static/chunks/6065-64db9ad305ba0bcd.js +1 -0
- khoj/interface/compiled/_next/static/chunks/6293-469dd16402ea8a6f.js +3 -0
- khoj/interface/compiled/_next/static/chunks/688-b5b4391bbc0376f1.js +1 -0
- khoj/interface/compiled/_next/static/chunks/8667-b6bf63c72b2d76eb.js +1 -0
- khoj/interface/compiled/_next/static/chunks/9259-1172dbaca0515237.js +1 -0
- khoj/interface/compiled/_next/static/chunks/94ca1967.1d9b42d929a1ee8c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/9597.83583248dfbf6e73.js +1 -0
- khoj/interface/compiled/_next/static/chunks/964ecbae.51d6faf8801d15e6.js +1 -0
- khoj/interface/compiled/_next/static/chunks/9665-391df1e5c51c960a.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/_not-found/page-a834eddae3e235df.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/page-28ce086a1129bca2.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/automations/layout-1fe1537449f43496.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/automations/page-bf365a60829d347f.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/page-0e476e57eb2015e3.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/layout-30e7fda7262713ce.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/page-a5515ea71aec5ef0.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/page-9140541e67ea307d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/settings/layout-d09d6510a45cd4bd.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/settings/page-951ba40b5b94b23a.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/page-1beb80d8d741c932.js +1 -0
- khoj/interface/compiled/_next/static/chunks/d3ac728e-44ebd2a0c99b12a0.js +1 -0
- khoj/interface/compiled/_next/static/chunks/fd9d1056-4482b99a36fd1673.js +1 -0
- khoj/interface/compiled/_next/static/chunks/framework-8e0e0f4a6b83a956.js +1 -0
- khoj/interface/compiled/_next/static/chunks/main-app-de1f09df97a3cfc7.js +1 -0
- khoj/interface/compiled/_next/static/chunks/main-db4bfac6b0a8d00b.js +1 -0
- khoj/interface/compiled/_next/static/chunks/pages/_app-3c9ca398d360b709.js +1 -0
- khoj/interface/compiled/_next/static/chunks/pages/_error-cf5ca766ac8f493f.js +1 -0
- khoj/interface/compiled/_next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
- khoj/interface/compiled/_next/static/chunks/webpack-a03962458328b163.js +1 -0
- khoj/interface/compiled/_next/static/css/089de1d8526b96e9.css +1 -0
- khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
- khoj/interface/compiled/_next/static/css/4e4e6a4a1c920d06.css +1 -0
- khoj/interface/compiled/_next/static/css/8d02837c730f8d13.css +25 -0
- khoj/interface/compiled/_next/static/css/8e6a3ca11a60b189.css +1 -0
- khoj/interface/compiled/_next/static/css/9c164d9727dd8092.css +1 -0
- khoj/interface/compiled/_next/static/css/dac88c17aaee5fcf.css +1 -0
- khoj/interface/compiled/_next/static/css/df4b47a2d0d85eae.css +1 -0
- khoj/interface/compiled/_next/static/css/e4eb883b5265d372.css +1 -0
- khoj/interface/compiled/_next/static/media/1d8a05b60287ae6c-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/6f22fce21a7c433c-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/77c207b095007c34-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/82ef96de0e8f4d8c-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.1608a09b.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.4aafdb68.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_AMS-Regular.a79f1c31.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.b6770918.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.cce5b8ec.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Bold.ec17d132.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.07ef19e7.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.55fac258.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Caligraphic-Regular.dad44a7f.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.9f256b85.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.b18f59e1.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Bold.d42a5579.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.7c187121.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.d3c882a6.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Fraktur-Regular.ed38e79f.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.b74a1a8b.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.c3fb5ac2.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Bold.d181c465.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.6f2bb1df.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.70d8b0a5.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-BoldItalic.e3f82f9d.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.47373d1e.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.8916142b.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Italic.9024d815.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.0462f03b.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.7f51fe03.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Main-Regular.b7f8fe9b.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.572d331f.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.a879cf83.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Math-BoldItalic.f1035d8d.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.5295ba48.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.939bc644.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Math-Italic.f28c23ac.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.8c5b5494.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.94e1e8dc.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Bold.bf59d231.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.3b1e59b3.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.7c9bc82b.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Italic.b4c20c84.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.74048478.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.ba21ed5f.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_SansSerif-Regular.d4d7ba48.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.03e9641d.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.07505710.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Script-Regular.fe9cbbe1.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.e1e279cb.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.eae34984.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size1-Regular.fabc004a.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.57727022.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.5916a24f.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size2-Regular.d6b476ec.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.9acaf01c.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.a144ef58.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size3-Regular.b4230e7e.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.10d95fd3.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.7a996c9d.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Size4-Regular.fbccdabe.ttf +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.6258592b.woff +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.a8709e36.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/KaTeX_Typewriter-Regular.d97aaf4a.ttf +0 -0
- khoj/interface/compiled/_next/static/media/a6ecd16fa044d500-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/bd82c78e5b7b3fe9-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/c32c8052c071fc42-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/c4250770ab8708b6-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/e098aaaecc9cfbb2-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/flags.3afdda2f.webp +0 -0
- khoj/interface/compiled/_next/static/media/flags@2x.5fbe9fc1.webp +0 -0
- khoj/interface/compiled/_next/static/media/globe.98e105ca.webp +0 -0
- khoj/interface/compiled/_next/static/media/globe@2x.974df6f8.webp +0 -0
- khoj/interface/compiled/agents/index.html +1 -0
- khoj/interface/compiled/agents/index.txt +7 -0
- khoj/interface/compiled/agents.svg +6 -0
- khoj/interface/compiled/assets/icons/khoj_lantern.ico +0 -0
- khoj/interface/compiled/assets/icons/khoj_lantern.svg +100 -0
- khoj/interface/compiled/assets/icons/khoj_lantern_1200x1200.png +0 -0
- khoj/interface/compiled/assets/icons/khoj_lantern_128x128.png +0 -0
- khoj/interface/compiled/assets/icons/khoj_lantern_128x128_dark.png +0 -0
- khoj/interface/compiled/assets/icons/khoj_lantern_256x256.png +0 -0
- khoj/interface/compiled/assets/icons/khoj_lantern_512x512.png +0 -0
- khoj/interface/compiled/assets/icons/khoj_lantern_logomarktype_1200x630.png +0 -0
- khoj/interface/compiled/assets/samples/desktop-browse-draw-sample.png +0 -0
- khoj/interface/compiled/assets/samples/desktop-plain-chat-sample.png +0 -0
- khoj/interface/compiled/assets/samples/desktop-remember-plan-sample.png +0 -0
- khoj/interface/compiled/assets/samples/phone-browse-draw-sample.png +0 -0
- khoj/interface/compiled/assets/samples/phone-plain-chat-sample.png +0 -0
- khoj/interface/compiled/assets/samples/phone-remember-plan-sample.png +0 -0
- khoj/interface/compiled/automation.svg +37 -0
- khoj/interface/compiled/automations/index.html +1 -0
- khoj/interface/compiled/automations/index.txt +8 -0
- khoj/interface/compiled/chat/index.html +1 -0
- khoj/interface/compiled/chat/index.txt +7 -0
- khoj/interface/compiled/chat.svg +24 -0
- khoj/interface/compiled/close.svg +5 -0
- khoj/interface/compiled/copy-button-success.svg +6 -0
- khoj/interface/compiled/copy-button.svg +5 -0
- khoj/interface/compiled/index.html +1 -0
- khoj/interface/compiled/index.txt +7 -0
- khoj/interface/compiled/khoj.webmanifest +76 -0
- khoj/interface/compiled/logo.svg +24 -0
- khoj/interface/compiled/search/index.html +1 -0
- khoj/interface/compiled/search/index.txt +7 -0
- khoj/interface/compiled/send.svg +1 -0
- khoj/interface/compiled/settings/index.html +1 -0
- khoj/interface/compiled/settings/index.txt +9 -0
- khoj/interface/compiled/share/chat/index.html +1 -0
- khoj/interface/compiled/share/chat/index.txt +7 -0
- khoj/interface/compiled/share.svg +8 -0
- khoj/interface/compiled/thumbs-down.svg +6 -0
- khoj/interface/compiled/thumbs-up.svg +6 -0
- khoj/interface/email/feedback.html +34 -0
- khoj/interface/email/magic_link.html +40 -0
- khoj/interface/email/task.html +37 -0
- khoj/interface/email/welcome.html +90 -0
- khoj/interface/web/.well-known/assetlinks.json +11 -0
- khoj/interface/web/assets/icons/agents.svg +19 -0
- khoj/interface/web/assets/icons/automation.svg +43 -0
- khoj/interface/web/assets/icons/chat.svg +24 -0
- khoj/interface/web/assets/icons/github.svg +1 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways.svg +32 -0
- khoj/interface/web/assets/icons/khoj.svg +26 -0
- khoj/interface/web/assets/icons/logotype.svg +1 -0
- khoj/interface/web/assets/icons/search.svg +57 -0
- khoj/interface/web/assets/icons/sync.svg +4 -0
- khoj/interface/web/assets/khoj.css +237 -0
- khoj/interface/web/assets/utils.js +33 -0
- khoj/interface/web/base_config.html +445 -0
- khoj/interface/web/content_source_github_input.html +208 -0
- khoj/interface/web/login.html +310 -0
- khoj/interface/web/utils.html +48 -0
- khoj/main.py +249 -0
- khoj/manage.py +22 -0
- khoj/migrations/__init__.py +0 -0
- khoj/migrations/migrate_offline_chat_default_model.py +69 -0
- khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
- khoj/migrations/migrate_offline_chat_schema.py +83 -0
- khoj/migrations/migrate_offline_model.py +29 -0
- khoj/migrations/migrate_processor_config_openai.py +67 -0
- khoj/migrations/migrate_server_pg.py +132 -0
- khoj/migrations/migrate_version.py +17 -0
- khoj/processor/__init__.py +0 -0
- khoj/processor/content/__init__.py +0 -0
- khoj/processor/content/docx/__init__.py +0 -0
- khoj/processor/content/docx/docx_to_entries.py +111 -0
- khoj/processor/content/github/__init__.py +0 -0
- khoj/processor/content/github/github_to_entries.py +226 -0
- khoj/processor/content/images/__init__.py +0 -0
- khoj/processor/content/images/image_to_entries.py +117 -0
- khoj/processor/content/markdown/__init__.py +0 -0
- khoj/processor/content/markdown/markdown_to_entries.py +160 -0
- khoj/processor/content/notion/notion_to_entries.py +259 -0
- khoj/processor/content/org_mode/__init__.py +0 -0
- khoj/processor/content/org_mode/org_to_entries.py +226 -0
- khoj/processor/content/org_mode/orgnode.py +532 -0
- khoj/processor/content/pdf/__init__.py +0 -0
- khoj/processor/content/pdf/pdf_to_entries.py +119 -0
- khoj/processor/content/plaintext/__init__.py +0 -0
- khoj/processor/content/plaintext/plaintext_to_entries.py +117 -0
- khoj/processor/content/text_to_entries.py +296 -0
- khoj/processor/conversation/__init__.py +0 -0
- khoj/processor/conversation/anthropic/__init__.py +0 -0
- khoj/processor/conversation/anthropic/anthropic_chat.py +243 -0
- khoj/processor/conversation/anthropic/utils.py +217 -0
- khoj/processor/conversation/google/__init__.py +0 -0
- khoj/processor/conversation/google/gemini_chat.py +253 -0
- khoj/processor/conversation/google/utils.py +260 -0
- khoj/processor/conversation/offline/__init__.py +0 -0
- khoj/processor/conversation/offline/chat_model.py +308 -0
- khoj/processor/conversation/offline/utils.py +80 -0
- khoj/processor/conversation/offline/whisper.py +15 -0
- khoj/processor/conversation/openai/__init__.py +0 -0
- khoj/processor/conversation/openai/gpt.py +243 -0
- khoj/processor/conversation/openai/utils.py +232 -0
- khoj/processor/conversation/openai/whisper.py +13 -0
- khoj/processor/conversation/prompts.py +1188 -0
- khoj/processor/conversation/utils.py +867 -0
- khoj/processor/embeddings.py +122 -0
- khoj/processor/image/generate.py +215 -0
- khoj/processor/speech/__init__.py +0 -0
- khoj/processor/speech/text_to_speech.py +51 -0
- khoj/processor/tools/__init__.py +0 -0
- khoj/processor/tools/online_search.py +472 -0
- khoj/processor/tools/run_code.py +179 -0
- khoj/routers/__init__.py +0 -0
- khoj/routers/api.py +760 -0
- khoj/routers/api_agents.py +295 -0
- khoj/routers/api_chat.py +1273 -0
- khoj/routers/api_content.py +634 -0
- khoj/routers/api_model.py +123 -0
- khoj/routers/api_phone.py +86 -0
- khoj/routers/api_subscription.py +144 -0
- khoj/routers/auth.py +307 -0
- khoj/routers/email.py +135 -0
- khoj/routers/helpers.py +2333 -0
- khoj/routers/notion.py +85 -0
- khoj/routers/research.py +364 -0
- khoj/routers/storage.py +63 -0
- khoj/routers/twilio.py +36 -0
- khoj/routers/web_client.py +141 -0
- khoj/search_filter/__init__.py +0 -0
- khoj/search_filter/base_filter.py +15 -0
- khoj/search_filter/date_filter.py +215 -0
- khoj/search_filter/file_filter.py +32 -0
- khoj/search_filter/word_filter.py +29 -0
- khoj/search_type/__init__.py +0 -0
- khoj/search_type/text_search.py +255 -0
- khoj/utils/__init__.py +0 -0
- khoj/utils/cli.py +101 -0
- khoj/utils/config.py +81 -0
- khoj/utils/constants.py +51 -0
- khoj/utils/fs_syncer.py +252 -0
- khoj/utils/helpers.py +627 -0
- khoj/utils/initialization.py +301 -0
- khoj/utils/jsonl.py +43 -0
- khoj/utils/models.py +47 -0
- khoj/utils/rawconfig.py +208 -0
- khoj/utils/state.py +48 -0
- khoj/utils/yaml.py +47 -0
- khoj-1.33.3.dev32.dist-info/METADATA +190 -0
- khoj-1.33.3.dev32.dist-info/RECORD +393 -0
- khoj-1.33.3.dev32.dist-info/WHEEL +4 -0
- khoj-1.33.3.dev32.dist-info/entry_points.txt +2 -0
- khoj-1.33.3.dev32.dist-info/licenses/LICENSE +661 -0
khoj/utils/config.py
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
# System Packages
|
2
|
+
from __future__ import annotations # to avoid quoting type hints
|
3
|
+
|
4
|
+
import logging
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from enum import Enum
|
7
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
8
|
+
|
9
|
+
import torch
|
10
|
+
|
11
|
+
from khoj.processor.conversation.offline.utils import download_model
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from sentence_transformers import CrossEncoder
|
18
|
+
|
19
|
+
from khoj.utils.models import BaseEncoder
|
20
|
+
|
21
|
+
|
22
|
+
class SearchType(str, Enum):
|
23
|
+
All = "all"
|
24
|
+
Org = "org"
|
25
|
+
Markdown = "markdown"
|
26
|
+
Image = "image"
|
27
|
+
Pdf = "pdf"
|
28
|
+
Github = "github"
|
29
|
+
Notion = "notion"
|
30
|
+
Plaintext = "plaintext"
|
31
|
+
Docx = "docx"
|
32
|
+
|
33
|
+
|
34
|
+
class ProcessorType(str, Enum):
|
35
|
+
Conversation = "conversation"
|
36
|
+
|
37
|
+
|
38
|
+
@dataclass
|
39
|
+
class TextContent:
|
40
|
+
enabled: bool
|
41
|
+
|
42
|
+
|
43
|
+
@dataclass
|
44
|
+
class ImageContent:
|
45
|
+
image_names: List[str]
|
46
|
+
image_embeddings: torch.Tensor
|
47
|
+
image_metadata_embeddings: torch.Tensor
|
48
|
+
|
49
|
+
|
50
|
+
@dataclass
|
51
|
+
class TextSearchModel:
|
52
|
+
bi_encoder: BaseEncoder
|
53
|
+
cross_encoder: Optional[CrossEncoder] = None
|
54
|
+
top_k: Optional[int] = 15
|
55
|
+
|
56
|
+
|
57
|
+
@dataclass
|
58
|
+
class ImageSearchModel:
|
59
|
+
image_encoder: BaseEncoder
|
60
|
+
|
61
|
+
|
62
|
+
@dataclass
|
63
|
+
class SearchModels:
|
64
|
+
text_search: Optional[TextSearchModel] = None
|
65
|
+
|
66
|
+
|
67
|
+
@dataclass
|
68
|
+
class OfflineChatProcessorConfig:
|
69
|
+
loaded_model: Union[Any, None] = None
|
70
|
+
|
71
|
+
|
72
|
+
class OfflineChatProcessorModel:
|
73
|
+
def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
|
74
|
+
self.chat_model = chat_model
|
75
|
+
self.loaded_model = None
|
76
|
+
try:
|
77
|
+
self.loaded_model = download_model(self.chat_model, max_tokens=max_tokens)
|
78
|
+
except ValueError as e:
|
79
|
+
self.loaded_model = None
|
80
|
+
logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
|
81
|
+
raise e
|
khoj/utils/constants.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Dict
|
3
|
+
|
4
|
+
app_root_directory = Path(__file__).parent.parent.parent
|
5
|
+
web_directory = app_root_directory / "khoj/interface/web/"
|
6
|
+
next_js_directory = app_root_directory / "khoj/interface/built/"
|
7
|
+
pypi_static_directory = app_root_directory / "khoj/interface/compiled/"
|
8
|
+
assetlinks_file_path = web_directory / ".well-known/assetlinks.json"
|
9
|
+
empty_escape_sequences = "\n|\r|\t| "
|
10
|
+
app_env_filepath = "~/.khoj/env"
|
11
|
+
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
|
12
|
+
content_directory = "~/.khoj/content/"
|
13
|
+
default_offline_chat_models = [
|
14
|
+
"bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
|
15
|
+
"bartowski/Llama-3.2-3B-Instruct-GGUF",
|
16
|
+
"bartowski/gemma-2-9b-it-GGUF",
|
17
|
+
"bartowski/gemma-2-2b-it-GGUF",
|
18
|
+
"Qwen/Qwen2.5-14B-Instruct-GGUF",
|
19
|
+
]
|
20
|
+
default_openai_chat_models = ["gpt-4o-mini", "gpt-4o"]
|
21
|
+
default_gemini_chat_models = ["gemini-1.5-flash", "gemini-1.5-pro"]
|
22
|
+
default_anthropic_chat_models = ["claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022"]
|
23
|
+
|
24
|
+
empty_config = {
|
25
|
+
"search-type": {
|
26
|
+
"image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
|
27
|
+
},
|
28
|
+
}
|
29
|
+
|
30
|
+
# default app config to use
|
31
|
+
default_config = {
|
32
|
+
"search-type": {
|
33
|
+
"image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
|
34
|
+
},
|
35
|
+
}
|
36
|
+
|
37
|
+
model_to_cost: Dict[str, Dict[str, float]] = {
|
38
|
+
# OpenAI Pricing: https://openai.com/api/pricing/
|
39
|
+
"gpt-4o": {"input": 2.50, "output": 10.00},
|
40
|
+
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
41
|
+
"o1": {"input": 15.0, "output": 60.00},
|
42
|
+
"o1-mini": {"input": 3.0, "output": 12.0},
|
43
|
+
# Gemini Pricing: https://ai.google.dev/pricing
|
44
|
+
"gemini-1.5-flash": {"input": 0.075, "output": 0.30},
|
45
|
+
"gemini-1.5-flash-002": {"input": 0.075, "output": 0.30},
|
46
|
+
"gemini-1.5-pro": {"input": 1.25, "output": 5.00},
|
47
|
+
"gemini-1.5-pro-002": {"input": 1.25, "output": 5.00},
|
48
|
+
# Anthropic Pricing: https://www.anthropic.com/pricing#anthropic-api_
|
49
|
+
"claude-3-5-sonnet-20241022": {"input": 3.0, "output": 15.0},
|
50
|
+
"claude-3-5-haiku-20241022": {"input": 1.0, "output": 5.0},
|
51
|
+
}
|
khoj/utils/fs_syncer.py
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
import glob
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from bs4 import BeautifulSoup
|
8
|
+
from magika import Magika
|
9
|
+
|
10
|
+
from khoj.database.models import (
|
11
|
+
KhojUser,
|
12
|
+
LocalMarkdownConfig,
|
13
|
+
LocalOrgConfig,
|
14
|
+
LocalPdfConfig,
|
15
|
+
LocalPlaintextConfig,
|
16
|
+
)
|
17
|
+
from khoj.utils.config import SearchType
|
18
|
+
from khoj.utils.helpers import get_absolute_path, is_none_or_empty
|
19
|
+
from khoj.utils.rawconfig import TextContentConfig
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
magika = Magika()
|
23
|
+
|
24
|
+
|
25
|
+
def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict:
|
26
|
+
files: dict[str, dict] = {"docx": {}, "image": {}}
|
27
|
+
|
28
|
+
if search_type == SearchType.All or search_type == SearchType.Org:
|
29
|
+
org_config = LocalOrgConfig.objects.filter(user=user).first()
|
30
|
+
files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {}
|
31
|
+
if search_type == SearchType.All or search_type == SearchType.Markdown:
|
32
|
+
markdown_config = LocalMarkdownConfig.objects.filter(user=user).first()
|
33
|
+
files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {}
|
34
|
+
if search_type == SearchType.All or search_type == SearchType.Plaintext:
|
35
|
+
plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first()
|
36
|
+
files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {}
|
37
|
+
if search_type == SearchType.All or search_type == SearchType.Pdf:
|
38
|
+
pdf_config = LocalPdfConfig.objects.filter(user=user).first()
|
39
|
+
files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {}
|
40
|
+
files["image"] = {}
|
41
|
+
files["docx"] = {}
|
42
|
+
return files
|
43
|
+
|
44
|
+
|
45
|
+
def construct_config_from_db(db_config) -> TextContentConfig:
|
46
|
+
return TextContentConfig(
|
47
|
+
input_files=db_config.input_files,
|
48
|
+
input_filter=db_config.input_filter,
|
49
|
+
index_heading_entries=db_config.index_heading_entries,
|
50
|
+
)
|
51
|
+
|
52
|
+
|
53
|
+
def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
54
|
+
def is_plaintextfile(file: str):
|
55
|
+
"Check if file is plaintext file"
|
56
|
+
# Check if file path exists
|
57
|
+
content_group = magika.identify_path(Path(file)).output.group
|
58
|
+
# Use file extension to decide plaintext if file content is not identifiable
|
59
|
+
valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
|
60
|
+
return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
|
61
|
+
|
62
|
+
def extract_html_content(html_content: str):
|
63
|
+
"Extract content from HTML"
|
64
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
65
|
+
return soup.get_text(strip=True, separator="\n")
|
66
|
+
|
67
|
+
# Extract required fields from config
|
68
|
+
input_files, input_filters = (
|
69
|
+
config.input_files,
|
70
|
+
config.input_filter,
|
71
|
+
)
|
72
|
+
|
73
|
+
# Input Validation
|
74
|
+
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
|
75
|
+
logger.debug("At least one of input-files or input-file-filter is required to be specified")
|
76
|
+
return {}
|
77
|
+
|
78
|
+
# Get all plain text files to process
|
79
|
+
absolute_plaintext_files, filtered_plaintext_files = set(), set()
|
80
|
+
if input_files:
|
81
|
+
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
|
82
|
+
if input_filters:
|
83
|
+
filtered_plaintext_files = {
|
84
|
+
filtered_file
|
85
|
+
for plaintext_file_filter in input_filters
|
86
|
+
for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
|
87
|
+
if os.path.isfile(filtered_file)
|
88
|
+
}
|
89
|
+
|
90
|
+
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
|
91
|
+
|
92
|
+
files_with_no_plaintext_extensions = {
|
93
|
+
target_files for target_files in all_target_files if not is_plaintextfile(target_files)
|
94
|
+
}
|
95
|
+
if any(files_with_no_plaintext_extensions):
|
96
|
+
logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
|
97
|
+
all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
|
98
|
+
|
99
|
+
logger.debug(f"Processing files: {all_target_files}")
|
100
|
+
|
101
|
+
filename_to_content_map = {}
|
102
|
+
for file in all_target_files:
|
103
|
+
with open(file, "r", encoding="utf8") as f:
|
104
|
+
try:
|
105
|
+
plaintext_content = f.read()
|
106
|
+
if file.endswith(("html", "htm", "xml")):
|
107
|
+
plaintext_content = extract_html_content(plaintext_content)
|
108
|
+
filename_to_content_map[file] = plaintext_content
|
109
|
+
except Exception as e:
|
110
|
+
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
|
111
|
+
logger.warning(e, exc_info=True)
|
112
|
+
|
113
|
+
return filename_to_content_map
|
114
|
+
|
115
|
+
|
116
|
+
def get_org_files(config: TextContentConfig):
|
117
|
+
# Extract required fields from config
|
118
|
+
org_files, org_file_filters = (
|
119
|
+
config.input_files,
|
120
|
+
config.input_filter,
|
121
|
+
)
|
122
|
+
|
123
|
+
# Input Validation
|
124
|
+
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
|
125
|
+
logger.debug("At least one of org-files or org-file-filter is required to be specified")
|
126
|
+
return {}
|
127
|
+
|
128
|
+
# Get Org files to process
|
129
|
+
absolute_org_files, filtered_org_files = set(), set()
|
130
|
+
if org_files:
|
131
|
+
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
|
132
|
+
if org_file_filters:
|
133
|
+
filtered_org_files = {
|
134
|
+
filtered_file
|
135
|
+
for org_file_filter in org_file_filters
|
136
|
+
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
|
137
|
+
if os.path.isfile(filtered_file)
|
138
|
+
}
|
139
|
+
|
140
|
+
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
141
|
+
|
142
|
+
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
|
143
|
+
if any(files_with_non_org_extensions):
|
144
|
+
logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
|
145
|
+
|
146
|
+
logger.debug(f"Processing files: {all_org_files}")
|
147
|
+
|
148
|
+
filename_to_content_map = {}
|
149
|
+
for file in all_org_files:
|
150
|
+
with open(file, "r", encoding="utf8") as f:
|
151
|
+
try:
|
152
|
+
filename_to_content_map[file] = f.read()
|
153
|
+
except Exception as e:
|
154
|
+
logger.warning(f"Unable to read file: {file} as org. Skipping file.")
|
155
|
+
logger.warning(e, exc_info=True)
|
156
|
+
|
157
|
+
return filename_to_content_map
|
158
|
+
|
159
|
+
|
160
|
+
def get_markdown_files(config: TextContentConfig):
|
161
|
+
# Extract required fields from config
|
162
|
+
markdown_files, markdown_file_filters = (
|
163
|
+
config.input_files,
|
164
|
+
config.input_filter,
|
165
|
+
)
|
166
|
+
|
167
|
+
# Input Validation
|
168
|
+
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
|
169
|
+
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
|
170
|
+
return {}
|
171
|
+
|
172
|
+
# Get markdown files to process
|
173
|
+
absolute_markdown_files, filtered_markdown_files = set(), set()
|
174
|
+
if markdown_files:
|
175
|
+
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
176
|
+
|
177
|
+
if markdown_file_filters:
|
178
|
+
filtered_markdown_files = {
|
179
|
+
filtered_file
|
180
|
+
for markdown_file_filter in markdown_file_filters
|
181
|
+
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
|
182
|
+
if os.path.isfile(filtered_file)
|
183
|
+
}
|
184
|
+
|
185
|
+
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
186
|
+
|
187
|
+
files_with_non_markdown_extensions = {
|
188
|
+
md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown")
|
189
|
+
}
|
190
|
+
|
191
|
+
if any(files_with_non_markdown_extensions):
|
192
|
+
logger.warning(
|
193
|
+
f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
|
194
|
+
)
|
195
|
+
|
196
|
+
logger.debug(f"Processing files: {all_markdown_files}")
|
197
|
+
|
198
|
+
filename_to_content_map = {}
|
199
|
+
for file in all_markdown_files:
|
200
|
+
with open(file, "r", encoding="utf8") as f:
|
201
|
+
try:
|
202
|
+
filename_to_content_map[file] = f.read()
|
203
|
+
except Exception as e:
|
204
|
+
logger.warning(f"Unable to read file: {file} as markdown. Skipping file.")
|
205
|
+
logger.warning(e, exc_info=True)
|
206
|
+
|
207
|
+
return filename_to_content_map
|
208
|
+
|
209
|
+
|
210
|
+
def get_pdf_files(config: TextContentConfig):
|
211
|
+
# Extract required fields from config
|
212
|
+
pdf_files, pdf_file_filters = (
|
213
|
+
config.input_files,
|
214
|
+
config.input_filter,
|
215
|
+
)
|
216
|
+
|
217
|
+
# Input Validation
|
218
|
+
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
|
219
|
+
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
|
220
|
+
return {}
|
221
|
+
|
222
|
+
# Get PDF files to process
|
223
|
+
absolute_pdf_files, filtered_pdf_files = set(), set()
|
224
|
+
if pdf_files:
|
225
|
+
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
226
|
+
if pdf_file_filters:
|
227
|
+
filtered_pdf_files = {
|
228
|
+
filtered_file
|
229
|
+
for pdf_file_filter in pdf_file_filters
|
230
|
+
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
231
|
+
if os.path.isfile(filtered_file)
|
232
|
+
}
|
233
|
+
|
234
|
+
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
235
|
+
|
236
|
+
files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
|
237
|
+
|
238
|
+
if any(files_with_non_pdf_extensions):
|
239
|
+
logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
|
240
|
+
|
241
|
+
logger.debug(f"Processing files: {all_pdf_files}")
|
242
|
+
|
243
|
+
filename_to_content_map = {}
|
244
|
+
for file in all_pdf_files:
|
245
|
+
with open(file, "rb") as f:
|
246
|
+
try:
|
247
|
+
filename_to_content_map[file] = f.read()
|
248
|
+
except Exception as e:
|
249
|
+
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
|
250
|
+
logger.warning(e, exc_info=True)
|
251
|
+
|
252
|
+
return filename_to_content_map
|