khoj 1.16.1.dev25__py3-none-any.whl → 1.17.1.dev216__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. khoj/configure.py +6 -6
  2. khoj/database/adapters/__init__.py +55 -26
  3. khoj/database/migrations/0053_agent_style_color_agent_style_icon.py +61 -0
  4. khoj/database/models/__init__.py +35 -0
  5. khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
  6. khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
  7. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  8. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  9. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +31 -5384
  10. khoj/interface/web/assets/icons/khoj.svg +26 -0
  11. khoj/interface/web/chat.html +191 -301
  12. khoj/interface/web/content_source_computer_input.html +3 -3
  13. khoj/interface/web/content_source_github_input.html +1 -1
  14. khoj/interface/web/content_source_notion_input.html +1 -1
  15. khoj/interface/web/public_conversation.html +1 -1
  16. khoj/interface/web/search.html +2 -2
  17. khoj/interface/web/{config.html → settings.html} +30 -30
  18. khoj/interface/web/utils.html +1 -1
  19. khoj/processor/content/docx/docx_to_entries.py +4 -9
  20. khoj/processor/content/github/github_to_entries.py +1 -3
  21. khoj/processor/content/images/image_to_entries.py +4 -9
  22. khoj/processor/content/markdown/markdown_to_entries.py +4 -9
  23. khoj/processor/content/notion/notion_to_entries.py +1 -3
  24. khoj/processor/content/org_mode/org_to_entries.py +4 -9
  25. khoj/processor/content/pdf/pdf_to_entries.py +4 -9
  26. khoj/processor/content/plaintext/plaintext_to_entries.py +4 -9
  27. khoj/processor/content/text_to_entries.py +1 -3
  28. khoj/processor/conversation/utils.py +0 -4
  29. khoj/processor/tools/online_search.py +13 -7
  30. khoj/routers/api.py +58 -9
  31. khoj/routers/api_agents.py +3 -1
  32. khoj/routers/api_chat.py +335 -562
  33. khoj/routers/api_content.py +538 -0
  34. khoj/routers/api_model.py +156 -0
  35. khoj/routers/helpers.py +338 -23
  36. khoj/routers/notion.py +2 -8
  37. khoj/routers/web_client.py +43 -256
  38. khoj/search_type/text_search.py +5 -4
  39. khoj/utils/fs_syncer.py +4 -2
  40. khoj/utils/rawconfig.py +6 -1
  41. {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/METADATA +2 -2
  42. {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/RECORD +45 -43
  43. khoj/routers/api_config.py +0 -434
  44. khoj/routers/indexer.py +0 -349
  45. {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/WHEEL +0 -0
  46. {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/entry_points.txt +0 -0
  47. {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/licenses/LICENSE +0 -0
@@ -165,7 +165,7 @@
165
165
 
166
166
  // Save Github config on server
167
167
  const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
168
- fetch('/api/config/data/content-source/github', {
168
+ fetch('/api/content/github', {
169
169
  method: 'POST',
170
170
  headers: {
171
171
  'Content-Type': 'application/json',
@@ -45,7 +45,7 @@
45
45
 
46
46
  // Save Notion config on server
47
47
  const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
48
- fetch('/api/config/data/content-source/notion', {
48
+ fetch('/api/content/notion', {
49
49
  method: 'POST',
50
50
  headers: {
51
51
  'Content-Type': 'application/json',
@@ -34,7 +34,7 @@ Hi, I am Khoj, your open, personal AI 👋🏽. I can:
34
34
  - 📚 Understand files you drag & drop here
35
35
  - 👩🏾‍🚀 Be tuned to your conversation needs via [agents](./agents)
36
36
 
37
- Get the Khoj [Desktop](https://khoj.dev/downloads), [Obsidian](https://docs.khoj.dev/clients/obsidian#setup), [Emacs](https://docs.khoj.dev/clients/emacs#setup) apps to search, chat with your 🖥️ computer docs. You can manage all the files you've shared with me at any time by going to [your settings](/config/content-source/computer/).
37
+ Get the Khoj [Desktop](https://khoj.dev/downloads), [Obsidian](https://docs.khoj.dev/clients/obsidian#setup), [Emacs](https://docs.khoj.dev/clients/emacs#setup) apps to search, chat with your 🖥️ computer docs. You can manage all the files you've shared with me at any time by going to [your settings](/settings/content/computer/).
38
38
 
39
39
  To get started, just start typing below. You can also type / to see a list of commands.
40
40
  `.trim()
@@ -209,12 +209,12 @@
209
209
 
210
210
  function populate_type_dropdown() {
211
211
  // Populate type dropdown field with enabled content types only
212
- fetch("/api/config/types")
212
+ fetch("/api/content/types")
213
213
  .then(response => response.json())
214
214
  .then(enabled_types => {
215
215
  // Show warning if no content types are enabled, or just one ("all")
216
216
  if (enabled_types[0] === "all" && enabled_types.length === 1) {
217
- document.getElementById("results").innerHTML = "<div id='results-error'>To use Khoj search, setup your content plugins on the Khoj <a class='inline-chat-link' href='/config'>settings page</a>.</div>";
217
+ document.getElementById("results").innerHTML = "<div id='results-error'>To use Khoj search, setup your content plugins on the Khoj <a class='inline-chat-link' href='/settings'>settings page</a>.</div>";
218
218
  document.getElementById("query").setAttribute("disabled", "disabled");
219
219
  document.getElementById("query").setAttribute("placeholder", "Configure Khoj to enable search");
220
220
  return [];
@@ -34,7 +34,7 @@
34
34
  <h3 id="card-title-computer" class="card-title">
35
35
  <span>Files</span>
36
36
  <img id="configured-icon-computer"
37
- style="display: {% if not current_model_state.computer %}none{% endif %}"
37
+ style="display: {% if not enabled_content_source.computer %}none{% endif %}"
38
38
  class="configured-icon"
39
39
  src="/static/assets/icons/confirm-icon.svg"
40
40
  alt="Configured">
@@ -44,8 +44,8 @@
44
44
  <p class="card-description">Manage files from your computer</p>
45
45
  </div>
46
46
  <div class="card-action-row">
47
- <a class="card-button" href="/config/content-source/computer">
48
- {% if current_model_state.computer %}
47
+ <a class="card-button" href="/settings/content/computer">
48
+ {% if enabled_content_source.computer %}
49
49
  Update
50
50
  {% else %}
51
51
  Setup
@@ -53,7 +53,7 @@
53
53
  <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
54
54
  </a>
55
55
  <div id="clear-computer" class="card-action-row"
56
- style="display: {% if not current_model_state.computer %}none{% endif %}">
56
+ style="display: {% if not enabled_content_source.computer %}none{% endif %}">
57
57
  <button class="card-button" onclick="clearContentType('computer')">
58
58
  Disable
59
59
  </button>
@@ -69,15 +69,15 @@
69
69
  class="configured-icon"
70
70
  src="/static/assets/icons/confirm-icon.svg"
71
71
  alt="Configured"
72
- style="display: {% if not current_model_state.github %}none{% endif %}">
72
+ style="display: {% if not enabled_content_source.github %}none{% endif %}">
73
73
  </h3>
74
74
  </div>
75
75
  <div class="card-description-row">
76
76
  <p class="card-description">Set repositories to index</p>
77
77
  </div>
78
78
  <div class="card-action-row">
79
- <a class="card-button" href="/config/content-source/github">
80
- {% if current_model_state.github %}
79
+ <a class="card-button" href="/settings/content/github">
80
+ {% if enabled_content_source.github %}
81
81
  Update
82
82
  {% else %}
83
83
  Setup
@@ -86,7 +86,7 @@
86
86
  </a>
87
87
  <div id="clear-github"
88
88
  class="card-action-row"
89
- style="display: {% if not current_model_state.github %}none{% endif %}">
89
+ style="display: {% if not enabled_content_source.github %}none{% endif %}">
90
90
  <button class="card-button" onclick="clearContentType('github')">
91
91
  Disable
92
92
  </button>
@@ -102,15 +102,15 @@
102
102
  class="configured-icon"
103
103
  src="/static/assets/icons/confirm-icon.svg"
104
104
  alt="Configured"
105
- style="display: {% if not current_model_state.notion %}none{% endif %}">
105
+ style="display: {% if not enabled_content_source.notion %}none{% endif %}">
106
106
  </h3>
107
107
  </div>
108
108
  <div class="card-description-row">
109
109
  <p class="card-description">Sync your Notion pages</p>
110
110
  </div>
111
111
  <div class="card-action-row">
112
- {% if current_model_state.notion %}
113
- <a class="card-button" href="/config/content-source/notion">
112
+ {% if enabled_content_source.notion %}
113
+ <a class="card-button" href="/settings/content/notion">
114
114
  Update
115
115
  <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
116
116
  </a>
@@ -120,7 +120,7 @@
120
120
  <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
121
121
  </a>
122
122
  {% else %}
123
- <a class="card-button" href="/config/content-source/notion">
123
+ <a class="card-button" href="/settings/content/notion">
124
124
  Setup
125
125
  <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
126
126
  </a>
@@ -128,7 +128,7 @@
128
128
 
129
129
  <div id="clear-notion"
130
130
  class="card-action-row"
131
- style="display: {% if not current_model_state.notion %}none{% endif %}">
131
+ style="display: {% if not enabled_content_source.notion %}none{% endif %}">
132
132
  <button class="card-button" onclick="clearContentType('notion')">
133
133
  Disable
134
134
  </button>
@@ -181,8 +181,8 @@
181
181
  </div>
182
182
  <div class="card-description-row">
183
183
  <select id="chat-models">
184
- {% for option in conversation_options %}
185
- <option value="{{ option.id }}" {% if option.id == selected_conversation_config %}selected{% endif %}>{{ option.chat_model }}</option>
184
+ {% for option in chat_model_options %}
185
+ <option value="{{ option.id }}" {% if option.id == selected_chat_model_config %}selected{% endif %}>{{ option.name }}</option>
186
186
  {% endfor %}
187
187
  </select>
188
188
  </div>
@@ -208,7 +208,7 @@
208
208
  <div class="card-description-row">
209
209
  <select id="paint-models">
210
210
  {% for option in paint_model_options %}
211
- <option value="{{ option.id }}" {% if option.id == selected_paint_model_config %}selected{% endif %}>{{ option.model_name }}</option>
211
+ <option value="{{ option.id }}" {% if option.id == selected_paint_model_config %}selected{% endif %}>{{ option.name }}</option>
212
212
  {% endfor %}
213
213
  </select>
214
214
  </div>
@@ -235,7 +235,7 @@
235
235
  <div class="card-description-row">
236
236
  <select id="voice-models">
237
237
  {% for option in voice_model_options %}
238
- <option value="{{ option.id }}" {% if option.id == selected_voice_config %}selected{% endif %}>{{ option.name }}</option>
238
+ <option value="{{ option.id }}" {% if option.id == selected_voice_model_config %}selected{% endif %}>{{ option.name }}</option>
239
239
  {% endfor %}
240
240
  </select>
241
241
  </div>
@@ -394,8 +394,8 @@
394
394
 
395
395
  function saveProfileGivenName() {
396
396
  const givenName = document.getElementById("profile_given_name").value;
397
- fetch('/api/config/user/name?name=' + givenName, {
398
- method: 'POST',
397
+ fetch('/api/user/name?name=' + givenName, {
398
+ method: 'PATCH',
399
399
  headers: {
400
400
  'Content-Type': 'application/json',
401
401
  }
@@ -421,7 +421,7 @@
421
421
  saveVoiceModelButton.disabled = true;
422
422
  saveVoiceModelButton.textContent = "Saving...";
423
423
 
424
- fetch('/api/config/data/voice/model?id=' + voiceModel, {
424
+ fetch('/api/model/voice?id=' + voiceModel, {
425
425
  method: 'POST',
426
426
  headers: {
427
427
  'Content-Type': 'application/json',
@@ -455,7 +455,7 @@
455
455
  saveModelButton.innerHTML = "";
456
456
  saveModelButton.textContent = "Saving...";
457
457
 
458
- fetch('/api/config/data/conversation/model?id=' + chatModel, {
458
+ fetch('/api/model/chat?id=' + chatModel, {
459
459
  method: 'POST',
460
460
  headers: {
461
461
  'Content-Type': 'application/json',
@@ -494,7 +494,7 @@
494
494
  saveSearchModelButton.disabled = true;
495
495
  saveSearchModelButton.textContent = "Saving...";
496
496
 
497
- fetch('/api/config/data/search/model?id=' + searchModel, {
497
+ fetch('/api/model/search?id=' + searchModel, {
498
498
  method: 'POST',
499
499
  headers: {
500
500
  'Content-Type': 'application/json',
@@ -526,7 +526,7 @@
526
526
  saveModelButton.disabled = true;
527
527
  saveModelButton.innerHTML = "Saving...";
528
528
 
529
- fetch('/api/config/data/paint/model?id=' + paintModel, {
529
+ fetch('/api/model/paint?id=' + paintModel, {
530
530
  method: 'POST',
531
531
  headers: {
532
532
  'Content-Type': 'application/json',
@@ -553,7 +553,7 @@
553
553
  };
554
554
 
555
555
  function clearContentType(content_source) {
556
- fetch('/api/config/data/content-source/' + content_source, {
556
+ fetch('/api/content/' + content_source, {
557
557
  method: 'DELETE',
558
558
  headers: {
559
559
  'Content-Type': 'application/json',
@@ -676,7 +676,7 @@
676
676
 
677
677
  content_sources = ["computer", "github", "notion"];
678
678
  content_sources.forEach(content_source => {
679
- fetch(`/api/config/data/${content_source}`, {
679
+ fetch(`/api/content/${content_source}`, {
680
680
  method: 'GET',
681
681
  headers: {
682
682
  'Content-Type': 'application/json',
@@ -807,7 +807,7 @@
807
807
 
808
808
  function getIndexedDataSize() {
809
809
  document.getElementById("indexed-data-size").textContent = "Calculating...";
810
- fetch('/api/config/index/size')
810
+ fetch('/api/content/size')
811
811
  .then(response => response.json())
812
812
  .then(data => {
813
813
  document.getElementById("indexed-data-size").textContent = data.indexed_data_size_in_mb + " MB used";
@@ -815,7 +815,7 @@
815
815
  }
816
816
 
817
817
  function removeFile(path) {
818
- fetch('/api/config/data/file?filename=' + path, {
818
+ fetch('/api/content/file?filename=' + path, {
819
819
  method: 'DELETE',
820
820
  headers: {
821
821
  'Content-Type': 'application/json',
@@ -890,7 +890,7 @@
890
890
  })
891
891
 
892
892
  phonenumberRemoveButton.addEventListener("click", () => {
893
- fetch('/api/config/phone', {
893
+ fetch('/api/phone', {
894
894
  method: 'DELETE',
895
895
  headers: {
896
896
  'Content-Type': 'application/json',
@@ -917,7 +917,7 @@
917
917
  }, 5000);
918
918
  } else {
919
919
  const mobileNumber = iti.getNumber();
920
- fetch('/api/config/phone?phone_number=' + mobileNumber, {
920
+ fetch('/api/phone?phone_number=' + mobileNumber, {
921
921
  method: 'POST',
922
922
  headers: {
923
923
  'Content-Type': 'application/json',
@@ -970,7 +970,7 @@
970
970
  return;
971
971
  }
972
972
 
973
- fetch('/api/config/phone/verify?code=' + otp, {
973
+ fetch('/api/phone/verify?code=' + otp, {
974
974
  method: 'POST',
975
975
  headers: {
976
976
  'Content-Type': 'application/json',
@@ -36,7 +36,7 @@
36
36
  {% endif %}
37
37
  <div id="khoj-nav-menu" class="khoj-nav-dropdown-content">
38
38
  <div class="khoj-nav-username"> {{ username }} </div>
39
- <a id="settings-nav" class="khoj-nav" href="/config">Settings</a>
39
+ <a id="settings-nav" class="khoj-nav" href="/settings">Settings</a>
40
40
  <a id="github-nav" class="khoj-nav" href="https://github.com/khoj-ai/khoj">GitHub</a>
41
41
  <a id="help-nav" class="khoj-nav" href="https://docs.khoj.dev" target="_blank">Help</a>
42
42
  <a class="khoj-nav" href="/auth/logout">Logout</a>
@@ -19,16 +19,11 @@ class DocxToEntries(TextToEntries):
19
19
  super().__init__()
20
20
 
21
21
  # Define Functions
22
- def process(
23
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
24
- ) -> Tuple[int, int]:
22
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
25
23
  # Extract required fields from config
26
- if not full_corpus:
27
- deletion_file_names = set([file for file in files if files[file] == b""])
28
- files_to_process = set(files) - deletion_file_names
29
- files = {file: files[file] for file in files_to_process}
30
- else:
31
- deletion_file_names = None
24
+ deletion_file_names = set([file for file in files if files[file] == b""])
25
+ files_to_process = set(files) - deletion_file_names
26
+ files = {file: files[file] for file in files_to_process}
32
27
 
33
28
  # Extract Entries from specified Docx files
34
29
  with timer("Extract entries from specified DOCX files", logger):
@@ -48,9 +48,7 @@ class GithubToEntries(TextToEntries):
48
48
  else:
49
49
  return
50
50
 
51
- def process(
52
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
53
- ) -> Tuple[int, int]:
51
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
54
52
  if self.config.pat_token is None or self.config.pat_token == "":
55
53
  logger.error(f"Github PAT token is not set. Skipping github content")
56
54
  raise ValueError("Github PAT token is not set. Skipping github content")
@@ -20,16 +20,11 @@ class ImageToEntries(TextToEntries):
20
20
  super().__init__()
21
21
 
22
22
  # Define Functions
23
- def process(
24
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
25
- ) -> Tuple[int, int]:
23
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
26
24
  # Extract required fields from config
27
- if not full_corpus:
28
- deletion_file_names = set([file for file in files if files[file] == b""])
29
- files_to_process = set(files) - deletion_file_names
30
- files = {file: files[file] for file in files_to_process}
31
- else:
32
- deletion_file_names = None
25
+ deletion_file_names = set([file for file in files if files[file] == b""])
26
+ files_to_process = set(files) - deletion_file_names
27
+ files = {file: files[file] for file in files_to_process}
33
28
 
34
29
  # Extract Entries from specified image files
35
30
  with timer("Extract entries from specified Image files", logger):
@@ -19,16 +19,11 @@ class MarkdownToEntries(TextToEntries):
19
19
  super().__init__()
20
20
 
21
21
  # Define Functions
22
- def process(
23
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
24
- ) -> Tuple[int, int]:
22
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
25
23
  # Extract required fields from config
26
- if not full_corpus:
27
- deletion_file_names = set([file for file in files if files[file] == ""])
28
- files_to_process = set(files) - deletion_file_names
29
- files = {file: files[file] for file in files_to_process}
30
- else:
31
- deletion_file_names = None
24
+ deletion_file_names = set([file for file in files if files[file] == ""])
25
+ files_to_process = set(files) - deletion_file_names
26
+ files = {file: files[file] for file in files_to_process}
32
27
 
33
28
  max_tokens = 256
34
29
  # Extract Entries from specified Markdown files
@@ -78,9 +78,7 @@ class NotionToEntries(TextToEntries):
78
78
 
79
79
  self.body_params = {"page_size": 100}
80
80
 
81
- def process(
82
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
83
- ) -> Tuple[int, int]:
81
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
84
82
  current_entries = []
85
83
 
86
84
  # Get all pages
@@ -20,15 +20,10 @@ class OrgToEntries(TextToEntries):
20
20
  super().__init__()
21
21
 
22
22
  # Define Functions
23
- def process(
24
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
25
- ) -> Tuple[int, int]:
26
- if not full_corpus:
27
- deletion_file_names = set([file for file in files if files[file] == ""])
28
- files_to_process = set(files) - deletion_file_names
29
- files = {file: files[file] for file in files_to_process}
30
- else:
31
- deletion_file_names = None
23
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
24
+ deletion_file_names = set([file for file in files if files[file] == ""])
25
+ files_to_process = set(files) - deletion_file_names
26
+ files = {file: files[file] for file in files_to_process}
32
27
 
33
28
  # Extract Entries from specified Org files
34
29
  max_tokens = 256
@@ -22,16 +22,11 @@ class PdfToEntries(TextToEntries):
22
22
  super().__init__()
23
23
 
24
24
  # Define Functions
25
- def process(
26
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
27
- ) -> Tuple[int, int]:
25
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
28
26
  # Extract required fields from config
29
- if not full_corpus:
30
- deletion_file_names = set([file for file in files if files[file] == b""])
31
- files_to_process = set(files) - deletion_file_names
32
- files = {file: files[file] for file in files_to_process}
33
- else:
34
- deletion_file_names = None
27
+ deletion_file_names = set([file for file in files if files[file] == b""])
28
+ files_to_process = set(files) - deletion_file_names
29
+ files = {file: files[file] for file in files_to_process}
35
30
 
36
31
  # Extract Entries from specified Pdf files
37
32
  with timer("Extract entries from specified PDF files", logger):
@@ -20,15 +20,10 @@ class PlaintextToEntries(TextToEntries):
20
20
  super().__init__()
21
21
 
22
22
  # Define Functions
23
- def process(
24
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
25
- ) -> Tuple[int, int]:
26
- if not full_corpus:
27
- deletion_file_names = set([file for file in files if files[file] == ""])
28
- files_to_process = set(files) - deletion_file_names
29
- files = {file: files[file] for file in files_to_process}
30
- else:
31
- deletion_file_names = None
23
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
24
+ deletion_file_names = set([file for file in files if files[file] == ""])
25
+ files_to_process = set(files) - deletion_file_names
26
+ files = {file: files[file] for file in files_to_process}
32
27
 
33
28
  # Extract Entries from specified plaintext files
34
29
  with timer("Extract entries from specified Plaintext files", logger):
@@ -31,9 +31,7 @@ class TextToEntries(ABC):
31
31
  self.date_filter = DateFilter()
32
32
 
33
33
  @abstractmethod
34
- def process(
35
- self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
36
- ) -> Tuple[int, int]:
34
+ def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
37
35
  ...
38
36
 
39
37
  @staticmethod
@@ -62,10 +62,6 @@ class ThreadedGenerator:
62
62
  self.queue.put(data)
63
63
 
64
64
  def close(self):
65
- if self.compiled_references and len(self.compiled_references) > 0:
66
- self.queue.put(f"### compiled references:{json.dumps(self.compiled_references)}")
67
- if self.online_results and len(self.online_results) > 0:
68
- self.queue.put(f"### compiled references:{json.dumps(self.online_results)}")
69
65
  self.queue.put(StopIteration)
70
66
 
71
67
 
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
11
11
  from markdownify import markdownify
12
12
 
13
13
  from khoj.routers.helpers import (
14
+ ChatEvent,
14
15
  extract_relevant_info,
15
16
  generate_online_subqueries,
16
17
  infer_webpage_urls,
@@ -56,7 +57,8 @@ async def search_online(
56
57
  query += " ".join(custom_filters)
57
58
  if not is_internet_connected():
58
59
  logger.warn("Cannot search online as not connected to internet")
59
- return {}
60
+ yield {}
61
+ return
60
62
 
61
63
  # Breakdown the query into subqueries to get the correct answer
62
64
  subqueries = await generate_online_subqueries(query, conversation_history, location)
@@ -66,7 +68,8 @@ async def search_online(
66
68
  logger.info(f"🌐 Searching the Internet for {list(subqueries)}")
67
69
  if send_status_func:
68
70
  subqueries_str = "\n- " + "\n- ".join(list(subqueries))
69
- await send_status_func(f"**🌐 Searching the Internet for**: {subqueries_str}")
71
+ async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
72
+ yield {ChatEvent.STATUS: event}
70
73
 
71
74
  with timer(f"Internet searches for {list(subqueries)} took", logger):
72
75
  search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
@@ -89,7 +92,8 @@ async def search_online(
89
92
  logger.info(f"🌐👀 Reading web pages at: {list(webpage_links)}")
90
93
  if send_status_func:
91
94
  webpage_links_str = "\n- " + "\n- ".join(list(webpage_links))
92
- await send_status_func(f"**📖 Reading web pages**: {webpage_links_str}")
95
+ async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
96
+ yield {ChatEvent.STATUS: event}
93
97
  tasks = [read_webpage_and_extract_content(subquery, link, content) for link, subquery, content in webpages]
94
98
  results = await asyncio.gather(*tasks)
95
99
 
@@ -98,7 +102,7 @@ async def search_online(
98
102
  if webpage_extract is not None:
99
103
  response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
100
104
 
101
- return response_dict
105
+ yield response_dict
102
106
 
103
107
 
104
108
  async def search_with_google(query: str) -> Tuple[str, Dict[str, List[Dict]]]:
@@ -127,13 +131,15 @@ async def read_webpages(
127
131
  "Infer web pages to read from the query and extract relevant information from them"
128
132
  logger.info(f"Inferring web pages to read")
129
133
  if send_status_func:
130
- await send_status_func(f"**🧐 Inferring web pages to read**")
134
+ async for event in send_status_func(f"**Inferring web pages to read**"):
135
+ yield {ChatEvent.STATUS: event}
131
136
  urls = await infer_webpage_urls(query, conversation_history, location)
132
137
 
133
138
  logger.info(f"Reading web pages at: {urls}")
134
139
  if send_status_func:
135
140
  webpage_links_str = "\n- " + "\n- ".join(list(urls))
136
- await send_status_func(f"**📖 Reading web pages**: {webpage_links_str}")
141
+ async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
142
+ yield {ChatEvent.STATUS: event}
137
143
  tasks = [read_webpage_and_extract_content(query, url) for url in urls]
138
144
  results = await asyncio.gather(*tasks)
139
145
 
@@ -141,7 +147,7 @@ async def read_webpages(
141
147
  response[query]["webpages"] = [
142
148
  {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
143
149
  ]
144
- return response
150
+ yield response
145
151
 
146
152
 
147
153
  async def read_webpage_and_extract_content(