khoj 1.16.1.dev25__py3-none-any.whl → 1.17.1.dev216__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/configure.py +6 -6
- khoj/database/adapters/__init__.py +55 -26
- khoj/database/migrations/0053_agent_style_color_agent_style_icon.py +61 -0
- khoj/database/models/__init__.py +35 -0
- khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
- khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways.svg +31 -5384
- khoj/interface/web/assets/icons/khoj.svg +26 -0
- khoj/interface/web/chat.html +191 -301
- khoj/interface/web/content_source_computer_input.html +3 -3
- khoj/interface/web/content_source_github_input.html +1 -1
- khoj/interface/web/content_source_notion_input.html +1 -1
- khoj/interface/web/public_conversation.html +1 -1
- khoj/interface/web/search.html +2 -2
- khoj/interface/web/{config.html → settings.html} +30 -30
- khoj/interface/web/utils.html +1 -1
- khoj/processor/content/docx/docx_to_entries.py +4 -9
- khoj/processor/content/github/github_to_entries.py +1 -3
- khoj/processor/content/images/image_to_entries.py +4 -9
- khoj/processor/content/markdown/markdown_to_entries.py +4 -9
- khoj/processor/content/notion/notion_to_entries.py +1 -3
- khoj/processor/content/org_mode/org_to_entries.py +4 -9
- khoj/processor/content/pdf/pdf_to_entries.py +4 -9
- khoj/processor/content/plaintext/plaintext_to_entries.py +4 -9
- khoj/processor/content/text_to_entries.py +1 -3
- khoj/processor/conversation/utils.py +0 -4
- khoj/processor/tools/online_search.py +13 -7
- khoj/routers/api.py +58 -9
- khoj/routers/api_agents.py +3 -1
- khoj/routers/api_chat.py +335 -562
- khoj/routers/api_content.py +538 -0
- khoj/routers/api_model.py +156 -0
- khoj/routers/helpers.py +338 -23
- khoj/routers/notion.py +2 -8
- khoj/routers/web_client.py +43 -256
- khoj/search_type/text_search.py +5 -4
- khoj/utils/fs_syncer.py +4 -2
- khoj/utils/rawconfig.py +6 -1
- {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/METADATA +2 -2
- {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/RECORD +45 -43
- khoj/routers/api_config.py +0 -434
- khoj/routers/indexer.py +0 -349
- {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/WHEEL +0 -0
- {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/entry_points.txt +0 -0
- {khoj-1.16.1.dev25.dist-info → khoj-1.17.1.dev216.dist-info}/licenses/LICENSE +0 -0
|
@@ -165,7 +165,7 @@
|
|
|
165
165
|
|
|
166
166
|
// Save Github config on server
|
|
167
167
|
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
|
|
168
|
-
fetch('/api/
|
|
168
|
+
fetch('/api/content/github', {
|
|
169
169
|
method: 'POST',
|
|
170
170
|
headers: {
|
|
171
171
|
'Content-Type': 'application/json',
|
|
@@ -45,7 +45,7 @@
|
|
|
45
45
|
|
|
46
46
|
// Save Notion config on server
|
|
47
47
|
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
|
|
48
|
-
fetch('/api/
|
|
48
|
+
fetch('/api/content/notion', {
|
|
49
49
|
method: 'POST',
|
|
50
50
|
headers: {
|
|
51
51
|
'Content-Type': 'application/json',
|
|
@@ -34,7 +34,7 @@ Hi, I am Khoj, your open, personal AI 👋🏽. I can:
|
|
|
34
34
|
- 📚 Understand files you drag & drop here
|
|
35
35
|
- 👩🏾🚀 Be tuned to your conversation needs via [agents](./agents)
|
|
36
36
|
|
|
37
|
-
Get the Khoj [Desktop](https://khoj.dev/downloads), [Obsidian](https://docs.khoj.dev/clients/obsidian#setup), [Emacs](https://docs.khoj.dev/clients/emacs#setup) apps to search, chat with your 🖥️ computer docs. You can manage all the files you've shared with me at any time by going to [your settings](/
|
|
37
|
+
Get the Khoj [Desktop](https://khoj.dev/downloads), [Obsidian](https://docs.khoj.dev/clients/obsidian#setup), [Emacs](https://docs.khoj.dev/clients/emacs#setup) apps to search, chat with your 🖥️ computer docs. You can manage all the files you've shared with me at any time by going to [your settings](/settings/content/computer/).
|
|
38
38
|
|
|
39
39
|
To get started, just start typing below. You can also type / to see a list of commands.
|
|
40
40
|
`.trim()
|
khoj/interface/web/search.html
CHANGED
|
@@ -209,12 +209,12 @@
|
|
|
209
209
|
|
|
210
210
|
function populate_type_dropdown() {
|
|
211
211
|
// Populate type dropdown field with enabled content types only
|
|
212
|
-
fetch("/api/
|
|
212
|
+
fetch("/api/content/types")
|
|
213
213
|
.then(response => response.json())
|
|
214
214
|
.then(enabled_types => {
|
|
215
215
|
// Show warning if no content types are enabled, or just one ("all")
|
|
216
216
|
if (enabled_types[0] === "all" && enabled_types.length === 1) {
|
|
217
|
-
document.getElementById("results").innerHTML = "<div id='results-error'>To use Khoj search, setup your content plugins on the Khoj <a class='inline-chat-link' href='/
|
|
217
|
+
document.getElementById("results").innerHTML = "<div id='results-error'>To use Khoj search, setup your content plugins on the Khoj <a class='inline-chat-link' href='/settings'>settings page</a>.</div>";
|
|
218
218
|
document.getElementById("query").setAttribute("disabled", "disabled");
|
|
219
219
|
document.getElementById("query").setAttribute("placeholder", "Configure Khoj to enable search");
|
|
220
220
|
return [];
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
<h3 id="card-title-computer" class="card-title">
|
|
35
35
|
<span>Files</span>
|
|
36
36
|
<img id="configured-icon-computer"
|
|
37
|
-
style="display: {% if not
|
|
37
|
+
style="display: {% if not enabled_content_source.computer %}none{% endif %}"
|
|
38
38
|
class="configured-icon"
|
|
39
39
|
src="/static/assets/icons/confirm-icon.svg"
|
|
40
40
|
alt="Configured">
|
|
@@ -44,8 +44,8 @@
|
|
|
44
44
|
<p class="card-description">Manage files from your computer</p>
|
|
45
45
|
</div>
|
|
46
46
|
<div class="card-action-row">
|
|
47
|
-
<a class="card-button" href="/
|
|
48
|
-
{% if
|
|
47
|
+
<a class="card-button" href="/settings/content/computer">
|
|
48
|
+
{% if enabled_content_source.computer %}
|
|
49
49
|
Update
|
|
50
50
|
{% else %}
|
|
51
51
|
Setup
|
|
@@ -53,7 +53,7 @@
|
|
|
53
53
|
<svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
|
|
54
54
|
</a>
|
|
55
55
|
<div id="clear-computer" class="card-action-row"
|
|
56
|
-
style="display: {% if not
|
|
56
|
+
style="display: {% if not enabled_content_source.computer %}none{% endif %}">
|
|
57
57
|
<button class="card-button" onclick="clearContentType('computer')">
|
|
58
58
|
Disable
|
|
59
59
|
</button>
|
|
@@ -69,15 +69,15 @@
|
|
|
69
69
|
class="configured-icon"
|
|
70
70
|
src="/static/assets/icons/confirm-icon.svg"
|
|
71
71
|
alt="Configured"
|
|
72
|
-
style="display: {% if not
|
|
72
|
+
style="display: {% if not enabled_content_source.github %}none{% endif %}">
|
|
73
73
|
</h3>
|
|
74
74
|
</div>
|
|
75
75
|
<div class="card-description-row">
|
|
76
76
|
<p class="card-description">Set repositories to index</p>
|
|
77
77
|
</div>
|
|
78
78
|
<div class="card-action-row">
|
|
79
|
-
<a class="card-button" href="/
|
|
80
|
-
{% if
|
|
79
|
+
<a class="card-button" href="/settings/content/github">
|
|
80
|
+
{% if enabled_content_source.github %}
|
|
81
81
|
Update
|
|
82
82
|
{% else %}
|
|
83
83
|
Setup
|
|
@@ -86,7 +86,7 @@
|
|
|
86
86
|
</a>
|
|
87
87
|
<div id="clear-github"
|
|
88
88
|
class="card-action-row"
|
|
89
|
-
style="display: {% if not
|
|
89
|
+
style="display: {% if not enabled_content_source.github %}none{% endif %}">
|
|
90
90
|
<button class="card-button" onclick="clearContentType('github')">
|
|
91
91
|
Disable
|
|
92
92
|
</button>
|
|
@@ -102,15 +102,15 @@
|
|
|
102
102
|
class="configured-icon"
|
|
103
103
|
src="/static/assets/icons/confirm-icon.svg"
|
|
104
104
|
alt="Configured"
|
|
105
|
-
style="display: {% if not
|
|
105
|
+
style="display: {% if not enabled_content_source.notion %}none{% endif %}">
|
|
106
106
|
</h3>
|
|
107
107
|
</div>
|
|
108
108
|
<div class="card-description-row">
|
|
109
109
|
<p class="card-description">Sync your Notion pages</p>
|
|
110
110
|
</div>
|
|
111
111
|
<div class="card-action-row">
|
|
112
|
-
{% if
|
|
113
|
-
<a class="card-button" href="/
|
|
112
|
+
{% if enabled_content_source.notion %}
|
|
113
|
+
<a class="card-button" href="/settings/content/notion">
|
|
114
114
|
Update
|
|
115
115
|
<svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
|
|
116
116
|
</a>
|
|
@@ -120,7 +120,7 @@
|
|
|
120
120
|
<svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
|
|
121
121
|
</a>
|
|
122
122
|
{% else %}
|
|
123
|
-
<a class="card-button" href="/
|
|
123
|
+
<a class="card-button" href="/settings/content/notion">
|
|
124
124
|
Setup
|
|
125
125
|
<svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
|
|
126
126
|
</a>
|
|
@@ -128,7 +128,7 @@
|
|
|
128
128
|
|
|
129
129
|
<div id="clear-notion"
|
|
130
130
|
class="card-action-row"
|
|
131
|
-
style="display: {% if not
|
|
131
|
+
style="display: {% if not enabled_content_source.notion %}none{% endif %}">
|
|
132
132
|
<button class="card-button" onclick="clearContentType('notion')">
|
|
133
133
|
Disable
|
|
134
134
|
</button>
|
|
@@ -181,8 +181,8 @@
|
|
|
181
181
|
</div>
|
|
182
182
|
<div class="card-description-row">
|
|
183
183
|
<select id="chat-models">
|
|
184
|
-
{% for option in
|
|
185
|
-
<option value="{{ option.id }}" {% if option.id ==
|
|
184
|
+
{% for option in chat_model_options %}
|
|
185
|
+
<option value="{{ option.id }}" {% if option.id == selected_chat_model_config %}selected{% endif %}>{{ option.name }}</option>
|
|
186
186
|
{% endfor %}
|
|
187
187
|
</select>
|
|
188
188
|
</div>
|
|
@@ -208,7 +208,7 @@
|
|
|
208
208
|
<div class="card-description-row">
|
|
209
209
|
<select id="paint-models">
|
|
210
210
|
{% for option in paint_model_options %}
|
|
211
|
-
<option value="{{ option.id }}" {% if option.id == selected_paint_model_config %}selected{% endif %}>{{ option.
|
|
211
|
+
<option value="{{ option.id }}" {% if option.id == selected_paint_model_config %}selected{% endif %}>{{ option.name }}</option>
|
|
212
212
|
{% endfor %}
|
|
213
213
|
</select>
|
|
214
214
|
</div>
|
|
@@ -235,7 +235,7 @@
|
|
|
235
235
|
<div class="card-description-row">
|
|
236
236
|
<select id="voice-models">
|
|
237
237
|
{% for option in voice_model_options %}
|
|
238
|
-
<option value="{{ option.id }}" {% if option.id ==
|
|
238
|
+
<option value="{{ option.id }}" {% if option.id == selected_voice_model_config %}selected{% endif %}>{{ option.name }}</option>
|
|
239
239
|
{% endfor %}
|
|
240
240
|
</select>
|
|
241
241
|
</div>
|
|
@@ -394,8 +394,8 @@
|
|
|
394
394
|
|
|
395
395
|
function saveProfileGivenName() {
|
|
396
396
|
const givenName = document.getElementById("profile_given_name").value;
|
|
397
|
-
fetch('/api/
|
|
398
|
-
method: '
|
|
397
|
+
fetch('/api/user/name?name=' + givenName, {
|
|
398
|
+
method: 'PATCH',
|
|
399
399
|
headers: {
|
|
400
400
|
'Content-Type': 'application/json',
|
|
401
401
|
}
|
|
@@ -421,7 +421,7 @@
|
|
|
421
421
|
saveVoiceModelButton.disabled = true;
|
|
422
422
|
saveVoiceModelButton.textContent = "Saving...";
|
|
423
423
|
|
|
424
|
-
fetch('/api/
|
|
424
|
+
fetch('/api/model/voice?id=' + voiceModel, {
|
|
425
425
|
method: 'POST',
|
|
426
426
|
headers: {
|
|
427
427
|
'Content-Type': 'application/json',
|
|
@@ -455,7 +455,7 @@
|
|
|
455
455
|
saveModelButton.innerHTML = "";
|
|
456
456
|
saveModelButton.textContent = "Saving...";
|
|
457
457
|
|
|
458
|
-
fetch('/api/
|
|
458
|
+
fetch('/api/model/chat?id=' + chatModel, {
|
|
459
459
|
method: 'POST',
|
|
460
460
|
headers: {
|
|
461
461
|
'Content-Type': 'application/json',
|
|
@@ -494,7 +494,7 @@
|
|
|
494
494
|
saveSearchModelButton.disabled = true;
|
|
495
495
|
saveSearchModelButton.textContent = "Saving...";
|
|
496
496
|
|
|
497
|
-
fetch('/api/
|
|
497
|
+
fetch('/api/model/search?id=' + searchModel, {
|
|
498
498
|
method: 'POST',
|
|
499
499
|
headers: {
|
|
500
500
|
'Content-Type': 'application/json',
|
|
@@ -526,7 +526,7 @@
|
|
|
526
526
|
saveModelButton.disabled = true;
|
|
527
527
|
saveModelButton.innerHTML = "Saving...";
|
|
528
528
|
|
|
529
|
-
fetch('/api/
|
|
529
|
+
fetch('/api/model/paint?id=' + paintModel, {
|
|
530
530
|
method: 'POST',
|
|
531
531
|
headers: {
|
|
532
532
|
'Content-Type': 'application/json',
|
|
@@ -553,7 +553,7 @@
|
|
|
553
553
|
};
|
|
554
554
|
|
|
555
555
|
function clearContentType(content_source) {
|
|
556
|
-
fetch('/api/
|
|
556
|
+
fetch('/api/content/' + content_source, {
|
|
557
557
|
method: 'DELETE',
|
|
558
558
|
headers: {
|
|
559
559
|
'Content-Type': 'application/json',
|
|
@@ -676,7 +676,7 @@
|
|
|
676
676
|
|
|
677
677
|
content_sources = ["computer", "github", "notion"];
|
|
678
678
|
content_sources.forEach(content_source => {
|
|
679
|
-
fetch(`/api/
|
|
679
|
+
fetch(`/api/content/${content_source}`, {
|
|
680
680
|
method: 'GET',
|
|
681
681
|
headers: {
|
|
682
682
|
'Content-Type': 'application/json',
|
|
@@ -807,7 +807,7 @@
|
|
|
807
807
|
|
|
808
808
|
function getIndexedDataSize() {
|
|
809
809
|
document.getElementById("indexed-data-size").textContent = "Calculating...";
|
|
810
|
-
fetch('/api/
|
|
810
|
+
fetch('/api/content/size')
|
|
811
811
|
.then(response => response.json())
|
|
812
812
|
.then(data => {
|
|
813
813
|
document.getElementById("indexed-data-size").textContent = data.indexed_data_size_in_mb + " MB used";
|
|
@@ -815,7 +815,7 @@
|
|
|
815
815
|
}
|
|
816
816
|
|
|
817
817
|
function removeFile(path) {
|
|
818
|
-
fetch('/api/
|
|
818
|
+
fetch('/api/content/file?filename=' + path, {
|
|
819
819
|
method: 'DELETE',
|
|
820
820
|
headers: {
|
|
821
821
|
'Content-Type': 'application/json',
|
|
@@ -890,7 +890,7 @@
|
|
|
890
890
|
})
|
|
891
891
|
|
|
892
892
|
phonenumberRemoveButton.addEventListener("click", () => {
|
|
893
|
-
fetch('/api/
|
|
893
|
+
fetch('/api/phone', {
|
|
894
894
|
method: 'DELETE',
|
|
895
895
|
headers: {
|
|
896
896
|
'Content-Type': 'application/json',
|
|
@@ -917,7 +917,7 @@
|
|
|
917
917
|
}, 5000);
|
|
918
918
|
} else {
|
|
919
919
|
const mobileNumber = iti.getNumber();
|
|
920
|
-
fetch('/api/
|
|
920
|
+
fetch('/api/phone?phone_number=' + mobileNumber, {
|
|
921
921
|
method: 'POST',
|
|
922
922
|
headers: {
|
|
923
923
|
'Content-Type': 'application/json',
|
|
@@ -970,7 +970,7 @@
|
|
|
970
970
|
return;
|
|
971
971
|
}
|
|
972
972
|
|
|
973
|
-
fetch('/api/
|
|
973
|
+
fetch('/api/phone/verify?code=' + otp, {
|
|
974
974
|
method: 'POST',
|
|
975
975
|
headers: {
|
|
976
976
|
'Content-Type': 'application/json',
|
khoj/interface/web/utils.html
CHANGED
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
{% endif %}
|
|
37
37
|
<div id="khoj-nav-menu" class="khoj-nav-dropdown-content">
|
|
38
38
|
<div class="khoj-nav-username"> {{ username }} </div>
|
|
39
|
-
<a id="settings-nav" class="khoj-nav" href="/
|
|
39
|
+
<a id="settings-nav" class="khoj-nav" href="/settings">Settings</a>
|
|
40
40
|
<a id="github-nav" class="khoj-nav" href="https://github.com/khoj-ai/khoj">GitHub</a>
|
|
41
41
|
<a id="help-nav" class="khoj-nav" href="https://docs.khoj.dev" target="_blank">Help</a>
|
|
42
42
|
<a class="khoj-nav" href="/auth/logout">Logout</a>
|
|
@@ -19,16 +19,11 @@ class DocxToEntries(TextToEntries):
|
|
|
19
19
|
super().__init__()
|
|
20
20
|
|
|
21
21
|
# Define Functions
|
|
22
|
-
def process(
|
|
23
|
-
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
24
|
-
) -> Tuple[int, int]:
|
|
22
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
25
23
|
# Extract required fields from config
|
|
26
|
-
if
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
files = {file: files[file] for file in files_to_process}
|
|
30
|
-
else:
|
|
31
|
-
deletion_file_names = None
|
|
24
|
+
deletion_file_names = set([file for file in files if files[file] == b""])
|
|
25
|
+
files_to_process = set(files) - deletion_file_names
|
|
26
|
+
files = {file: files[file] for file in files_to_process}
|
|
32
27
|
|
|
33
28
|
# Extract Entries from specified Docx files
|
|
34
29
|
with timer("Extract entries from specified DOCX files", logger):
|
|
@@ -48,9 +48,7 @@ class GithubToEntries(TextToEntries):
|
|
|
48
48
|
else:
|
|
49
49
|
return
|
|
50
50
|
|
|
51
|
-
def process(
|
|
52
|
-
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
53
|
-
) -> Tuple[int, int]:
|
|
51
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
54
52
|
if self.config.pat_token is None or self.config.pat_token == "":
|
|
55
53
|
logger.error(f"Github PAT token is not set. Skipping github content")
|
|
56
54
|
raise ValueError("Github PAT token is not set. Skipping github content")
|
|
@@ -20,16 +20,11 @@ class ImageToEntries(TextToEntries):
|
|
|
20
20
|
super().__init__()
|
|
21
21
|
|
|
22
22
|
# Define Functions
|
|
23
|
-
def process(
|
|
24
|
-
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
25
|
-
) -> Tuple[int, int]:
|
|
23
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
26
24
|
# Extract required fields from config
|
|
27
|
-
if
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
files = {file: files[file] for file in files_to_process}
|
|
31
|
-
else:
|
|
32
|
-
deletion_file_names = None
|
|
25
|
+
deletion_file_names = set([file for file in files if files[file] == b""])
|
|
26
|
+
files_to_process = set(files) - deletion_file_names
|
|
27
|
+
files = {file: files[file] for file in files_to_process}
|
|
33
28
|
|
|
34
29
|
# Extract Entries from specified image files
|
|
35
30
|
with timer("Extract entries from specified Image files", logger):
|
|
@@ -19,16 +19,11 @@ class MarkdownToEntries(TextToEntries):
|
|
|
19
19
|
super().__init__()
|
|
20
20
|
|
|
21
21
|
# Define Functions
|
|
22
|
-
def process(
|
|
23
|
-
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
24
|
-
) -> Tuple[int, int]:
|
|
22
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
25
23
|
# Extract required fields from config
|
|
26
|
-
if
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
files = {file: files[file] for file in files_to_process}
|
|
30
|
-
else:
|
|
31
|
-
deletion_file_names = None
|
|
24
|
+
deletion_file_names = set([file for file in files if files[file] == ""])
|
|
25
|
+
files_to_process = set(files) - deletion_file_names
|
|
26
|
+
files = {file: files[file] for file in files_to_process}
|
|
32
27
|
|
|
33
28
|
max_tokens = 256
|
|
34
29
|
# Extract Entries from specified Markdown files
|
|
@@ -78,9 +78,7 @@ class NotionToEntries(TextToEntries):
|
|
|
78
78
|
|
|
79
79
|
self.body_params = {"page_size": 100}
|
|
80
80
|
|
|
81
|
-
def process(
|
|
82
|
-
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
83
|
-
) -> Tuple[int, int]:
|
|
81
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
84
82
|
current_entries = []
|
|
85
83
|
|
|
86
84
|
# Get all pages
|
|
@@ -20,15 +20,10 @@ class OrgToEntries(TextToEntries):
|
|
|
20
20
|
super().__init__()
|
|
21
21
|
|
|
22
22
|
# Define Functions
|
|
23
|
-
def process(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
deletion_file_names = set([file for file in files if files[file] == ""])
|
|
28
|
-
files_to_process = set(files) - deletion_file_names
|
|
29
|
-
files = {file: files[file] for file in files_to_process}
|
|
30
|
-
else:
|
|
31
|
-
deletion_file_names = None
|
|
23
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
24
|
+
deletion_file_names = set([file for file in files if files[file] == ""])
|
|
25
|
+
files_to_process = set(files) - deletion_file_names
|
|
26
|
+
files = {file: files[file] for file in files_to_process}
|
|
32
27
|
|
|
33
28
|
# Extract Entries from specified Org files
|
|
34
29
|
max_tokens = 256
|
|
@@ -22,16 +22,11 @@ class PdfToEntries(TextToEntries):
|
|
|
22
22
|
super().__init__()
|
|
23
23
|
|
|
24
24
|
# Define Functions
|
|
25
|
-
def process(
|
|
26
|
-
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
27
|
-
) -> Tuple[int, int]:
|
|
25
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
28
26
|
# Extract required fields from config
|
|
29
|
-
if
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
files = {file: files[file] for file in files_to_process}
|
|
33
|
-
else:
|
|
34
|
-
deletion_file_names = None
|
|
27
|
+
deletion_file_names = set([file for file in files if files[file] == b""])
|
|
28
|
+
files_to_process = set(files) - deletion_file_names
|
|
29
|
+
files = {file: files[file] for file in files_to_process}
|
|
35
30
|
|
|
36
31
|
# Extract Entries from specified Pdf files
|
|
37
32
|
with timer("Extract entries from specified PDF files", logger):
|
|
@@ -20,15 +20,10 @@ class PlaintextToEntries(TextToEntries):
|
|
|
20
20
|
super().__init__()
|
|
21
21
|
|
|
22
22
|
# Define Functions
|
|
23
|
-
def process(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
deletion_file_names = set([file for file in files if files[file] == ""])
|
|
28
|
-
files_to_process = set(files) - deletion_file_names
|
|
29
|
-
files = {file: files[file] for file in files_to_process}
|
|
30
|
-
else:
|
|
31
|
-
deletion_file_names = None
|
|
23
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
24
|
+
deletion_file_names = set([file for file in files if files[file] == ""])
|
|
25
|
+
files_to_process = set(files) - deletion_file_names
|
|
26
|
+
files = {file: files[file] for file in files_to_process}
|
|
32
27
|
|
|
33
28
|
# Extract Entries from specified plaintext files
|
|
34
29
|
with timer("Extract entries from specified Plaintext files", logger):
|
|
@@ -31,9 +31,7 @@ class TextToEntries(ABC):
|
|
|
31
31
|
self.date_filter = DateFilter()
|
|
32
32
|
|
|
33
33
|
@abstractmethod
|
|
34
|
-
def process(
|
|
35
|
-
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
36
|
-
) -> Tuple[int, int]:
|
|
34
|
+
def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]:
|
|
37
35
|
...
|
|
38
36
|
|
|
39
37
|
@staticmethod
|
|
@@ -62,10 +62,6 @@ class ThreadedGenerator:
|
|
|
62
62
|
self.queue.put(data)
|
|
63
63
|
|
|
64
64
|
def close(self):
|
|
65
|
-
if self.compiled_references and len(self.compiled_references) > 0:
|
|
66
|
-
self.queue.put(f"### compiled references:{json.dumps(self.compiled_references)}")
|
|
67
|
-
if self.online_results and len(self.online_results) > 0:
|
|
68
|
-
self.queue.put(f"### compiled references:{json.dumps(self.online_results)}")
|
|
69
65
|
self.queue.put(StopIteration)
|
|
70
66
|
|
|
71
67
|
|
|
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
|
|
|
11
11
|
from markdownify import markdownify
|
|
12
12
|
|
|
13
13
|
from khoj.routers.helpers import (
|
|
14
|
+
ChatEvent,
|
|
14
15
|
extract_relevant_info,
|
|
15
16
|
generate_online_subqueries,
|
|
16
17
|
infer_webpage_urls,
|
|
@@ -56,7 +57,8 @@ async def search_online(
|
|
|
56
57
|
query += " ".join(custom_filters)
|
|
57
58
|
if not is_internet_connected():
|
|
58
59
|
logger.warn("Cannot search online as not connected to internet")
|
|
59
|
-
|
|
60
|
+
yield {}
|
|
61
|
+
return
|
|
60
62
|
|
|
61
63
|
# Breakdown the query into subqueries to get the correct answer
|
|
62
64
|
subqueries = await generate_online_subqueries(query, conversation_history, location)
|
|
@@ -66,7 +68,8 @@ async def search_online(
|
|
|
66
68
|
logger.info(f"🌐 Searching the Internet for {list(subqueries)}")
|
|
67
69
|
if send_status_func:
|
|
68
70
|
subqueries_str = "\n- " + "\n- ".join(list(subqueries))
|
|
69
|
-
|
|
71
|
+
async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
|
|
72
|
+
yield {ChatEvent.STATUS: event}
|
|
70
73
|
|
|
71
74
|
with timer(f"Internet searches for {list(subqueries)} took", logger):
|
|
72
75
|
search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
|
|
@@ -89,7 +92,8 @@ async def search_online(
|
|
|
89
92
|
logger.info(f"🌐👀 Reading web pages at: {list(webpage_links)}")
|
|
90
93
|
if send_status_func:
|
|
91
94
|
webpage_links_str = "\n- " + "\n- ".join(list(webpage_links))
|
|
92
|
-
|
|
95
|
+
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
|
96
|
+
yield {ChatEvent.STATUS: event}
|
|
93
97
|
tasks = [read_webpage_and_extract_content(subquery, link, content) for link, subquery, content in webpages]
|
|
94
98
|
results = await asyncio.gather(*tasks)
|
|
95
99
|
|
|
@@ -98,7 +102,7 @@ async def search_online(
|
|
|
98
102
|
if webpage_extract is not None:
|
|
99
103
|
response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
|
|
100
104
|
|
|
101
|
-
|
|
105
|
+
yield response_dict
|
|
102
106
|
|
|
103
107
|
|
|
104
108
|
async def search_with_google(query: str) -> Tuple[str, Dict[str, List[Dict]]]:
|
|
@@ -127,13 +131,15 @@ async def read_webpages(
|
|
|
127
131
|
"Infer web pages to read from the query and extract relevant information from them"
|
|
128
132
|
logger.info(f"Inferring web pages to read")
|
|
129
133
|
if send_status_func:
|
|
130
|
-
|
|
134
|
+
async for event in send_status_func(f"**Inferring web pages to read**"):
|
|
135
|
+
yield {ChatEvent.STATUS: event}
|
|
131
136
|
urls = await infer_webpage_urls(query, conversation_history, location)
|
|
132
137
|
|
|
133
138
|
logger.info(f"Reading web pages at: {urls}")
|
|
134
139
|
if send_status_func:
|
|
135
140
|
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
|
136
|
-
|
|
141
|
+
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
|
142
|
+
yield {ChatEvent.STATUS: event}
|
|
137
143
|
tasks = [read_webpage_and_extract_content(query, url) for url in urls]
|
|
138
144
|
results = await asyncio.gather(*tasks)
|
|
139
145
|
|
|
@@ -141,7 +147,7 @@ async def read_webpages(
|
|
|
141
147
|
response[query]["webpages"] = [
|
|
142
148
|
{"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
|
|
143
149
|
]
|
|
144
|
-
|
|
150
|
+
yield response
|
|
145
151
|
|
|
146
152
|
|
|
147
153
|
async def read_webpage_and_extract_content(
|