khoj 1.16.1.dev15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/__init__.py +0 -0
- khoj/app/README.md +94 -0
- khoj/app/__init__.py +0 -0
- khoj/app/asgi.py +16 -0
- khoj/app/settings.py +192 -0
- khoj/app/urls.py +25 -0
- khoj/configure.py +424 -0
- khoj/database/__init__.py +0 -0
- khoj/database/adapters/__init__.py +1234 -0
- khoj/database/admin.py +290 -0
- khoj/database/apps.py +6 -0
- khoj/database/management/__init__.py +0 -0
- khoj/database/management/commands/__init__.py +0 -0
- khoj/database/management/commands/change_generated_images_url.py +61 -0
- khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
- khoj/database/migrations/0001_khojuser.py +98 -0
- khoj/database/migrations/0002_googleuser.py +32 -0
- khoj/database/migrations/0003_vector_extension.py +10 -0
- khoj/database/migrations/0004_content_types_and_more.py +181 -0
- khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
- khoj/database/migrations/0006_embeddingsdates.py +33 -0
- khoj/database/migrations/0007_add_conversation.py +27 -0
- khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
- khoj/database/migrations/0009_khojapiuser.py +24 -0
- khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
- khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
- khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
- khoj/database/migrations/0012_entry_file_source.py +21 -0
- khoj/database/migrations/0013_subscription.py +37 -0
- khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
- khoj/database/migrations/0015_alter_subscription_user.py +21 -0
- khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
- khoj/database/migrations/0017_searchmodel.py +32 -0
- khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
- khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
- khoj/database/migrations/0020_reflectivequestion.py +36 -0
- khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
- khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
- khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
- khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
- khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
- khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
- khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
- khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
- khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
- khoj/database/migrations/0029_userrequests.py +27 -0
- khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
- khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
- khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
- khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
- khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
- khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
- khoj/database/migrations/0035_processlock.py +26 -0
- khoj/database/migrations/0036_alter_processlock_name.py +19 -0
- khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
- khoj/database/migrations/0036_publicconversation.py +42 -0
- khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
- khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
- khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
- khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
- khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
- khoj/database/migrations/0040_alter_processlock_name.py +26 -0
- khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
- khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
- khoj/database/migrations/0042_serverchatsettings.py +46 -0
- khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
- khoj/database/migrations/0044_conversation_file_filters.py +17 -0
- khoj/database/migrations/0045_fileobject.py +37 -0
- khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
- khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
- khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
- khoj/database/migrations/0049_datastore.py +38 -0
- khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
- khoj/database/migrations/0050_alter_processlock_name.py +25 -0
- khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
- khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
- khoj/database/migrations/__init__.py +0 -0
- khoj/database/models/__init__.py +402 -0
- khoj/database/tests.py +3 -0
- khoj/interface/email/feedback.html +34 -0
- khoj/interface/email/magic_link.html +17 -0
- khoj/interface/email/task.html +40 -0
- khoj/interface/email/welcome.html +61 -0
- khoj/interface/web/404.html +56 -0
- khoj/interface/web/agent.html +312 -0
- khoj/interface/web/agents.html +276 -0
- khoj/interface/web/assets/icons/agents.svg +6 -0
- khoj/interface/web/assets/icons/automation.svg +37 -0
- khoj/interface/web/assets/icons/cancel.svg +3 -0
- khoj/interface/web/assets/icons/chat.svg +24 -0
- khoj/interface/web/assets/icons/collapse.svg +17 -0
- khoj/interface/web/assets/icons/computer.png +0 -0
- khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
- khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
- khoj/interface/web/assets/icons/copy-button.svg +5 -0
- khoj/interface/web/assets/icons/credit-card.png +0 -0
- khoj/interface/web/assets/icons/delete.svg +26 -0
- khoj/interface/web/assets/icons/docx.svg +7 -0
- khoj/interface/web/assets/icons/edit.svg +4 -0
- khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
- khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
- khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
- khoj/interface/web/assets/icons/favicon.icns +0 -0
- khoj/interface/web/assets/icons/github.svg +1 -0
- khoj/interface/web/assets/icons/key.svg +4 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
- khoj/interface/web/assets/icons/logotype.svg +1 -0
- khoj/interface/web/assets/icons/markdown.svg +1 -0
- khoj/interface/web/assets/icons/new.svg +23 -0
- khoj/interface/web/assets/icons/notion.svg +4 -0
- khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
- khoj/interface/web/assets/icons/org.svg +1 -0
- khoj/interface/web/assets/icons/pdf.svg +23 -0
- khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
- khoj/interface/web/assets/icons/plaintext.svg +1 -0
- khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
- khoj/interface/web/assets/icons/search.svg +25 -0
- khoj/interface/web/assets/icons/send.svg +1 -0
- khoj/interface/web/assets/icons/share.svg +8 -0
- khoj/interface/web/assets/icons/speaker.svg +4 -0
- khoj/interface/web/assets/icons/stop-solid.svg +37 -0
- khoj/interface/web/assets/icons/sync.svg +4 -0
- khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
- khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
- khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
- khoj/interface/web/assets/icons/voice.svg +8 -0
- khoj/interface/web/assets/icons/web.svg +2 -0
- khoj/interface/web/assets/icons/whatsapp.svg +17 -0
- khoj/interface/web/assets/khoj.css +237 -0
- khoj/interface/web/assets/markdown-it.min.js +8476 -0
- khoj/interface/web/assets/natural-cron.min.js +1 -0
- khoj/interface/web/assets/org.min.js +1823 -0
- khoj/interface/web/assets/pico.min.css +5 -0
- khoj/interface/web/assets/purify.min.js +3 -0
- khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
- khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
- khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
- khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
- khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
- khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
- khoj/interface/web/assets/utils.js +33 -0
- khoj/interface/web/base_config.html +445 -0
- khoj/interface/web/chat.html +3546 -0
- khoj/interface/web/config.html +1011 -0
- khoj/interface/web/config_automation.html +1103 -0
- khoj/interface/web/content_source_computer_input.html +139 -0
- khoj/interface/web/content_source_github_input.html +216 -0
- khoj/interface/web/content_source_notion_input.html +94 -0
- khoj/interface/web/khoj.webmanifest +51 -0
- khoj/interface/web/login.html +219 -0
- khoj/interface/web/public_conversation.html +2006 -0
- khoj/interface/web/search.html +470 -0
- khoj/interface/web/utils.html +48 -0
- khoj/main.py +241 -0
- khoj/manage.py +22 -0
- khoj/migrations/__init__.py +0 -0
- khoj/migrations/migrate_offline_chat_default_model.py +69 -0
- khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
- khoj/migrations/migrate_offline_chat_schema.py +83 -0
- khoj/migrations/migrate_offline_model.py +29 -0
- khoj/migrations/migrate_processor_config_openai.py +67 -0
- khoj/migrations/migrate_server_pg.py +138 -0
- khoj/migrations/migrate_version.py +17 -0
- khoj/processor/__init__.py +0 -0
- khoj/processor/content/__init__.py +0 -0
- khoj/processor/content/docx/__init__.py +0 -0
- khoj/processor/content/docx/docx_to_entries.py +110 -0
- khoj/processor/content/github/__init__.py +0 -0
- khoj/processor/content/github/github_to_entries.py +224 -0
- khoj/processor/content/images/__init__.py +0 -0
- khoj/processor/content/images/image_to_entries.py +118 -0
- khoj/processor/content/markdown/__init__.py +0 -0
- khoj/processor/content/markdown/markdown_to_entries.py +165 -0
- khoj/processor/content/notion/notion_to_entries.py +260 -0
- khoj/processor/content/org_mode/__init__.py +0 -0
- khoj/processor/content/org_mode/org_to_entries.py +231 -0
- khoj/processor/content/org_mode/orgnode.py +532 -0
- khoj/processor/content/pdf/__init__.py +0 -0
- khoj/processor/content/pdf/pdf_to_entries.py +116 -0
- khoj/processor/content/plaintext/__init__.py +0 -0
- khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
- khoj/processor/content/text_to_entries.py +297 -0
- khoj/processor/conversation/__init__.py +0 -0
- khoj/processor/conversation/anthropic/__init__.py +0 -0
- khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
- khoj/processor/conversation/anthropic/utils.py +114 -0
- khoj/processor/conversation/offline/__init__.py +0 -0
- khoj/processor/conversation/offline/chat_model.py +231 -0
- khoj/processor/conversation/offline/utils.py +78 -0
- khoj/processor/conversation/offline/whisper.py +15 -0
- khoj/processor/conversation/openai/__init__.py +0 -0
- khoj/processor/conversation/openai/gpt.py +187 -0
- khoj/processor/conversation/openai/utils.py +129 -0
- khoj/processor/conversation/openai/whisper.py +13 -0
- khoj/processor/conversation/prompts.py +758 -0
- khoj/processor/conversation/utils.py +262 -0
- khoj/processor/embeddings.py +117 -0
- khoj/processor/speech/__init__.py +0 -0
- khoj/processor/speech/text_to_speech.py +51 -0
- khoj/processor/tools/__init__.py +0 -0
- khoj/processor/tools/online_search.py +225 -0
- khoj/routers/__init__.py +0 -0
- khoj/routers/api.py +626 -0
- khoj/routers/api_agents.py +43 -0
- khoj/routers/api_chat.py +1180 -0
- khoj/routers/api_config.py +434 -0
- khoj/routers/api_phone.py +86 -0
- khoj/routers/auth.py +181 -0
- khoj/routers/email.py +133 -0
- khoj/routers/helpers.py +1188 -0
- khoj/routers/indexer.py +349 -0
- khoj/routers/notion.py +91 -0
- khoj/routers/storage.py +35 -0
- khoj/routers/subscription.py +104 -0
- khoj/routers/twilio.py +36 -0
- khoj/routers/web_client.py +471 -0
- khoj/search_filter/__init__.py +0 -0
- khoj/search_filter/base_filter.py +15 -0
- khoj/search_filter/date_filter.py +217 -0
- khoj/search_filter/file_filter.py +30 -0
- khoj/search_filter/word_filter.py +29 -0
- khoj/search_type/__init__.py +0 -0
- khoj/search_type/text_search.py +241 -0
- khoj/utils/__init__.py +0 -0
- khoj/utils/cli.py +93 -0
- khoj/utils/config.py +81 -0
- khoj/utils/constants.py +24 -0
- khoj/utils/fs_syncer.py +249 -0
- khoj/utils/helpers.py +418 -0
- khoj/utils/initialization.py +146 -0
- khoj/utils/jsonl.py +43 -0
- khoj/utils/models.py +47 -0
- khoj/utils/rawconfig.py +160 -0
- khoj/utils/state.py +46 -0
- khoj/utils/yaml.py +43 -0
- khoj-1.16.1.dev15.dist-info/METADATA +178 -0
- khoj-1.16.1.dev15.dist-info/RECORD +242 -0
- khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
- khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
- khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import math
|
|
4
|
+
import queue
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from time import perf_counter
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
import tiktoken
|
|
10
|
+
from langchain.schema import ChatMessage
|
|
11
|
+
from llama_cpp.llama import Llama
|
|
12
|
+
from transformers import AutoTokenizer
|
|
13
|
+
|
|
14
|
+
from khoj.database.adapters import ConversationAdapters
|
|
15
|
+
from khoj.database.models import ClientApplication, KhojUser
|
|
16
|
+
from khoj.processor.conversation.offline.utils import download_model, infer_max_tokens
|
|
17
|
+
from khoj.utils import state
|
|
18
|
+
from khoj.utils.helpers import is_none_or_empty, merge_dicts
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
model_to_prompt_size = {
|
|
22
|
+
"gpt-3.5-turbo": 12000,
|
|
23
|
+
"gpt-3.5-turbo-0125": 12000,
|
|
24
|
+
"gpt-4-0125-preview": 20000,
|
|
25
|
+
"gpt-4-turbo-preview": 20000,
|
|
26
|
+
"TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
|
|
27
|
+
"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
|
|
28
|
+
}
|
|
29
|
+
model_to_tokenizer: Dict[str, str] = {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ThreadedGenerator:
|
|
33
|
+
def __init__(self, compiled_references, online_results, completion_func=None):
|
|
34
|
+
self.queue = queue.Queue()
|
|
35
|
+
self.compiled_references = compiled_references
|
|
36
|
+
self.online_results = online_results
|
|
37
|
+
self.completion_func = completion_func
|
|
38
|
+
self.response = ""
|
|
39
|
+
self.start_time = perf_counter()
|
|
40
|
+
|
|
41
|
+
def __iter__(self):
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
def __next__(self):
|
|
45
|
+
item = self.queue.get()
|
|
46
|
+
if item is StopIteration:
|
|
47
|
+
time_to_response = perf_counter() - self.start_time
|
|
48
|
+
logger.info(f"Chat streaming took: {time_to_response:.3f} seconds")
|
|
49
|
+
if self.completion_func:
|
|
50
|
+
# The completion func effectively acts as a callback.
|
|
51
|
+
# It adds the aggregated response to the conversation history.
|
|
52
|
+
self.completion_func(chat_response=self.response)
|
|
53
|
+
raise StopIteration
|
|
54
|
+
return item
|
|
55
|
+
|
|
56
|
+
def send(self, data):
|
|
57
|
+
if self.response == "":
|
|
58
|
+
time_to_first_response = perf_counter() - self.start_time
|
|
59
|
+
logger.info(f"First response took: {time_to_first_response:.3f} seconds")
|
|
60
|
+
|
|
61
|
+
self.response += data
|
|
62
|
+
self.queue.put(data)
|
|
63
|
+
|
|
64
|
+
def close(self):
|
|
65
|
+
if self.compiled_references and len(self.compiled_references) > 0:
|
|
66
|
+
self.queue.put(f"### compiled references:{json.dumps(self.compiled_references)}")
|
|
67
|
+
if self.online_results and len(self.online_results) > 0:
|
|
68
|
+
self.queue.put(f"### compiled references:{json.dumps(self.online_results)}")
|
|
69
|
+
self.queue.put(StopIteration)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def message_to_log(
|
|
73
|
+
user_message, chat_response, user_message_metadata={}, khoj_message_metadata={}, conversation_log=[]
|
|
74
|
+
):
|
|
75
|
+
"""Create json logs from messages, metadata for conversation log"""
|
|
76
|
+
default_khoj_message_metadata = {
|
|
77
|
+
"intent": {"type": "remember", "memory-type": "notes", "query": user_message},
|
|
78
|
+
"trigger-emotion": "calm",
|
|
79
|
+
}
|
|
80
|
+
khoj_response_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
81
|
+
|
|
82
|
+
# Create json log from Human's message
|
|
83
|
+
human_log = merge_dicts({"message": user_message, "by": "you"}, user_message_metadata)
|
|
84
|
+
|
|
85
|
+
# Create json log from GPT's response
|
|
86
|
+
khoj_log = merge_dicts(khoj_message_metadata, default_khoj_message_metadata)
|
|
87
|
+
khoj_log = merge_dicts({"message": chat_response, "by": "khoj", "created": khoj_response_time}, khoj_log)
|
|
88
|
+
|
|
89
|
+
conversation_log.extend([human_log, khoj_log])
|
|
90
|
+
return conversation_log
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def save_to_conversation_log(
|
|
94
|
+
q: str,
|
|
95
|
+
chat_response: str,
|
|
96
|
+
user: KhojUser,
|
|
97
|
+
meta_log: Dict,
|
|
98
|
+
user_message_time: str = None,
|
|
99
|
+
compiled_references: List[Dict[str, Any]] = [],
|
|
100
|
+
online_results: Dict[str, Any] = {},
|
|
101
|
+
inferred_queries: List[str] = [],
|
|
102
|
+
intent_type: str = "remember",
|
|
103
|
+
client_application: ClientApplication = None,
|
|
104
|
+
conversation_id: int = None,
|
|
105
|
+
automation_id: str = None,
|
|
106
|
+
):
|
|
107
|
+
user_message_time = user_message_time or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
108
|
+
updated_conversation = message_to_log(
|
|
109
|
+
user_message=q,
|
|
110
|
+
chat_response=chat_response,
|
|
111
|
+
user_message_metadata={"created": user_message_time},
|
|
112
|
+
khoj_message_metadata={
|
|
113
|
+
"context": compiled_references,
|
|
114
|
+
"intent": {"inferred-queries": inferred_queries, "type": intent_type},
|
|
115
|
+
"onlineContext": online_results,
|
|
116
|
+
"automationId": automation_id,
|
|
117
|
+
},
|
|
118
|
+
conversation_log=meta_log.get("chat", []),
|
|
119
|
+
)
|
|
120
|
+
ConversationAdapters.save_conversation(
|
|
121
|
+
user,
|
|
122
|
+
{"chat": updated_conversation},
|
|
123
|
+
client_application=client_application,
|
|
124
|
+
conversation_id=conversation_id,
|
|
125
|
+
user_message=q,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
logger.info(
|
|
129
|
+
f"""
|
|
130
|
+
Saved Conversation Turn
|
|
131
|
+
You ({user.username}): "{q}"
|
|
132
|
+
|
|
133
|
+
Khoj: "{inferred_queries if ("text-to-image" in intent_type) else chat_response}"
|
|
134
|
+
""".strip()
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def generate_chatml_messages_with_context(
|
|
139
|
+
user_message,
|
|
140
|
+
system_message=None,
|
|
141
|
+
conversation_log={},
|
|
142
|
+
model_name="gpt-3.5-turbo",
|
|
143
|
+
loaded_model: Optional[Llama] = None,
|
|
144
|
+
max_prompt_size=None,
|
|
145
|
+
tokenizer_name=None,
|
|
146
|
+
):
|
|
147
|
+
"""Generate messages for ChatGPT with context from previous conversation"""
|
|
148
|
+
# Set max prompt size from user config or based on pre-configured for model and machine specs
|
|
149
|
+
if not max_prompt_size:
|
|
150
|
+
if loaded_model:
|
|
151
|
+
max_prompt_size = infer_max_tokens(loaded_model.n_ctx(), model_to_prompt_size.get(model_name, math.inf))
|
|
152
|
+
else:
|
|
153
|
+
max_prompt_size = model_to_prompt_size.get(model_name, 2000)
|
|
154
|
+
|
|
155
|
+
# Scale lookback turns proportional to max prompt size supported by model
|
|
156
|
+
lookback_turns = max_prompt_size // 750
|
|
157
|
+
|
|
158
|
+
# Extract Chat History for Context
|
|
159
|
+
chat_logs = []
|
|
160
|
+
for chat in conversation_log.get("chat", []):
|
|
161
|
+
chat_notes = f'\n\n Notes:\n{chat.get("context")}' if chat.get("context") else "\n"
|
|
162
|
+
chat_logs += [chat["message"] + chat_notes]
|
|
163
|
+
|
|
164
|
+
rest_backnforths: List[ChatMessage] = []
|
|
165
|
+
# Extract in reverse chronological order
|
|
166
|
+
for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]):
|
|
167
|
+
if len(rest_backnforths) >= 2 * lookback_turns:
|
|
168
|
+
break
|
|
169
|
+
rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1]
|
|
170
|
+
|
|
171
|
+
# Format user and system messages to chatml format
|
|
172
|
+
messages = []
|
|
173
|
+
if not is_none_or_empty(user_message):
|
|
174
|
+
messages.append(ChatMessage(content=user_message, role="user"))
|
|
175
|
+
if len(rest_backnforths) > 0:
|
|
176
|
+
messages += rest_backnforths
|
|
177
|
+
if not is_none_or_empty(system_message):
|
|
178
|
+
messages.append(ChatMessage(content=system_message, role="system"))
|
|
179
|
+
|
|
180
|
+
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
|
181
|
+
messages = truncate_messages(messages, max_prompt_size, model_name, loaded_model, tokenizer_name)
|
|
182
|
+
|
|
183
|
+
# Return message in chronological order
|
|
184
|
+
return messages[::-1]
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def truncate_messages(
|
|
188
|
+
messages: list[ChatMessage],
|
|
189
|
+
max_prompt_size,
|
|
190
|
+
model_name: str,
|
|
191
|
+
loaded_model: Optional[Llama] = None,
|
|
192
|
+
tokenizer_name=None,
|
|
193
|
+
) -> list[ChatMessage]:
|
|
194
|
+
"""Truncate messages to fit within max prompt size supported by model"""
|
|
195
|
+
|
|
196
|
+
default_tokenizer = "hf-internal-testing/llama-tokenizer"
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
if loaded_model:
|
|
200
|
+
encoder = loaded_model.tokenizer()
|
|
201
|
+
elif model_name.startswith("gpt-"):
|
|
202
|
+
encoder = tiktoken.encoding_for_model(model_name)
|
|
203
|
+
elif tokenizer_name:
|
|
204
|
+
if tokenizer_name in state.pretrained_tokenizers:
|
|
205
|
+
encoder = state.pretrained_tokenizers[tokenizer_name]
|
|
206
|
+
else:
|
|
207
|
+
encoder = AutoTokenizer.from_pretrained(tokenizer_name)
|
|
208
|
+
state.pretrained_tokenizers[tokenizer_name] = encoder
|
|
209
|
+
else:
|
|
210
|
+
encoder = download_model(model_name).tokenizer()
|
|
211
|
+
except:
|
|
212
|
+
if default_tokenizer in state.pretrained_tokenizers:
|
|
213
|
+
encoder = state.pretrained_tokenizers[default_tokenizer]
|
|
214
|
+
else:
|
|
215
|
+
encoder = AutoTokenizer.from_pretrained(default_tokenizer)
|
|
216
|
+
state.pretrained_tokenizers[default_tokenizer] = encoder
|
|
217
|
+
logger.warning(
|
|
218
|
+
f"Fallback to default chat model tokenizer: {tokenizer_name}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Extract system message from messages
|
|
222
|
+
system_message = None
|
|
223
|
+
for idx, message in enumerate(messages):
|
|
224
|
+
if message.role == "system":
|
|
225
|
+
system_message = messages.pop(idx)
|
|
226
|
+
break
|
|
227
|
+
|
|
228
|
+
system_message_tokens = (
|
|
229
|
+
len(encoder.encode(system_message.content)) if system_message and type(system_message.content) == str else 0
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
tokens = sum([len(encoder.encode(message.content)) for message in messages if type(message.content) == str])
|
|
233
|
+
|
|
234
|
+
# Drop older messages until under max supported prompt size by model
|
|
235
|
+
while (tokens + system_message_tokens) > max_prompt_size and len(messages) > 1:
|
|
236
|
+
messages.pop()
|
|
237
|
+
tokens = sum([len(encoder.encode(message.content)) for message in messages if type(message.content) == str])
|
|
238
|
+
|
|
239
|
+
# Truncate current message if still over max supported prompt size by model
|
|
240
|
+
if (tokens + system_message_tokens) > max_prompt_size:
|
|
241
|
+
current_message = "\n".join(messages[0].content.split("\n")[:-1]) if type(messages[0].content) == str else ""
|
|
242
|
+
original_question = "\n".join(messages[0].content.split("\n")[-1:]) if type(messages[0].content) == str else ""
|
|
243
|
+
original_question = f"\n{original_question}"
|
|
244
|
+
original_question_tokens = len(encoder.encode(original_question))
|
|
245
|
+
remaining_tokens = max_prompt_size - system_message_tokens
|
|
246
|
+
if remaining_tokens > original_question_tokens:
|
|
247
|
+
remaining_tokens -= original_question_tokens
|
|
248
|
+
truncated_message = encoder.decode(encoder.encode(current_message)[:remaining_tokens]).strip()
|
|
249
|
+
messages = [ChatMessage(content=truncated_message + original_question, role=messages[0].role)]
|
|
250
|
+
else:
|
|
251
|
+
truncated_message = encoder.decode(encoder.encode(original_question)[:remaining_tokens]).strip()
|
|
252
|
+
messages = [ChatMessage(content=truncated_message, role=messages[0].role)]
|
|
253
|
+
logger.debug(
|
|
254
|
+
f"Truncate current message to fit within max prompt size of {max_prompt_size} supported by {model_name} model:\n {truncated_message}"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return messages + [system_message] if system_message else messages
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def reciprocal_conversation_to_chatml(message_pair):
|
|
261
|
+
"""Convert a single back and forth between user and assistant to chatml format"""
|
|
262
|
+
return [ChatMessage(content=message, role=role) for message, role in zip(message_pair, ["user", "assistant"])]
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
import tqdm
|
|
6
|
+
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
7
|
+
from tenacity import (
|
|
8
|
+
before_sleep_log,
|
|
9
|
+
retry,
|
|
10
|
+
retry_if_exception_type,
|
|
11
|
+
stop_after_attempt,
|
|
12
|
+
wait_random_exponential,
|
|
13
|
+
)
|
|
14
|
+
from torch import nn
|
|
15
|
+
|
|
16
|
+
from khoj.utils.helpers import get_device, merge_dicts
|
|
17
|
+
from khoj.utils.rawconfig import SearchResponse
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EmbeddingsModel:
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
model_name: str = "thenlper/gte-small",
|
|
26
|
+
embeddings_inference_endpoint: str = None,
|
|
27
|
+
embeddings_inference_endpoint_api_key: str = None,
|
|
28
|
+
query_encode_kwargs: dict = {},
|
|
29
|
+
docs_encode_kwargs: dict = {},
|
|
30
|
+
model_kwargs: dict = {},
|
|
31
|
+
):
|
|
32
|
+
default_query_encode_kwargs = {"show_progress_bar": False, "normalize_embeddings": True}
|
|
33
|
+
default_docs_encode_kwargs = {"show_progress_bar": True, "normalize_embeddings": True}
|
|
34
|
+
self.query_encode_kwargs = merge_dicts(query_encode_kwargs, default_query_encode_kwargs)
|
|
35
|
+
self.docs_encode_kwargs = merge_dicts(docs_encode_kwargs, default_docs_encode_kwargs)
|
|
36
|
+
self.model_kwargs = merge_dicts(model_kwargs, {"device": get_device()})
|
|
37
|
+
self.model_name = model_name
|
|
38
|
+
self.inference_endpoint = embeddings_inference_endpoint
|
|
39
|
+
self.api_key = embeddings_inference_endpoint_api_key
|
|
40
|
+
self.embeddings_model = SentenceTransformer(self.model_name, **self.model_kwargs)
|
|
41
|
+
|
|
42
|
+
def inference_server_enabled(self) -> bool:
|
|
43
|
+
return self.api_key is not None and self.inference_endpoint is not None
|
|
44
|
+
|
|
45
|
+
def embed_query(self, query):
|
|
46
|
+
if self.inference_server_enabled():
|
|
47
|
+
return self.embed_with_api([query])[0]
|
|
48
|
+
return self.embeddings_model.encode([query], **self.query_encode_kwargs)[0]
|
|
49
|
+
|
|
50
|
+
@retry(
|
|
51
|
+
retry=retry_if_exception_type(requests.exceptions.HTTPError),
|
|
52
|
+
wait=wait_random_exponential(multiplier=1, max=10),
|
|
53
|
+
stop=stop_after_attempt(5),
|
|
54
|
+
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
|
55
|
+
)
|
|
56
|
+
def embed_with_api(self, docs):
|
|
57
|
+
payload = {"inputs": docs}
|
|
58
|
+
headers = {
|
|
59
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
60
|
+
"Content-Type": "application/json",
|
|
61
|
+
}
|
|
62
|
+
try:
|
|
63
|
+
response = requests.post(self.inference_endpoint, json=payload, headers=headers)
|
|
64
|
+
response.raise_for_status()
|
|
65
|
+
except requests.exceptions.HTTPError as e:
|
|
66
|
+
logger.error(
|
|
67
|
+
f" Error while calling inference endpoint {self.inference_endpoint} with error {e}, response {response.json()} ",
|
|
68
|
+
exc_info=True,
|
|
69
|
+
)
|
|
70
|
+
raise e
|
|
71
|
+
return response.json()["embeddings"]
|
|
72
|
+
|
|
73
|
+
def embed_documents(self, docs):
|
|
74
|
+
if self.inference_server_enabled():
|
|
75
|
+
if "huggingface" not in self.inference_endpoint:
|
|
76
|
+
logger.warning(
|
|
77
|
+
f"Unsupported inference endpoint: {self.inference_endpoint}. Only HuggingFace supported. Generating embeddings on device instead."
|
|
78
|
+
)
|
|
79
|
+
return self.embeddings_model.encode(docs, **self.docs_encode_kwargs).tolist()
|
|
80
|
+
# break up the docs payload in chunks of 1000 to avoid hitting rate limits
|
|
81
|
+
embeddings = []
|
|
82
|
+
with tqdm.tqdm(total=len(docs)) as pbar:
|
|
83
|
+
for i in range(0, len(docs), 1000):
|
|
84
|
+
docs_to_embed = docs[i : i + 1000]
|
|
85
|
+
generated_embeddings = self.embed_with_api(docs_to_embed)
|
|
86
|
+
embeddings += generated_embeddings
|
|
87
|
+
pbar.update(1000)
|
|
88
|
+
return embeddings
|
|
89
|
+
return self.embeddings_model.encode(docs, **self.docs_encode_kwargs).tolist() if docs else []
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class CrossEncoderModel:
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
model_name: str = "mixedbread-ai/mxbai-rerank-xsmall-v1",
|
|
96
|
+
cross_encoder_inference_endpoint: str = None,
|
|
97
|
+
cross_encoder_inference_endpoint_api_key: str = None,
|
|
98
|
+
):
|
|
99
|
+
self.model_name = model_name
|
|
100
|
+
self.cross_encoder_model = CrossEncoder(model_name=self.model_name, device=get_device())
|
|
101
|
+
self.inference_endpoint = cross_encoder_inference_endpoint
|
|
102
|
+
self.api_key = cross_encoder_inference_endpoint_api_key
|
|
103
|
+
|
|
104
|
+
def inference_server_enabled(self) -> bool:
|
|
105
|
+
return self.api_key is not None and self.inference_endpoint is not None
|
|
106
|
+
|
|
107
|
+
def predict(self, query, hits: List[SearchResponse], key: str = "compiled"):
|
|
108
|
+
if self.inference_server_enabled() and "huggingface" in self.inference_endpoint:
|
|
109
|
+
target_url = f"{self.inference_endpoint}"
|
|
110
|
+
payload = {"inputs": {"query": query, "passages": [hit.additional[key] for hit in hits]}}
|
|
111
|
+
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
|
|
112
|
+
response = requests.post(target_url, json=payload, headers=headers)
|
|
113
|
+
return response.json()["scores"]
|
|
114
|
+
|
|
115
|
+
cross_inp = [[query, hit.additional[key]] for hit in hits]
|
|
116
|
+
cross_scores = self.cross_encoder_model.predict(cross_inp, activation_fct=nn.Sigmoid())
|
|
117
|
+
return cross_scores
|
|
File without changes
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import json # Used for working with JSON data
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import requests # Used for making HTTP requests
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from markdown_it import MarkdownIt
|
|
7
|
+
|
|
8
|
+
# Define constants for the script
|
|
9
|
+
CHUNK_SIZE = 1024 # Size of chunks to read/write at a time
|
|
10
|
+
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY", None) # Your API key for authentication
|
|
11
|
+
VOICE_ID = "RPEIZnKMqlQiZyZd1Dae" # ID of the voice model to use. MALE - Christopher - friendly guy next door.
|
|
12
|
+
ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech" # Base URL for the Text-to-Speech API
|
|
13
|
+
|
|
14
|
+
markdown_renderer = MarkdownIt()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def is_eleven_labs_enabled():
|
|
18
|
+
return ELEVEN_LABS_API_KEY is not None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def generate_text_to_speech(
|
|
22
|
+
text_to_speak: str,
|
|
23
|
+
voice_id: str = VOICE_ID,
|
|
24
|
+
):
|
|
25
|
+
if not is_eleven_labs_enabled():
|
|
26
|
+
return "Eleven Labs API key is not set"
|
|
27
|
+
|
|
28
|
+
# Convert the incoming text from markdown format to plain text
|
|
29
|
+
html = markdown_renderer.render(text_to_speak)
|
|
30
|
+
text = "".join(BeautifulSoup(html, features="lxml").findAll(text=True))
|
|
31
|
+
|
|
32
|
+
# Construct the URL for the Text-to-Speech API request
|
|
33
|
+
tts_url = f"{ELEVEN_API_URL}/{voice_id}/stream"
|
|
34
|
+
|
|
35
|
+
# Set up headers for the API request, including the API key for authentication
|
|
36
|
+
headers = {"Accept": "application/json", "xi-api-key": ELEVEN_LABS_API_KEY}
|
|
37
|
+
|
|
38
|
+
# Set up the data payload for the API request, including the text and voice settings
|
|
39
|
+
data = {
|
|
40
|
+
"text": text,
|
|
41
|
+
# "model_id": "eleven_multilingual_v2",
|
|
42
|
+
"voice_settings": {"stability": 0.5, "similarity_boost": 0.8, "style": 0.0, "use_speaker_boost": True},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Make the POST request to the TTS API with headers and data, enabling streaming response
|
|
46
|
+
response = requests.post(tts_url, headers=headers, json=data, stream=True)
|
|
47
|
+
|
|
48
|
+
if response.ok:
|
|
49
|
+
return response
|
|
50
|
+
else:
|
|
51
|
+
raise Exception(f"Failed to generate text-to-speech: {response.text}")
|
|
File without changes
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
8
|
+
|
|
9
|
+
import aiohttp
|
|
10
|
+
from bs4 import BeautifulSoup
|
|
11
|
+
from markdownify import markdownify
|
|
12
|
+
|
|
13
|
+
from khoj.routers.helpers import (
|
|
14
|
+
extract_relevant_info,
|
|
15
|
+
generate_online_subqueries,
|
|
16
|
+
infer_webpage_urls,
|
|
17
|
+
)
|
|
18
|
+
from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer
|
|
19
|
+
from khoj.utils.rawconfig import LocationData
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
|
|
24
|
+
SERPER_DEV_URL = "https://google.serper.dev/search"
|
|
25
|
+
|
|
26
|
+
JINA_READER_API_URL = "https://r.jina.ai/"
|
|
27
|
+
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
|
28
|
+
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
|
29
|
+
|
|
30
|
+
OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY")
|
|
31
|
+
OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI"
|
|
32
|
+
OLOSTEP_QUERY_PARAMS = {
|
|
33
|
+
"timeout": 35, # seconds
|
|
34
|
+
"waitBeforeScraping": 1, # seconds
|
|
35
|
+
"saveHtml": "False",
|
|
36
|
+
"saveMarkdown": "True",
|
|
37
|
+
"removeCSSselectors": "default",
|
|
38
|
+
"htmlTransformer": "none",
|
|
39
|
+
"removeImages": "True",
|
|
40
|
+
"fastLane": "True",
|
|
41
|
+
# Similar to Stripe's API, the expand parameters avoid the need to make a second API call
|
|
42
|
+
# to retrieve the dataset (from the dataset API) if you only need the markdown or html.
|
|
43
|
+
"expandMarkdown": "True",
|
|
44
|
+
"expandHtml": "False",
|
|
45
|
+
}
|
|
46
|
+
MAX_WEBPAGES_TO_READ = 1
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def search_online(
|
|
50
|
+
query: str,
|
|
51
|
+
conversation_history: dict,
|
|
52
|
+
location: LocationData,
|
|
53
|
+
send_status_func: Optional[Callable] = None,
|
|
54
|
+
custom_filters: List[str] = [],
|
|
55
|
+
):
|
|
56
|
+
query += " ".join(custom_filters)
|
|
57
|
+
if not is_internet_connected():
|
|
58
|
+
logger.warn("Cannot search online as not connected to internet")
|
|
59
|
+
return {}
|
|
60
|
+
|
|
61
|
+
# Breakdown the query into subqueries to get the correct answer
|
|
62
|
+
subqueries = await generate_online_subqueries(query, conversation_history, location)
|
|
63
|
+
response_dict = {}
|
|
64
|
+
|
|
65
|
+
if subqueries:
|
|
66
|
+
logger.info(f"🌐 Searching the Internet for {list(subqueries)}")
|
|
67
|
+
if send_status_func:
|
|
68
|
+
subqueries_str = "\n- " + "\n- ".join(list(subqueries))
|
|
69
|
+
await send_status_func(f"**🌐 Searching the Internet for**: {subqueries_str}")
|
|
70
|
+
|
|
71
|
+
with timer(f"Internet searches for {list(subqueries)} took", logger):
|
|
72
|
+
search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
|
|
73
|
+
search_tasks = [search_func(subquery) for subquery in subqueries]
|
|
74
|
+
search_results = await asyncio.gather(*search_tasks)
|
|
75
|
+
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
|
76
|
+
|
|
77
|
+
# Gather distinct web page data from organic results of each subquery without an instant answer.
|
|
78
|
+
# Content of web pages is directly available when Jina is used for search.
|
|
79
|
+
webpages = {
|
|
80
|
+
(organic.get("link"), subquery, organic.get("content"))
|
|
81
|
+
for subquery in response_dict
|
|
82
|
+
for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]
|
|
83
|
+
if "answerBox" not in response_dict[subquery]
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Read, extract relevant info from the retrieved web pages
|
|
87
|
+
if webpages:
|
|
88
|
+
webpage_links = [link for link, _, _ in webpages]
|
|
89
|
+
logger.info(f"🌐👀 Reading web pages at: {list(webpage_links)}")
|
|
90
|
+
if send_status_func:
|
|
91
|
+
webpage_links_str = "\n- " + "\n- ".join(list(webpage_links))
|
|
92
|
+
await send_status_func(f"**📖 Reading web pages**: {webpage_links_str}")
|
|
93
|
+
tasks = [read_webpage_and_extract_content(subquery, link, content) for link, subquery, content in webpages]
|
|
94
|
+
results = await asyncio.gather(*tasks)
|
|
95
|
+
|
|
96
|
+
# Collect extracted info from the retrieved web pages
|
|
97
|
+
for subquery, webpage_extract, url in results:
|
|
98
|
+
if webpage_extract is not None:
|
|
99
|
+
response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract}
|
|
100
|
+
|
|
101
|
+
return response_dict
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
async def search_with_google(query: str) -> Tuple[str, Dict[str, List[Dict]]]:
|
|
105
|
+
payload = json.dumps({"q": query})
|
|
106
|
+
headers = {"X-API-KEY": SERPER_DEV_API_KEY, "Content-Type": "application/json"}
|
|
107
|
+
|
|
108
|
+
async with aiohttp.ClientSession() as session:
|
|
109
|
+
async with session.post(SERPER_DEV_URL, headers=headers, data=payload) as response:
|
|
110
|
+
if response.status != 200:
|
|
111
|
+
logger.error(await response.text())
|
|
112
|
+
return query, {}
|
|
113
|
+
json_response = await response.json()
|
|
114
|
+
extraction_fields = ["organic", "answerBox", "peopleAlsoAsk", "knowledgeGraph"]
|
|
115
|
+
extracted_search_result = {
|
|
116
|
+
field: json_response[field]
|
|
117
|
+
for field in extraction_fields
|
|
118
|
+
if not is_none_or_empty(json_response.get(field))
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return query, extracted_search_result
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
async def read_webpages(
|
|
125
|
+
query: str, conversation_history: dict, location: LocationData, send_status_func: Optional[Callable] = None
|
|
126
|
+
):
|
|
127
|
+
"Infer web pages to read from the query and extract relevant information from them"
|
|
128
|
+
logger.info(f"Inferring web pages to read")
|
|
129
|
+
if send_status_func:
|
|
130
|
+
await send_status_func(f"**🧐 Inferring web pages to read**")
|
|
131
|
+
urls = await infer_webpage_urls(query, conversation_history, location)
|
|
132
|
+
|
|
133
|
+
logger.info(f"Reading web pages at: {urls}")
|
|
134
|
+
if send_status_func:
|
|
135
|
+
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
|
136
|
+
await send_status_func(f"**📖 Reading web pages**: {webpage_links_str}")
|
|
137
|
+
tasks = [read_webpage_and_extract_content(query, url) for url in urls]
|
|
138
|
+
results = await asyncio.gather(*tasks)
|
|
139
|
+
|
|
140
|
+
response: Dict[str, Dict] = defaultdict(dict)
|
|
141
|
+
response[query]["webpages"] = [
|
|
142
|
+
{"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None
|
|
143
|
+
]
|
|
144
|
+
return response
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
async def read_webpage_and_extract_content(
|
|
148
|
+
subquery: str, url: str, content: str = None
|
|
149
|
+
) -> Tuple[str, Union[None, str], str]:
|
|
150
|
+
try:
|
|
151
|
+
if is_none_or_empty(content):
|
|
152
|
+
with timer(f"Reading web page at '{url}' took", logger):
|
|
153
|
+
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url)
|
|
154
|
+
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
|
155
|
+
extracted_info = await extract_relevant_info(subquery, content)
|
|
156
|
+
return subquery, extracted_info, url
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Failed to read web page at '{url}' with {e}")
|
|
159
|
+
return subquery, None, url
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def read_webpage_at_url(web_url: str) -> str:
|
|
163
|
+
headers = {
|
|
164
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
async with aiohttp.ClientSession() as session:
|
|
168
|
+
async with session.get(web_url, headers=headers, timeout=30) as response:
|
|
169
|
+
response.raise_for_status()
|
|
170
|
+
html = await response.text()
|
|
171
|
+
parsed_html = BeautifulSoup(html, "html.parser")
|
|
172
|
+
body = parsed_html.body.get_text(separator="\n", strip=True)
|
|
173
|
+
return markdownify(body)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
async def read_webpage_with_olostep(web_url: str) -> str:
|
|
177
|
+
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
|
|
178
|
+
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
|
179
|
+
web_scraping_params["url"] = web_url
|
|
180
|
+
|
|
181
|
+
async with aiohttp.ClientSession() as session:
|
|
182
|
+
async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response:
|
|
183
|
+
response.raise_for_status()
|
|
184
|
+
response_json = await response.json()
|
|
185
|
+
return response_json["markdown_content"]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def read_webpage_with_jina(web_url: str) -> str:
|
|
189
|
+
jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}"
|
|
190
|
+
headers = {"Accept": "application/json", "X-Timeout": "30"}
|
|
191
|
+
if JINA_API_KEY:
|
|
192
|
+
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
|
|
193
|
+
|
|
194
|
+
async with aiohttp.ClientSession() as session:
|
|
195
|
+
async with session.get(jina_reader_api_url, headers=headers) as response:
|
|
196
|
+
response.raise_for_status()
|
|
197
|
+
response_json = await response.json()
|
|
198
|
+
return response_json["data"]["content"]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
async def search_with_jina(query: str) -> Tuple[str, Dict[str, List[Dict]]]:
|
|
202
|
+
encoded_query = urllib.parse.quote(query)
|
|
203
|
+
jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}"
|
|
204
|
+
headers = {"Accept": "application/json"}
|
|
205
|
+
if JINA_API_KEY:
|
|
206
|
+
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
|
|
207
|
+
|
|
208
|
+
async with aiohttp.ClientSession() as session:
|
|
209
|
+
async with session.get(jina_search_api_url, headers=headers) as response:
|
|
210
|
+
if response.status != 200:
|
|
211
|
+
logger.error(await response.text())
|
|
212
|
+
return query, {}
|
|
213
|
+
response_json = await response.json()
|
|
214
|
+
parsed_response = [
|
|
215
|
+
{
|
|
216
|
+
"title": item["title"],
|
|
217
|
+
"content": item.get("content"),
|
|
218
|
+
# rename description -> snippet for consistency
|
|
219
|
+
"snippet": item["description"],
|
|
220
|
+
# rename url -> link for consistency
|
|
221
|
+
"link": item["url"],
|
|
222
|
+
}
|
|
223
|
+
for item in response_json["data"]
|
|
224
|
+
]
|
|
225
|
+
return query, {"organic": parsed_response}
|
khoj/routers/__init__.py
ADDED
|
File without changes
|