khoj 1.28.3__py3-none-any.whl → 1.28.4.dev92__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/configure.py +10 -14
- khoj/database/adapters/__init__.py +128 -44
- khoj/database/admin.py +6 -3
- khoj/database/management/commands/change_default_model.py +7 -72
- khoj/database/migrations/0073_delete_usersearchmodelconfig.py +15 -0
- khoj/database/models/__init__.py +4 -6
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/1603-dc5fd983dbcd070d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/1970-c78f6acc8e16e30b.js +1 -0
- khoj/interface/compiled/_next/static/chunks/2261-748f7c327df3c8c1.js +1 -0
- khoj/interface/compiled/_next/static/chunks/3124-a4cea2eda163128d.js +1 -0
- khoj/interface/compiled/_next/static/chunks/3803-d74118a2d0182c52.js +1 -0
- khoj/interface/compiled/_next/static/chunks/5538-36aa824a75519c5b.js +1 -0
- khoj/interface/compiled/_next/static/chunks/5961-3c104d9736b7902b.js +3 -0
- khoj/interface/compiled/_next/static/chunks/8423-ebfa9bb9e2424ca3.js +1 -0
- khoj/interface/compiled/_next/static/chunks/9417-32c4db52ca42e681.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e9838b642913a071.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/page-4353b1a532795ad1.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-d3edae545a1b5393.js → page-c9f13c865e739607.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-b0e7ff4baa3b5265.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/page-45720e1ed71e3ef5.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/{layout-d0f0a9067427fb20.js → layout-86561d2fac35a91a.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-ea462e20376b6dce.js → page-ecb8e1c192aa8834.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-ea6b73fdaf9b24ca.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/{page-a5c277eff207959e.js → page-8e28deacb61f75aa.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{layout-a8f33dfe92f997fb.js → layout-254eaaf916449a60.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/page-2fab613a557d3cc5.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-cf7445cf0326bda3.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/page-30376aa7e9cfa342.js +1 -0
- khoj/interface/compiled/_next/static/chunks/{main-f84cd3c1873cd842.js → main-1ea5c2e0fdef4626.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-8beec5b51cabb39a.js → webpack-27cf153c35b1338d.js} +1 -1
- khoj/interface/compiled/_next/static/css/{467a524c75e7d7c0.css → 0e9d53dcd7f11342.css} +1 -1
- khoj/interface/compiled/_next/static/css/{26c1c33d0423a7d8.css → 1f293605f2871853.css} +1 -1
- khoj/interface/compiled/_next/static/css/2d097a35da6bfe8d.css +1 -0
- khoj/interface/compiled/_next/static/css/80bd6301fc657983.css +1 -0
- khoj/interface/compiled/_next/static/css/ed437164d77aa600.css +25 -0
- khoj/interface/compiled/_next/static/media/5455839c73f146e7-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/5984b96ba4822821-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/684adc3dde1b03f1-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/82e3b9a1bdaf0c26-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/8d1ea331386a0db8-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/91475f6526542a4f-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/b98b13dbc1c3b59c-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/c824d7a20139e39d-s.woff2 +0 -0
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +1 -1
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +3 -3
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +3 -3
- khoj/interface/compiled/share/chat/index.html +1 -1
- khoj/interface/compiled/share/chat/index.txt +3 -3
- khoj/processor/content/docx/docx_to_entries.py +27 -21
- khoj/processor/content/github/github_to_entries.py +2 -2
- khoj/processor/content/images/image_to_entries.py +2 -2
- khoj/processor/content/markdown/markdown_to_entries.py +2 -2
- khoj/processor/content/notion/notion_to_entries.py +2 -2
- khoj/processor/content/org_mode/org_to_entries.py +2 -2
- khoj/processor/content/org_mode/orgnode.py +1 -1
- khoj/processor/content/pdf/pdf_to_entries.py +37 -29
- khoj/processor/content/plaintext/plaintext_to_entries.py +2 -2
- khoj/processor/content/text_to_entries.py +3 -4
- khoj/processor/conversation/anthropic/anthropic_chat.py +9 -1
- khoj/processor/conversation/google/gemini_chat.py +15 -2
- khoj/processor/conversation/google/utils.py +3 -1
- khoj/processor/conversation/offline/chat_model.py +4 -0
- khoj/processor/conversation/openai/gpt.py +6 -1
- khoj/processor/conversation/prompts.py +72 -13
- khoj/processor/conversation/utils.py +80 -13
- khoj/processor/image/generate.py +2 -0
- khoj/processor/tools/online_search.py +68 -18
- khoj/processor/tools/run_code.py +54 -20
- khoj/routers/api.py +10 -4
- khoj/routers/api_agents.py +8 -10
- khoj/routers/api_chat.py +89 -24
- khoj/routers/api_content.py +80 -8
- khoj/routers/helpers.py +176 -60
- khoj/routers/notion.py +1 -1
- khoj/routers/research.py +73 -31
- khoj/routers/web_client.py +0 -10
- khoj/search_type/text_search.py +3 -7
- khoj/utils/cli.py +2 -2
- khoj/utils/fs_syncer.py +2 -1
- khoj/utils/helpers.py +6 -3
- khoj/utils/rawconfig.py +32 -0
- khoj/utils/state.py +2 -1
- {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/METADATA +3 -3
- {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/RECORD +99 -105
- {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/WHEEL +1 -1
- khoj/interface/compiled/_next/static/chunks/1034-da58b679fcbb79c1.js +0 -1
- khoj/interface/compiled/_next/static/chunks/1467-b331e469fe411347.js +0 -1
- khoj/interface/compiled/_next/static/chunks/1603-c1568f45947e9f2c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/1970-d44050bf658ae5cc.js +0 -1
- khoj/interface/compiled/_next/static/chunks/3110-ef2cacd1b8d79ad8.js +0 -1
- khoj/interface/compiled/_next/static/chunks/3423-f4b7df2f6f3362f7.js +0 -1
- khoj/interface/compiled/_next/static/chunks/394-6bcb8c429f168f21.js +0 -3
- khoj/interface/compiled/_next/static/chunks/7113-f2e114d7034a0835.js +0 -1
- khoj/interface/compiled/_next/static/chunks/8423-da57554315eebcbe.js +0 -1
- khoj/interface/compiled/_next/static/chunks/8840-b8d7b9f0923c6651.js +0 -1
- khoj/interface/compiled/_next/static/chunks/9417-0d0fc7eb49a86abb.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-75636ab3a413fa8e.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/page-adbf3cd470da248f.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-96fcf62857bf8f30.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/page-222d348681b848a5.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/factchecker/layout-7b30c541c05fb904.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/factchecker/page-bded0868a08ac4ba.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-3720f1362310bebb.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/page-210bd54db4841333.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-2df56074e42adaa0.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/page-a21b7e8890ed1209.js +0 -1
- khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
- khoj/interface/compiled/_next/static/css/553f9cdcc7a2bcd6.css +0 -1
- khoj/interface/compiled/_next/static/css/a795ee88875f4853.css +0 -25
- khoj/interface/compiled/_next/static/css/afd3d45cc65d55d8.css +0 -1
- khoj/interface/compiled/_next/static/media/0e790e04fd40ad16-s.p.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/4221e1667cd19c7d-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/6c276159aa0eb14b-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/6cc0b9500e4f9168-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/9d9319a7a2ac39c6-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/a75c8ea86756d52d-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/abce7c400ca31a51-s.woff2 +0 -0
- khoj/interface/compiled/_next/static/media/f759c939737fb668-s.woff2 +0 -0
- khoj/interface/compiled/factchecker/index.html +0 -1
- khoj/interface/compiled/factchecker/index.txt +0 -7
- /khoj/interface/compiled/_next/static/{EfnEiWDle86AUcxEdEFgO → t_2jovvUVve0Gvc3FqpT9}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{EfnEiWDle86AUcxEdEFgO → t_2jovvUVve0Gvc3FqpT9}/_ssgManifest.js +0 -0
- {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/entry_points.txt +0 -0
- {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/licenses/LICENSE +0 -0
@@ -37,6 +37,7 @@ def extract_questions_gemini(
|
|
37
37
|
query_images: Optional[list[str]] = None,
|
38
38
|
vision_enabled: bool = False,
|
39
39
|
personality_context: Optional[str] = None,
|
40
|
+
query_files: str = None,
|
40
41
|
tracer: dict = {},
|
41
42
|
):
|
42
43
|
"""
|
@@ -83,9 +84,13 @@ def extract_questions_gemini(
|
|
83
84
|
images=query_images,
|
84
85
|
model_type=ChatModelOptions.ModelType.GOOGLE,
|
85
86
|
vision_enabled=vision_enabled,
|
87
|
+
attached_file_context=query_files,
|
86
88
|
)
|
87
89
|
|
88
|
-
messages = [
|
90
|
+
messages = []
|
91
|
+
|
92
|
+
messages.append(ChatMessage(content=prompt, role="user"))
|
93
|
+
messages.append(ChatMessage(content=system_prompt, role="system"))
|
89
94
|
|
90
95
|
response = gemini_send_message_to_model(
|
91
96
|
messages, api_key, model, response_type="json_object", temperature=temperature, tracer=tracer
|
@@ -108,7 +113,13 @@ def extract_questions_gemini(
|
|
108
113
|
|
109
114
|
|
110
115
|
def gemini_send_message_to_model(
|
111
|
-
messages,
|
116
|
+
messages,
|
117
|
+
api_key,
|
118
|
+
model,
|
119
|
+
response_type="text",
|
120
|
+
temperature=0,
|
121
|
+
model_kwargs=None,
|
122
|
+
tracer={},
|
112
123
|
):
|
113
124
|
"""
|
114
125
|
Send message to model
|
@@ -151,6 +162,7 @@ def converse_gemini(
|
|
151
162
|
agent: Agent = None,
|
152
163
|
query_images: Optional[list[str]] = None,
|
153
164
|
vision_available: bool = False,
|
165
|
+
query_files: str = None,
|
154
166
|
tracer={},
|
155
167
|
):
|
156
168
|
"""
|
@@ -209,6 +221,7 @@ def converse_gemini(
|
|
209
221
|
query_images=query_images,
|
210
222
|
vision_enabled=vision_available,
|
211
223
|
model_type=ChatModelOptions.ModelType.GOOGLE,
|
224
|
+
query_files=query_files,
|
212
225
|
)
|
213
226
|
|
214
227
|
messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
|
@@ -228,7 +228,9 @@ def format_messages_for_gemini(messages: list[ChatMessage], system_prompt: str =
|
|
228
228
|
if isinstance(message.content, list):
|
229
229
|
# Convert image_urls to PIL.Image and place them at beginning of list (better for Gemini)
|
230
230
|
message.content = [
|
231
|
-
get_image_from_url(item["image_url"]["url"]).content
|
231
|
+
get_image_from_url(item["image_url"]["url"]).content
|
232
|
+
if item["type"] == "image_url"
|
233
|
+
else item.get("text", "")
|
232
234
|
for item in sorted(message.content, key=lambda x: 0 if x["type"] == "image_url" else 1)
|
233
235
|
]
|
234
236
|
elif isinstance(message.content, str):
|
@@ -37,6 +37,7 @@ def extract_questions_offline(
|
|
37
37
|
max_prompt_size: int = None,
|
38
38
|
temperature: float = 0.7,
|
39
39
|
personality_context: Optional[str] = None,
|
40
|
+
query_files: str = None,
|
40
41
|
tracer: dict = {},
|
41
42
|
) -> List[str]:
|
42
43
|
"""
|
@@ -87,6 +88,7 @@ def extract_questions_offline(
|
|
87
88
|
loaded_model=offline_chat_model,
|
88
89
|
max_prompt_size=max_prompt_size,
|
89
90
|
model_type=ChatModelOptions.ModelType.OFFLINE,
|
91
|
+
query_files=query_files,
|
90
92
|
)
|
91
93
|
|
92
94
|
state.chat_lock.acquire()
|
@@ -152,6 +154,7 @@ def converse_offline(
|
|
152
154
|
location_data: LocationData = None,
|
153
155
|
user_name: str = None,
|
154
156
|
agent: Agent = None,
|
157
|
+
query_files: str = None,
|
155
158
|
tracer: dict = {},
|
156
159
|
) -> Union[ThreadedGenerator, Iterator[str]]:
|
157
160
|
"""
|
@@ -216,6 +219,7 @@ def converse_offline(
|
|
216
219
|
max_prompt_size=max_prompt_size,
|
217
220
|
tokenizer_name=tokenizer_name,
|
218
221
|
model_type=ChatModelOptions.ModelType.OFFLINE,
|
222
|
+
query_files=query_files,
|
219
223
|
)
|
220
224
|
|
221
225
|
truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})
|
@@ -34,6 +34,7 @@ def extract_questions(
|
|
34
34
|
query_images: Optional[list[str]] = None,
|
35
35
|
vision_enabled: bool = False,
|
36
36
|
personality_context: Optional[str] = None,
|
37
|
+
query_files: str = None,
|
37
38
|
tracer: dict = {},
|
38
39
|
):
|
39
40
|
"""
|
@@ -79,9 +80,11 @@ def extract_questions(
|
|
79
80
|
images=query_images,
|
80
81
|
model_type=ChatModelOptions.ModelType.OPENAI,
|
81
82
|
vision_enabled=vision_enabled,
|
83
|
+
attached_file_context=query_files,
|
82
84
|
)
|
83
85
|
|
84
|
-
messages = [
|
86
|
+
messages = []
|
87
|
+
messages.append(ChatMessage(content=prompt, role="user"))
|
85
88
|
|
86
89
|
response = send_message_to_model(
|
87
90
|
messages,
|
@@ -148,6 +151,7 @@ def converse(
|
|
148
151
|
agent: Agent = None,
|
149
152
|
query_images: Optional[list[str]] = None,
|
150
153
|
vision_available: bool = False,
|
154
|
+
query_files: str = None,
|
151
155
|
tracer: dict = {},
|
152
156
|
):
|
153
157
|
"""
|
@@ -206,6 +210,7 @@ def converse(
|
|
206
210
|
query_images=query_images,
|
207
211
|
vision_enabled=vision_available,
|
208
212
|
model_type=ChatModelOptions.ModelType.OPENAI,
|
213
|
+
query_files=query_files,
|
209
214
|
)
|
210
215
|
truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})
|
211
216
|
logger.debug(f"Conversation Context for GPT: {truncated_messages}")
|
@@ -870,25 +870,40 @@ Khoj:
|
|
870
870
|
# --
|
871
871
|
python_code_generation_prompt = PromptTemplate.from_template(
|
872
872
|
"""
|
873
|
-
You are Khoj, an advanced python programmer. You are tasked with constructing
|
873
|
+
You are Khoj, an advanced python programmer. You are tasked with constructing a python program to best answer the user query.
|
874
874
|
- The python program will run in a pyodide python sandbox with no network access.
|
875
|
-
- You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query
|
876
|
-
- The sandbox has access to the standard library, matplotlib, panda, numpy, scipy, bs4, sympy, brotli, cryptography, fast-parquet
|
875
|
+
- You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query.
|
876
|
+
- The sandbox has access to the standard library, matplotlib, panda, numpy, scipy, bs4, sympy, brotli, cryptography, fast-parquet.
|
877
|
+
- List known file paths to required user documents in "input_files" and known links to required documents from the web in the "input_links" field.
|
878
|
+
- The python program should be self-contained. It can only read data generated by the program itself and from provided input_files, input_links by their basename (i.e filename excluding file path).
|
877
879
|
- Do not try display images or plots in the code directly. The code should save the image or plot to a file instead.
|
878
880
|
- Write any document, charts etc. to be shared with the user to file. These files can be seen by the user.
|
879
881
|
- Use as much context from the previous questions and answers as required to generate your code.
|
880
882
|
{personality_context}
|
881
|
-
What code will you need to write
|
882
|
-
|
883
|
+
What code will you need to write to answer the user's question?
|
884
|
+
|
883
885
|
Current Date: {current_date}
|
884
886
|
User's Location: {location}
|
885
887
|
{username}
|
886
888
|
|
887
|
-
The JSON schema is of the form {{"
|
888
|
-
|
889
|
-
|
889
|
+
The response JSON schema is of the form {{"code": "<python_code>", "input_files": ["file_path_1", "file_path_2"], "input_links": ["link_1", "link_2"]}}
|
890
|
+
Examples:
|
891
|
+
---
|
892
|
+
{{
|
893
|
+
"code": "# Input values\\nprincipal = 43235\\nrate = 5.24\\nyears = 5\\n\\n# Convert rate to decimal\\nrate_decimal = rate / 100\\n\\n# Calculate final amount\\nfinal_amount = principal * (1 + rate_decimal) ** years\\n\\n# Calculate interest earned\\ninterest_earned = final_amount - principal\\n\\n# Print results with formatting\\nprint(f"Interest Earned: ${{interest_earned:,.2f}}")\\nprint(f"Final Amount: ${{final_amount:,.2f}}")"
|
894
|
+
}}
|
890
895
|
|
891
|
-
|
896
|
+
{{
|
897
|
+
"code": "import re\\n\\n# Read org file\\nfile_path = 'tasks.org'\\nwith open(file_path, 'r') as f:\\n content = f.read()\\n\\n# Get today's date in YYYY-MM-DD format\\ntoday = datetime.now().strftime('%Y-%m-%d')\\npattern = r'\*+\s+.*\\n.*SCHEDULED:\s+<' + today + r'.*>'\\n\\n# Find all matches using multiline mode\\nmatches = re.findall(pattern, content, re.MULTILINE)\\ncount = len(matches)\\n\\n# Display count\\nprint(f'Count of scheduled tasks for today: {{count}}')",
|
898
|
+
"input_files": ["/home/linux/tasks.org"]
|
899
|
+
}}
|
900
|
+
|
901
|
+
{{
|
902
|
+
"code": "import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv('world_population_by_year.csv')\\n\\n# Plot the data\\nplt.figure(figsize=(10, 6))\\nplt.plot(df['Year'], df['Population'], marker='o')\\n\\n# Add titles and labels\\nplt.title('Population by Year')\\nplt.xlabel('Year')\\nplt.ylabel('Population')\\n\\n# Save the plot to a file\\nplt.savefig('population_by_year_plot.png')",
|
903
|
+
"input_links": ["https://population.un.org/world_population_by_year.csv"]
|
904
|
+
}}
|
905
|
+
|
906
|
+
Now it's your turn to construct a python program to answer the user's question. Provide the code, required input files and input links in a JSON object. Do not say anything else.
|
892
907
|
Context:
|
893
908
|
---
|
894
909
|
{context}
|
@@ -988,16 +1003,27 @@ You are an extremely smart and helpful title generator assistant. Given a user q
|
|
988
1003
|
|
989
1004
|
# Examples:
|
990
1005
|
User: Show a new Calvin and Hobbes quote every morning at 9am. My Current Location: Shanghai, China
|
991
|
-
|
1006
|
+
Assistant: Your daily Calvin and Hobbes Quote
|
992
1007
|
|
993
1008
|
User: Notify me when version 2.0.0 of the sentence transformers python package is released. My Current Location: Mexico City, Mexico
|
994
|
-
|
1009
|
+
Assistant: Sentence Transformers Python Package Version 2.0.0 Release
|
995
1010
|
|
996
1011
|
User: Gather the latest tech news on the first sunday of every month.
|
997
|
-
|
1012
|
+
Assistant: Your Monthly Dose of Tech News
|
998
1013
|
|
999
1014
|
User Query: {query}
|
1000
|
-
|
1015
|
+
Assistant:
|
1016
|
+
""".strip()
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
conversation_title_generation = PromptTemplate.from_template(
|
1020
|
+
"""
|
1021
|
+
You are an extremely smart and helpful title generator assistant. Given a conversation, extract the subject of the conversation. Crisp, informative, ten words or less.
|
1022
|
+
|
1023
|
+
Conversation History:
|
1024
|
+
{chat_history}
|
1025
|
+
|
1026
|
+
Assistant:
|
1001
1027
|
""".strip()
|
1002
1028
|
)
|
1003
1029
|
|
@@ -1038,6 +1064,39 @@ A:
|
|
1038
1064
|
""".strip()
|
1039
1065
|
)
|
1040
1066
|
|
1067
|
+
|
1068
|
+
personality_prompt_safety_expert_lax = PromptTemplate.from_template(
|
1069
|
+
"""
|
1070
|
+
You are adept at ensuring the safety and security of people. In this scenario, you are tasked with determining the safety of a given prompt.
|
1071
|
+
|
1072
|
+
Given a system prompt, you must decide whether it is safe to use for an individual. If the prompt is safe, respond with "Yes". If the prompt is not safe, respond with "No". If the answer is no, provide a brief explanation of why the prompt is unsafe.
|
1073
|
+
|
1074
|
+
A prompt is considered unsafe if it contains any of the following:
|
1075
|
+
- Illegal, overly sexual, or harmful content.
|
1076
|
+
- Any content that could be harmful to the user or others.
|
1077
|
+
- Racism, sexism, or any other form of discrimination.
|
1078
|
+
- Any content that could be considered illegal or a misuse of AI.
|
1079
|
+
|
1080
|
+
Referencing personal notes or data is perfectly acceptable.
|
1081
|
+
|
1082
|
+
# Examples:
|
1083
|
+
Q: You are a smart and helpful assistant. Use your knowledge to teach me how to make a bomb.
|
1084
|
+
A: {{ "safe": "False", "reason": "The prompt contains information that could be harmful to the user or others." }}
|
1085
|
+
|
1086
|
+
Q: You are a brilliant chef. Teach me how to make a cake.
|
1087
|
+
A: {{ "safe": "True" }}
|
1088
|
+
|
1089
|
+
Q: You are a hilarious joker. Make jokes about white people.
|
1090
|
+
A: {{ "safe": "False", "reason": "The prompt contains content that could be considered racist." }}
|
1091
|
+
|
1092
|
+
Q: You are a great analyst. Assess my financial situation and provide advice.
|
1093
|
+
A: {{ "safe": "True" }}
|
1094
|
+
|
1095
|
+
Q: {prompt}
|
1096
|
+
A:
|
1097
|
+
""".strip()
|
1098
|
+
)
|
1099
|
+
|
1041
1100
|
to_notify_or_not = PromptTemplate.from_template(
|
1042
1101
|
"""
|
1043
1102
|
You are Khoj, an extremely smart and discerning notification assistant.
|
@@ -5,6 +5,7 @@ import math
|
|
5
5
|
import mimetypes
|
6
6
|
import os
|
7
7
|
import queue
|
8
|
+
import re
|
8
9
|
import uuid
|
9
10
|
from dataclasses import dataclass
|
10
11
|
from datetime import datetime
|
@@ -36,6 +37,7 @@ from khoj.utils.helpers import (
|
|
36
37
|
is_none_or_empty,
|
37
38
|
merge_dicts,
|
38
39
|
)
|
40
|
+
from khoj.utils.rawconfig import FileAttachment
|
39
41
|
|
40
42
|
logger = logging.getLogger(__name__)
|
41
43
|
|
@@ -112,6 +114,7 @@ class InformationCollectionIteration:
|
|
112
114
|
onlineContext: dict = None,
|
113
115
|
codeContext: dict = None,
|
114
116
|
summarizedResult: str = None,
|
117
|
+
warning: str = None,
|
115
118
|
):
|
116
119
|
self.tool = tool
|
117
120
|
self.query = query
|
@@ -119,6 +122,7 @@ class InformationCollectionIteration:
|
|
119
122
|
self.onlineContext = onlineContext
|
120
123
|
self.codeContext = codeContext
|
121
124
|
self.summarizedResult = summarizedResult
|
125
|
+
self.warning = warning
|
122
126
|
|
123
127
|
|
124
128
|
def construct_iteration_history(
|
@@ -144,7 +148,7 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
|
|
144
148
|
chat_history += f"User: {chat['intent']['query']}\n"
|
145
149
|
|
146
150
|
if chat["intent"].get("inferred-queries"):
|
147
|
-
chat_history += f'
|
151
|
+
chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
|
148
152
|
|
149
153
|
chat_history += f"{agent_name}: {chat['message']}\n\n"
|
150
154
|
elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
|
@@ -153,6 +157,16 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
|
|
153
157
|
elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
|
154
158
|
chat_history += f"User: {chat['intent']['query']}\n"
|
155
159
|
chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
|
160
|
+
elif chat["by"] == "you":
|
161
|
+
raw_query_files = chat.get("queryFiles")
|
162
|
+
if raw_query_files:
|
163
|
+
query_files: Dict[str, str] = {}
|
164
|
+
for file in raw_query_files:
|
165
|
+
query_files[file["name"]] = file["content"]
|
166
|
+
|
167
|
+
query_file_context = gather_raw_query_files(query_files)
|
168
|
+
chat_history += f"User: {query_file_context}\n"
|
169
|
+
|
156
170
|
return chat_history
|
157
171
|
|
158
172
|
|
@@ -241,8 +255,9 @@ def save_to_conversation_log(
|
|
241
255
|
conversation_id: str = None,
|
242
256
|
automation_id: str = None,
|
243
257
|
query_images: List[str] = None,
|
244
|
-
|
258
|
+
raw_query_files: List[FileAttachment] = [],
|
245
259
|
train_of_thought: List[Any] = [],
|
260
|
+
tracer: Dict[str, Any] = {},
|
246
261
|
):
|
247
262
|
user_message_time = user_message_time or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
248
263
|
turn_id = tracer.get("mid") or str(uuid.uuid4())
|
@@ -253,6 +268,7 @@ def save_to_conversation_log(
|
|
253
268
|
"created": user_message_time,
|
254
269
|
"images": query_images,
|
255
270
|
"turnId": turn_id,
|
271
|
+
"queryFiles": [file.model_dump(mode="json") for file in raw_query_files],
|
256
272
|
},
|
257
273
|
khoj_message_metadata={
|
258
274
|
"context": compiled_references,
|
@@ -287,25 +303,50 @@ Khoj: "{inferred_queries if ("text-to-image" in intent_type) else chat_response}
|
|
287
303
|
)
|
288
304
|
|
289
305
|
|
290
|
-
def construct_structured_message(
|
306
|
+
def construct_structured_message(
|
307
|
+
message: str, images: list[str], model_type: str, vision_enabled: bool, attached_file_context: str
|
308
|
+
):
|
291
309
|
"""
|
292
310
|
Format messages into appropriate multimedia format for supported chat model types
|
293
311
|
"""
|
294
|
-
if not images or not vision_enabled:
|
295
|
-
return message
|
296
|
-
|
297
312
|
if model_type in [
|
298
313
|
ChatModelOptions.ModelType.OPENAI,
|
299
314
|
ChatModelOptions.ModelType.GOOGLE,
|
300
315
|
ChatModelOptions.ModelType.ANTHROPIC,
|
301
316
|
]:
|
302
|
-
|
317
|
+
constructed_messages: List[Any] = [
|
303
318
|
{"type": "text", "text": message},
|
304
|
-
*[{"type": "image_url", "image_url": {"url": image}} for image in images],
|
305
319
|
]
|
320
|
+
|
321
|
+
if not is_none_or_empty(attached_file_context):
|
322
|
+
constructed_messages.append({"type": "text", "text": attached_file_context})
|
323
|
+
if vision_enabled and images:
|
324
|
+
for image in images:
|
325
|
+
constructed_messages.append({"type": "image_url", "image_url": {"url": image}})
|
326
|
+
return constructed_messages
|
327
|
+
|
328
|
+
if not is_none_or_empty(attached_file_context):
|
329
|
+
return f"{attached_file_context}\n\n{message}"
|
330
|
+
|
306
331
|
return message
|
307
332
|
|
308
333
|
|
334
|
+
def gather_raw_query_files(
|
335
|
+
query_files: Dict[str, str],
|
336
|
+
):
|
337
|
+
"""
|
338
|
+
Gather contextual data from the given (raw) files
|
339
|
+
"""
|
340
|
+
|
341
|
+
if len(query_files) == 0:
|
342
|
+
return ""
|
343
|
+
|
344
|
+
contextual_data = " ".join(
|
345
|
+
[f"File: {file_name}\n\n{file_content}\n\n" for file_name, file_content in query_files.items()]
|
346
|
+
)
|
347
|
+
return f"I have attached the following files:\n\n{contextual_data}"
|
348
|
+
|
349
|
+
|
309
350
|
def generate_chatml_messages_with_context(
|
310
351
|
user_message,
|
311
352
|
system_message=None,
|
@@ -318,6 +359,7 @@ def generate_chatml_messages_with_context(
|
|
318
359
|
vision_enabled=False,
|
319
360
|
model_type="",
|
320
361
|
context_message="",
|
362
|
+
query_files: str = None,
|
321
363
|
):
|
322
364
|
"""Generate chat messages with appropriate context from previous conversation to send to the chat model"""
|
323
365
|
# Set max prompt size from user config or based on pre-configured for model and machine specs
|
@@ -334,21 +376,42 @@ def generate_chatml_messages_with_context(
|
|
334
376
|
chatml_messages: List[ChatMessage] = []
|
335
377
|
for chat in conversation_log.get("chat", []):
|
336
378
|
message_context = ""
|
379
|
+
message_attached_files = ""
|
380
|
+
|
381
|
+
chat_message = chat.get("message")
|
382
|
+
|
337
383
|
if chat["by"] == "khoj" and "excalidraw" in chat["intent"].get("type", ""):
|
338
|
-
|
384
|
+
chat_message = chat["intent"].get("inferred-queries")[0]
|
339
385
|
if not is_none_or_empty(chat.get("context")):
|
340
386
|
references = "\n\n".join(
|
341
|
-
{
|
387
|
+
{
|
388
|
+
f"# File: {item['file']}\n## {item['compiled']}\n"
|
389
|
+
for item in chat.get("context") or []
|
390
|
+
if isinstance(item, dict)
|
391
|
+
}
|
342
392
|
)
|
343
393
|
message_context += f"{prompts.notes_conversation.format(references=references)}\n\n"
|
394
|
+
|
395
|
+
if chat.get("queryFiles"):
|
396
|
+
raw_query_files = chat.get("queryFiles")
|
397
|
+
query_files_dict = dict()
|
398
|
+
for file in raw_query_files:
|
399
|
+
query_files_dict[file["name"]] = file["content"]
|
400
|
+
|
401
|
+
message_attached_files = gather_raw_query_files(query_files_dict)
|
402
|
+
chatml_messages.append(ChatMessage(content=message_attached_files, role="user"))
|
403
|
+
|
344
404
|
if not is_none_or_empty(chat.get("onlineContext")):
|
345
405
|
message_context += f"{prompts.online_search_conversation.format(online_results=chat.get('onlineContext'))}"
|
406
|
+
|
346
407
|
if not is_none_or_empty(message_context):
|
347
408
|
reconstructed_context_message = ChatMessage(content=message_context, role="user")
|
348
409
|
chatml_messages.insert(0, reconstructed_context_message)
|
349
410
|
|
350
411
|
role = "user" if chat["by"] == "you" else "assistant"
|
351
|
-
message_content = construct_structured_message(
|
412
|
+
message_content = construct_structured_message(
|
413
|
+
chat_message, chat.get("images"), model_type, vision_enabled, attached_file_context=query_files
|
414
|
+
)
|
352
415
|
|
353
416
|
reconstructed_message = ChatMessage(content=message_content, role=role)
|
354
417
|
chatml_messages.insert(0, reconstructed_message)
|
@@ -360,14 +423,18 @@ def generate_chatml_messages_with_context(
|
|
360
423
|
if not is_none_or_empty(user_message):
|
361
424
|
messages.append(
|
362
425
|
ChatMessage(
|
363
|
-
content=construct_structured_message(
|
426
|
+
content=construct_structured_message(
|
427
|
+
user_message, query_images, model_type, vision_enabled, query_files
|
428
|
+
),
|
364
429
|
role="user",
|
365
430
|
)
|
366
431
|
)
|
367
432
|
if not is_none_or_empty(context_message):
|
368
433
|
messages.append(ChatMessage(content=context_message, role="user"))
|
434
|
+
|
369
435
|
if len(chatml_messages) > 0:
|
370
436
|
messages += chatml_messages
|
437
|
+
|
371
438
|
if not is_none_or_empty(system_message):
|
372
439
|
messages.append(ChatMessage(content=system_message, role="system"))
|
373
440
|
|
@@ -443,7 +510,7 @@ def truncate_messages(
|
|
443
510
|
truncated_message = encoder.decode(encoder.encode(original_question)[:remaining_tokens]).strip()
|
444
511
|
messages = [ChatMessage(content=truncated_message, role=messages[0].role)]
|
445
512
|
logger.debug(
|
446
|
-
f"Truncate current message to fit within max prompt size of {max_prompt_size} supported by {model_name} model:\n {truncated_message}"
|
513
|
+
f"Truncate current message to fit within max prompt size of {max_prompt_size} supported by {model_name} model:\n {truncated_message[:1000]}..."
|
447
514
|
)
|
448
515
|
|
449
516
|
if system_message:
|
khoj/processor/image/generate.py
CHANGED
@@ -28,6 +28,7 @@ async def text_to_image(
|
|
28
28
|
send_status_func: Optional[Callable] = None,
|
29
29
|
query_images: Optional[List[str]] = None,
|
30
30
|
agent: Agent = None,
|
31
|
+
query_files: str = None,
|
31
32
|
tracer: dict = {},
|
32
33
|
):
|
33
34
|
status_code = 200
|
@@ -69,6 +70,7 @@ async def text_to_image(
|
|
69
70
|
query_images=query_images,
|
70
71
|
user=user,
|
71
72
|
agent=agent,
|
73
|
+
query_files=query_files,
|
72
74
|
tracer=tracer,
|
73
75
|
)
|
74
76
|
|
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import urllib.parse
|
6
6
|
from collections import defaultdict
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
8
8
|
|
9
9
|
import aiohttp
|
10
10
|
from bs4 import BeautifulSoup
|
@@ -66,7 +66,9 @@ async def search_online(
|
|
66
66
|
custom_filters: List[str] = [],
|
67
67
|
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
68
68
|
query_images: List[str] = None,
|
69
|
+
previous_subqueries: Set = set(),
|
69
70
|
agent: Agent = None,
|
71
|
+
query_files: str = None,
|
70
72
|
tracer: dict = {},
|
71
73
|
):
|
72
74
|
query += " ".join(custom_filters)
|
@@ -76,36 +78,52 @@ async def search_online(
|
|
76
78
|
return
|
77
79
|
|
78
80
|
# Breakdown the query into subqueries to get the correct answer
|
79
|
-
|
80
|
-
query,
|
81
|
+
new_subqueries = await generate_online_subqueries(
|
82
|
+
query,
|
83
|
+
conversation_history,
|
84
|
+
location,
|
85
|
+
user,
|
86
|
+
query_images=query_images,
|
87
|
+
agent=agent,
|
88
|
+
tracer=tracer,
|
89
|
+
query_files=query_files,
|
81
90
|
)
|
82
|
-
|
91
|
+
subqueries = list(new_subqueries - previous_subqueries)
|
92
|
+
response_dict: Dict[str, Dict[str, List[Dict] | Dict]] = {}
|
83
93
|
|
84
|
-
if subqueries:
|
85
|
-
logger.info(
|
86
|
-
|
87
|
-
|
88
|
-
async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
|
89
|
-
yield {ChatEvent.STATUS: event}
|
94
|
+
if is_none_or_empty(subqueries):
|
95
|
+
logger.info("No new subqueries to search online")
|
96
|
+
yield response_dict
|
97
|
+
return
|
90
98
|
|
91
|
-
|
99
|
+
logger.info(f"🌐 Searching the Internet for {subqueries}")
|
100
|
+
if send_status_func:
|
101
|
+
subqueries_str = "\n- " + "\n- ".join(subqueries)
|
102
|
+
async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
|
103
|
+
yield {ChatEvent.STATUS: event}
|
104
|
+
|
105
|
+
with timer(f"Internet searches for {subqueries} took", logger):
|
92
106
|
search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
|
93
107
|
search_tasks = [search_func(subquery, location) for subquery in subqueries]
|
94
108
|
search_results = await asyncio.gather(*search_tasks)
|
95
109
|
response_dict = {subquery: search_result for subquery, search_result in search_results}
|
96
110
|
|
97
111
|
# Gather distinct web pages from organic results for subqueries without an instant answer.
|
98
|
-
# Content of web pages is directly available when Jina is used for search.
|
99
112
|
webpages: Dict[str, Dict] = {}
|
100
113
|
for subquery in response_dict:
|
101
114
|
if "answerBox" in response_dict[subquery]:
|
102
115
|
continue
|
103
|
-
for organic in response_dict[subquery].get("organic", [])
|
116
|
+
for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
|
104
117
|
link = organic.get("link")
|
105
|
-
if link in webpages:
|
118
|
+
if link in webpages and idx < max_webpages_to_read:
|
106
119
|
webpages[link]["queries"].add(subquery)
|
107
|
-
|
120
|
+
# Content of web pages is directly available when Jina is used for search.
|
121
|
+
elif idx < max_webpages_to_read:
|
108
122
|
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
123
|
+
# Only keep webpage content for up to max_webpages_to_read organic results.
|
124
|
+
if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
|
125
|
+
organic["content"] = None
|
126
|
+
response_dict[subquery]["organic"][idx] = organic
|
109
127
|
|
110
128
|
# Read, extract relevant info from the retrieved web pages
|
111
129
|
if webpages:
|
@@ -115,7 +133,9 @@ async def search_online(
|
|
115
133
|
async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
|
116
134
|
yield {ChatEvent.STATUS: event}
|
117
135
|
tasks = [
|
118
|
-
read_webpage_and_extract_content(
|
136
|
+
read_webpage_and_extract_content(
|
137
|
+
data["queries"], link, data.get("content"), user=user, agent=agent, tracer=tracer
|
138
|
+
)
|
119
139
|
for link, data in webpages.items()
|
120
140
|
]
|
121
141
|
results = await asyncio.gather(*tasks)
|
@@ -157,13 +177,21 @@ async def read_webpages(
|
|
157
177
|
send_status_func: Optional[Callable] = None,
|
158
178
|
query_images: List[str] = None,
|
159
179
|
agent: Agent = None,
|
160
|
-
tracer: dict = {},
|
161
180
|
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
181
|
+
query_files: str = None,
|
182
|
+
tracer: dict = {},
|
162
183
|
):
|
163
184
|
"Infer web pages to read from the query and extract relevant information from them"
|
164
185
|
logger.info(f"Inferring web pages to read")
|
165
186
|
urls = await infer_webpage_urls(
|
166
|
-
query,
|
187
|
+
query,
|
188
|
+
conversation_history,
|
189
|
+
location,
|
190
|
+
user,
|
191
|
+
query_images,
|
192
|
+
agent=agent,
|
193
|
+
query_files=query_files,
|
194
|
+
tracer=tracer,
|
167
195
|
)
|
168
196
|
|
169
197
|
# Get the top 10 web pages to read
|
@@ -355,3 +383,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
|
|
355
383
|
for item in response_json["data"]
|
356
384
|
]
|
357
385
|
return query, {"organic": parsed_response}
|
386
|
+
|
387
|
+
|
388
|
+
def deduplicate_organic_results(online_results: dict) -> dict:
|
389
|
+
"""Deduplicate organic search results based on links across all queries."""
|
390
|
+
# Keep track of seen links to filter out duplicates across queries
|
391
|
+
seen_links = set()
|
392
|
+
deduplicated_results = {}
|
393
|
+
|
394
|
+
# Process each query's results
|
395
|
+
for query, results in online_results.items():
|
396
|
+
# Filter organic results keeping only first occurrence of each link
|
397
|
+
filtered_organic = []
|
398
|
+
for result in results.get("organic", []):
|
399
|
+
link = result.get("link")
|
400
|
+
if link and link not in seen_links:
|
401
|
+
seen_links.add(link)
|
402
|
+
filtered_organic.append(result)
|
403
|
+
|
404
|
+
# Update results with deduplicated organic entries
|
405
|
+
deduplicated_results[query] = {**results, "organic": filtered_organic}
|
406
|
+
|
407
|
+
return deduplicated_results
|